diff --git a/Makefile.crtools b/Makefile.crtools index 9156fe55d..650b9b0c3 100644 --- a/Makefile.crtools +++ b/Makefile.crtools @@ -61,6 +61,7 @@ obj-y += kerndat.o obj-y += stats.o obj-y += cgroup.o obj-y += timerfd.o +obj-y += aio.o obj-y += string.o obj-y += sigframe.o ifeq ($(VDSO),y) diff --git a/aio.c b/aio.c new file mode 100644 index 000000000..86cb0c67c --- /dev/null +++ b/aio.c @@ -0,0 +1,120 @@ +#include +#include +#include +#include "vma.h" +#include "xmalloc.h" +#include "aio.h" +#include "parasite.h" +#include "parasite-syscall.h" +#include "protobuf/mm.pb-c.h" + +int dump_aio_ring(MmEntry *mme, struct vma_area *vma) +{ + int nr = mme->n_aios; + AioRingEntry *re; + + pr_info("Dumping AIO ring @%#lx, %u reqs\n", + vma->e->start, vma->aio_nr_req); + + mme->aios = xrealloc(mme->aios, (nr + 1) * sizeof(re)); + if (!mme->aios) + return -1; + + re = xmalloc(sizeof(*re)); + if (!re) + return -1; + + aio_ring_entry__init(re); + re->id = vma->e->start; + re->nr_req = vma->aio_nr_req; + re->ring_len = vma->e->end - vma->e->start; + mme->aios[nr] = re; + mme->n_aios = nr + 1; + return 0; +} + +void free_aios(MmEntry *mme) +{ + int i; + + if (mme->aios) { + for (i = 0; i < mme->n_aios; i++) + xfree(mme->aios[i]); + xfree(mme->aios); + } +} + +static unsigned int aio_estimate_nr_reqs(unsigned int k_max_reqs) +{ + /* + * Kernel does + * + * nr_reqs = max(nr_reqs, nr_cpus * 4) + * nr_reqs *= 2 + * nr_reqs += 2 + * ring = roundup(sizeof(head) + nr_reqs * sizeof(req)) + * nr_reqs = (ring - sizeof(head)) / sizeof(req) + * + * And the k_max_reqs here is the resulting value. + * + * We need to get the initial nr_reqs that would grow + * up back to the k_max_reqs. + */ + + return (k_max_reqs - 2) / 2; +} + +unsigned long aio_rings_args_size(struct vm_area_list *vmas) +{ + return sizeof(struct parasite_check_aios_args) + + vmas->nr_aios * sizeof(struct parasite_aio); +} + +int parasite_check_aios(struct parasite_ctl *ctl, struct vm_area_list *vmas) +{ + struct vma_area *vma; + struct parasite_check_aios_args *aa; + struct parasite_aio *pa; + int i; + + if (!vmas->nr_aios) + return 0; + + pr_info("Checking AIO rings\n"); + + /* + * Go to parasite and + * a) check that no requests are currently pengind + * b) get the maximum number of requests kernel handles + * to estimate what was the user request on ring + * creation. + */ + + aa = parasite_args_s(ctl, aio_rings_args_size(vmas)); + pa = &aa->ring[0]; + list_for_each_entry(vma, &vmas->h, list) { + if (!vma_area_is(vma, VMA_AREA_AIORING)) + continue; + + pr_debug(" `- Ring #%ld @%#lx\n", + pa - &aa->ring[0], vma->e->start); + pa->ctx = vma->e->start; + pa->max_reqs = 0; + pa->vma_nr_reqs = &vma->aio_nr_req; + pa++; + } + aa->nr_rings = vmas->nr_aios; + + if (parasite_execute_daemon(PARASITE_CMD_CHECK_AIOS, ctl)) + return -1; + + pa = &aa->ring[0]; + for (i = 0; i < vmas->nr_aios; i++) { + pa = &aa->ring[i]; + *pa->vma_nr_reqs = aio_estimate_nr_reqs(pa->max_reqs); + pr_debug(" `- Ring #%d has %u reqs, estimated to %u\n", i, + pa->max_reqs, *pa->vma_nr_reqs); + } + + return 0; +} diff --git a/cr-dump.c b/cr-dump.c index 98db10f81..31afaa07c 100644 --- a/cr-dump.c +++ b/cr-dump.c @@ -72,6 +72,7 @@ #include "irmap.h" #include "sysfs_parse.h" #include "action-scripts.h" +#include "aio.h" #include "asm/dump.h" @@ -464,6 +465,12 @@ static int dump_task_mm(pid_t pid, const struct proc_pid_stat *stat, goto err; mme.vmas[i++] = vma; + + if (vma_entry_is(vma, VMA_AREA_AIORING)) { + ret = dump_aio_ring(&mme, vma_area); + if (ret) + goto err; + } } mme.mm_start_code = stat->start_code; @@ -496,6 +503,7 @@ static int dump_task_mm(pid_t pid, const struct proc_pid_stat *stat, ret = pb_write_one(img_from_set(imgset, CR_FD_MM), &mme, PB_MM); xfree(mme.mm_saved_auxv); + free_aios(&mme); err: return ret; } @@ -1566,6 +1574,12 @@ static int dump_one_task(struct pstree_item *item) goto err_cure_imgset; } + ret = parasite_check_aios(parasite_ctl, &vmas); /* FIXME -- merge with above */ + if (ret) { + pr_err("Failed to check aio rings (pid: %d)\n", pid); + goto err_cure_imgset; + } + ret = parasite_dump_misc_seized(parasite_ctl, &misc); if (ret) { pr_err("Can't dump misc (pid: %d)\n", pid); diff --git a/include/aio.h b/include/aio.h new file mode 100644 index 000000000..af7a046e7 --- /dev/null +++ b/include/aio.h @@ -0,0 +1,9 @@ +#ifndef __CR_AIO_H__ +#define __CR_AIO_H__ +#include "protobuf/mm.pb-c.h" +int dump_aio_ring(MmEntry *mme, struct vma_area *vma); +void free_aios(MmEntry *mme); +struct parasite_ctl; +int parasite_check_aios(struct parasite_ctl *, struct vm_area_list *); +unsigned long aio_rings_args_size(struct vm_area_list *); +#endif /* __CR_AIO_H__ */ diff --git a/include/image.h b/include/image.h index e02fa0e02..9c711c0a7 100644 --- a/include/image.h +++ b/include/image.h @@ -54,6 +54,7 @@ #define VMA_AREA_SYSVIPC (1 << 10) #define VMA_AREA_SOCKET (1 << 11) #define VMA_AREA_VVAR (1 << 12) +#define VMA_AREA_AIORING (1 << 13) #define VMA_UNSUPP (1 << 31) /* Unsupported VMA */ diff --git a/include/parasite.h b/include/parasite.h index 1ff115ff7..365e6746d 100644 --- a/include/parasite.h +++ b/include/parasite.h @@ -47,6 +47,7 @@ enum { PARASITE_CMD_GET_PROC_FD, PARASITE_CMD_DUMP_TTY, PARASITE_CMD_CHECK_VDSO_MARK, + PARASITE_CMD_CHECK_AIOS, PARASITE_CMD_MAX, }; @@ -133,6 +134,17 @@ struct parasite_dump_posix_timers_args { struct posix_timer timer[0]; }; +struct parasite_aio { + unsigned long ctx; + unsigned int max_reqs; + unsigned int *vma_nr_reqs; +}; + +struct parasite_check_aios_args { + unsigned nr_rings; + struct parasite_aio ring[0]; +}; + static inline int posix_timers_dump_size(int timer_n) { return sizeof(int) + sizeof(struct posix_timer) * timer_n; diff --git a/include/vma.h b/include/vma.h index d2ce80c47..878658cae 100644 --- a/include/vma.h +++ b/include/vma.h @@ -7,6 +7,7 @@ struct vm_area_list { struct list_head h; unsigned nr; + unsigned int nr_aios; unsigned long priv_size; /* nr of pages in private VMAs */ unsigned long longest; /* nr of pages in longest VMA */ }; @@ -35,9 +36,12 @@ struct vma_area { * The file_fd is an fd for a regular file and * the socket_id is the inode number of the * mapped (PF_PACKET) socket. + * + * The aio_nr_req is only for aio rings. */ int vm_file_fd; int vm_socket_id; + unsigned int aio_nr_req; }; char *aufs_rpath; /* path from aufs root */ diff --git a/parasite-syscall.c b/parasite-syscall.c index 96c21600e..e8eeffba4 100644 --- a/parasite-syscall.c +++ b/parasite-syscall.c @@ -28,6 +28,7 @@ #include "mem.h" #include "vma.h" #include "proc_parse.h" +#include "aio.h" #include #include @@ -1193,6 +1194,7 @@ struct parasite_ctl *parasite_infect_seized(pid_t pid, struct pstree_item *item, return NULL; parasite_ensure_args_size(dump_pages_args_size(vma_area_list)); + parasite_ensure_args_size(aio_rings_args_size(vma_area_list)); /* * Inject a parasite engine. Ie allocate memory inside alien diff --git a/pie/parasite.c b/pie/parasite.c index 9557c55ac..5aef2d55a 100644 --- a/pie/parasite.c +++ b/pie/parasite.c @@ -330,6 +330,73 @@ static inline int tty_ioctl(int fd, int cmd, int *arg) return 0; } +/* + * Stolen from kernel/fs/aio.c + * + * Is it valid to go to memory and check it? Should be, + * as libaio does the same. + */ + +#define AIO_RING_MAGIC 0xa10a10a1 +#define AIO_RING_COMPAT_FEATURES 1 +#define AIO_RING_INCOMPAT_FEATURES 0 + +struct aio_ring { + unsigned id; /* kernel internal index number */ + unsigned nr; /* number of io_events */ + unsigned head; /* Written to by userland or under ring_lock + * mutex by aio_read_events_ring(). */ + unsigned tail; + + unsigned magic; + unsigned compat_features; + unsigned incompat_features; + unsigned header_length; /* size of aio_ring */ + + + /* struct io_event io_events[0]; */ +}; + +static int sane_ring(struct aio_ring *ring) +{ + return ring->magic == AIO_RING_MAGIC && + ring->compat_features == AIO_RING_COMPAT_FEATURES && + ring->incompat_features == AIO_RING_INCOMPAT_FEATURES && + ring->header_length == sizeof(struct aio_ring); +} + +static int parasite_check_aios(struct parasite_check_aios_args *args) +{ + int i; + + for (i = 0; i < args->nr_rings; i++) { + struct aio_ring *ring; + + ring = (struct aio_ring *)args->ring[i].ctx; + if (!sane_ring(ring)) { + pr_err("Not valid ring #%d\n", i); + pr_info(" `- magic %x\n", ring->magic); + pr_info(" `- cf %d\n", ring->compat_features); + pr_info(" `- if %d\n", ring->incompat_features); + pr_info(" `- size %d (%ld)\n", ring->header_length, sizeof(struct aio_ring)); + return -1; + } + + /* + * XXX what else can we do if there are requests + * in the ring? + */ + if (ring->head != ring->tail) { + pr_err("Pending AIO requests in ring #%d\n", i); + return -1; + } + + args->ring[i].max_reqs = ring->nr; + } + + return 0; +} + static int parasite_dump_tty(struct parasite_tty_args *args) { int ret; @@ -541,6 +608,9 @@ static noinline __used int noinline parasite_daemon(void *args) case PARASITE_CMD_DUMP_TTY: ret = parasite_dump_tty(args); break; + case PARASITE_CMD_CHECK_AIOS: + ret = parasite_check_aios(args); + break; #ifdef CONFIG_VDSO case PARASITE_CMD_CHECK_VDSO_MARK: ret = parasite_check_vdso_mark(args); diff --git a/proc_parse.c b/proc_parse.c index 7d21eb913..3da1d9eb6 100644 --- a/proc_parse.c +++ b/proc_parse.c @@ -43,6 +43,12 @@ static char *buf = __buf.buf; #define BUF_SIZE sizeof(__buf.buf) +/* + * This is how AIO ring buffers look like in proc + */ + +#define AIO_FNAME "/[aio]" + int parse_cpuinfo_features(int (*handler)(char *tok)) { FILE *cpuinfo; @@ -191,7 +197,7 @@ static inline int vfi_equal(struct vma_file_info *a, struct vma_file_info *b) (a->dev_min ^ b->dev_min)) == 0; } -static int vma_get_mapfile(struct vma_area *vma, DIR *mfd, +static int vma_get_mapfile(char *fname, struct vma_area *vma, DIR *mfd, struct vma_file_info *vfi, struct vma_file_info *prev_vfi) { char path[32]; @@ -244,13 +250,22 @@ static int vma_get_mapfile(struct vma_area *vma, DIR *mfd, if (fstatat(dirfd(mfd), path, &buf, 0)) return -1; - if (!S_ISSOCK(buf.st_mode)) - return -1; + if (S_ISSOCK(buf.st_mode)) { + pr_info("Found socket mapping @%"PRIx64"\n", vma->e->start); + vma->vm_socket_id = buf.st_ino; + vma->e->status |= VMA_AREA_SOCKET | VMA_AREA_REGULAR; + return 0; + } - vma->vm_socket_id = buf.st_ino; - pr_info("Found socket mapping @%"PRIx64"\n", vma->e->start); - vma->e->status |= VMA_AREA_SOCKET | VMA_AREA_REGULAR; - return 0; + if ((buf.st_mode & S_IFMT) == 0 && !strcmp(fname, AIO_FNAME)) { + /* AIO ring, let's try */ + close(vma->vm_file_fd); + vma->aio_nr_req = -1; + vma->e->status = VMA_AREA_AIORING; + return 0; + } + + pr_err("Unknown shit %o (%s)\n", buf.st_mode, fname); } return -1; @@ -325,6 +340,7 @@ int parse_smaps(pid_t pid, struct vm_area_list *vma_area_list) struct bfd f; vma_area_list->nr = 0; + vma_area_list->nr_aios = 0; vma_area_list->longest = 0; vma_area_list->priv_size = 0; INIT_LIST_HEAD(&vma_area_list->h); @@ -417,7 +433,7 @@ int parse_smaps(pid_t pid, struct vm_area_list *vma_area_list) vma_area->e->pgoff = pgoff; vma_area->e->prot = PROT_NONE; - if (vma_get_mapfile(vma_area, map_files_dir, &vfi, &prev_vfi)) + if (vma_get_mapfile(file_path, vma_area, map_files_dir, &vfi, &prev_vfi)) goto err_bogus_mapfile; if (r == 'r') @@ -437,6 +453,8 @@ int parse_smaps(pid_t pid, struct vm_area_list *vma_area_list) } if (vma_area->e->status != 0) { + if (vma_area->e->status & VMA_AREA_AIORING) + vma_area_list->nr_aios++; continue; } else if (!strcmp(file_path, "[vsyscall]") || !strcmp(file_path, "[vectors]")) { diff --git a/protobuf/mm.proto b/protobuf/mm.proto index 1556b602d..de2ff7436 100644 --- a/protobuf/mm.proto +++ b/protobuf/mm.proto @@ -1,5 +1,11 @@ import "vma.proto"; +message aio_ring_entry { + required uint64 id = 1; + required uint32 nr_req = 2; + required uint32 ring_len = 3; +} + message mm_entry { required uint64 mm_start_code = 1; required uint64 mm_end_code = 2; @@ -19,4 +25,5 @@ message mm_entry { repeated vma_entry vmas = 14; optional int32 dumpable = 15; + repeated aio_ring_entry aios = 16; }