From 08c204820f4f2facfad83bfa3611fc6ae3e33170 Mon Sep 17 00:00:00 2001 From: Pavel Emelyanov Date: Fri, 19 Dec 2014 16:01:54 +0300 Subject: [PATCH] aio: Dump AIO rings When AIO context is set up kernel does two things: 1. creates an in-kernel aioctx object 2. maps a ring into process memory The 2nd thing gives us all the needed information about how the AIO was set up. So, in order to dump one we need to pick the ring in memory and get all the information we need from it. One thing to note -- we cannot dump tasks if there are any AIO requests pending. So we also need to go to parasite and check the ring to be empty. Signed-off-by: Pavel Emelyanov --- Makefile.crtools | 1 + aio.c | 120 +++++++++++++++++++++++++++++++++++++++++++++ cr-dump.c | 14 ++++++ include/aio.h | 9 ++++ include/image.h | 1 + include/parasite.h | 12 +++++ include/vma.h | 4 ++ parasite-syscall.c | 2 + pie/parasite.c | 70 ++++++++++++++++++++++++++ proc_parse.c | 34 ++++++++++--- protobuf/mm.proto | 7 +++ 11 files changed, 266 insertions(+), 8 deletions(-) create mode 100644 aio.c create mode 100644 include/aio.h diff --git a/Makefile.crtools b/Makefile.crtools index 9156fe55d..650b9b0c3 100644 --- a/Makefile.crtools +++ b/Makefile.crtools @@ -61,6 +61,7 @@ obj-y += kerndat.o obj-y += stats.o obj-y += cgroup.o obj-y += timerfd.o +obj-y += aio.o obj-y += string.o obj-y += sigframe.o ifeq ($(VDSO),y) diff --git a/aio.c b/aio.c new file mode 100644 index 000000000..86cb0c67c --- /dev/null +++ b/aio.c @@ -0,0 +1,120 @@ +#include +#include +#include +#include "vma.h" +#include "xmalloc.h" +#include "aio.h" +#include "parasite.h" +#include "parasite-syscall.h" +#include "protobuf/mm.pb-c.h" + +int dump_aio_ring(MmEntry *mme, struct vma_area *vma) +{ + int nr = mme->n_aios; + AioRingEntry *re; + + pr_info("Dumping AIO ring @%#lx, %u reqs\n", + vma->e->start, vma->aio_nr_req); + + mme->aios = xrealloc(mme->aios, (nr + 1) * sizeof(re)); + if (!mme->aios) + return -1; + + re = xmalloc(sizeof(*re)); + if (!re) + return -1; + + aio_ring_entry__init(re); + re->id = vma->e->start; + re->nr_req = vma->aio_nr_req; + re->ring_len = vma->e->end - vma->e->start; + mme->aios[nr] = re; + mme->n_aios = nr + 1; + return 0; +} + +void free_aios(MmEntry *mme) +{ + int i; + + if (mme->aios) { + for (i = 0; i < mme->n_aios; i++) + xfree(mme->aios[i]); + xfree(mme->aios); + } +} + +static unsigned int aio_estimate_nr_reqs(unsigned int k_max_reqs) +{ + /* + * Kernel does + * + * nr_reqs = max(nr_reqs, nr_cpus * 4) + * nr_reqs *= 2 + * nr_reqs += 2 + * ring = roundup(sizeof(head) + nr_reqs * sizeof(req)) + * nr_reqs = (ring - sizeof(head)) / sizeof(req) + * + * And the k_max_reqs here is the resulting value. + * + * We need to get the initial nr_reqs that would grow + * up back to the k_max_reqs. + */ + + return (k_max_reqs - 2) / 2; +} + +unsigned long aio_rings_args_size(struct vm_area_list *vmas) +{ + return sizeof(struct parasite_check_aios_args) + + vmas->nr_aios * sizeof(struct parasite_aio); +} + +int parasite_check_aios(struct parasite_ctl *ctl, struct vm_area_list *vmas) +{ + struct vma_area *vma; + struct parasite_check_aios_args *aa; + struct parasite_aio *pa; + int i; + + if (!vmas->nr_aios) + return 0; + + pr_info("Checking AIO rings\n"); + + /* + * Go to parasite and + * a) check that no requests are currently pengind + * b) get the maximum number of requests kernel handles + * to estimate what was the user request on ring + * creation. + */ + + aa = parasite_args_s(ctl, aio_rings_args_size(vmas)); + pa = &aa->ring[0]; + list_for_each_entry(vma, &vmas->h, list) { + if (!vma_area_is(vma, VMA_AREA_AIORING)) + continue; + + pr_debug(" `- Ring #%ld @%#lx\n", + pa - &aa->ring[0], vma->e->start); + pa->ctx = vma->e->start; + pa->max_reqs = 0; + pa->vma_nr_reqs = &vma->aio_nr_req; + pa++; + } + aa->nr_rings = vmas->nr_aios; + + if (parasite_execute_daemon(PARASITE_CMD_CHECK_AIOS, ctl)) + return -1; + + pa = &aa->ring[0]; + for (i = 0; i < vmas->nr_aios; i++) { + pa = &aa->ring[i]; + *pa->vma_nr_reqs = aio_estimate_nr_reqs(pa->max_reqs); + pr_debug(" `- Ring #%d has %u reqs, estimated to %u\n", i, + pa->max_reqs, *pa->vma_nr_reqs); + } + + return 0; +} diff --git a/cr-dump.c b/cr-dump.c index 98db10f81..31afaa07c 100644 --- a/cr-dump.c +++ b/cr-dump.c @@ -72,6 +72,7 @@ #include "irmap.h" #include "sysfs_parse.h" #include "action-scripts.h" +#include "aio.h" #include "asm/dump.h" @@ -464,6 +465,12 @@ static int dump_task_mm(pid_t pid, const struct proc_pid_stat *stat, goto err; mme.vmas[i++] = vma; + + if (vma_entry_is(vma, VMA_AREA_AIORING)) { + ret = dump_aio_ring(&mme, vma_area); + if (ret) + goto err; + } } mme.mm_start_code = stat->start_code; @@ -496,6 +503,7 @@ static int dump_task_mm(pid_t pid, const struct proc_pid_stat *stat, ret = pb_write_one(img_from_set(imgset, CR_FD_MM), &mme, PB_MM); xfree(mme.mm_saved_auxv); + free_aios(&mme); err: return ret; } @@ -1566,6 +1574,12 @@ static int dump_one_task(struct pstree_item *item) goto err_cure_imgset; } + ret = parasite_check_aios(parasite_ctl, &vmas); /* FIXME -- merge with above */ + if (ret) { + pr_err("Failed to check aio rings (pid: %d)\n", pid); + goto err_cure_imgset; + } + ret = parasite_dump_misc_seized(parasite_ctl, &misc); if (ret) { pr_err("Can't dump misc (pid: %d)\n", pid); diff --git a/include/aio.h b/include/aio.h new file mode 100644 index 000000000..af7a046e7 --- /dev/null +++ b/include/aio.h @@ -0,0 +1,9 @@ +#ifndef __CR_AIO_H__ +#define __CR_AIO_H__ +#include "protobuf/mm.pb-c.h" +int dump_aio_ring(MmEntry *mme, struct vma_area *vma); +void free_aios(MmEntry *mme); +struct parasite_ctl; +int parasite_check_aios(struct parasite_ctl *, struct vm_area_list *); +unsigned long aio_rings_args_size(struct vm_area_list *); +#endif /* __CR_AIO_H__ */ diff --git a/include/image.h b/include/image.h index e02fa0e02..9c711c0a7 100644 --- a/include/image.h +++ b/include/image.h @@ -54,6 +54,7 @@ #define VMA_AREA_SYSVIPC (1 << 10) #define VMA_AREA_SOCKET (1 << 11) #define VMA_AREA_VVAR (1 << 12) +#define VMA_AREA_AIORING (1 << 13) #define VMA_UNSUPP (1 << 31) /* Unsupported VMA */ diff --git a/include/parasite.h b/include/parasite.h index 1ff115ff7..365e6746d 100644 --- a/include/parasite.h +++ b/include/parasite.h @@ -47,6 +47,7 @@ enum { PARASITE_CMD_GET_PROC_FD, PARASITE_CMD_DUMP_TTY, PARASITE_CMD_CHECK_VDSO_MARK, + PARASITE_CMD_CHECK_AIOS, PARASITE_CMD_MAX, }; @@ -133,6 +134,17 @@ struct parasite_dump_posix_timers_args { struct posix_timer timer[0]; }; +struct parasite_aio { + unsigned long ctx; + unsigned int max_reqs; + unsigned int *vma_nr_reqs; +}; + +struct parasite_check_aios_args { + unsigned nr_rings; + struct parasite_aio ring[0]; +}; + static inline int posix_timers_dump_size(int timer_n) { return sizeof(int) + sizeof(struct posix_timer) * timer_n; diff --git a/include/vma.h b/include/vma.h index d2ce80c47..878658cae 100644 --- a/include/vma.h +++ b/include/vma.h @@ -7,6 +7,7 @@ struct vm_area_list { struct list_head h; unsigned nr; + unsigned int nr_aios; unsigned long priv_size; /* nr of pages in private VMAs */ unsigned long longest; /* nr of pages in longest VMA */ }; @@ -35,9 +36,12 @@ struct vma_area { * The file_fd is an fd for a regular file and * the socket_id is the inode number of the * mapped (PF_PACKET) socket. + * + * The aio_nr_req is only for aio rings. */ int vm_file_fd; int vm_socket_id; + unsigned int aio_nr_req; }; char *aufs_rpath; /* path from aufs root */ diff --git a/parasite-syscall.c b/parasite-syscall.c index 96c21600e..e8eeffba4 100644 --- a/parasite-syscall.c +++ b/parasite-syscall.c @@ -28,6 +28,7 @@ #include "mem.h" #include "vma.h" #include "proc_parse.h" +#include "aio.h" #include #include @@ -1193,6 +1194,7 @@ struct parasite_ctl *parasite_infect_seized(pid_t pid, struct pstree_item *item, return NULL; parasite_ensure_args_size(dump_pages_args_size(vma_area_list)); + parasite_ensure_args_size(aio_rings_args_size(vma_area_list)); /* * Inject a parasite engine. Ie allocate memory inside alien diff --git a/pie/parasite.c b/pie/parasite.c index 9557c55ac..5aef2d55a 100644 --- a/pie/parasite.c +++ b/pie/parasite.c @@ -330,6 +330,73 @@ static inline int tty_ioctl(int fd, int cmd, int *arg) return 0; } +/* + * Stolen from kernel/fs/aio.c + * + * Is it valid to go to memory and check it? Should be, + * as libaio does the same. + */ + +#define AIO_RING_MAGIC 0xa10a10a1 +#define AIO_RING_COMPAT_FEATURES 1 +#define AIO_RING_INCOMPAT_FEATURES 0 + +struct aio_ring { + unsigned id; /* kernel internal index number */ + unsigned nr; /* number of io_events */ + unsigned head; /* Written to by userland or under ring_lock + * mutex by aio_read_events_ring(). */ + unsigned tail; + + unsigned magic; + unsigned compat_features; + unsigned incompat_features; + unsigned header_length; /* size of aio_ring */ + + + /* struct io_event io_events[0]; */ +}; + +static int sane_ring(struct aio_ring *ring) +{ + return ring->magic == AIO_RING_MAGIC && + ring->compat_features == AIO_RING_COMPAT_FEATURES && + ring->incompat_features == AIO_RING_INCOMPAT_FEATURES && + ring->header_length == sizeof(struct aio_ring); +} + +static int parasite_check_aios(struct parasite_check_aios_args *args) +{ + int i; + + for (i = 0; i < args->nr_rings; i++) { + struct aio_ring *ring; + + ring = (struct aio_ring *)args->ring[i].ctx; + if (!sane_ring(ring)) { + pr_err("Not valid ring #%d\n", i); + pr_info(" `- magic %x\n", ring->magic); + pr_info(" `- cf %d\n", ring->compat_features); + pr_info(" `- if %d\n", ring->incompat_features); + pr_info(" `- size %d (%ld)\n", ring->header_length, sizeof(struct aio_ring)); + return -1; + } + + /* + * XXX what else can we do if there are requests + * in the ring? + */ + if (ring->head != ring->tail) { + pr_err("Pending AIO requests in ring #%d\n", i); + return -1; + } + + args->ring[i].max_reqs = ring->nr; + } + + return 0; +} + static int parasite_dump_tty(struct parasite_tty_args *args) { int ret; @@ -541,6 +608,9 @@ static noinline __used int noinline parasite_daemon(void *args) case PARASITE_CMD_DUMP_TTY: ret = parasite_dump_tty(args); break; + case PARASITE_CMD_CHECK_AIOS: + ret = parasite_check_aios(args); + break; #ifdef CONFIG_VDSO case PARASITE_CMD_CHECK_VDSO_MARK: ret = parasite_check_vdso_mark(args); diff --git a/proc_parse.c b/proc_parse.c index 7d21eb913..3da1d9eb6 100644 --- a/proc_parse.c +++ b/proc_parse.c @@ -43,6 +43,12 @@ static char *buf = __buf.buf; #define BUF_SIZE sizeof(__buf.buf) +/* + * This is how AIO ring buffers look like in proc + */ + +#define AIO_FNAME "/[aio]" + int parse_cpuinfo_features(int (*handler)(char *tok)) { FILE *cpuinfo; @@ -191,7 +197,7 @@ static inline int vfi_equal(struct vma_file_info *a, struct vma_file_info *b) (a->dev_min ^ b->dev_min)) == 0; } -static int vma_get_mapfile(struct vma_area *vma, DIR *mfd, +static int vma_get_mapfile(char *fname, struct vma_area *vma, DIR *mfd, struct vma_file_info *vfi, struct vma_file_info *prev_vfi) { char path[32]; @@ -244,13 +250,22 @@ static int vma_get_mapfile(struct vma_area *vma, DIR *mfd, if (fstatat(dirfd(mfd), path, &buf, 0)) return -1; - if (!S_ISSOCK(buf.st_mode)) - return -1; + if (S_ISSOCK(buf.st_mode)) { + pr_info("Found socket mapping @%"PRIx64"\n", vma->e->start); + vma->vm_socket_id = buf.st_ino; + vma->e->status |= VMA_AREA_SOCKET | VMA_AREA_REGULAR; + return 0; + } - vma->vm_socket_id = buf.st_ino; - pr_info("Found socket mapping @%"PRIx64"\n", vma->e->start); - vma->e->status |= VMA_AREA_SOCKET | VMA_AREA_REGULAR; - return 0; + if ((buf.st_mode & S_IFMT) == 0 && !strcmp(fname, AIO_FNAME)) { + /* AIO ring, let's try */ + close(vma->vm_file_fd); + vma->aio_nr_req = -1; + vma->e->status = VMA_AREA_AIORING; + return 0; + } + + pr_err("Unknown shit %o (%s)\n", buf.st_mode, fname); } return -1; @@ -325,6 +340,7 @@ int parse_smaps(pid_t pid, struct vm_area_list *vma_area_list) struct bfd f; vma_area_list->nr = 0; + vma_area_list->nr_aios = 0; vma_area_list->longest = 0; vma_area_list->priv_size = 0; INIT_LIST_HEAD(&vma_area_list->h); @@ -417,7 +433,7 @@ int parse_smaps(pid_t pid, struct vm_area_list *vma_area_list) vma_area->e->pgoff = pgoff; vma_area->e->prot = PROT_NONE; - if (vma_get_mapfile(vma_area, map_files_dir, &vfi, &prev_vfi)) + if (vma_get_mapfile(file_path, vma_area, map_files_dir, &vfi, &prev_vfi)) goto err_bogus_mapfile; if (r == 'r') @@ -437,6 +453,8 @@ int parse_smaps(pid_t pid, struct vm_area_list *vma_area_list) } if (vma_area->e->status != 0) { + if (vma_area->e->status & VMA_AREA_AIORING) + vma_area_list->nr_aios++; continue; } else if (!strcmp(file_path, "[vsyscall]") || !strcmp(file_path, "[vectors]")) { diff --git a/protobuf/mm.proto b/protobuf/mm.proto index 1556b602d..de2ff7436 100644 --- a/protobuf/mm.proto +++ b/protobuf/mm.proto @@ -1,5 +1,11 @@ import "vma.proto"; +message aio_ring_entry { + required uint64 id = 1; + required uint32 nr_req = 2; + required uint32 ring_len = 3; +} + message mm_entry { required uint64 mm_start_code = 1; required uint64 mm_end_code = 2; @@ -19,4 +25,5 @@ message mm_entry { repeated vma_entry vmas = 14; optional int32 dumpable = 15; + repeated aio_ring_entry aios = 16; }