From 55708698c5f153f4e390175cdfc395333b2eafbd Mon Sep 17 00:00:00 2001 From: Gu Zheng Date: Tue, 16 Jul 2013 17:56:12 +0800 Subject: [PATCH 01/22] fs/anon_inode: Introduce a new lib function anon_inode_getfile_private() Introduce a new lib function anon_inode_getfile_private(), it creates a new file instance by hooking it up to an anonymous inode, and a dentry that describe the "class" of the file, similar to anon_inode_getfile(), but each file holds a single inode. Furthermore, anyone who wants to create a private anon file will benefit from this change. Signed-off-by: Gu Zheng Signed-off-by: Benjamin LaHaise --- fs/anon_inodes.c | 66 +++++++++++++++++++++++++++++++++++++ include/linux/anon_inodes.h | 3 ++ 2 files changed, 69 insertions(+) diff --git a/fs/anon_inodes.c b/fs/anon_inodes.c index 47a65df8c871..85c961849953 100644 --- a/fs/anon_inodes.c +++ b/fs/anon_inodes.c @@ -108,6 +108,72 @@ static struct file_system_type anon_inode_fs_type = { .kill_sb = kill_anon_super, }; +/** + * anon_inode_getfile_private - creates a new file instance by hooking it up to an + * anonymous inode, and a dentry that describe the "class" + * of the file + * + * @name: [in] name of the "class" of the new file + * @fops: [in] file operations for the new file + * @priv: [in] private data for the new file (will be file's private_data) + * @flags: [in] flags + * + * + * Similar to anon_inode_getfile, but each file holds a single inode. + * + */ +struct file *anon_inode_getfile_private(const char *name, + const struct file_operations *fops, + void *priv, int flags) +{ + struct qstr this; + struct path path; + struct file *file; + struct inode *inode; + + if (fops->owner && !try_module_get(fops->owner)) + return ERR_PTR(-ENOENT); + + inode = anon_inode_mkinode(anon_inode_mnt->mnt_sb); + if (IS_ERR(inode)) { + file = ERR_PTR(-ENOMEM); + goto err_module; + } + + /* + * Link the inode to a directory entry by creating a unique name + * using the inode sequence number. + */ + file = ERR_PTR(-ENOMEM); + this.name = name; + this.len = strlen(name); + this.hash = 0; + path.dentry = d_alloc_pseudo(anon_inode_mnt->mnt_sb, &this); + if (!path.dentry) + goto err_module; + + path.mnt = mntget(anon_inode_mnt); + + d_instantiate(path.dentry, inode); + + file = alloc_file(&path, OPEN_FMODE(flags), fops); + if (IS_ERR(file)) + goto err_dput; + + file->f_mapping = inode->i_mapping; + file->f_flags = flags & (O_ACCMODE | O_NONBLOCK); + file->private_data = priv; + + return file; + +err_dput: + path_put(&path); +err_module: + module_put(fops->owner); + return file; +} +EXPORT_SYMBOL_GPL(anon_inode_getfile_private); + /** * anon_inode_getfile - creates a new file instance by hooking it up to an * anonymous inode, and a dentry that describe the "class" diff --git a/include/linux/anon_inodes.h b/include/linux/anon_inodes.h index 8013a45242fe..cf573c22b81e 100644 --- a/include/linux/anon_inodes.h +++ b/include/linux/anon_inodes.h @@ -13,6 +13,9 @@ struct file_operations; struct file *anon_inode_getfile(const char *name, const struct file_operations *fops, void *priv, int flags); +struct file *anon_inode_getfile_private(const char *name, + const struct file_operations *fops, + void *priv, int flags); int anon_inode_getfd(const char *name, const struct file_operations *fops, void *priv, int flags); From 36bc08cc01709b4a9bb563b35aa530241ddc63e3 Mon Sep 17 00:00:00 2001 From: Gu Zheng Date: Tue, 16 Jul 2013 17:56:16 +0800 Subject: [PATCH 02/22] fs/aio: Add support to aio ring pages migration As the aio job will pin the ring pages, that will lead to mem migrated failed. In order to fix this problem we use an anon inode to manage the aio ring pages, and setup the migratepage callback in the anon inode's address space, so that when mem migrating the aio ring pages will be moved to other mem node safely. Signed-off-by: Gu Zheng Signed-off-by: Benjamin LaHaise --- fs/aio.c | 119 ++++++++++++++++++++++++++++++++++++---- include/linux/migrate.h | 3 + mm/migrate.c | 2 +- 3 files changed, 112 insertions(+), 12 deletions(-) diff --git a/fs/aio.c b/fs/aio.c index 9b5ca1137419..cbd0afe77273 100644 --- a/fs/aio.c +++ b/fs/aio.c @@ -35,6 +35,9 @@ #include #include #include +#include +#include +#include #include #include @@ -110,6 +113,7 @@ struct kioctx { } ____cacheline_aligned_in_smp; struct page *internal_pages[AIO_RING_PAGES]; + struct file *aio_ring_file; }; /*------ sysctl variables----*/ @@ -138,15 +142,78 @@ __initcall(aio_setup); static void aio_free_ring(struct kioctx *ctx) { - long i; + int i; + struct file *aio_ring_file = ctx->aio_ring_file; - for (i = 0; i < ctx->nr_pages; i++) + for (i = 0; i < ctx->nr_pages; i++) { + pr_debug("pid(%d) [%d] page->count=%d\n", current->pid, i, + page_count(ctx->ring_pages[i])); put_page(ctx->ring_pages[i]); + } if (ctx->ring_pages && ctx->ring_pages != ctx->internal_pages) kfree(ctx->ring_pages); + + if (aio_ring_file) { + truncate_setsize(aio_ring_file->f_inode, 0); + pr_debug("pid(%d) i_nlink=%u d_count=%d d_unhashed=%d i_count=%d\n", + current->pid, aio_ring_file->f_inode->i_nlink, + aio_ring_file->f_path.dentry->d_count, + d_unhashed(aio_ring_file->f_path.dentry), + atomic_read(&aio_ring_file->f_inode->i_count)); + fput(aio_ring_file); + ctx->aio_ring_file = NULL; + } } +static int aio_ring_mmap(struct file *file, struct vm_area_struct *vma) +{ + vma->vm_ops = &generic_file_vm_ops; + return 0; +} + +static const struct file_operations aio_ring_fops = { + .mmap = aio_ring_mmap, +}; + +static int aio_set_page_dirty(struct page *page) +{ + return 0; +} + +static int aio_migratepage(struct address_space *mapping, struct page *new, + struct page *old, enum migrate_mode mode) +{ + struct kioctx *ctx = mapping->private_data; + unsigned long flags; + unsigned idx = old->index; + int rc; + + /* Writeback must be complete */ + BUG_ON(PageWriteback(old)); + put_page(old); + + rc = migrate_page_move_mapping(mapping, new, old, NULL, mode); + if (rc != MIGRATEPAGE_SUCCESS) { + get_page(old); + return rc; + } + + get_page(new); + + spin_lock_irqsave(&ctx->completion_lock, flags); + migrate_page_copy(new, old); + ctx->ring_pages[idx] = new; + spin_unlock_irqrestore(&ctx->completion_lock, flags); + + return rc; +} + +static const struct address_space_operations aio_ctx_aops = { + .set_page_dirty = aio_set_page_dirty, + .migratepage = aio_migratepage, +}; + static int aio_setup_ring(struct kioctx *ctx) { struct aio_ring *ring; @@ -154,20 +221,45 @@ static int aio_setup_ring(struct kioctx *ctx) struct mm_struct *mm = current->mm; unsigned long size, populate; int nr_pages; + int i; + struct file *file; /* Compensate for the ring buffer's head/tail overlap entry */ nr_events += 2; /* 1 is required, 2 for good luck */ size = sizeof(struct aio_ring); size += sizeof(struct io_event) * nr_events; - nr_pages = (size + PAGE_SIZE-1) >> PAGE_SHIFT; + nr_pages = PFN_UP(size); if (nr_pages < 0) return -EINVAL; - nr_events = (PAGE_SIZE * nr_pages - sizeof(struct aio_ring)) / sizeof(struct io_event); + file = anon_inode_getfile_private("[aio]", &aio_ring_fops, ctx, O_RDWR); + if (IS_ERR(file)) { + ctx->aio_ring_file = NULL; + return -EAGAIN; + } + + file->f_inode->i_mapping->a_ops = &aio_ctx_aops; + file->f_inode->i_mapping->private_data = ctx; + file->f_inode->i_size = PAGE_SIZE * (loff_t)nr_pages; + + for (i = 0; i < nr_pages; i++) { + struct page *page; + page = find_or_create_page(file->f_inode->i_mapping, + i, GFP_HIGHUSER | __GFP_ZERO); + if (!page) + break; + pr_debug("pid(%d) page[%d]->count=%d\n", + current->pid, i, page_count(page)); + SetPageUptodate(page); + SetPageDirty(page); + unlock_page(page); + } + ctx->aio_ring_file = file; + nr_events = (PAGE_SIZE * nr_pages - sizeof(struct aio_ring)) + / sizeof(struct io_event); - ctx->nr_events = 0; ctx->ring_pages = ctx->internal_pages; if (nr_pages > AIO_RING_PAGES) { ctx->ring_pages = kcalloc(nr_pages, sizeof(struct page *), @@ -178,28 +270,31 @@ static int aio_setup_ring(struct kioctx *ctx) ctx->mmap_size = nr_pages * PAGE_SIZE; pr_debug("attempting mmap of %lu bytes\n", ctx->mmap_size); + down_write(&mm->mmap_sem); - ctx->mmap_base = do_mmap_pgoff(NULL, 0, ctx->mmap_size, - PROT_READ|PROT_WRITE, - MAP_ANONYMOUS|MAP_PRIVATE, 0, &populate); + ctx->mmap_base = do_mmap_pgoff(ctx->aio_ring_file, 0, ctx->mmap_size, + PROT_READ | PROT_WRITE, + MAP_SHARED | MAP_POPULATE, 0, &populate); if (IS_ERR((void *)ctx->mmap_base)) { up_write(&mm->mmap_sem); ctx->mmap_size = 0; aio_free_ring(ctx); return -EAGAIN; } + up_write(&mm->mmap_sem); + + mm_populate(ctx->mmap_base, populate); pr_debug("mmap address: 0x%08lx\n", ctx->mmap_base); ctx->nr_pages = get_user_pages(current, mm, ctx->mmap_base, nr_pages, 1, 0, ctx->ring_pages, NULL); - up_write(&mm->mmap_sem); + for (i = 0; i < ctx->nr_pages; i++) + put_page(ctx->ring_pages[i]); if (unlikely(ctx->nr_pages != nr_pages)) { aio_free_ring(ctx); return -EAGAIN; } - if (populate) - mm_populate(ctx->mmap_base, populate); ctx->user_id = ctx->mmap_base; ctx->nr_events = nr_events; /* trusted copy */ @@ -399,6 +494,8 @@ out_cleanup: err = -EAGAIN; aio_free_ring(ctx); out_freectx: + if (ctx->aio_ring_file) + fput(ctx->aio_ring_file); kmem_cache_free(kioctx_cachep, ctx); pr_debug("error allocating ioctx %d\n", err); return ERR_PTR(err); diff --git a/include/linux/migrate.h b/include/linux/migrate.h index a405d3dc0f61..c407d88f5979 100644 --- a/include/linux/migrate.h +++ b/include/linux/migrate.h @@ -55,6 +55,9 @@ extern int migrate_vmas(struct mm_struct *mm, extern void migrate_page_copy(struct page *newpage, struct page *page); extern int migrate_huge_page_move_mapping(struct address_space *mapping, struct page *newpage, struct page *page); +extern int migrate_page_move_mapping(struct address_space *mapping, + struct page *newpage, struct page *page, + struct buffer_head *head, enum migrate_mode mode); #else static inline void putback_lru_pages(struct list_head *l) {} diff --git a/mm/migrate.c b/mm/migrate.c index 6f0c24438bba..1da0092561a4 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -307,7 +307,7 @@ static inline bool buffer_migrate_lock_buffers(struct buffer_head *head, * 2 for pages with a mapping * 3 for pages with a mapping and PagePrivate/PagePrivate2 set. */ -static int migrate_page_move_mapping(struct address_space *mapping, +int migrate_page_move_mapping(struct address_space *mapping, struct page *newpage, struct page *page, struct buffer_head *head, enum migrate_mode mode) { From 0c45355fc7c48c82db151bf0e7ca305d513e639e Mon Sep 17 00:00:00 2001 From: Benjamin LaHaise Date: Wed, 17 Jul 2013 09:34:24 -0400 Subject: [PATCH 03/22] aio: fix build when migration is disabled When "fs/aio: Add support to aio ring pages migration" was applied, it broke the build when CONFIG_MIGRATION was disabled. Wrap the migration code with a test for CONFIG_MIGRATION to fix this and save a few bytes when migration is disabled. Signed-off-by: Benjamin LaHaise --- fs/aio.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/fs/aio.c b/fs/aio.c index cbd0afe77273..dedeea01e4e4 100644 --- a/fs/aio.c +++ b/fs/aio.c @@ -181,6 +181,7 @@ static int aio_set_page_dirty(struct page *page) return 0; } +#if IS_ENABLED(CONFIG_MIGRATION) static int aio_migratepage(struct address_space *mapping, struct page *new, struct page *old, enum migrate_mode mode) { @@ -208,10 +209,13 @@ static int aio_migratepage(struct address_space *mapping, struct page *new, return rc; } +#endif static const struct address_space_operations aio_ctx_aops = { .set_page_dirty = aio_set_page_dirty, +#if IS_ENABLED(CONFIG_MIGRATION) .migratepage = aio_migratepage, +#endif }; static int aio_setup_ring(struct kioctx *ctx) From 34e83fc618085e00dc9803286c581f51966673bd Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Fri, 26 Apr 2013 10:58:39 +1000 Subject: [PATCH 04/22] aio: reqs_active -> reqs_available The number of outstanding kiocbs is one of the few shared things left that has to be touched for every kiocb - it'd be nice to make it percpu. We can make it per cpu by treating it like an allocation problem: we have a maximum number of kiocbs that can be outstanding (i.e. slots) - then we just allocate and free slots, and we know how to write per cpu allocators. So as prep work for that, we convert reqs_active to reqs_available. Signed-off-by: Kent Overstreet Cc: Zach Brown Cc: Felipe Balbi Cc: Greg Kroah-Hartman Cc: Mark Fasheh Cc: Joel Becker Cc: Rusty Russell Cc: Jens Axboe Cc: Asai Thambi S P Cc: Selvan Mani Cc: Sam Bradshaw Cc: Jeff Moyer Cc: Al Viro Cc: Benjamin LaHaise Reviewed-by: "Theodore Ts'o" Signed-off-by: Andrew Morton Signed-off-by: Benjamin LaHaise --- fs/aio.c | 38 ++++++++++++++++++++++---------------- 1 file changed, 22 insertions(+), 16 deletions(-) diff --git a/fs/aio.c b/fs/aio.c index dedeea01e4e4..0e23dfa77b0e 100644 --- a/fs/aio.c +++ b/fs/aio.c @@ -94,7 +94,13 @@ struct kioctx { struct work_struct rcu_work; struct { - atomic_t reqs_active; + /* + * This counts the number of available slots in the ringbuffer, + * so we avoid overflowing it: it's decremented (if positive) + * when allocating a kiocb and incremented when the resulting + * io_event is pulled off the ringbuffer. + */ + atomic_t reqs_available; } ____cacheline_aligned_in_smp; struct { @@ -404,19 +410,20 @@ static void free_ioctx(struct kioctx *ctx) head = ring->head; kunmap_atomic(ring); - while (atomic_read(&ctx->reqs_active) > 0) { + while (atomic_read(&ctx->reqs_available) < ctx->nr_events - 1) { wait_event(ctx->wait, - head != ctx->tail || - atomic_read(&ctx->reqs_active) <= 0); + (head != ctx->tail) || + (atomic_read(&ctx->reqs_available) >= + ctx->nr_events - 1)); avail = (head <= ctx->tail ? ctx->tail : ctx->nr_events) - head; - atomic_sub(avail, &ctx->reqs_active); + atomic_add(avail, &ctx->reqs_available); head += avail; head %= ctx->nr_events; } - WARN_ON(atomic_read(&ctx->reqs_active) < 0); + WARN_ON(atomic_read(&ctx->reqs_available) > ctx->nr_events - 1); aio_free_ring(ctx); @@ -475,6 +482,8 @@ static struct kioctx *ioctx_alloc(unsigned nr_events) if (aio_setup_ring(ctx) < 0) goto out_freectx; + atomic_set(&ctx->reqs_available, ctx->nr_events - 1); + /* limit the number of system wide aios */ spin_lock(&aio_nr_lock); if (aio_nr + nr_events > aio_max_nr || @@ -586,7 +595,7 @@ void exit_aio(struct mm_struct *mm) "exit_aio:ioctx still alive: %d %d %d\n", atomic_read(&ctx->users), atomic_read(&ctx->dead), - atomic_read(&ctx->reqs_active)); + atomic_read(&ctx->reqs_available)); /* * We don't need to bother with munmap() here - * exit_mmap(mm) is coming and it'll unmap everything. @@ -615,12 +624,9 @@ static inline struct kiocb *aio_get_req(struct kioctx *ctx) { struct kiocb *req; - if (atomic_read(&ctx->reqs_active) >= ctx->nr_events) + if (atomic_dec_if_positive(&ctx->reqs_available) <= 0) return NULL; - if (atomic_inc_return(&ctx->reqs_active) > ctx->nr_events - 1) - goto out_put; - req = kmem_cache_alloc(kiocb_cachep, GFP_KERNEL|__GFP_ZERO); if (unlikely(!req)) goto out_put; @@ -630,7 +636,7 @@ static inline struct kiocb *aio_get_req(struct kioctx *ctx) return req; out_put: - atomic_dec(&ctx->reqs_active); + atomic_inc(&ctx->reqs_available); return NULL; } @@ -701,7 +707,7 @@ void aio_complete(struct kiocb *iocb, long res, long res2) /* * Take rcu_read_lock() in case the kioctx is being destroyed, as we - * need to issue a wakeup after decrementing reqs_active. + * need to issue a wakeup after incrementing reqs_available. */ rcu_read_lock(); @@ -719,7 +725,7 @@ void aio_complete(struct kiocb *iocb, long res, long res2) */ if (unlikely(xchg(&iocb->ki_cancel, KIOCB_CANCELLED) == KIOCB_CANCELLED)) { - atomic_dec(&ctx->reqs_active); + atomic_inc(&ctx->reqs_available); /* Still need the wake_up in case free_ioctx is waiting */ goto put_rq; } @@ -857,7 +863,7 @@ static long aio_read_events_ring(struct kioctx *ctx, pr_debug("%li h%u t%u\n", ret, head, ctx->tail); - atomic_sub(ret, &ctx->reqs_active); + atomic_add(ret, &ctx->reqs_available); out: mutex_unlock(&ctx->ring_lock); @@ -1241,7 +1247,7 @@ static int io_submit_one(struct kioctx *ctx, struct iocb __user *user_iocb, aio_put_req(req); /* drop extra ref to req */ return 0; out_put_req: - atomic_dec(&ctx->reqs_active); + atomic_inc(&ctx->reqs_available); aio_put_req(req); /* drop extra ref to req */ aio_put_req(req); /* drop i/o ref to req */ return ret; From e1bdd5f27a5b14e24a658d5511bebceb67679d83 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Fri, 26 Apr 2013 10:58:39 +1000 Subject: [PATCH 05/22] aio: percpu reqs_available See the previous patch ("aio: reqs_active -> reqs_available") for why we want to do this - this basically implements a per cpu allocator for reqs_available that doesn't actually allocate anything. Note that we need to increase the size of the ringbuffer we allocate, since a single thread won't necessarily be able to use all the reqs_available slots - some (up to about half) might be on other per cpu lists, unavailable for the current thread. We size the ringbuffer based on the nr_events userspace passed to io_setup(), so this is a slight behaviour change - but nr_events wasn't being used as a hard limit before, it was being rounded up to the next page before so this doesn't change the actual semantics. Signed-off-by: Kent Overstreet Cc: Zach Brown Cc: Felipe Balbi Cc: Greg Kroah-Hartman Cc: Mark Fasheh Cc: Joel Becker Cc: Rusty Russell Cc: Jens Axboe Cc: Asai Thambi S P Cc: Selvan Mani Cc: Sam Bradshaw Cc: Jeff Moyer Cc: Al Viro Cc: Benjamin LaHaise Reviewed-by: "Theodore Ts'o" Signed-off-by: Andrew Morton Signed-off-by: Benjamin LaHaise --- fs/aio.c | 106 +++++++++++++++++++++++++++++++++++++++++++++++++++---- 1 file changed, 99 insertions(+), 7 deletions(-) diff --git a/fs/aio.c b/fs/aio.c index 0e23dfa77b0e..bb1a6c433110 100644 --- a/fs/aio.c +++ b/fs/aio.c @@ -26,6 +26,7 @@ #include #include #include +#include #include #include #include @@ -64,6 +65,10 @@ struct aio_ring { #define AIO_RING_PAGES 8 +struct kioctx_cpu { + unsigned reqs_available; +}; + struct kioctx { atomic_t users; atomic_t dead; @@ -72,6 +77,13 @@ struct kioctx { unsigned long user_id; struct hlist_node list; + struct __percpu kioctx_cpu *cpu; + + /* + * For percpu reqs_available, number of slots we move to/from global + * counter at a time: + */ + unsigned req_batch; /* * This is what userspace passed to io_setup(), it's not used for * anything but counting against the global max_reqs quota. @@ -99,6 +111,8 @@ struct kioctx { * so we avoid overflowing it: it's decremented (if positive) * when allocating a kiocb and incremented when the resulting * io_event is pulled off the ringbuffer. + * + * We batch accesses to it with a percpu version. */ atomic_t reqs_available; } ____cacheline_aligned_in_smp; @@ -379,6 +393,8 @@ static int kiocb_cancel(struct kioctx *ctx, struct kiocb *kiocb, static void free_ioctx_rcu(struct rcu_head *head) { struct kioctx *ctx = container_of(head, struct kioctx, rcu_head); + + free_percpu(ctx->cpu); kmem_cache_free(kioctx_cachep, ctx); } @@ -392,7 +408,7 @@ static void free_ioctx(struct kioctx *ctx) struct aio_ring *ring; struct io_event res; struct kiocb *req; - unsigned head, avail; + unsigned cpu, head, avail; spin_lock_irq(&ctx->ctx_lock); @@ -406,6 +422,13 @@ static void free_ioctx(struct kioctx *ctx) spin_unlock_irq(&ctx->ctx_lock); + for_each_possible_cpu(cpu) { + struct kioctx_cpu *kcpu = per_cpu_ptr(ctx->cpu, cpu); + + atomic_add(kcpu->reqs_available, &ctx->reqs_available); + kcpu->reqs_available = 0; + } + ring = kmap_atomic(ctx->ring_pages[0]); head = ring->head; kunmap_atomic(ring); @@ -454,6 +477,18 @@ static struct kioctx *ioctx_alloc(unsigned nr_events) struct kioctx *ctx; int err = -ENOMEM; + /* + * We keep track of the number of available ringbuffer slots, to prevent + * overflow (reqs_available), and we also use percpu counters for this. + * + * So since up to half the slots might be on other cpu's percpu counters + * and unavailable, double nr_events so userspace sees what they + * expected: additionally, we move req_batch slots to/from percpu + * counters at a time, so make sure that isn't 0: + */ + nr_events = max(nr_events, num_possible_cpus() * 4); + nr_events *= 2; + /* Prevent overflows */ if ((nr_events > (0x10000000U / sizeof(struct io_event))) || (nr_events > (0x10000000U / sizeof(struct kiocb)))) { @@ -479,10 +514,16 @@ static struct kioctx *ioctx_alloc(unsigned nr_events) INIT_LIST_HEAD(&ctx->active_reqs); - if (aio_setup_ring(ctx) < 0) + ctx->cpu = alloc_percpu(struct kioctx_cpu); + if (!ctx->cpu) goto out_freectx; + if (aio_setup_ring(ctx) < 0) + goto out_freepcpu; + atomic_set(&ctx->reqs_available, ctx->nr_events - 1); + ctx->req_batch = (ctx->nr_events - 1) / (num_possible_cpus() * 4); + BUG_ON(!ctx->req_batch); /* limit the number of system wide aios */ spin_lock(&aio_nr_lock); @@ -506,6 +547,8 @@ static struct kioctx *ioctx_alloc(unsigned nr_events) out_cleanup: err = -EAGAIN; aio_free_ring(ctx); +out_freepcpu: + free_percpu(ctx->cpu); out_freectx: if (ctx->aio_ring_file) fput(ctx->aio_ring_file); @@ -610,6 +653,52 @@ void exit_aio(struct mm_struct *mm) } } +static void put_reqs_available(struct kioctx *ctx, unsigned nr) +{ + struct kioctx_cpu *kcpu; + + preempt_disable(); + kcpu = this_cpu_ptr(ctx->cpu); + + kcpu->reqs_available += nr; + while (kcpu->reqs_available >= ctx->req_batch * 2) { + kcpu->reqs_available -= ctx->req_batch; + atomic_add(ctx->req_batch, &ctx->reqs_available); + } + + preempt_enable(); +} + +static bool get_reqs_available(struct kioctx *ctx) +{ + struct kioctx_cpu *kcpu; + bool ret = false; + + preempt_disable(); + kcpu = this_cpu_ptr(ctx->cpu); + + if (!kcpu->reqs_available) { + int old, avail = atomic_read(&ctx->reqs_available); + + do { + if (avail < ctx->req_batch) + goto out; + + old = avail; + avail = atomic_cmpxchg(&ctx->reqs_available, + avail, avail - ctx->req_batch); + } while (avail != old); + + kcpu->reqs_available += ctx->req_batch; + } + + ret = true; + kcpu->reqs_available--; +out: + preempt_enable(); + return ret; +} + /* aio_get_req * Allocate a slot for an aio request. Increments the ki_users count * of the kioctx so that the kioctx stays around until all requests are @@ -624,7 +713,7 @@ static inline struct kiocb *aio_get_req(struct kioctx *ctx) { struct kiocb *req; - if (atomic_dec_if_positive(&ctx->reqs_available) <= 0) + if (!get_reqs_available(ctx)) return NULL; req = kmem_cache_alloc(kiocb_cachep, GFP_KERNEL|__GFP_ZERO); @@ -633,10 +722,9 @@ static inline struct kiocb *aio_get_req(struct kioctx *ctx) atomic_set(&req->ki_users, 2); req->ki_ctx = ctx; - return req; out_put: - atomic_inc(&ctx->reqs_available); + put_reqs_available(ctx, 1); return NULL; } @@ -725,6 +813,10 @@ void aio_complete(struct kiocb *iocb, long res, long res2) */ if (unlikely(xchg(&iocb->ki_cancel, KIOCB_CANCELLED) == KIOCB_CANCELLED)) { + /* + * Can't use the percpu reqs_available here - could race with + * free_ioctx() + */ atomic_inc(&ctx->reqs_available); /* Still need the wake_up in case free_ioctx is waiting */ goto put_rq; @@ -863,7 +955,7 @@ static long aio_read_events_ring(struct kioctx *ctx, pr_debug("%li h%u t%u\n", ret, head, ctx->tail); - atomic_add(ret, &ctx->reqs_available); + put_reqs_available(ctx, ret); out: mutex_unlock(&ctx->ring_lock); @@ -1247,7 +1339,7 @@ static int io_submit_one(struct kioctx *ctx, struct iocb __user *user_iocb, aio_put_req(req); /* drop extra ref to req */ return 0; out_put_req: - atomic_inc(&ctx->reqs_available); + put_reqs_available(ctx, 1); aio_put_req(req); /* drop extra ref to req */ aio_put_req(req); /* drop i/o ref to req */ return ret; From 723be6e39d14254bb5bb9f422b434566d359fa6e Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Tue, 28 May 2013 15:14:48 -0700 Subject: [PATCH 06/22] aio: percpu ioctx refcount This just converts the ioctx refcount to the new generic dynamic percpu refcount code. Signed-off-by: Kent Overstreet Cc: Zach Brown Cc: Felipe Balbi Cc: Greg Kroah-Hartman Cc: Mark Fasheh Cc: Joel Becker Cc: Rusty Russell Cc: Jens Axboe Cc: Asai Thambi S P Cc: Selvan Mani Cc: Sam Bradshaw Cc: Jeff Moyer Cc: Al Viro Cc: Benjamin LaHaise Reviewed-by: "Theodore Ts'o" Signed-off-by: Andrew Morton Signed-off-by: Benjamin LaHaise --- fs/aio.c | 66 +++++++++++++++++++++++--------------------------------- 1 file changed, 27 insertions(+), 39 deletions(-) diff --git a/fs/aio.c b/fs/aio.c index bb1a6c433110..7b470bfbf891 100644 --- a/fs/aio.c +++ b/fs/aio.c @@ -39,6 +39,7 @@ #include #include #include +#include #include #include @@ -70,7 +71,7 @@ struct kioctx_cpu { }; struct kioctx { - atomic_t users; + struct percpu_ref users; atomic_t dead; /* This needs improving */ @@ -103,7 +104,7 @@ struct kioctx { long nr_pages; struct rcu_head rcu_head; - struct work_struct rcu_work; + struct work_struct free_work; struct { /* @@ -403,8 +404,9 @@ static void free_ioctx_rcu(struct rcu_head *head) * and ctx->users has dropped to 0, so we know no more kiocbs can be submitted - * now it's safe to cancel any that need to be. */ -static void free_ioctx(struct kioctx *ctx) +static void free_ioctx(struct work_struct *work) { + struct kioctx *ctx = container_of(work, struct kioctx, free_work); struct aio_ring *ring; struct io_event res; struct kiocb *req; @@ -462,10 +464,12 @@ static void free_ioctx(struct kioctx *ctx) call_rcu(&ctx->rcu_head, free_ioctx_rcu); } -static void put_ioctx(struct kioctx *ctx) +static void free_ioctx_ref(struct percpu_ref *ref) { - if (unlikely(atomic_dec_and_test(&ctx->users))) - free_ioctx(ctx); + struct kioctx *ctx = container_of(ref, struct kioctx, users); + + INIT_WORK(&ctx->free_work, free_ioctx); + schedule_work(&ctx->free_work); } /* ioctx_alloc @@ -505,8 +509,9 @@ static struct kioctx *ioctx_alloc(unsigned nr_events) ctx->max_reqs = nr_events; - atomic_set(&ctx->users, 2); - atomic_set(&ctx->dead, 0); + if (percpu_ref_init(&ctx->users, free_ioctx_ref)) + goto out_freectx; + spin_lock_init(&ctx->ctx_lock); spin_lock_init(&ctx->completion_lock); mutex_init(&ctx->ring_lock); @@ -516,7 +521,7 @@ static struct kioctx *ioctx_alloc(unsigned nr_events) ctx->cpu = alloc_percpu(struct kioctx_cpu); if (!ctx->cpu) - goto out_freectx; + goto out_freeref; if (aio_setup_ring(ctx) < 0) goto out_freepcpu; @@ -535,6 +540,8 @@ static struct kioctx *ioctx_alloc(unsigned nr_events) aio_nr += ctx->max_reqs; spin_unlock(&aio_nr_lock); + percpu_ref_get(&ctx->users); /* io_setup() will drop this ref */ + /* now link into global list. */ spin_lock(&mm->ioctx_lock); hlist_add_head_rcu(&ctx->list, &mm->ioctx_list); @@ -549,6 +556,8 @@ out_cleanup: aio_free_ring(ctx); out_freepcpu: free_percpu(ctx->cpu); +out_freeref: + free_percpu(ctx->users.pcpu_count); out_freectx: if (ctx->aio_ring_file) fput(ctx->aio_ring_file); @@ -557,22 +566,6 @@ out_freectx: return ERR_PTR(err); } -static void kill_ioctx_work(struct work_struct *work) -{ - struct kioctx *ctx = container_of(work, struct kioctx, rcu_work); - - wake_up_all(&ctx->wait); - put_ioctx(ctx); -} - -static void kill_ioctx_rcu(struct rcu_head *head) -{ - struct kioctx *ctx = container_of(head, struct kioctx, rcu_head); - - INIT_WORK(&ctx->rcu_work, kill_ioctx_work); - schedule_work(&ctx->rcu_work); -} - /* kill_ioctx * Cancels all outstanding aio requests on an aio context. Used * when the processes owning a context have all exited to encourage @@ -582,6 +575,8 @@ static void kill_ioctx(struct kioctx *ctx) { if (!atomic_xchg(&ctx->dead, 1)) { hlist_del_rcu(&ctx->list); + /* percpu_ref_kill() will do the necessary call_rcu() */ + wake_up_all(&ctx->wait); /* * It'd be more correct to do this in free_ioctx(), after all @@ -598,8 +593,7 @@ static void kill_ioctx(struct kioctx *ctx) if (ctx->mmap_size) vm_munmap(ctx->mmap_base, ctx->mmap_size); - /* Between hlist_del_rcu() and dropping the initial ref */ - call_rcu(&ctx->rcu_head, kill_ioctx_rcu); + percpu_ref_kill(&ctx->users); } } @@ -633,12 +627,6 @@ void exit_aio(struct mm_struct *mm) struct hlist_node *n; hlist_for_each_entry_safe(ctx, n, &mm->ioctx_list, list) { - if (1 != atomic_read(&ctx->users)) - printk(KERN_DEBUG - "exit_aio:ioctx still alive: %d %d %d\n", - atomic_read(&ctx->users), - atomic_read(&ctx->dead), - atomic_read(&ctx->reqs_available)); /* * We don't need to bother with munmap() here - * exit_mmap(mm) is coming and it'll unmap everything. @@ -757,7 +745,7 @@ static struct kioctx *lookup_ioctx(unsigned long ctx_id) hlist_for_each_entry_rcu(ctx, &mm->ioctx_list, list) { if (ctx->user_id == ctx_id) { - atomic_inc(&ctx->users); + percpu_ref_get(&ctx->users); ret = ctx; break; } @@ -1054,7 +1042,7 @@ SYSCALL_DEFINE2(io_setup, unsigned, nr_events, aio_context_t __user *, ctxp) ret = put_user(ioctx->user_id, ctxp); if (ret) kill_ioctx(ioctx); - put_ioctx(ioctx); + percpu_ref_put(&ioctx->users); } out: @@ -1072,7 +1060,7 @@ SYSCALL_DEFINE1(io_destroy, aio_context_t, ctx) struct kioctx *ioctx = lookup_ioctx(ctx); if (likely(NULL != ioctx)) { kill_ioctx(ioctx); - put_ioctx(ioctx); + percpu_ref_put(&ioctx->users); return 0; } pr_debug("EINVAL: io_destroy: invalid context id\n"); @@ -1394,7 +1382,7 @@ long do_io_submit(aio_context_t ctx_id, long nr, } blk_finish_plug(&plug); - put_ioctx(ctx); + percpu_ref_put(&ctx->users); return i ? i : ret; } @@ -1483,7 +1471,7 @@ SYSCALL_DEFINE3(io_cancel, aio_context_t, ctx_id, struct iocb __user *, iocb, ret = -EFAULT; } - put_ioctx(ctx); + percpu_ref_put(&ctx->users); return ret; } @@ -1512,7 +1500,7 @@ SYSCALL_DEFINE5(io_getevents, aio_context_t, ctx_id, if (likely(ioctx)) { if (likely(min_nr <= nr && min_nr >= 0)) ret = read_events(ioctx, min_nr, nr, events, timeout); - put_ioctx(ioctx); + percpu_ref_put(&ioctx->users); } return ret; } From bec68faaf3ba74ed0dcd5dc3a881b30aec542973 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Mon, 13 May 2013 14:45:08 -0700 Subject: [PATCH 07/22] aio: io_cancel() no longer returns the io_event Originally, io_event() was documented to return the io_event if cancellation succeeded - the io_event wouldn't be delivered via the ring buffer like it normally would. But this isn't what the implementation was actually doing; the only driver implementing cancellation, the usb gadget code, never returned an io_event in its cancel function. And aio_complete() was recently changed to no longer suppress event delivery if the kiocb had been cancelled. This gets rid of the unused io_event argument to kiocb_cancel() and kiocb->ki_cancel(), and changes io_cancel() to return -EINPROGRESS if kiocb->ki_cancel() returned success. Also tweak the refcounting in kiocb_cancel() to make more sense. Signed-off-by: Kent Overstreet Cc: Zach Brown Cc: Felipe Balbi Cc: Greg Kroah-Hartman Cc: Mark Fasheh Cc: Joel Becker Cc: Rusty Russell Cc: Jens Axboe Cc: Asai Thambi S P Cc: Selvan Mani Cc: Sam Bradshaw Cc: Jeff Moyer Cc: Al Viro Cc: Benjamin LaHaise Signed-off-by: Benjamin LaHaise --- drivers/usb/gadget/inode.c | 3 +-- fs/aio.c | 40 ++++++++++---------------------------- include/linux/aio.h | 2 +- 3 files changed, 12 insertions(+), 33 deletions(-) diff --git a/drivers/usb/gadget/inode.c b/drivers/usb/gadget/inode.c index 570c005062ab..e02c1e04529e 100644 --- a/drivers/usb/gadget/inode.c +++ b/drivers/usb/gadget/inode.c @@ -524,7 +524,7 @@ struct kiocb_priv { unsigned actual; }; -static int ep_aio_cancel(struct kiocb *iocb, struct io_event *e) +static int ep_aio_cancel(struct kiocb *iocb) { struct kiocb_priv *priv = iocb->private; struct ep_data *epdata; @@ -540,7 +540,6 @@ static int ep_aio_cancel(struct kiocb *iocb, struct io_event *e) // spin_unlock(&epdata->dev->lock); local_irq_enable(); - aio_put_req(iocb); return value; } diff --git a/fs/aio.c b/fs/aio.c index 7b470bfbf891..12b37689dd2c 100644 --- a/fs/aio.c +++ b/fs/aio.c @@ -358,8 +358,7 @@ void kiocb_set_cancel_fn(struct kiocb *req, kiocb_cancel_fn *cancel) } EXPORT_SYMBOL(kiocb_set_cancel_fn); -static int kiocb_cancel(struct kioctx *ctx, struct kiocb *kiocb, - struct io_event *res) +static int kiocb_cancel(struct kioctx *ctx, struct kiocb *kiocb) { kiocb_cancel_fn *old, *cancel; int ret = -EINVAL; @@ -381,12 +380,10 @@ static int kiocb_cancel(struct kioctx *ctx, struct kiocb *kiocb, atomic_inc(&kiocb->ki_users); spin_unlock_irq(&ctx->ctx_lock); - memset(res, 0, sizeof(*res)); - res->obj = (u64)(unsigned long)kiocb->ki_obj.user; - res->data = kiocb->ki_user_data; - ret = cancel(kiocb, res); + ret = cancel(kiocb); spin_lock_irq(&ctx->ctx_lock); + aio_put_req(kiocb); return ret; } @@ -408,7 +405,6 @@ static void free_ioctx(struct work_struct *work) { struct kioctx *ctx = container_of(work, struct kioctx, free_work); struct aio_ring *ring; - struct io_event res; struct kiocb *req; unsigned cpu, head, avail; @@ -419,7 +415,7 @@ static void free_ioctx(struct work_struct *work) struct kiocb, ki_list); list_del_init(&req->ki_list); - kiocb_cancel(ctx, req, &res); + kiocb_cancel(ctx, req); } spin_unlock_irq(&ctx->ctx_lock); @@ -795,21 +791,6 @@ void aio_complete(struct kiocb *iocb, long res, long res2) spin_unlock_irqrestore(&ctx->ctx_lock, flags); } - /* - * cancelled requests don't get events, userland was given one - * when the event got cancelled. - */ - if (unlikely(xchg(&iocb->ki_cancel, - KIOCB_CANCELLED) == KIOCB_CANCELLED)) { - /* - * Can't use the percpu reqs_available here - could race with - * free_ioctx() - */ - atomic_inc(&ctx->reqs_available); - /* Still need the wake_up in case free_ioctx is waiting */ - goto put_rq; - } - /* * Add a completion event to the ring buffer. Must be done holding * ctx->completion_lock to prevent other code from messing with the tail @@ -862,7 +843,6 @@ void aio_complete(struct kiocb *iocb, long res, long res2) if (iocb->ki_eventfd != NULL) eventfd_signal(iocb->ki_eventfd, 1); -put_rq: /* everything turned out well, dispose of the aiocb. */ aio_put_req(iocb); @@ -1439,7 +1419,6 @@ static struct kiocb *lookup_kiocb(struct kioctx *ctx, struct iocb __user *iocb, SYSCALL_DEFINE3(io_cancel, aio_context_t, ctx_id, struct iocb __user *, iocb, struct io_event __user *, result) { - struct io_event res; struct kioctx *ctx; struct kiocb *kiocb; u32 key; @@ -1457,18 +1436,19 @@ SYSCALL_DEFINE3(io_cancel, aio_context_t, ctx_id, struct iocb __user *, iocb, kiocb = lookup_kiocb(ctx, iocb, key); if (kiocb) - ret = kiocb_cancel(ctx, kiocb, &res); + ret = kiocb_cancel(ctx, kiocb); else ret = -EINVAL; spin_unlock_irq(&ctx->ctx_lock); if (!ret) { - /* Cancellation succeeded -- copy the result - * into the user's buffer. + /* + * The result argument is no longer used - the io_event is + * always delivered via the ring buffer. -EINPROGRESS indicates + * cancellation is progress: */ - if (copy_to_user(result, &res, sizeof(res))) - ret = -EFAULT; + ret = -EINPROGRESS; } percpu_ref_put(&ctx->users); diff --git a/include/linux/aio.h b/include/linux/aio.h index 1bdf965339f9..8c8dd1d84d2e 100644 --- a/include/linux/aio.h +++ b/include/linux/aio.h @@ -27,7 +27,7 @@ struct kiocb; */ #define KIOCB_CANCELLED ((void *) (~0ULL)) -typedef int (kiocb_cancel_fn)(struct kiocb *, struct io_event *); +typedef int (kiocb_cancel_fn)(struct kiocb *); struct kiocb { atomic_t ki_users; From 5ffac122dbda89fbb29885f35a5d47b0edb8936d Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Thu, 9 May 2013 15:36:07 -0700 Subject: [PATCH 08/22] aio: Don't use ctx->tail unnecessarily aio_complete() (arguably) needs to keep its own trusted copy of the tail pointer, but io_getevents() doesn't have to use it - it's already using the head pointer from the ring buffer. So convert it to use the tail from the ring buffer so it touches fewer cachelines and doesn't contend with the cacheline aio_complete() needs. Signed-off-by: Kent Overstreet Cc: Zach Brown Cc: Felipe Balbi Cc: Greg Kroah-Hartman Cc: Mark Fasheh Cc: Joel Becker Cc: Rusty Russell Cc: Jens Axboe Cc: Asai Thambi S P Cc: Selvan Mani Cc: Sam Bradshaw Cc: Jeff Moyer Cc: Al Viro Cc: Benjamin LaHaise Signed-off-by: Benjamin LaHaise --- fs/aio.c | 42 +++++++++++++++++++++++------------------- 1 file changed, 23 insertions(+), 19 deletions(-) diff --git a/fs/aio.c b/fs/aio.c index 12b37689dd2c..57b02791d04e 100644 --- a/fs/aio.c +++ b/fs/aio.c @@ -406,7 +406,8 @@ static void free_ioctx(struct work_struct *work) struct kioctx *ctx = container_of(work, struct kioctx, free_work); struct aio_ring *ring; struct kiocb *req; - unsigned cpu, head, avail; + unsigned cpu, avail; + DEFINE_WAIT(wait); spin_lock_irq(&ctx->ctx_lock); @@ -427,22 +428,24 @@ static void free_ioctx(struct work_struct *work) kcpu->reqs_available = 0; } - ring = kmap_atomic(ctx->ring_pages[0]); - head = ring->head; - kunmap_atomic(ring); + while (1) { + prepare_to_wait(&ctx->wait, &wait, TASK_UNINTERRUPTIBLE); - while (atomic_read(&ctx->reqs_available) < ctx->nr_events - 1) { - wait_event(ctx->wait, - (head != ctx->tail) || - (atomic_read(&ctx->reqs_available) >= - ctx->nr_events - 1)); - - avail = (head <= ctx->tail ? ctx->tail : ctx->nr_events) - head; + ring = kmap_atomic(ctx->ring_pages[0]); + avail = (ring->head <= ring->tail) + ? ring->tail - ring->head + : ctx->nr_events - ring->head + ring->tail; atomic_add(avail, &ctx->reqs_available); - head += avail; - head %= ctx->nr_events; + ring->head = ring->tail; + kunmap_atomic(ring); + + if (atomic_read(&ctx->reqs_available) >= ctx->nr_events - 1) + break; + + schedule(); } + finish_wait(&ctx->wait, &wait); WARN_ON(atomic_read(&ctx->reqs_available) > ctx->nr_events - 1); @@ -869,7 +872,7 @@ static long aio_read_events_ring(struct kioctx *ctx, struct io_event __user *event, long nr) { struct aio_ring *ring; - unsigned head, pos; + unsigned head, tail, pos; long ret = 0; int copy_ret; @@ -877,11 +880,12 @@ static long aio_read_events_ring(struct kioctx *ctx, ring = kmap_atomic(ctx->ring_pages[0]); head = ring->head; + tail = ring->tail; kunmap_atomic(ring); - pr_debug("h%u t%u m%u\n", head, ctx->tail, ctx->nr_events); + pr_debug("h%u t%u m%u\n", head, tail, ctx->nr_events); - if (head == ctx->tail) + if (head == tail) goto out; while (ret < nr) { @@ -889,8 +893,8 @@ static long aio_read_events_ring(struct kioctx *ctx, struct io_event *ev; struct page *page; - avail = (head <= ctx->tail ? ctx->tail : ctx->nr_events) - head; - if (head == ctx->tail) + avail = (head <= tail ? tail : ctx->nr_events) - head; + if (head == tail) break; avail = min(avail, nr - ret); @@ -921,7 +925,7 @@ static long aio_read_events_ring(struct kioctx *ctx, kunmap_atomic(ring); flush_dcache_page(ctx->ring_pages[0]); - pr_debug("%li h%u t%u\n", ret, head, ctx->tail); + pr_debug("%li h%u t%u\n", ret, head, tail); put_reqs_available(ctx, ret); out: From 73a7075e3f6ec63dc359064eea6fd84f406cf2a5 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Thu, 9 May 2013 15:03:42 -0700 Subject: [PATCH 09/22] aio: Kill aio_rw_vect_retry() This code doesn't serve any purpose anymore, since the aio retry infrastructure has been removed. This change should be safe because aio_read/write are also used for synchronous IO, and called from do_sync_read()/do_sync_write() - and there's no looping done in the sync case (the read and write syscalls). Signed-off-by: Kent Overstreet Cc: Zach Brown Cc: Felipe Balbi Cc: Greg Kroah-Hartman Cc: Mark Fasheh Cc: Joel Becker Cc: Rusty Russell Cc: Jens Axboe Cc: Asai Thambi S P Cc: Selvan Mani Cc: Sam Bradshaw Cc: Jeff Moyer Cc: Al Viro Cc: Benjamin LaHaise Signed-off-by: Benjamin LaHaise --- drivers/staging/android/logger.c | 2 +- drivers/usb/gadget/inode.c | 6 +-- fs/aio.c | 91 +++++++------------------------- fs/block_dev.c | 2 +- fs/nfs/direct.c | 1 - fs/ocfs2/file.c | 6 +-- fs/read_write.c | 3 -- fs/udf/file.c | 2 +- include/linux/aio.h | 2 - mm/page_io.c | 1 - net/socket.c | 2 +- 11 files changed, 28 insertions(+), 90 deletions(-) diff --git a/drivers/staging/android/logger.c b/drivers/staging/android/logger.c index 080abf2faf97..d72b47195ecf 100644 --- a/drivers/staging/android/logger.c +++ b/drivers/staging/android/logger.c @@ -481,7 +481,7 @@ static ssize_t logger_aio_write(struct kiocb *iocb, const struct iovec *iov, header.sec = now.tv_sec; header.nsec = now.tv_nsec; header.euid = current_euid(); - header.len = min_t(size_t, iocb->ki_left, LOGGER_ENTRY_MAX_PAYLOAD); + header.len = min_t(size_t, iocb->ki_nbytes, LOGGER_ENTRY_MAX_PAYLOAD); header.hdr_size = sizeof(struct logger_entry); /* null writes succeed, return zero */ diff --git a/drivers/usb/gadget/inode.c b/drivers/usb/gadget/inode.c index e02c1e04529e..f255ad7f4c74 100644 --- a/drivers/usb/gadget/inode.c +++ b/drivers/usb/gadget/inode.c @@ -708,11 +708,11 @@ ep_aio_read(struct kiocb *iocb, const struct iovec *iov, if (unlikely(usb_endpoint_dir_in(&epdata->desc))) return -EINVAL; - buf = kmalloc(iocb->ki_left, GFP_KERNEL); + buf = kmalloc(iocb->ki_nbytes, GFP_KERNEL); if (unlikely(!buf)) return -ENOMEM; - return ep_aio_rwtail(iocb, buf, iocb->ki_left, epdata, iov, nr_segs); + return ep_aio_rwtail(iocb, buf, iocb->ki_nbytes, epdata, iov, nr_segs); } static ssize_t @@ -727,7 +727,7 @@ ep_aio_write(struct kiocb *iocb, const struct iovec *iov, if (unlikely(!usb_endpoint_dir_in(&epdata->desc))) return -EINVAL; - buf = kmalloc(iocb->ki_left, GFP_KERNEL); + buf = kmalloc(iocb->ki_nbytes, GFP_KERNEL); if (unlikely(!buf)) return -ENOMEM; diff --git a/fs/aio.c b/fs/aio.c index 57b02791d04e..d00904db69b7 100644 --- a/fs/aio.c +++ b/fs/aio.c @@ -707,7 +707,7 @@ static inline struct kiocb *aio_get_req(struct kioctx *ctx) if (unlikely(!req)) goto out_put; - atomic_set(&req->ki_users, 2); + atomic_set(&req->ki_users, 1); req->ki_ctx = ctx; return req; out_put: @@ -1051,75 +1051,9 @@ SYSCALL_DEFINE1(io_destroy, aio_context_t, ctx) return -EINVAL; } -static void aio_advance_iovec(struct kiocb *iocb, ssize_t ret) -{ - struct iovec *iov = &iocb->ki_iovec[iocb->ki_cur_seg]; - - BUG_ON(ret <= 0); - - while (iocb->ki_cur_seg < iocb->ki_nr_segs && ret > 0) { - ssize_t this = min((ssize_t)iov->iov_len, ret); - iov->iov_base += this; - iov->iov_len -= this; - iocb->ki_left -= this; - ret -= this; - if (iov->iov_len == 0) { - iocb->ki_cur_seg++; - iov++; - } - } - - /* the caller should not have done more io than what fit in - * the remaining iovecs */ - BUG_ON(ret > 0 && iocb->ki_left == 0); -} - typedef ssize_t (aio_rw_op)(struct kiocb *, const struct iovec *, unsigned long, loff_t); -static ssize_t aio_rw_vect_retry(struct kiocb *iocb, int rw, aio_rw_op *rw_op) -{ - struct file *file = iocb->ki_filp; - struct address_space *mapping = file->f_mapping; - struct inode *inode = mapping->host; - ssize_t ret = 0; - - /* This matches the pread()/pwrite() logic */ - if (iocb->ki_pos < 0) - return -EINVAL; - - if (rw == WRITE) - file_start_write(file); - do { - ret = rw_op(iocb, &iocb->ki_iovec[iocb->ki_cur_seg], - iocb->ki_nr_segs - iocb->ki_cur_seg, - iocb->ki_pos); - if (ret > 0) - aio_advance_iovec(iocb, ret); - - /* retry all partial writes. retry partial reads as long as its a - * regular file. */ - } while (ret > 0 && iocb->ki_left > 0 && - (rw == WRITE || - (!S_ISFIFO(inode->i_mode) && !S_ISSOCK(inode->i_mode)))); - if (rw == WRITE) - file_end_write(file); - - /* This means we must have transferred all that we could */ - /* No need to retry anymore */ - if ((ret == 0) || (iocb->ki_left == 0)) - ret = iocb->ki_nbytes - iocb->ki_left; - - /* If we managed to write some out we return that, rather than - * the eventual error. */ - if (rw == WRITE - && ret < 0 && ret != -EIOCBQUEUED - && iocb->ki_nbytes - iocb->ki_left) - ret = iocb->ki_nbytes - iocb->ki_left; - - return ret; -} - static ssize_t aio_setup_vectored_rw(int rw, struct kiocb *kiocb, bool compat) { ssize_t ret; @@ -1204,9 +1138,22 @@ rw_common: return ret; req->ki_nbytes = ret; - req->ki_left = ret; - ret = aio_rw_vect_retry(req, rw, rw_op); + /* XXX: move/kill - rw_verify_area()? */ + /* This matches the pread()/pwrite() logic */ + if (req->ki_pos < 0) { + ret = -EINVAL; + break; + } + + if (rw == WRITE) + file_start_write(file); + + ret = rw_op(req, req->ki_iovec, + req->ki_nr_segs, req->ki_pos); + + if (rw == WRITE) + file_end_write(file); break; case IOCB_CMD_FDSYNC: @@ -1301,19 +1248,17 @@ static int io_submit_one(struct kioctx *ctx, struct iocb __user *user_iocb, req->ki_pos = iocb->aio_offset; req->ki_buf = (char __user *)(unsigned long)iocb->aio_buf; - req->ki_left = req->ki_nbytes = iocb->aio_nbytes; + req->ki_nbytes = iocb->aio_nbytes; req->ki_opcode = iocb->aio_lio_opcode; ret = aio_run_iocb(req, compat); if (ret) goto out_put_req; - aio_put_req(req); /* drop extra ref to req */ return 0; out_put_req: put_reqs_available(ctx, 1); - aio_put_req(req); /* drop extra ref to req */ - aio_put_req(req); /* drop i/o ref to req */ + aio_put_req(req); return ret; } diff --git a/fs/block_dev.c b/fs/block_dev.c index c7bda5cd3da7..8772b155cb30 100644 --- a/fs/block_dev.c +++ b/fs/block_dev.c @@ -1542,7 +1542,7 @@ static ssize_t blkdev_aio_read(struct kiocb *iocb, const struct iovec *iov, return 0; size -= pos; - if (size < iocb->ki_left) + if (size < iocb->ki_nbytes) nr_segs = iov_shorten((struct iovec *)iov, nr_segs, size); return generic_file_aio_read(iocb, iov, nr_segs, pos); } diff --git a/fs/nfs/direct.c b/fs/nfs/direct.c index 0bd7a55a5f07..91ff089d3412 100644 --- a/fs/nfs/direct.c +++ b/fs/nfs/direct.c @@ -130,7 +130,6 @@ ssize_t nfs_direct_IO(int rw, struct kiocb *iocb, const struct iovec *iov, loff_ return -EINVAL; #else - VM_BUG_ON(iocb->ki_left != PAGE_SIZE); VM_BUG_ON(iocb->ki_nbytes != PAGE_SIZE); if (rw == READ || rw == KERNEL_READ) diff --git a/fs/ocfs2/file.c b/fs/ocfs2/file.c index 41000f223ca4..dd1a4901a54b 100644 --- a/fs/ocfs2/file.c +++ b/fs/ocfs2/file.c @@ -2245,7 +2245,7 @@ static ssize_t ocfs2_file_aio_write(struct kiocb *iocb, file->f_path.dentry->d_name.name, (unsigned int)nr_segs); - if (iocb->ki_left == 0) + if (iocb->ki_nbytes == 0) return 0; appending = file->f_flags & O_APPEND ? 1 : 0; @@ -2296,7 +2296,7 @@ relock: can_do_direct = direct_io; ret = ocfs2_prepare_inode_for_write(file, ppos, - iocb->ki_left, appending, + iocb->ki_nbytes, appending, &can_do_direct, &has_refcount); if (ret < 0) { mlog_errno(ret); @@ -2304,7 +2304,7 @@ relock: } if (direct_io && !is_sync_kiocb(iocb)) - unaligned_dio = ocfs2_is_io_unaligned(inode, iocb->ki_left, + unaligned_dio = ocfs2_is_io_unaligned(inode, iocb->ki_nbytes, *ppos); /* diff --git a/fs/read_write.c b/fs/read_write.c index 122a3846d9e1..e3cd280b158c 100644 --- a/fs/read_write.c +++ b/fs/read_write.c @@ -367,7 +367,6 @@ ssize_t do_sync_read(struct file *filp, char __user *buf, size_t len, loff_t *pp init_sync_kiocb(&kiocb, filp); kiocb.ki_pos = *ppos; - kiocb.ki_left = len; kiocb.ki_nbytes = len; ret = filp->f_op->aio_read(&kiocb, &iov, 1, kiocb.ki_pos); @@ -417,7 +416,6 @@ ssize_t do_sync_write(struct file *filp, const char __user *buf, size_t len, lof init_sync_kiocb(&kiocb, filp); kiocb.ki_pos = *ppos; - kiocb.ki_left = len; kiocb.ki_nbytes = len; ret = filp->f_op->aio_write(&kiocb, &iov, 1, kiocb.ki_pos); @@ -599,7 +597,6 @@ static ssize_t do_sync_readv_writev(struct file *filp, const struct iovec *iov, init_sync_kiocb(&kiocb, filp); kiocb.ki_pos = *ppos; - kiocb.ki_left = len; kiocb.ki_nbytes = len; ret = fn(&kiocb, iov, nr_segs, kiocb.ki_pos); diff --git a/fs/udf/file.c b/fs/udf/file.c index 29569dd08168..c02a27a19c6d 100644 --- a/fs/udf/file.c +++ b/fs/udf/file.c @@ -141,7 +141,7 @@ static ssize_t udf_file_aio_write(struct kiocb *iocb, const struct iovec *iov, struct file *file = iocb->ki_filp; struct inode *inode = file_inode(file); int err, pos; - size_t count = iocb->ki_left; + size_t count = iocb->ki_nbytes; struct udf_inode_info *iinfo = UDF_I(inode); down_write(&iinfo->i_data_sem); diff --git a/include/linux/aio.h b/include/linux/aio.h index 8c8dd1d84d2e..7bb766e73968 100644 --- a/include/linux/aio.h +++ b/include/linux/aio.h @@ -50,11 +50,9 @@ struct kiocb { unsigned short ki_opcode; size_t ki_nbytes; /* copy of iocb->aio_nbytes */ char __user *ki_buf; /* remaining iocb->aio_buf */ - size_t ki_left; /* remaining bytes */ struct iovec ki_inline_vec; /* inline vector */ struct iovec *ki_iovec; unsigned long ki_nr_segs; - unsigned long ki_cur_seg; struct list_head ki_list; /* the aio core uses this * for cancellation */ diff --git a/mm/page_io.c b/mm/page_io.c index ba05b64e5d8d..8c79a4764be0 100644 --- a/mm/page_io.c +++ b/mm/page_io.c @@ -266,7 +266,6 @@ int __swap_writepage(struct page *page, struct writeback_control *wbc, init_sync_kiocb(&kiocb, swap_file); kiocb.ki_pos = page_file_offset(page); - kiocb.ki_left = PAGE_SIZE; kiocb.ki_nbytes = PAGE_SIZE; set_page_writeback(page); diff --git a/net/socket.c b/net/socket.c index 829b460acb87..fea902f2ba58 100644 --- a/net/socket.c +++ b/net/socket.c @@ -931,7 +931,7 @@ static ssize_t sock_aio_read(struct kiocb *iocb, const struct iovec *iov, if (pos != 0) return -ESPIPE; - if (iocb->ki_left == 0) /* Match SYS5 behaviour */ + if (iocb->ki_nbytes == 0) /* Match SYS5 behaviour */ return 0; From 8bc92afcf7f5c598001dd04e62d88f57f6e89e51 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Mon, 25 Feb 2013 16:36:27 -0800 Subject: [PATCH 10/22] aio: Kill unneeded kiocb members The old aio retry infrastucture needed to save the various arguments to to aio operations. But with the retry infrastructure gone, we can trim struct kiocb quite a bit. Signed-off-by: Kent Overstreet Cc: Zach Brown Cc: Felipe Balbi Cc: Greg Kroah-Hartman Cc: Mark Fasheh Cc: Joel Becker Cc: Rusty Russell Cc: Jens Axboe Cc: Asai Thambi S P Cc: Selvan Mani Cc: Sam Bradshaw Cc: Jeff Moyer Cc: Al Viro Cc: Benjamin LaHaise Cc: Theodore Ts'o Signed-off-by: Benjamin LaHaise --- fs/aio.c | 69 ++++++++++++++++++++++++++------------------- include/linux/aio.h | 11 ++------ 2 files changed, 42 insertions(+), 38 deletions(-) diff --git a/fs/aio.c b/fs/aio.c index d00904db69b7..09fe1f334631 100644 --- a/fs/aio.c +++ b/fs/aio.c @@ -723,8 +723,6 @@ static void kiocb_free(struct kiocb *req) eventfd_ctx_put(req->ki_eventfd); if (req->ki_dtor) req->ki_dtor(req); - if (req->ki_iovec != &req->ki_inline_vec) - kfree(req->ki_iovec); kmem_cache_free(kiocb_cachep, req); } @@ -1054,24 +1052,26 @@ SYSCALL_DEFINE1(io_destroy, aio_context_t, ctx) typedef ssize_t (aio_rw_op)(struct kiocb *, const struct iovec *, unsigned long, loff_t); -static ssize_t aio_setup_vectored_rw(int rw, struct kiocb *kiocb, bool compat) +static ssize_t aio_setup_vectored_rw(struct kiocb *kiocb, + int rw, char __user *buf, + unsigned long *nr_segs, + struct iovec **iovec, + bool compat) { ssize_t ret; - kiocb->ki_nr_segs = kiocb->ki_nbytes; + *nr_segs = kiocb->ki_nbytes; #ifdef CONFIG_COMPAT if (compat) ret = compat_rw_copy_check_uvector(rw, - (struct compat_iovec __user *)kiocb->ki_buf, - kiocb->ki_nr_segs, 1, &kiocb->ki_inline_vec, - &kiocb->ki_iovec); + (struct compat_iovec __user *)buf, + *nr_segs, 1, *iovec, iovec); else #endif ret = rw_copy_check_uvector(rw, - (struct iovec __user *)kiocb->ki_buf, - kiocb->ki_nr_segs, 1, &kiocb->ki_inline_vec, - &kiocb->ki_iovec); + (struct iovec __user *)buf, + *nr_segs, 1, *iovec, iovec); if (ret < 0) return ret; @@ -1080,15 +1080,17 @@ static ssize_t aio_setup_vectored_rw(int rw, struct kiocb *kiocb, bool compat) return 0; } -static ssize_t aio_setup_single_vector(int rw, struct kiocb *kiocb) +static ssize_t aio_setup_single_vector(struct kiocb *kiocb, + int rw, char __user *buf, + unsigned long *nr_segs, + struct iovec *iovec) { - if (unlikely(!access_ok(!rw, kiocb->ki_buf, kiocb->ki_nbytes))) + if (unlikely(!access_ok(!rw, buf, kiocb->ki_nbytes))) return -EFAULT; - kiocb->ki_iovec = &kiocb->ki_inline_vec; - kiocb->ki_iovec->iov_base = kiocb->ki_buf; - kiocb->ki_iovec->iov_len = kiocb->ki_nbytes; - kiocb->ki_nr_segs = 1; + iovec->iov_base = buf; + iovec->iov_len = kiocb->ki_nbytes; + *nr_segs = 1; return 0; } @@ -1097,15 +1099,18 @@ static ssize_t aio_setup_single_vector(int rw, struct kiocb *kiocb) * Performs the initial checks and aio retry method * setup for the kiocb at the time of io submission. */ -static ssize_t aio_run_iocb(struct kiocb *req, bool compat) +static ssize_t aio_run_iocb(struct kiocb *req, unsigned opcode, + char __user *buf, bool compat) { struct file *file = req->ki_filp; ssize_t ret; + unsigned long nr_segs; int rw; fmode_t mode; aio_rw_op *rw_op; + struct iovec inline_vec, *iovec = &inline_vec; - switch (req->ki_opcode) { + switch (opcode) { case IOCB_CMD_PREAD: case IOCB_CMD_PREADV: mode = FMODE_READ; @@ -1126,16 +1131,21 @@ rw_common: if (!rw_op) return -EINVAL; - ret = (req->ki_opcode == IOCB_CMD_PREADV || - req->ki_opcode == IOCB_CMD_PWRITEV) - ? aio_setup_vectored_rw(rw, req, compat) - : aio_setup_single_vector(rw, req); + ret = (opcode == IOCB_CMD_PREADV || + opcode == IOCB_CMD_PWRITEV) + ? aio_setup_vectored_rw(req, rw, buf, &nr_segs, + &iovec, compat) + : aio_setup_single_vector(req, rw, buf, &nr_segs, + iovec); if (ret) return ret; ret = rw_verify_area(rw, file, &req->ki_pos, req->ki_nbytes); - if (ret < 0) + if (ret < 0) { + if (iovec != &inline_vec) + kfree(iovec); return ret; + } req->ki_nbytes = ret; @@ -1149,8 +1159,7 @@ rw_common: if (rw == WRITE) file_start_write(file); - ret = rw_op(req, req->ki_iovec, - req->ki_nr_segs, req->ki_pos); + ret = rw_op(req, iovec, nr_segs, req->ki_pos); if (rw == WRITE) file_end_write(file); @@ -1175,6 +1184,9 @@ rw_common: return -EINVAL; } + if (iovec != &inline_vec) + kfree(iovec); + if (ret != -EIOCBQUEUED) { /* * There's no easy way to restart the syscall since other AIO's @@ -1246,12 +1258,11 @@ static int io_submit_one(struct kioctx *ctx, struct iocb __user *user_iocb, req->ki_obj.user = user_iocb; req->ki_user_data = iocb->aio_data; req->ki_pos = iocb->aio_offset; - - req->ki_buf = (char __user *)(unsigned long)iocb->aio_buf; req->ki_nbytes = iocb->aio_nbytes; - req->ki_opcode = iocb->aio_lio_opcode; - ret = aio_run_iocb(req, compat); + ret = aio_run_iocb(req, iocb->aio_lio_opcode, + (char __user *)(unsigned long)iocb->aio_buf, + compat); if (ret) goto out_put_req; diff --git a/include/linux/aio.h b/include/linux/aio.h index 7bb766e73968..b570472355d1 100644 --- a/include/linux/aio.h +++ b/include/linux/aio.h @@ -36,6 +36,7 @@ struct kiocb { struct kioctx *ki_ctx; /* NULL for sync ops */ kiocb_cancel_fn *ki_cancel; void (*ki_dtor)(struct kiocb *); + void *private; union { void __user *user; @@ -44,15 +45,7 @@ struct kiocb { __u64 ki_user_data; /* user's data for completion */ loff_t ki_pos; - - void *private; - /* State that we remember to be able to restart/retry */ - unsigned short ki_opcode; - size_t ki_nbytes; /* copy of iocb->aio_nbytes */ - char __user *ki_buf; /* remaining iocb->aio_buf */ - struct iovec ki_inline_vec; /* inline vector */ - struct iovec *ki_iovec; - unsigned long ki_nr_segs; + size_t ki_nbytes; /* copy of iocb->aio_nbytes */ struct list_head ki_list; /* the aio core uses this * for cancellation */ From 57282d8fd744072d6d6f18fa6ebe3cc1149015bf Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Mon, 13 May 2013 13:42:52 -0700 Subject: [PATCH 11/22] aio: Kill ki_users The kiocb refcount is only needed for cancellation - to ensure a kiocb isn't freed while a ki_cancel callback is running. But if we restrict ki_cancel callbacks to not block (which they currently don't), we can simply drop the refcount. Signed-off-by: Kent Overstreet Cc: Zach Brown Cc: Felipe Balbi Cc: Greg Kroah-Hartman Cc: Mark Fasheh Cc: Joel Becker Cc: Rusty Russell Cc: Jens Axboe Cc: Asai Thambi S P Cc: Selvan Mani Cc: Sam Bradshaw Cc: Jeff Moyer Cc: Al Viro Cc: Benjamin LaHaise Cc: Theodore Ts'o Signed-off-by: Benjamin LaHaise --- fs/aio.c | 47 ++++++++++++--------------------------------- include/linux/aio.h | 5 ----- 2 files changed, 12 insertions(+), 40 deletions(-) diff --git a/fs/aio.c b/fs/aio.c index 09fe1f334631..69a608a43760 100644 --- a/fs/aio.c +++ b/fs/aio.c @@ -361,7 +361,6 @@ EXPORT_SYMBOL(kiocb_set_cancel_fn); static int kiocb_cancel(struct kioctx *ctx, struct kiocb *kiocb) { kiocb_cancel_fn *old, *cancel; - int ret = -EINVAL; /* * Don't want to set kiocb->ki_cancel = KIOCB_CANCELLED unless it @@ -371,21 +370,13 @@ static int kiocb_cancel(struct kioctx *ctx, struct kiocb *kiocb) cancel = ACCESS_ONCE(kiocb->ki_cancel); do { if (!cancel || cancel == KIOCB_CANCELLED) - return ret; + return -EINVAL; old = cancel; cancel = cmpxchg(&kiocb->ki_cancel, old, KIOCB_CANCELLED); } while (cancel != old); - atomic_inc(&kiocb->ki_users); - spin_unlock_irq(&ctx->ctx_lock); - - ret = cancel(kiocb); - - spin_lock_irq(&ctx->ctx_lock); - aio_put_req(kiocb); - - return ret; + return cancel(kiocb); } static void free_ioctx_rcu(struct rcu_head *head) @@ -599,16 +590,16 @@ static void kill_ioctx(struct kioctx *ctx) /* wait_on_sync_kiocb: * Waits on the given sync kiocb to complete. */ -ssize_t wait_on_sync_kiocb(struct kiocb *iocb) +ssize_t wait_on_sync_kiocb(struct kiocb *req) { - while (atomic_read(&iocb->ki_users)) { + while (!req->ki_ctx) { set_current_state(TASK_UNINTERRUPTIBLE); - if (!atomic_read(&iocb->ki_users)) + if (req->ki_ctx) break; io_schedule(); } __set_current_state(TASK_RUNNING); - return iocb->ki_user_data; + return req->ki_user_data; } EXPORT_SYMBOL(wait_on_sync_kiocb); @@ -687,14 +678,8 @@ out: } /* aio_get_req - * Allocate a slot for an aio request. Increments the ki_users count - * of the kioctx so that the kioctx stays around until all requests are - * complete. Returns NULL if no requests are free. - * - * Returns with kiocb->ki_users set to 2. The io submit code path holds - * an extra reference while submitting the i/o. - * This prevents races between the aio code path referencing the - * req (after submitting it) and aio_complete() freeing the req. + * Allocate a slot for an aio request. + * Returns NULL if no requests are free. */ static inline struct kiocb *aio_get_req(struct kioctx *ctx) { @@ -707,7 +692,6 @@ static inline struct kiocb *aio_get_req(struct kioctx *ctx) if (unlikely(!req)) goto out_put; - atomic_set(&req->ki_users, 1); req->ki_ctx = ctx; return req; out_put: @@ -726,13 +710,6 @@ static void kiocb_free(struct kiocb *req) kmem_cache_free(kiocb_cachep, req); } -void aio_put_req(struct kiocb *req) -{ - if (atomic_dec_and_test(&req->ki_users)) - kiocb_free(req); -} -EXPORT_SYMBOL(aio_put_req); - static struct kioctx *lookup_ioctx(unsigned long ctx_id) { struct mm_struct *mm = current->mm; @@ -771,9 +748,9 @@ void aio_complete(struct kiocb *iocb, long res, long res2) * - the sync task helpfully left a reference to itself in the iocb */ if (is_sync_kiocb(iocb)) { - BUG_ON(atomic_read(&iocb->ki_users) != 1); iocb->ki_user_data = res; - atomic_set(&iocb->ki_users, 0); + smp_wmb(); + iocb->ki_ctx = ERR_PTR(-EXDEV); wake_up_process(iocb->ki_obj.tsk); return; } @@ -845,7 +822,7 @@ void aio_complete(struct kiocb *iocb, long res, long res2) eventfd_signal(iocb->ki_eventfd, 1); /* everything turned out well, dispose of the aiocb. */ - aio_put_req(iocb); + kiocb_free(iocb); /* * We have to order our ring_info tail store above and test @@ -1269,7 +1246,7 @@ static int io_submit_one(struct kioctx *ctx, struct iocb __user *user_iocb, return 0; out_put_req: put_reqs_available(ctx, 1); - aio_put_req(req); + kiocb_free(req); return ret; } diff --git a/include/linux/aio.h b/include/linux/aio.h index b570472355d1..c4f07ffa1cbb 100644 --- a/include/linux/aio.h +++ b/include/linux/aio.h @@ -30,8 +30,6 @@ struct kiocb; typedef int (kiocb_cancel_fn)(struct kiocb *); struct kiocb { - atomic_t ki_users; - struct file *ki_filp; struct kioctx *ki_ctx; /* NULL for sync ops */ kiocb_cancel_fn *ki_cancel; @@ -65,7 +63,6 @@ static inline bool is_sync_kiocb(struct kiocb *kiocb) static inline void init_sync_kiocb(struct kiocb *kiocb, struct file *filp) { *kiocb = (struct kiocb) { - .ki_users = ATOMIC_INIT(1), .ki_ctx = NULL, .ki_filp = filp, .ki_obj.tsk = current, @@ -75,7 +72,6 @@ static inline void init_sync_kiocb(struct kiocb *kiocb, struct file *filp) /* prototypes */ #ifdef CONFIG_AIO extern ssize_t wait_on_sync_kiocb(struct kiocb *iocb); -extern void aio_put_req(struct kiocb *iocb); extern void aio_complete(struct kiocb *iocb, long res, long res2); struct mm_struct; extern void exit_aio(struct mm_struct *mm); @@ -84,7 +80,6 @@ extern long do_io_submit(aio_context_t ctx_id, long nr, void kiocb_set_cancel_fn(struct kiocb *req, kiocb_cancel_fn *cancel); #else static inline ssize_t wait_on_sync_kiocb(struct kiocb *iocb) { return 0; } -static inline void aio_put_req(struct kiocb *iocb) { } static inline void aio_complete(struct kiocb *iocb, long res, long res2) { } struct mm_struct; static inline void exit_aio(struct mm_struct *mm) { } From d29c445b635b3a03cf683cafcbae58a4ec1e1125 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Mon, 18 Mar 2013 11:09:26 -0700 Subject: [PATCH 12/22] aio: Kill ki_dtor sock_aio_dtor() is dead code - and stuff that does need to do cleanup can simply do it before calling aio_complete(). Signed-off-by: Kent Overstreet Cc: Zach Brown Cc: Felipe Balbi Cc: Greg Kroah-Hartman Cc: Mark Fasheh Cc: Joel Becker Cc: Rusty Russell Cc: Jens Axboe Cc: Asai Thambi S P Cc: Selvan Mani Cc: Sam Bradshaw Cc: Jeff Moyer Cc: Al Viro Cc: Benjamin LaHaise Cc: Theodore Ts'o Signed-off-by: Benjamin LaHaise --- fs/aio.c | 2 -- include/linux/aio.h | 1 - net/socket.c | 13 ++----------- 3 files changed, 2 insertions(+), 14 deletions(-) diff --git a/fs/aio.c b/fs/aio.c index 69a608a43760..e46b1195191b 100644 --- a/fs/aio.c +++ b/fs/aio.c @@ -705,8 +705,6 @@ static void kiocb_free(struct kiocb *req) fput(req->ki_filp); if (req->ki_eventfd != NULL) eventfd_ctx_put(req->ki_eventfd); - if (req->ki_dtor) - req->ki_dtor(req); kmem_cache_free(kiocb_cachep, req); } diff --git a/include/linux/aio.h b/include/linux/aio.h index c4f07ffa1cbb..d9c92daa3944 100644 --- a/include/linux/aio.h +++ b/include/linux/aio.h @@ -33,7 +33,6 @@ struct kiocb { struct file *ki_filp; struct kioctx *ki_ctx; /* NULL for sync ops */ kiocb_cancel_fn *ki_cancel; - void (*ki_dtor)(struct kiocb *); void *private; union { diff --git a/net/socket.c b/net/socket.c index fea902f2ba58..06a082e0e49c 100644 --- a/net/socket.c +++ b/net/socket.c @@ -854,11 +854,6 @@ int kernel_recvmsg(struct socket *sock, struct msghdr *msg, } EXPORT_SYMBOL(kernel_recvmsg); -static void sock_aio_dtor(struct kiocb *iocb) -{ - kfree(iocb->private); -} - static ssize_t sock_sendpage(struct file *file, struct page *page, int offset, size_t size, loff_t *ppos, int more) { @@ -889,12 +884,8 @@ static ssize_t sock_splice_read(struct file *file, loff_t *ppos, static struct sock_iocb *alloc_sock_iocb(struct kiocb *iocb, struct sock_iocb *siocb) { - if (!is_sync_kiocb(iocb)) { - siocb = kmalloc(sizeof(*siocb), GFP_KERNEL); - if (!siocb) - return NULL; - iocb->ki_dtor = sock_aio_dtor; - } + if (!is_sync_kiocb(iocb)) + BUG(); siocb->kiocb = iocb; iocb->private = siocb; From 4cd81c3dfc4a34e4a0b6fa577860077c8e5b13af Mon Sep 17 00:00:00 2001 From: Benjamin LaHaise Date: Tue, 30 Jul 2013 12:06:37 -0400 Subject: [PATCH 13/22] aio: double aio_max_nr in calculations With the changes to use percpu counters for aio event ring size calculation, existing increases to aio_max_nr are now insufficient to allow for the allocation of enough events. Double the value used for aio_max_nr to account for the doubling introduced by the percpu slack. Signed-off-by: Benjamin LaHaise --- fs/aio.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/fs/aio.c b/fs/aio.c index e46b1195191b..945dd0d072f3 100644 --- a/fs/aio.c +++ b/fs/aio.c @@ -490,7 +490,7 @@ static struct kioctx *ioctx_alloc(unsigned nr_events) return ERR_PTR(-EINVAL); } - if (!nr_events || (unsigned long)nr_events > aio_max_nr) + if (!nr_events || (unsigned long)nr_events > (aio_max_nr * 2UL)) return ERR_PTR(-EAGAIN); ctx = kmem_cache_zalloc(kioctx_cachep, GFP_KERNEL); @@ -522,7 +522,7 @@ static struct kioctx *ioctx_alloc(unsigned nr_events) /* limit the number of system wide aios */ spin_lock(&aio_nr_lock); - if (aio_nr + nr_events > aio_max_nr || + if (aio_nr + nr_events > (aio_max_nr * 2UL) || aio_nr + nr_events < aio_nr) { spin_unlock(&aio_nr_lock); goto out_cleanup; From db446a08c23d5475e6b08c87acca79ebb20f283c Mon Sep 17 00:00:00 2001 From: Benjamin LaHaise Date: Tue, 30 Jul 2013 12:54:40 -0400 Subject: [PATCH 14/22] aio: convert the ioctx list to table lookup v3 On Wed, Jun 12, 2013 at 11:14:40AM -0700, Kent Overstreet wrote: > On Mon, Apr 15, 2013 at 02:40:55PM +0300, Octavian Purdila wrote: > > When using a large number of threads performing AIO operations the > > IOCTX list may get a significant number of entries which will cause > > significant overhead. For example, when running this fio script: > > > > rw=randrw; size=256k ;directory=/mnt/fio; ioengine=libaio; iodepth=1 > > blocksize=1024; numjobs=512; thread; loops=100 > > > > on an EXT2 filesystem mounted on top of a ramdisk we can observe up to > > 30% CPU time spent by lookup_ioctx: > > > > 32.51% [guest.kernel] [g] lookup_ioctx > > 9.19% [guest.kernel] [g] __lock_acquire.isra.28 > > 4.40% [guest.kernel] [g] lock_release > > 4.19% [guest.kernel] [g] sched_clock_local > > 3.86% [guest.kernel] [g] local_clock > > 3.68% [guest.kernel] [g] native_sched_clock > > 3.08% [guest.kernel] [g] sched_clock_cpu > > 2.64% [guest.kernel] [g] lock_release_holdtime.part.11 > > 2.60% [guest.kernel] [g] memcpy > > 2.33% [guest.kernel] [g] lock_acquired > > 2.25% [guest.kernel] [g] lock_acquire > > 1.84% [guest.kernel] [g] do_io_submit > > > > This patchs converts the ioctx list to a radix tree. For a performance > > comparison the above FIO script was run on a 2 sockets 8 core > > machine. This are the results (average and %rsd of 10 runs) for the > > original list based implementation and for the radix tree based > > implementation: > > > > cores 1 2 4 8 16 32 > > list 109376 ms 69119 ms 35682 ms 22671 ms 19724 ms 16408 ms > > %rsd 0.69% 1.15% 1.17% 1.21% 1.71% 1.43% > > radix 73651 ms 41748 ms 23028 ms 16766 ms 15232 ms 13787 ms > > %rsd 1.19% 0.98% 0.69% 1.13% 0.72% 0.75% > > % of radix > > relative 66.12% 65.59% 66.63% 72.31% 77.26% 83.66% > > to list > > > > To consider the impact of the patch on the typical case of having > > only one ctx per process the following FIO script was run: > > > > rw=randrw; size=100m ;directory=/mnt/fio; ioengine=libaio; iodepth=1 > > blocksize=1024; numjobs=1; thread; loops=100 > > > > on the same system and the results are the following: > > > > list 58892 ms > > %rsd 0.91% > > radix 59404 ms > > %rsd 0.81% > > % of radix > > relative 100.87% > > to list > > So, I was just doing some benchmarking/profiling to get ready to send > out the aio patches I've got for 3.11 - and it looks like your patch is > causing a ~1.5% throughput regression in my testing :/ ... I've got an alternate approach for fixing this wart in lookup_ioctx()... Instead of using an rbtree, just use the reserved id in the ring buffer header to index an array pointing the ioctx. It's not finished yet, and it needs to be tidied up, but is most of the way there. -ben -- "Thought is the essence of where you are now." -- kmo> And, a rework of Ben's code, but this was entirely his idea kmo> -Kent bcrl> And fix the code to use the right mm_struct in kill_ioctx(), actually free memory. Signed-off-by: Benjamin LaHaise --- fs/aio.c | 136 ++++++++++++++++++++++++++++++++------- include/linux/mm_types.h | 5 +- kernel/fork.c | 2 +- 3 files changed, 118 insertions(+), 25 deletions(-) diff --git a/fs/aio.c b/fs/aio.c index 945dd0d072f3..52f200ebef07 100644 --- a/fs/aio.c +++ b/fs/aio.c @@ -66,6 +66,12 @@ struct aio_ring { #define AIO_RING_PAGES 8 +struct kioctx_table { + struct rcu_head rcu; + unsigned nr; + struct kioctx *table[]; +}; + struct kioctx_cpu { unsigned reqs_available; }; @@ -74,9 +80,7 @@ struct kioctx { struct percpu_ref users; atomic_t dead; - /* This needs improving */ unsigned long user_id; - struct hlist_node list; struct __percpu kioctx_cpu *cpu; @@ -135,6 +139,8 @@ struct kioctx { struct page *internal_pages[AIO_RING_PAGES]; struct file *aio_ring_file; + + unsigned id; }; /*------ sysctl variables----*/ @@ -326,7 +332,7 @@ static int aio_setup_ring(struct kioctx *ctx) ring = kmap_atomic(ctx->ring_pages[0]); ring->nr = nr_events; /* user copy */ - ring->id = ctx->user_id; + ring->id = ~0U; ring->head = ring->tail = 0; ring->magic = AIO_RING_MAGIC; ring->compat_features = AIO_RING_COMPAT_FEATURES; @@ -462,6 +468,58 @@ static void free_ioctx_ref(struct percpu_ref *ref) schedule_work(&ctx->free_work); } +static int ioctx_add_table(struct kioctx *ctx, struct mm_struct *mm) +{ + unsigned i, new_nr; + struct kioctx_table *table, *old; + struct aio_ring *ring; + + spin_lock(&mm->ioctx_lock); + table = rcu_dereference(mm->ioctx_table); + + while (1) { + if (table) + for (i = 0; i < table->nr; i++) + if (!table->table[i]) { + ctx->id = i; + table->table[i] = ctx; + spin_unlock(&mm->ioctx_lock); + + ring = kmap_atomic(ctx->ring_pages[0]); + ring->id = ctx->id; + kunmap_atomic(ring); + return 0; + } + + new_nr = (table ? table->nr : 1) * 4; + + spin_unlock(&mm->ioctx_lock); + + table = kzalloc(sizeof(*table) + sizeof(struct kioctx *) * + new_nr, GFP_KERNEL); + if (!table) + return -ENOMEM; + + table->nr = new_nr; + + spin_lock(&mm->ioctx_lock); + old = rcu_dereference(mm->ioctx_table); + + if (!old) { + rcu_assign_pointer(mm->ioctx_table, table); + } else if (table->nr > old->nr) { + memcpy(table->table, old->table, + old->nr * sizeof(struct kioctx *)); + + rcu_assign_pointer(mm->ioctx_table, table); + kfree_rcu(old, rcu); + } else { + kfree(table); + table = old; + } + } +} + /* ioctx_alloc * Allocates and initializes an ioctx. Returns an ERR_PTR if it failed. */ @@ -520,6 +578,10 @@ static struct kioctx *ioctx_alloc(unsigned nr_events) ctx->req_batch = (ctx->nr_events - 1) / (num_possible_cpus() * 4); BUG_ON(!ctx->req_batch); + err = ioctx_add_table(ctx, mm); + if (err) + goto out_cleanup_noerr; + /* limit the number of system wide aios */ spin_lock(&aio_nr_lock); if (aio_nr + nr_events > (aio_max_nr * 2UL) || @@ -532,17 +594,13 @@ static struct kioctx *ioctx_alloc(unsigned nr_events) percpu_ref_get(&ctx->users); /* io_setup() will drop this ref */ - /* now link into global list. */ - spin_lock(&mm->ioctx_lock); - hlist_add_head_rcu(&ctx->list, &mm->ioctx_list); - spin_unlock(&mm->ioctx_lock); - pr_debug("allocated ioctx %p[%ld]: mm=%p mask=0x%x\n", ctx, ctx->user_id, mm, ctx->nr_events); return ctx; out_cleanup: err = -EAGAIN; +out_cleanup_noerr: aio_free_ring(ctx); out_freepcpu: free_percpu(ctx->cpu); @@ -561,10 +619,18 @@ out_freectx: * when the processes owning a context have all exited to encourage * the rapid destruction of the kioctx. */ -static void kill_ioctx(struct kioctx *ctx) +static void kill_ioctx(struct mm_struct *mm, struct kioctx *ctx) { if (!atomic_xchg(&ctx->dead, 1)) { - hlist_del_rcu(&ctx->list); + struct kioctx_table *table; + + spin_lock(&mm->ioctx_lock); + table = rcu_dereference(mm->ioctx_table); + + WARN_ON(ctx != table->table[ctx->id]); + table->table[ctx->id] = NULL; + spin_unlock(&mm->ioctx_lock); + /* percpu_ref_kill() will do the necessary call_rcu() */ wake_up_all(&ctx->wait); @@ -613,10 +679,28 @@ EXPORT_SYMBOL(wait_on_sync_kiocb); */ void exit_aio(struct mm_struct *mm) { + struct kioctx_table *table; struct kioctx *ctx; - struct hlist_node *n; + unsigned i = 0; + + while (1) { + rcu_read_lock(); + table = rcu_dereference(mm->ioctx_table); + + do { + if (!table || i >= table->nr) { + rcu_read_unlock(); + rcu_assign_pointer(mm->ioctx_table, NULL); + if (table) + kfree(table); + return; + } + + ctx = table->table[i++]; + } while (!ctx); + + rcu_read_unlock(); - hlist_for_each_entry_safe(ctx, n, &mm->ioctx_list, list) { /* * We don't need to bother with munmap() here - * exit_mmap(mm) is coming and it'll unmap everything. @@ -627,7 +711,7 @@ void exit_aio(struct mm_struct *mm) */ ctx->mmap_size = 0; - kill_ioctx(ctx); + kill_ioctx(mm, ctx); } } @@ -710,19 +794,27 @@ static void kiocb_free(struct kiocb *req) static struct kioctx *lookup_ioctx(unsigned long ctx_id) { + struct aio_ring __user *ring = (void __user *)ctx_id; struct mm_struct *mm = current->mm; struct kioctx *ctx, *ret = NULL; + struct kioctx_table *table; + unsigned id; + + if (get_user(id, &ring->id)) + return NULL; rcu_read_lock(); + table = rcu_dereference(mm->ioctx_table); - hlist_for_each_entry_rcu(ctx, &mm->ioctx_list, list) { - if (ctx->user_id == ctx_id) { - percpu_ref_get(&ctx->users); - ret = ctx; - break; - } - } + if (!table || id >= table->nr) + goto out; + ctx = table->table[id]; + if (ctx->user_id == ctx_id) { + percpu_ref_get(&ctx->users); + ret = ctx; + } +out: rcu_read_unlock(); return ret; } @@ -998,7 +1090,7 @@ SYSCALL_DEFINE2(io_setup, unsigned, nr_events, aio_context_t __user *, ctxp) if (!IS_ERR(ioctx)) { ret = put_user(ioctx->user_id, ctxp); if (ret) - kill_ioctx(ioctx); + kill_ioctx(current->mm, ioctx); percpu_ref_put(&ioctx->users); } @@ -1016,7 +1108,7 @@ SYSCALL_DEFINE1(io_destroy, aio_context_t, ctx) { struct kioctx *ioctx = lookup_ioctx(ctx); if (likely(NULL != ioctx)) { - kill_ioctx(ioctx); + kill_ioctx(current->mm, ioctx); percpu_ref_put(&ioctx->users); return 0; } diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h index fb425aa16c01..da8cf5cc1aa6 100644 --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h @@ -322,6 +322,7 @@ struct mm_rss_stat { atomic_long_t count[NR_MM_COUNTERS]; }; +struct kioctx_table; struct mm_struct { struct vm_area_struct * mmap; /* list of VMAs */ struct rb_root mm_rb; @@ -382,8 +383,8 @@ struct mm_struct { struct core_state *core_state; /* coredumping support */ #ifdef CONFIG_AIO - spinlock_t ioctx_lock; - struct hlist_head ioctx_list; + spinlock_t ioctx_lock; + struct kioctx_table __rcu *ioctx_table; #endif #ifdef CONFIG_MM_OWNER /* diff --git a/kernel/fork.c b/kernel/fork.c index 66635c80a813..db5f541c5488 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -522,7 +522,7 @@ static void mm_init_aio(struct mm_struct *mm) { #ifdef CONFIG_AIO spin_lock_init(&mm->ioctx_lock); - INIT_HLIST_HEAD(&mm->ioctx_list); + mm->ioctx_table = NULL; #endif } From 6878ea72a5d1aa6caae86449975a50b7fe9abed5 Mon Sep 17 00:00:00 2001 From: Benjamin LaHaise Date: Wed, 31 Jul 2013 10:34:18 -0400 Subject: [PATCH 15/22] aio: be defensive to ensure request batching is non-zero instead of BUG_ON() In the event that an overflow/underflow occurs while calculating req_batch, clamp the minimum at 1 request instead of doing a BUG_ON(). Signed-off-by: Benjamin LaHaise --- fs/aio.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/fs/aio.c b/fs/aio.c index 52f200ebef07..588aff9dc316 100644 --- a/fs/aio.c +++ b/fs/aio.c @@ -576,7 +576,8 @@ static struct kioctx *ioctx_alloc(unsigned nr_events) atomic_set(&ctx->reqs_available, ctx->nr_events - 1); ctx->req_batch = (ctx->nr_events - 1) / (num_possible_cpus() * 4); - BUG_ON(!ctx->req_batch); + if (ctx->req_batch < 1) + ctx->req_batch = 1; err = ioctx_add_table(ctx, mm); if (err) From da90382c2ec367aac88ff6aa76afb659ee0e4235 Mon Sep 17 00:00:00 2001 From: Benjamin LaHaise Date: Mon, 5 Aug 2013 13:21:43 -0400 Subject: [PATCH 16/22] aio: fix error handling and rcu usage in "convert the ioctx list to table lookup v3" In the patch "aio: convert the ioctx list to table lookup v3", incorrect handling in the ioctx_alloc() error path was introduced that lead to an ioctx being added via ioctx_add_table() while freed when the ioctx_alloc() call returned -EAGAIN due to hitting the aio_max_nr limit. Fix this by only calling ioctx_add_table() as the last step in ioctx_alloc(). Also, several unnecessary rcu_dereference() calls were added that lead to RCU warnings where the system was already protected by a spin lock for accessing mm->ioctx_table. Signed-off-by: Benjamin LaHaise --- fs/aio.c | 17 +++++++++-------- 1 file changed, 9 insertions(+), 8 deletions(-) diff --git a/fs/aio.c b/fs/aio.c index 588aff9dc316..3bc068c1cb9c 100644 --- a/fs/aio.c +++ b/fs/aio.c @@ -475,7 +475,7 @@ static int ioctx_add_table(struct kioctx *ctx, struct mm_struct *mm) struct aio_ring *ring; spin_lock(&mm->ioctx_lock); - table = rcu_dereference(mm->ioctx_table); + table = mm->ioctx_table; while (1) { if (table) @@ -503,7 +503,7 @@ static int ioctx_add_table(struct kioctx *ctx, struct mm_struct *mm) table->nr = new_nr; spin_lock(&mm->ioctx_lock); - old = rcu_dereference(mm->ioctx_table); + old = mm->ioctx_table; if (!old) { rcu_assign_pointer(mm->ioctx_table, table); @@ -579,10 +579,6 @@ static struct kioctx *ioctx_alloc(unsigned nr_events) if (ctx->req_batch < 1) ctx->req_batch = 1; - err = ioctx_add_table(ctx, mm); - if (err) - goto out_cleanup_noerr; - /* limit the number of system wide aios */ spin_lock(&aio_nr_lock); if (aio_nr + nr_events > (aio_max_nr * 2UL) || @@ -595,13 +591,18 @@ static struct kioctx *ioctx_alloc(unsigned nr_events) percpu_ref_get(&ctx->users); /* io_setup() will drop this ref */ + err = ioctx_add_table(ctx, mm); + if (err) + goto out_cleanup_put; + pr_debug("allocated ioctx %p[%ld]: mm=%p mask=0x%x\n", ctx, ctx->user_id, mm, ctx->nr_events); return ctx; +out_cleanup_put: + percpu_ref_put(&ctx->users); out_cleanup: err = -EAGAIN; -out_cleanup_noerr: aio_free_ring(ctx); out_freepcpu: free_percpu(ctx->cpu); @@ -626,7 +627,7 @@ static void kill_ioctx(struct mm_struct *mm, struct kioctx *ctx) struct kioctx_table *table; spin_lock(&mm->ioctx_lock); - table = rcu_dereference(mm->ioctx_table); + table = mm->ioctx_table; WARN_ON(ctx != table->table[ctx->id]); table->table[ctx->id] = NULL; From 0bdd5ca553ebc15060dd72684710e0a0af4481b4 Mon Sep 17 00:00:00 2001 From: Peng Tao Date: Tue, 6 Aug 2013 20:31:01 +0800 Subject: [PATCH 17/22] staging/lustre: kiocb->ki_left is removed We also missed ki_nbytes... Cc: Kent Overstreet Cc: Andreas Dilger Cc: Greg Kroah-Hartman Signed-off-by: Peng Tao Signed-off-by: Benjamin LaHaise --- drivers/staging/lustre/lustre/llite/file.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/staging/lustre/lustre/llite/file.c b/drivers/staging/lustre/lustre/llite/file.c index ed1e3f7b4e58..444710b33ad5 100644 --- a/drivers/staging/lustre/lustre/llite/file.c +++ b/drivers/staging/lustre/lustre/llite/file.c @@ -1027,7 +1027,7 @@ static ssize_t ll_file_read(struct file *file, char *buf, size_t count, local_iov->iov_len = count; init_sync_kiocb(kiocb, file); kiocb->ki_pos = *ppos; - kiocb->ki_left = count; + kiocb->ki_nbytes = count; result = ll_file_aio_read(kiocb, local_iov, 1, kiocb->ki_pos); *ppos = kiocb->ki_pos; @@ -1088,7 +1088,7 @@ static ssize_t ll_file_write(struct file *file, const char *buf, size_t count, local_iov->iov_len = count; init_sync_kiocb(kiocb, file); kiocb->ki_pos = *ppos; - kiocb->ki_left = count; + kiocb->ki_nbytes = count; result = ll_file_aio_write(kiocb, local_iov, 1, kiocb->ki_pos); *ppos = kiocb->ki_pos; From f30d704fe1244c44a984d3d1f47bc648bcc6c9f7 Mon Sep 17 00:00:00 2001 From: Benjamin LaHaise Date: Wed, 7 Aug 2013 18:23:48 -0400 Subject: [PATCH 18/22] aio: table lookup: verify ctx pointer Another shortcoming of the table lookup patch was revealed where the pointer was not being tested before being dereferenced. Verify this to avoid the NULL pointer dereference. Signed-off-by: Benjamin LaHaise --- fs/aio.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/aio.c b/fs/aio.c index 3bc068c1cb9c..c3f005dc2d55 100644 --- a/fs/aio.c +++ b/fs/aio.c @@ -812,7 +812,7 @@ static struct kioctx *lookup_ioctx(unsigned long ctx_id) goto out; ctx = table->table[id]; - if (ctx->user_id == ctx_id) { + if (ctx && ctx->user_id == ctx_id) { percpu_ref_get(&ctx->users); ret = ctx; } From 79bd1bcf1ab22ea723da7d5854a9e72a350ecbf8 Mon Sep 17 00:00:00 2001 From: Benjamin LaHaise Date: Fri, 30 Aug 2013 10:22:04 -0400 Subject: [PATCH 19/22] aio: remove unnecessary debugging from aio_free_ring() The commit 36bc08cc0170 ("fs/aio: Add support to aio ring pages migration") added some debugging code that is not required and resulted in a build error when 98474236f72e ("vfs: make the dentry cache use the lockref infrastructure") was added to the tree. The code is not required, so just delete it. Signed-off-by: Benjamin LaHaise --- fs/aio.c | 5 ----- 1 file changed, 5 deletions(-) diff --git a/fs/aio.c b/fs/aio.c index c3f005dc2d55..d0defcb0a493 100644 --- a/fs/aio.c +++ b/fs/aio.c @@ -183,11 +183,6 @@ static void aio_free_ring(struct kioctx *ctx) if (aio_ring_file) { truncate_setsize(aio_ring_file->f_inode, 0); - pr_debug("pid(%d) i_nlink=%u d_count=%d d_unhashed=%d i_count=%d\n", - current->pid, aio_ring_file->f_inode->i_nlink, - aio_ring_file->f_path.dentry->d_count, - d_unhashed(aio_ring_file->f_path.dentry), - atomic_read(&aio_ring_file->f_inode->i_count)); fput(aio_ring_file); ctx->aio_ring_file = NULL; } From 77d30b14d24e557f89c41980011d72428514d729 Mon Sep 17 00:00:00 2001 From: Benjamin LaHaise Date: Fri, 30 Aug 2013 11:12:50 -0400 Subject: [PATCH 20/22] aio: fix rcu sparse warnings introduced by ioctx table lookup patch Sseveral sparse warnings were caused by missing rcu_dereference() annotations for dereferencing mm->ioctx_table. Thankfully, none of those were actual bugs as the deref was protected by a spin lock in all instances. Signed-off-by: Benjamin LaHaise Reported-by: Fengguang Wu --- fs/aio.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/fs/aio.c b/fs/aio.c index d0defcb0a493..6e267553604c 100644 --- a/fs/aio.c +++ b/fs/aio.c @@ -470,7 +470,7 @@ static int ioctx_add_table(struct kioctx *ctx, struct mm_struct *mm) struct aio_ring *ring; spin_lock(&mm->ioctx_lock); - table = mm->ioctx_table; + table = rcu_dereference(mm->ioctx_table); while (1) { if (table) @@ -498,7 +498,7 @@ static int ioctx_add_table(struct kioctx *ctx, struct mm_struct *mm) table->nr = new_nr; spin_lock(&mm->ioctx_lock); - old = mm->ioctx_table; + old = rcu_dereference(mm->ioctx_table); if (!old) { rcu_assign_pointer(mm->ioctx_table, table); @@ -622,7 +622,7 @@ static void kill_ioctx(struct mm_struct *mm, struct kioctx *ctx) struct kioctx_table *table; spin_lock(&mm->ioctx_lock); - table = mm->ioctx_table; + table = rcu_dereference(mm->ioctx_table); WARN_ON(ctx != table->table[ctx->id]); table->table[ctx->id] = NULL; From d6c355c7dabcd753a75bc77d150d36328a355267 Mon Sep 17 00:00:00 2001 From: Benjamin LaHaise Date: Mon, 9 Sep 2013 11:57:59 -0400 Subject: [PATCH 21/22] aio: fix race in ring buffer page lookup introduced by page migration support Prior to the introduction of page migration support in "fs/aio: Add support to aio ring pages migration" / 36bc08cc01709b4a9bb563b35aa530241ddc63e3, mapping of the ring buffer pages was done via get_user_pages() while retaining mmap_sem held for write. This avoided possible races with userland racing an munmap() or mremap(). The page migration patch, however, switched to using mm_populate() to prime the page mapping. mm_populate() cannot be called with mmap_sem held. Instead of dropping the mmap_sem, revert to the old behaviour and simply drop the use of mm_populate() since get_user_pages() will cause the pages to get mapped anyways. Thanks to Al Viro for spotting this issue. Signed-off-by: Benjamin LaHaise --- fs/aio.c | 15 ++++++++++++--- 1 file changed, 12 insertions(+), 3 deletions(-) diff --git a/fs/aio.c b/fs/aio.c index 6e267553604c..f4a27af6c9ac 100644 --- a/fs/aio.c +++ b/fs/aio.c @@ -307,16 +307,25 @@ static int aio_setup_ring(struct kioctx *ctx) aio_free_ring(ctx); return -EAGAIN; } - up_write(&mm->mmap_sem); - - mm_populate(ctx->mmap_base, populate); pr_debug("mmap address: 0x%08lx\n", ctx->mmap_base); + + /* We must do this while still holding mmap_sem for write, as we + * need to be protected against userspace attempting to mremap() + * or munmap() the ring buffer. + */ ctx->nr_pages = get_user_pages(current, mm, ctx->mmap_base, nr_pages, 1, 0, ctx->ring_pages, NULL); + + /* Dropping the reference here is safe as the page cache will hold + * onto the pages for us. It is also required so that page migration + * can unmap the pages and get the right reference count. + */ for (i = 0; i < ctx->nr_pages; i++) put_page(ctx->ring_pages[i]); + up_write(&mm->mmap_sem); + if (unlikely(ctx->nr_pages != nr_pages)) { aio_free_ring(ctx); return -EAGAIN; From d9b2c8714aef102dea95544a8cd9372b21af463f Mon Sep 17 00:00:00 2001 From: Artem Savkov Date: Mon, 9 Sep 2013 12:29:35 -0400 Subject: [PATCH 22/22] aio: rcu_read_lock protection for new rcu_dereference calls Patch "aio: fix rcu sparse warnings introduced by ioctx table lookup patch" (77d30b14d24e557f89c41980011d72428514d729 in linux-next.git) introduced a couple of new rcu_dereference calls which are not protected by rcu_read_lock and result in following warnings during syscall fuzzing(trinity): [ 471.646379] =============================== [ 471.649727] [ INFO: suspicious RCU usage. ] [ 471.653919] 3.11.0-next-20130906+ #496 Not tainted [ 471.657792] ------------------------------- [ 471.661235] fs/aio.c:503 suspicious rcu_dereference_check() usage! [ 471.665968] [ 471.665968] other info that might help us debug this: [ 471.665968] [ 471.672141] [ 471.672141] rcu_scheduler_active = 1, debug_locks = 1 [ 471.677549] 1 lock held by trinity-child0/3774: [ 471.681675] #0: (&(&mm->ioctx_lock)->rlock){+.+...}, at: [] SyS_io_setup+0x63a/0xc70 [ 471.688721] [ 471.688721] stack backtrace: [ 471.692488] CPU: 1 PID: 3774 Comm: trinity-child0 Not tainted 3.11.0-next-20130906+ #496 [ 471.698437] Hardware name: Bochs Bochs, BIOS Bochs 01/01/2011 [ 471.703151] 00000000 00000000 c58bbf30 c18a814b de2234c0 c58bbf58 c10a4ec6 c1b0d824 [ 471.709544] c1b0f60e 00000001 00000001 c1af61b0 00000000 cb670ac0 c3aca000 c58bbfac [ 471.716251] c119bc7c 00000002 00000001 00000000 c119b8dd 00000000 c10cf684 c58bbfb4 [ 471.722902] Call Trace: [ 471.724859] [] dump_stack+0x4b/0x66 [ 471.728772] [] lockdep_rcu_suspicious+0xc6/0x100 [ 471.733716] [] SyS_io_setup+0x89c/0xc70 [ 471.737806] [] ? SyS_io_setup+0x4fd/0xc70 [ 471.741689] [] ? __audit_syscall_entry+0x94/0xe0 [ 471.746080] [] syscall_call+0x7/0xb [ 471.749723] [] ? task_fork_fair+0x240/0x260 Signed-off-by: Artem Savkov Reviewed-by: Gu Zheng Signed-off-by: Benjamin LaHaise --- fs/aio.c | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/fs/aio.c b/fs/aio.c index f4a27af6c9ac..6b868f0e0c4c 100644 --- a/fs/aio.c +++ b/fs/aio.c @@ -479,6 +479,7 @@ static int ioctx_add_table(struct kioctx *ctx, struct mm_struct *mm) struct aio_ring *ring; spin_lock(&mm->ioctx_lock); + rcu_read_lock(); table = rcu_dereference(mm->ioctx_table); while (1) { @@ -487,6 +488,7 @@ static int ioctx_add_table(struct kioctx *ctx, struct mm_struct *mm) if (!table->table[i]) { ctx->id = i; table->table[i] = ctx; + rcu_read_unlock(); spin_unlock(&mm->ioctx_lock); ring = kmap_atomic(ctx->ring_pages[0]); @@ -497,6 +499,7 @@ static int ioctx_add_table(struct kioctx *ctx, struct mm_struct *mm) new_nr = (table ? table->nr : 1) * 4; + rcu_read_unlock(); spin_unlock(&mm->ioctx_lock); table = kzalloc(sizeof(*table) + sizeof(struct kioctx *) * @@ -507,6 +510,7 @@ static int ioctx_add_table(struct kioctx *ctx, struct mm_struct *mm) table->nr = new_nr; spin_lock(&mm->ioctx_lock); + rcu_read_lock(); old = rcu_dereference(mm->ioctx_table); if (!old) { @@ -631,10 +635,12 @@ static void kill_ioctx(struct mm_struct *mm, struct kioctx *ctx) struct kioctx_table *table; spin_lock(&mm->ioctx_lock); + rcu_read_lock(); table = rcu_dereference(mm->ioctx_table); WARN_ON(ctx != table->table[ctx->id]); table->table[ctx->id] = NULL; + rcu_read_unlock(); spin_unlock(&mm->ioctx_lock); /* percpu_ref_kill() will do the necessary call_rcu() */