aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorTheodore Ts'o <tytso@mit.edu>2015-01-22 15:20:55 -0500
committerMister Oyster <oysterized@gmail.com>2017-05-27 19:39:53 +0200
commit56d6480a113d3a0d8914bb29611a6564760b935e (patch)
tree5f45ae2c3b995c47bf9b234473ec9f9f88688b7e
parentd23347545e98b330e5d9a6dc88180c16e53bf405 (diff)
non-ext4 portions of "direct-io: Implement generic deferred AIO completions"
Originally from 7b7a8665edd8db73 Signed-off-by: Theodore Ts'o <tytso@mit.edu>
-rw-r--r--fs/direct-io.c85
-rw-r--r--fs/ocfs2/aops.c8
-rw-r--r--fs/super.c18
-rw-r--r--fs/xfs/xfs_aops.c28
-rw-r--r--fs/xfs/xfs_aops.h3
-rw-r--r--include/linux/buffer_head.h2
-rw-r--r--include/linux/fs.h7
7 files changed, 91 insertions, 60 deletions
diff --git a/fs/direct-io.c b/fs/direct-io.c
index b85d5d759..3f92cba09 100644
--- a/fs/direct-io.c
+++ b/fs/direct-io.c
@@ -127,6 +127,7 @@ struct dio {
spinlock_t bio_lock; /* protects BIO fields below */
int page_errors; /* errno from get_user_pages() */
int is_async; /* is IO async ? */
+ bool defer_completion; /* defer AIO completion to workqueue? */
int io_error; /* IO error in completion path */
unsigned long refcount; /* direct_io_worker() and bios */
struct bio *bio_list; /* singly linked via bi_private */
@@ -141,7 +142,10 @@ struct dio {
* allocation time. Don't add new fields after pages[] unless you
* wish that they not be zeroed.
*/
- struct page *pages[DIO_PAGES]; /* page buffer */
+ union {
+ struct page *pages[DIO_PAGES]; /* page buffer */
+ struct work_struct complete_work;/* deferred AIO completion */
+ };
} ____cacheline_aligned_in_smp;
static struct kmem_cache *dio_cache __read_mostly;
@@ -221,16 +225,16 @@ static inline struct page *dio_get_page(struct dio *dio,
* dio_complete() - called when all DIO BIO I/O has been completed
* @offset: the byte offset in the file of the completed operation
*
- * This releases locks as dictated by the locking type, lets interested parties
- * know that a DIO operation has completed, and calculates the resulting return
- * code for the operation.
+ * This drops i_dio_count, lets interested parties know that a DIO operation
+ * has completed, and calculates the resulting return code for the operation.
*
* It lets the filesystem know if it registered an interest earlier via
* get_block. Pass the private field of the map buffer_head so that
* filesystems can use it to hold additional state between get_block calls and
* dio_complete.
*/
-static ssize_t dio_complete(struct dio *dio, loff_t offset, ssize_t ret, bool is_async)
+static ssize_t dio_complete(struct dio *dio, loff_t offset, ssize_t ret,
+ bool is_async)
{
ssize_t transferred = 0;
@@ -258,19 +262,26 @@ static ssize_t dio_complete(struct dio *dio, loff_t offset, ssize_t ret, bool is
if (ret == 0)
ret = transferred;
- if (dio->end_io && dio->result) {
- dio->end_io(dio->iocb, offset, transferred,
- dio->private, ret, is_async);
- } else {
- inode_dio_done(dio->inode);
- if (is_async)
- aio_complete(dio->iocb, ret, 0);
- }
+ if (dio->end_io && dio->result)
+ dio->end_io(dio->iocb, offset, transferred, dio->private);
+
+ inode_dio_done(dio->inode);
+ if (is_async)
+ aio_complete(dio->iocb, ret, 0);
+ kmem_cache_free(dio_cache, dio);
return ret;
}
+static void dio_aio_complete_work(struct work_struct *work)
+{
+ struct dio *dio = container_of(work, struct dio, complete_work);
+
+ dio_complete(dio, dio->iocb->ki_pos, 0, true);
+}
+
static int dio_bio_complete(struct dio *dio, struct bio *bio);
+
/*
* Asynchronous IO callback.
*/
@@ -290,8 +301,13 @@ static void dio_bio_end_aio(struct bio *bio, int error)
spin_unlock_irqrestore(&dio->bio_lock, flags);
if (remaining == 0) {
- dio_complete(dio, dio->iocb->ki_pos, 0, true);
- kmem_cache_free(dio_cache, dio);
+ if (dio->result && dio->defer_completion) {
+ INIT_WORK(&dio->complete_work, dio_aio_complete_work);
+ queue_work(dio->inode->i_sb->s_dio_done_wq,
+ &dio->complete_work);
+ } else {
+ dio_complete(dio, dio->iocb->ki_pos, 0, true);
+ }
}
}
@@ -511,6 +527,41 @@ static inline int dio_bio_reap(struct dio *dio, struct dio_submit *sdio)
}
/*
+ * Create workqueue for deferred direct IO completions. We allocate the
+ * workqueue when it's first needed. This avoids creating workqueue for
+ * filesystems that don't need it and also allows us to create the workqueue
+ * late enough so the we can include s_id in the name of the workqueue.
+ */
+static int sb_init_dio_done_wq(struct super_block *sb)
+{
+ struct workqueue_struct *wq = alloc_workqueue("dio/%s",
+ WQ_MEM_RECLAIM, 0,
+ sb->s_id);
+ if (!wq)
+ return -ENOMEM;
+ /*
+ * This has to be atomic as more DIOs can race to create the workqueue
+ */
+ cmpxchg(&sb->s_dio_done_wq, NULL, wq);
+ /* Someone created workqueue before us? Free ours... */
+ if (wq != sb->s_dio_done_wq)
+ destroy_workqueue(wq);
+ return 0;
+}
+
+static int dio_set_defer_completion(struct dio *dio)
+{
+ struct super_block *sb = dio->inode->i_sb;
+
+ if (dio->defer_completion)
+ return 0;
+ dio->defer_completion = true;
+ if (!sb->s_dio_done_wq)
+ return sb_init_dio_done_wq(sb);
+ return 0;
+}
+
+/*
* Call into the fs to map some more disk blocks. We record the current number
* of available blocks at sdio->blocks_available. These are in units of the
* fs blocksize, (1 << inode->i_blkbits).
@@ -581,6 +632,9 @@ static int get_more_blocks(struct dio *dio, struct dio_submit *sdio,
/* Store for completion */
dio->private = map_bh->b_private;
+
+ if (ret == 0 && buffer_defer_completion(map_bh))
+ ret = dio_set_defer_completion(dio);
}
return ret;
}
@@ -1300,7 +1354,6 @@ do_blockdev_direct_IO(int rw, struct kiocb *iocb, struct inode *inode,
if (drop_refcount(dio) == 0) {
retval = dio_complete(dio, offset, retval, false);
- kmem_cache_free(dio_cache, dio);
} else
BUG_ON(retval != -EIOCBQUEUED);
diff --git a/fs/ocfs2/aops.c b/fs/ocfs2/aops.c
index f998c6009..c92061ef9 100644
--- a/fs/ocfs2/aops.c
+++ b/fs/ocfs2/aops.c
@@ -565,9 +565,7 @@ bail:
static void ocfs2_dio_end_io(struct kiocb *iocb,
loff_t offset,
ssize_t bytes,
- void *private,
- int ret,
- bool is_async)
+ void *private)
{
struct inode *inode = file_inode(iocb->ki_filp);
int level;
@@ -592,10 +590,6 @@ static void ocfs2_dio_end_io(struct kiocb *iocb,
level = ocfs2_iocb_rw_locked_level(iocb);
ocfs2_rw_unlock(inode, level);
-
- inode_dio_done(inode);
- if (is_async)
- aio_complete(iocb, ret, 0);
}
/*
diff --git a/fs/super.c b/fs/super.c
index 388f3832c..581c3925b 100644
--- a/fs/super.c
+++ b/fs/super.c
@@ -156,15 +156,9 @@ static struct super_block *alloc_super(struct file_system_type *type, int flags)
static const struct super_operations default_op;
if (s) {
- if (security_sb_alloc(s)) {
- /*
- * We cannot call security_sb_free() without
- * security_sb_alloc() succeeding. So bail out manually
- */
- kfree(s);
- s = NULL;
- goto out;
- }
+ if (security_sb_alloc(s))
+ goto out_free_sb;
+
if (init_sb_writers(s, type))
goto err_out;
s->s_flags = flags;
@@ -215,6 +209,7 @@ out:
err_out:
security_sb_free(s);
destroy_sb_writers(s);
+out_free_sb:
kfree(s);
s = NULL;
goto out;
@@ -398,6 +393,11 @@ void generic_shutdown_super(struct super_block *sb)
evict_inodes(sb);
+ if (sb->s_dio_done_wq) {
+ destroy_workqueue(sb->s_dio_done_wq);
+ sb->s_dio_done_wq = NULL;
+ }
+
if (sop->put_super)
sop->put_super(sb);
diff --git a/fs/xfs/xfs_aops.c b/fs/xfs/xfs_aops.c
index cfbb4c1b2..f57232578 100644
--- a/fs/xfs/xfs_aops.c
+++ b/fs/xfs/xfs_aops.c
@@ -86,14 +86,6 @@ xfs_destroy_ioend(
bh->b_end_io(bh, !ioend->io_error);
}
- if (ioend->io_iocb) {
- inode_dio_done(ioend->io_inode);
- if (ioend->io_isasync) {
- aio_complete(ioend->io_iocb, ioend->io_error ?
- ioend->io_error : ioend->io_result, 0);
- }
- }
-
mempool_free(ioend, xfs_ioend_pool);
}
@@ -281,7 +273,6 @@ xfs_alloc_ioend(
* all the I/O from calling the completion routine too early.
*/
atomic_set(&ioend->io_remaining, 1);
- ioend->io_isasync = 0;
ioend->io_isdirect = 0;
ioend->io_error = 0;
ioend->io_list = NULL;
@@ -291,8 +282,6 @@ xfs_alloc_ioend(
ioend->io_buffer_tail = NULL;
ioend->io_offset = 0;
ioend->io_size = 0;
- ioend->io_iocb = NULL;
- ioend->io_result = 0;
ioend->io_append_trans = NULL;
INIT_WORK(&ioend->io_work, xfs_end_io);
@@ -1290,8 +1279,10 @@ __xfs_get_blocks(
if (create || !ISUNWRITTEN(&imap))
xfs_map_buffer(inode, bh_result, &imap, offset);
if (create && ISUNWRITTEN(&imap)) {
- if (direct)
+ if (direct) {
bh_result->b_private = inode;
+ set_buffer_defer_completion(bh_result);
+ }
set_buffer_unwritten(bh_result);
}
}
@@ -1388,9 +1379,7 @@ xfs_end_io_direct_write(
struct kiocb *iocb,
loff_t offset,
ssize_t size,
- void *private,
- int ret,
- bool is_async)
+ void *private)
{
struct xfs_ioend *ioend = iocb->private;
@@ -1412,17 +1401,10 @@ xfs_end_io_direct_write(
ioend->io_offset = offset;
ioend->io_size = size;
- ioend->io_iocb = iocb;
- ioend->io_result = ret;
if (private && size > 0)
ioend->io_type = XFS_IO_UNWRITTEN;
- if (is_async) {
- ioend->io_isasync = 1;
- xfs_finish_ioend(ioend);
- } else {
- xfs_finish_ioend_sync(ioend);
- }
+ xfs_finish_ioend_sync(ioend);
}
STATIC ssize_t
diff --git a/fs/xfs/xfs_aops.h b/fs/xfs/xfs_aops.h
index c325abb8d..f94dd459d 100644
--- a/fs/xfs/xfs_aops.h
+++ b/fs/xfs/xfs_aops.h
@@ -45,7 +45,6 @@ typedef struct xfs_ioend {
unsigned int io_type; /* delalloc / unwritten */
int io_error; /* I/O error code */
atomic_t io_remaining; /* hold count */
- unsigned int io_isasync : 1; /* needs aio_complete */
unsigned int io_isdirect : 1;/* direct I/O */
struct inode *io_inode; /* file being written to */
struct buffer_head *io_buffer_head;/* buffer linked list head */
@@ -54,8 +53,6 @@ typedef struct xfs_ioend {
xfs_off_t io_offset; /* offset in the file */
struct work_struct io_work; /* xfsdatad work queue */
struct xfs_trans *io_append_trans;/* xact. for size update */
- struct kiocb *io_iocb;
- int io_result;
} xfs_ioend_t;
extern const struct address_space_operations xfs_address_space_operations;
diff --git a/include/linux/buffer_head.h b/include/linux/buffer_head.h
index a66bbbf79..c065c02af 100644
--- a/include/linux/buffer_head.h
+++ b/include/linux/buffer_head.h
@@ -36,6 +36,7 @@ enum bh_state_bits {
BH_Quiet, /* Buffer Error Prinks to be quiet */
BH_Meta, /* Buffer contains metadata */
BH_Prio, /* Buffer should be submitted with REQ_PRIO */
+ BH_Defer_Completion, /* Defer AIO completion to workqueue */
BH_PrivateStart,/* not a state bit, but the first bit available
* for private allocation by other entities
@@ -128,6 +129,7 @@ BUFFER_FNS(Write_EIO, write_io_error)
BUFFER_FNS(Unwritten, unwritten)
BUFFER_FNS(Meta, meta)
BUFFER_FNS(Prio, prio)
+BUFFER_FNS(Defer_Completion, defer_completion)
#define bh_offset(bh) ((unsigned long)(bh)->b_data & ~PAGE_MASK)
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 9bfff5d2b..16c7d1778 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -46,6 +46,7 @@ struct vfsmount;
struct cred;
struct swap_info_struct;
struct seq_file;
+struct workqueue_struct;
struct fscrypt_info;
struct fscrypt_operations;
@@ -65,8 +66,7 @@ struct buffer_head;
typedef int (get_block_t)(struct inode *inode, sector_t iblock,
struct buffer_head *bh_result, int create);
typedef void (dio_iodone_t)(struct kiocb *iocb, loff_t offset,
- ssize_t bytes, void *private, int ret,
- bool is_async);
+ ssize_t bytes, void *private);
#define MAY_EXEC 0x00000001
#define MAY_WRITE 0x00000002
@@ -1343,6 +1343,9 @@ struct super_block {
/* Being remounted read-only */
int s_readonly_remount;
+
+ /* AIO completions deferred from interrupt context */
+ struct workqueue_struct *s_dio_done_wq;
};
/* superblock cache pruning functions */