[PATCH V2] Btrfs: Full direct I/O and AIO read implementation.

jim owens Wed, 10 Feb 2010 10:54:17 -0800


Signed-off-by: jim owens <[email protected]>
Signed-off-by: jim owens <[email protected]>
---


V2 is a merge of my original file:
http://article.gmane.org/gmane.comp.file-systems.btrfs/4530

and the fixes produced from Josef Bacik's fsx testing:
http://article.gmane.org/gmane.comp.file-systems.btrfs/4612

I included my new email address as a second sign-off.

 fs/btrfs/dio.c | 1945 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++
 1 files changed, 1945 insertions(+), 0 deletions(-)
 create mode 100644 fs/btrfs/dio.c

diff --git a/fs/btrfs/dio.c b/fs/btrfs/dio.c
new file mode 100644
index 0000000..3315cc9
--- /dev/null
+++ b/fs/btrfs/dio.c
@@ -0,0 +1,1945 @@
+/*
+ * (c) Copyright Hewlett-Packard Development Company, L.P., 2009
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License v2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+
+#include <linux/bitops.h>
+#include <linux/slab.h>
+#include <linux/bio.h>
+#include <linux/mm.h>
+#include <linux/mmu_context.h>
+#include <linux/gfp.h>
+#include <linux/pagemap.h>
+#include <linux/page-flags.h>
+#include <linux/module.h>
+#include <linux/spinlock.h>
+#include <linux/blkdev.h>
+#include <linux/swap.h>
+#include <linux/writeback.h>
+#include <linux/pagevec.h>
+
+#include "extent_io.h"
+#include "extent_map.h"
+#include "compat.h"
+#include "ctree.h"
+#include "btrfs_inode.h"
+#include "volumes.h"
+#include "compression.h"
+#include "disk-io.h"
+
+
+/* per-stripe working info while building and submitting I/O */
+struct btrfs_dio_dev {
+       u64 physical;           /* byte number on device */
+       int vecs;               /* number of unused bio_vecs in bio */
+       int unplug;             /* bios were submitted so issue unplug */
+       struct bio *bio;
+};
+
+/* modified working copy that describes current state of user memory
+ * remaining to submit I/O on, or on I/O completion the area of user
+ * memory that applies to the uncompressed extent.
+ */
+struct btrfs_dio_user_mem_control {
+       const struct iovec *user_iov;   /* user input vector being processed */
+       struct iovec work_iov;          /* updated base/len for part not done */
+       long remaining;                 /* total user input memory left */
+       long todo;                      /* user mem applicable to extent part */
+       int next_user_page;             /* gup */
+       int user_pages_left;            /* gup */
+       int gup_max;                    /* gup */
+       struct page **pagelist;         /* gup */
+};
+
+/* max bios that we can process in one extent - minimum 32 for compression */
+#define MAX_STRIPE_SEGMENTS 32
+#define CSUM_RESERVE_SEGMENTS 1
+
+/* per-physical-extent submit/completion processing info */
+struct btrfs_dio_extcb {
+       struct btrfs_dio_extcb *next;
+       struct btrfs_diocb *diocb;
+
+       struct extent_map *em;          /* chunk stripe map for this extent */
+       /* active_umc points at diocb.umc in submit and extcb.umc in completion 
*/
+       struct btrfs_dio_user_mem_control *active_umc;
+       struct btrfs_dio_user_mem_control umc;
+       struct extent_buffer *leaf;
+
+       struct btrfs_inflate icb;       /* extent decompression processing */
+
+       u64 filestart;
+       u64 iostart;
+       u32 iolen;
+       u32 filetail;
+       u32 beforetail;
+
+       u64 lockstart;
+       u64 lockend;
+
+       int compressed;
+       int stripes;
+       int error;
+       int pending_bios;
+       int shortread;
+       int retry_mirror;
+       u32 retry_len;
+       u32 retry_csum;
+       u64 retry_start;
+       struct bio *retry_bio;
+
+       char *tmpbuf;                   /* for fetching range of checksums */
+       int tmpbuf_size;
+
+       int bo_used;                    /* order[] bio entries in use */
+       int bo_now;                     /* order[bo_now] being completed */
+       int bo_bvn;                     /* order[bo_now] bi_io_vec being 
completed */
+       int bo_frag;                    /* bv_len unfinished on error */
+
+       struct page *csum_pg1;          /* temp read area for unaligned I/O */
+       struct page *csum_pg2;          /* may need two for head and tail */
+       struct bio *order[MAX_STRIPE_SEGMENTS + CSUM_RESERVE_SEGMENTS];
+       struct btrfs_dio_dev diodev[];  /* array size based on stripes */
+};
+
+#define GUP_IOSUBMIT_MAX 64            /* same as fs/direct-io.c */
+#define GUP_IODONE_MAX 33              /* unaligned inflate 128k + 1 page */
+
+/* single master control for user's directIO request */
+struct btrfs_diocb {
+       spinlock_t diolock;
+       struct kiocb *kiocb;
+       struct inode *inode;
+       u64 start;                      /* current submit file position */
+       u64 end;
+       u64 lockstart;
+       u64 lockend;
+       u64 begin;                      /* original beginning file position */
+       u64 terminate;                  /* fpos after failed submit/completion 
*/ 
+
+       struct btrfs_dio_user_mem_control umc;
+       struct workspace *workspace;
+       char *csum_buf;
+
+       u32 blocksize;
+       int rw;
+       int error;
+       int sleeping;
+       int reaping;
+       int pending_extcbs;
+       struct btrfs_dio_extcb *done_extcbs;
+
+       struct mm_struct *user_mm;      /* workers assume state of user task */
+       struct task_struct *waiter;     /* final completion processing */
+       struct btrfs_work submit;       /* submit and finish thread for aio */
+       struct btrfs_work reaper;       /* completion handling during submit */
+
+       struct page *gup_iosubmit_pages[GUP_IOSUBMIT_MAX];
+       struct page *gup_iodone_pages[GUP_IODONE_MAX];
+};
+
+static void btrfs_dio_reaper(struct btrfs_work *work);
+static void btrfs_dio_aio_submit(struct btrfs_work *work);
+static ssize_t btrfs_dio_wait(struct btrfs_diocb *diocb);
+static void btrfs_dio_free_diocb(struct btrfs_diocb *diocb);
+static void btrfs_dio_extcb_biodone(struct btrfs_dio_extcb *extcb);
+static void btrfs_dio_bi_end_io(struct bio *bio, int error);
+static void btrfs_dio_write(struct btrfs_diocb *diocb);
+static void btrfs_dio_read(struct btrfs_diocb *diocb);
+static int btrfs_dio_new_extcb(struct btrfs_dio_extcb **alloc_extcb,
+                               struct btrfs_diocb *diocb, struct extent_map 
*em);
+static void btrfs_dio_eof_tail(u32 *filetail, int eof, struct btrfs_diocb 
*diocb);
+static int btrfs_dio_compressed_read(struct btrfs_diocb *diocb,
+                               struct extent_map *lem, u64 data_len);
+static int btrfs_dio_extent_read(struct btrfs_diocb *diocb,
+                               struct extent_map *lem, u64 data_len, int eof);
+static void btfrs_dio_unplug(struct btrfs_dio_extcb *extcb);
+static int btrfs_dio_read_stripes(struct btrfs_dio_extcb *extcb,
+                               u64 *rd_start, u64 *rd_len, int temp_pages);
+static void btrfs_dio_reset_next_in(struct btrfs_dio_extcb *extcb);
+static void btrfs_dio_get_next_in(struct bio_vec *vec,
+                               struct btrfs_dio_extcb *extcb);
+static void btrfs_dio_put_next_in(struct bio_vec *vec,
+                               struct btrfs_dio_extcb *extcb);
+static int btrfs_dio_inflate_next_in(struct bio_vec *ivec,
+                               struct btrfs_inflate *icb);
+static int btrfs_dio_inline_next_in(struct bio_vec *ivec,
+                               struct btrfs_inflate *icb);
+static int btrfs_dio_get_user_bvec(struct bio_vec *uv,
+                               struct btrfs_dio_user_mem_control *umc);
+static int btrfs_dio_not_aligned(unsigned long iomask, u32 testlen,
+                               struct btrfs_dio_user_mem_control *umc);
+static void btrfs_dio_put_user_bvec(struct bio_vec *uv,
+                               struct btrfs_dio_user_mem_control *umc);
+static void btrfs_dio_release_unused_pages(struct btrfs_dio_user_mem_control 
*umc);
+static void btrfs_dio_skip_user_mem(struct btrfs_dio_user_mem_control *umc,
+                               u32 skip_len);
+static int btrfs_dio_get_next_out(struct bio_vec *ovec,
+                               struct btrfs_inflate *icb);
+static void btrfs_dio_done_with_out(struct bio_vec *ovec,
+                               struct btrfs_inflate *icb);
+static void btrfs_dio_release_bios(struct btrfs_dio_extcb *extcb, int dirty);
+static void btrfs_dio_read_done(struct btrfs_dio_extcb *extcb);
+static void btrfs_dio_decompress(struct btrfs_dio_extcb *extcb);
+static void btrfs_dio_free_extcb(struct btrfs_dio_extcb *extcb);
+static int btrfs_dio_get_workbuf(struct btrfs_dio_extcb *extcb);
+static int btrfs_dio_drop_workbuf(struct btrfs_dio_extcb *extcb);
+static void btrfs_dio_complete_bios(struct btrfs_diocb *diocb);
+static int btrfs_dio_new_bio(struct btrfs_dio_extcb *extcb, int dvn);
+static void btrfs_dio_submit_bio(struct btrfs_dio_extcb *extcb, int dvn);
+static int btrfs_dio_add_user_pages(u64 *dev_left, struct btrfs_dio_extcb 
*extcb, int dvn);
+static int btrfs_dio_add_temp_pages(u64 *dev_left, struct btrfs_dio_extcb 
*extcb, int dvn);
+static int btrfs_dio_hole_read(struct btrfs_diocb *diocb, u64 hole_len);
+static int btrfs_dio_inline_read(struct btrfs_diocb *diocb, u64 *data_len);
+static int btrfs_dio_read_csum(struct btrfs_dio_extcb *extcb);
+static void btrfs_dio_free_retry(struct btrfs_dio_extcb *extcb);
+static int btrfs_dio_retry_block(struct btrfs_dio_extcb *extcb);
+static int btrfs_dio_read_retry(struct btrfs_dio_extcb *extcb);
+
+
+ssize_t btrfs_direct_IO(int rw, struct kiocb *kiocb,
+                       const struct iovec *iov, loff_t offset,
+                       unsigned long nr_segs)
+{
+       int seg;
+       ssize_t done = 0;
+       struct btrfs_diocb *diocb;
+       struct inode *inode = kiocb->ki_filp->f_mapping->host;
+
+       /* traditional 512-byte device sector alignment is the
+        * minimum required. if they have a larger sector disk
+        * (possibly multiple sizes in the filesystem) and need
+        * a larger alignment for this I/O, we just fail later. 
+        */
+       if (offset & 511)
+               return -EINVAL;
+
+       /* check memory alignment, blocks cannot straddle pages.
+        * allow 0-length vectors which are questionable but seem legal.
+        */
+       for (seg = 0; seg < nr_segs; seg++) {
+               if (iov[seg].iov_len && ((unsigned long)iov[seg].iov_base & 
511))
+                       return -EINVAL;
+               if (iov[seg].iov_len & 511)
+                       return -EINVAL;
+               done += iov[seg].iov_len;
+       }
+
+       /* limit request size to available memory */
+       done = min_t(ssize_t, done, kiocb->ki_left);
+
+       /* no write code here so fall back to buffered writes */
+       if (rw == WRITE)
+               return 0;
+
+       diocb = kzalloc(sizeof(*diocb), GFP_NOFS);
+       if (!diocb)
+               return -ENOMEM;
+
+       diocb->rw = rw;
+       diocb->kiocb = kiocb;
+       diocb->start = offset;
+       diocb->begin = offset;
+       diocb->terminate = offset + done;
+       diocb->inode = inode;
+       diocb->blocksize = BTRFS_I(diocb->inode)->root->sectorsize;
+
+       diocb->umc.user_iov = iov;
+       diocb->umc.work_iov = *iov;
+       diocb->umc.remaining = done;
+       diocb->umc.gup_max = GUP_IOSUBMIT_MAX;
+       diocb->umc.pagelist = diocb->gup_iosubmit_pages;
+
+       spin_lock_init(&diocb->diolock);
+
+       diocb->user_mm = current->mm;
+       diocb->reaper.func = btrfs_dio_reaper;
+       btrfs_set_work_high_prio(&diocb->reaper);
+
+       if (is_sync_kiocb(diocb->kiocb)) {
+               if (diocb->rw == READ)
+                       btrfs_dio_read(diocb);
+               else
+                       btrfs_dio_write(diocb);
+               done = btrfs_dio_wait(diocb);
+
+               btrfs_dio_free_diocb(diocb);
+               return done;
+       } else {
+               diocb->submit.func = btrfs_dio_aio_submit;
+               btrfs_queue_worker(&BTRFS_I(diocb->inode)->root->fs_info->
+                               submit_workers, &diocb->submit);
+               return -EIOCBQUEUED;
+       }
+}
+
+/* process context worker routine to handle bio completion
+ * for extents that finish while submitting other extents,
+ * limited to one thread for a dio so we don't hog the cpus
+ */
+static void btrfs_dio_reaper(struct btrfs_work *work)
+{
+       struct btrfs_diocb *diocb = 
+               container_of(work, struct btrfs_diocb, reaper);
+
+       use_mm(diocb->user_mm);
+
+       btrfs_dio_complete_bios(diocb);
+
+       spin_lock_irq(&diocb->diolock);
+       diocb->reaping = 0;
+       if (!diocb->pending_extcbs && diocb->sleeping) {
+               diocb->sleeping = 0;
+               wake_up_process(diocb->waiter);
+       }
+       spin_unlock_irq(&diocb->diolock);
+
+       unuse_mm(diocb->user_mm);
+
+       /* return control to btrfs worker pool */
+}
+
+/* process context worker routine to handle aio submit
+ * and final completion callback
+ */
+static void btrfs_dio_aio_submit(struct btrfs_work *work)
+{
+       struct btrfs_diocb *diocb = 
+               container_of(work, struct btrfs_diocb, submit);
+       ssize_t done;
+
+       use_mm(diocb->user_mm);
+               
+       if (diocb->rw == READ)
+               btrfs_dio_read(diocb);
+       else
+               btrfs_dio_write(diocb);
+
+       done = btrfs_dio_wait(diocb);
+
+       aio_complete(diocb->kiocb, done, 0);
+
+       unuse_mm(diocb->user_mm);
+
+       btrfs_dio_free_diocb(diocb);
+
+       /* return control to btrfs worker pool */
+}
+
+static ssize_t btrfs_dio_wait(struct btrfs_diocb *diocb)
+{
+       ssize_t done;
+
+       spin_lock_irq(&diocb->diolock);
+       diocb->waiter = current;
+
+       /* after reaper terminates, we complete any remaining bios */
+       do {
+               if (diocb->reaping ||
+                   (diocb->pending_extcbs && !diocb->done_extcbs)) {
+                       diocb->sleeping = 1;
+                       __set_current_state(TASK_UNINTERRUPTIBLE);
+                       spin_unlock_irq(&diocb->diolock);
+                       io_schedule();
+                       spin_lock_irq(&diocb->diolock);
+               }
+               spin_unlock_irq(&diocb->diolock);
+               btrfs_dio_complete_bios(diocb);
+               spin_lock_irq(&diocb->diolock);
+       } while (diocb->pending_extcbs || diocb->done_extcbs);
+
+       spin_unlock_irq(&diocb->diolock);
+
+       done = min(diocb->start, diocb->terminate) - diocb->begin;
+       return done ? done : diocb->error;
+}
+
+static void btrfs_dio_free_diocb(struct btrfs_diocb *diocb)
+{
+       if (diocb->workspace)
+               free_workspace(diocb->workspace);
+       kfree(diocb->csum_buf);
+       kfree(diocb);
+}
+
+/* must be called with diocb->diolock held.
+ * performs "all bios are done for extcb" processing
+ * to prevent submit/reap thread race
+ */ 
+static void btrfs_dio_extcb_biodone(struct btrfs_dio_extcb *extcb)
+{
+       struct btrfs_diocb *diocb = extcb->diocb;
+
+       if (--extcb->pending_bios == 0) {
+               extcb->next = diocb->done_extcbs;
+               diocb->done_extcbs = extcb;
+               if (!diocb->reaping) {
+                       if (!diocb->waiter) {
+                               diocb->reaping = 1;
+                               btrfs_queue_worker(
+                                       &BTRFS_I(diocb->inode)->root->fs_info->
+                                       endio_workers, &diocb->reaper);
+                       } else if (diocb->sleeping) {
+                               diocb->sleeping = 0;
+                               wake_up_process(diocb->waiter);
+                       }
+               }
+       }
+}
+
+/* only thing we run in interrupt context, bio completion
+ * processing is always deferred from interrupt context so
+ * we can handle compressed extents, checksums, and retries
+ */
+static void btrfs_dio_bi_end_io(struct bio *bio, int error)
+{
+       struct btrfs_dio_extcb *extcb = bio->bi_private;
+       unsigned long flags;
+
+       if (error)
+               clear_bit(BIO_UPTODATE, &bio->bi_flags);
+
+       spin_lock_irqsave(&extcb->diocb->diolock, flags);
+       if (!test_bit(BIO_UPTODATE, &bio->bi_flags))
+               extcb->error = error ? error : -EIO;
+       btrfs_dio_extcb_biodone(extcb);
+       spin_unlock_irqrestore(&extcb->diocb->diolock, flags);
+}
+
+static void btrfs_dio_write(struct btrfs_diocb *diocb)
+{
+}
+
+static void btrfs_dio_read(struct btrfs_diocb *diocb)
+{
+       struct extent_io_tree *io_tree = &BTRFS_I(diocb->inode)->io_tree;
+       u64 end = diocb->terminate; /* copy because reaper changes it */
+       u64 data_len;
+       int err = 0;
+       int loop = 0;
+
+       /* expand lock region to include what we read to validate checksum */ 
+       diocb->lockstart = diocb->start & ~(diocb->blocksize-1);
+       diocb->lockend = ALIGN(diocb->terminate, diocb->blocksize) - 1;
+
+getlock:
+       mutex_lock(&diocb->inode->i_mutex);
+               
+       /* ensure writeout and btree update on everything
+        * we might read for checksum or compressed extents
+        */
+       data_len = diocb->lockend + 1 - diocb->lockstart;
+       err = btrfs_wait_ordered_range(diocb->inode, diocb->lockstart, 
data_len);
+       if (err) {
+               diocb->error = err;
+               mutex_unlock(&diocb->inode->i_mutex);
+               return;
+       }
+       data_len = i_size_read(diocb->inode);
+       if (data_len < end)
+               end = data_len;
+       if (end <= diocb->start) {
+               mutex_unlock(&diocb->inode->i_mutex);
+               goto fail; /* 0 is returned past EOF */
+       }
+       if (!loop) {
+               loop++;
+               diocb->terminate = end;
+               diocb->lockend = ALIGN(diocb->terminate, diocb->blocksize) - 1;
+       }
+
+       lock_extent(io_tree, diocb->lockstart, diocb->lockend, GFP_NOFS);
+       mutex_unlock(&diocb->inode->i_mutex);
+
+       data_len = end - diocb->start;
+       while (data_len && !diocb->error) { /* error in reaper stops submit */
+               struct extent_map *em;
+               u64 len = data_len;
+
+               em = btrfs_get_extent(diocb->inode, NULL, 0, diocb->start, len, 
0);
+               if (!em) {
+                       err = -EIO;
+                       goto fail;
+               }
+
+               /* must be problem flushing ordered data with btree not updated 
*/
+               if (test_bit(EXTENT_FLAG_VACANCY, &em->flags)) {
+                       printk(KERN_ERR "btrfs directIO extent map incomplete 
ino %lu "
+                               "extent start %llu len %llu\n",
+                               diocb->inode->i_ino, diocb->start, len);
+                       err = -EIO;
+                       goto fail;
+               }
+               
+               if (em->block_start == EXTENT_MAP_INLINE) {
+                       /* ugly stuff because inline can exist in a large file
+                        * with other extents if a hole immediately follows.
+                        * the inline might end short of the btrfs block with
+                        * an implied hole that we need to zero here.
+                        */
+                       u64 expected = min(diocb->start + len, em->start + 
em->len);
+                       err = btrfs_dio_inline_read(diocb, &len);
+                       if (!err && expected > diocb->start) {
+                               data_len -= len;
+                               len = expected - diocb->start;
+                               err = btrfs_dio_hole_read(diocb, len);
+                       }
+               } else {
+                       len = min(len, em->len - (diocb->start - em->start));
+                       if (test_bit(EXTENT_FLAG_PREALLOC, &em->flags) ||
+                                       em->block_start == EXTENT_MAP_HOLE) {
+                               err = btrfs_dio_hole_read(diocb, len);
+                       } else if (test_bit(EXTENT_FLAG_COMPRESSED, 
&em->flags)) {
+                               if (diocb->lockstart > em->start || 
diocb->lockend <
+                                               em->start + em->len - 1) {
+                                       /* lock everything we must read to 
inflate */
+                                       unlock_extent(io_tree, diocb->lockstart,
+                                               diocb->lockend, GFP_NOFS);
+                                       diocb->lockstart = em->start;
+                                       diocb->lockend = max(diocb->lockend,
+                                                       em->start + em->len - 
1);
+                                       free_extent_map(em);
+                                       goto getlock;
+                               }
+                               err = btrfs_dio_compressed_read(diocb, em, len);
+                       } else {
+                               err = btrfs_dio_extent_read(diocb, em, len,
+                                                       len == data_len);
+                       }
+               }
+
+               free_extent_map(em);
+               data_len -= len;
+               if (err)
+                       goto fail;
+               cond_resched();
+       }
+fail:
+       if (err)
+               diocb->error = err;
+
+       /* extent processing routines unlock or keep locked their
+        * range as appropriate for submitted bios, so we only
+        * need to unlock the unprocessed remainder
+        */
+       if (diocb->lockstart <= diocb->lockend)
+               unlock_extent(io_tree, diocb->lockstart, diocb->lockend, 
GFP_NOFS);
+}
+
+static int btrfs_dio_new_extcb(struct btrfs_dio_extcb **alloc_extcb,
+                               struct btrfs_diocb *diocb, struct extent_map 
*em)
+{
+       int devices = btrfs_map_stripe_count(em);
+       struct btrfs_dio_extcb *extcb;
+
+       extcb = kzalloc(sizeof(*extcb) +
+                       sizeof(struct btrfs_dio_dev) * devices, GFP_NOFS);
+       if (!extcb)     
+               return -ENOMEM;
+
+       extcb->em = em;
+       extcb->diocb = diocb;
+       extcb->filestart = diocb->start;
+       extcb->stripes = devices;
+
+       /* need these for completion error/tail processing */
+       extcb->umc.work_iov = diocb->umc.work_iov;
+       extcb->umc.user_iov = diocb->umc.user_iov;
+       extcb->umc.remaining = diocb->umc.remaining;
+
+       /* can use common list because we run 1 completion thread */
+       extcb->umc.gup_max = GUP_IODONE_MAX;
+       extcb->umc.pagelist = diocb->gup_iodone_pages;
+
+       extcb->pending_bios = 1;        /* prevent reaping race */
+       *alloc_extcb = extcb;
+       return 0;
+}
+
+/* compressed data is at most 128kb uncompressed and will be in
+ * one single matching logical->physical extent map that may be
+ * multiple raid stripes. we must read the whole compressed extent
+ * to inflate it, independent of user file data_start and data_len.
+ */
+static int btrfs_dio_compressed_read(struct btrfs_diocb *diocb,
+                               struct extent_map *lem, u64 data_len)
+{
+       struct extent_map_tree *em_tree = &BTRFS_I(diocb->inode)->
+               root->fs_info->mapping_tree.map_tree;
+       u64 compressed_start = lem->block_start;
+       u64 compressed_len = lem->block_len;
+       struct extent_map *em;
+       int err;
+       struct btrfs_dio_extcb *extcb;
+
+       /* get single extent map with device raid layout for compressed data */ 
+       read_lock(&em_tree->lock);
+       em = lookup_extent_mapping(em_tree, compressed_start, compressed_len);
+       read_unlock(&em_tree->lock);
+       BUG_ON(em->block_len < data_len);
+
+       err = btrfs_dio_new_extcb(&extcb, diocb, em);
+       if (err) {
+               free_extent_map(em);
+               return err;
+       }
+
+       /* we now own this range and will unlock it in our completion */
+       extcb->lockstart = diocb->lockstart;
+       extcb->lockend = diocb->lockstart + lem->len - 1;
+       diocb->lockstart += lem->len;
+
+       extcb->compressed = 1;
+       extcb->iostart = compressed_start;
+       extcb->icb.out_start = diocb->start - lem->start;
+       extcb->icb.out_len = data_len;
+       extcb->icb.get_next_in = btrfs_dio_inflate_next_in;
+       extcb->icb.get_next_out = btrfs_dio_get_next_out;
+       extcb->icb.done_with_out = btrfs_dio_done_with_out;
+
+       /* completion code is per-extent on user memory */
+       extcb->active_umc = &extcb->umc;
+       extcb->umc.todo = data_len;
+
+       /* read entire compressed extent into temp pages,
+        * it must all fit in one extcb for us to inflate
+        */
+       err = btrfs_dio_read_stripes(extcb, &compressed_start, &compressed_len, 
1);
+       if (compressed_len && !err)
+               err = -EIO;
+       if (!err)
+               diocb->start += data_len;
+
+       /* adjust diocb->iov and diocb->iov_left to account
+        * for uncompressed size so we start the next extent
+        * at the proper point in user memory
+        */
+       btrfs_dio_skip_user_mem(&diocb->umc, data_len);
+
+       btfrs_dio_unplug(extcb);
+
+       spin_lock_irq(&diocb->diolock);
+       diocb->pending_extcbs++;
+       /* decrement pending_bios to let reaper run on extcb,
+        * it will run immediately to clean up if we failed
+        */
+       btrfs_dio_extcb_biodone(extcb);
+       spin_unlock_irq(&diocb->diolock);
+
+       return err;
+}
+
+/* for consistent eof processing between inline/compressed/normal
+ * extents, an unaligned eof gets special treatment, read into temp
+ * and memcpy to user on completion the part that does not match
+ * the users I/O alignment (for now always 511)
+ */
+static void btrfs_dio_eof_tail(u32 *filetail, int eof, struct btrfs_diocb 
*diocb)
+{
+       if (eof)
+               *filetail &= 511;
+       else
+               *filetail = 0; /* aligned direct to user memory */ 
+}
+
+/* called with a hard-sector bounded file byte data start/len
+ * which covers areas of disk data.  it might not... be contiguous,
+ * be on the same device(s), have the same redundancy property.
+ * get the extent map per contiguous chunk and submit bios.
+ */
+
+static int btrfs_dio_extent_read(struct btrfs_diocb *diocb,
+                               struct extent_map *lem, u64 data_len, int eof)
+{
+       struct extent_map_tree *em_tree = &BTRFS_I(diocb->inode)->
+               root->fs_info->mapping_tree.map_tree;
+       u64 data_start = lem->block_start + (diocb->start - lem->start);
+       struct extent_map *em;
+       int err = -EIO;
+       int csum = !(BTRFS_I(diocb->inode)->flags & BTRFS_INODE_NODATASUM);
+       u64 csum_before = 0;
+       u64 csum_after = 0;
+       u32 filetail = (data_start + data_len) & (diocb->blocksize - 1);
+
+       if (csum) {
+               csum_before = data_start & (diocb->blocksize - 1);
+               if (filetail)
+                       csum_after = diocb->blocksize - filetail;
+       }
+
+       /* make post-eof consistent between inline/compressed/normal extents */
+       if (filetail)
+               btrfs_dio_eof_tail(&filetail, eof, diocb);
+
+       data_start -= csum_before;
+       data_len += csum_before + csum_after;
+
+       while (data_len) {
+               struct btrfs_dio_extcb *extcb;
+               u64 filelen = 0;
+
+               /* get device extent map for next contiguous chunk */ 
+               read_lock(&em_tree->lock);
+               em = lookup_extent_mapping(em_tree, data_start, data_len);
+               read_unlock(&em_tree->lock);
+
+               err = btrfs_dio_new_extcb(&extcb, diocb, em);
+               if (err) {
+                       free_extent_map(em);
+                       return err;
+               }
+
+               /* if the chunk can not fit into MAX_STRIPE_SEGMENTS,
+                * we will have to split it into multiple extcbs, but
+                * for now, do everything assuming it fits.
+                */
+               extcb->iostart = data_start;
+               /* we now own this range and will unlock it in our completion */
+               extcb->lockstart = diocb->lockstart;
+               diocb->lockstart += data_len;
+               extcb->lockend = diocb->lockstart - 1;
+
+               /* only the first extent read can start inside a
+                * btrfs block, must read part of block before
+                * user start into temp page to validate csum.
+                */
+               if (csum_before) {
+                       data_len -= csum_before;
+                       err = btrfs_dio_read_stripes(extcb,
+                               &data_start, &csum_before, 1);
+                       if (err)
+                               goto fail;
+                       BUG_ON(csum_before);
+               }
+
+               /* device transfers to user pages in sector alignment
+                * but file tail can be 1-byte aligned.  since we need
+                * to have a temp page for checksum, we put the tail in
+                * that page and copy it to user memory on completion so
+                * post-xfer-memory looks the same as compressed or inline 
+                */
+               data_len -= csum_after + filetail;
+               filelen = data_len;
+               if (data_len) {
+                       /* add_user_pages submits must be done using diocb */
+                       extcb->active_umc = &diocb->umc;
+                       err = btrfs_dio_read_stripes(extcb,
+                               &data_start, &data_len, 0);
+                       filelen -= data_len;
+                       if (err)
+                               goto fail;
+               }
+
+               if (data_len) {
+                       /* chunk must not have fit in MAX_STRIPE_SEGMENTS,
+                        * fix everything to reflect our current state
+                        * so we can process more of the chunk in a new extcb.
+                        * we save an extra bio slot to handle the case that
+                        * the user memory vectors caused a partial last block
+                        * when we need a full one for checksums. add part of
+                        * extent as "tail checksum" and recalculate what we
+                        * have remaining for next loop.
+                        */
+                       if (csum && (extcb->iolen & (diocb->blocksize - 1))) {
+                               u64 align_size = diocb->blocksize -
+                                       (extcb->iolen & (diocb->blocksize - 1));
+
+                               data_len += filetail;
+                               if (data_len <= align_size) {
+                                       extcb->filetail = data_len;
+                                       data_len = 0;
+                               } else {
+                                       extcb->filetail = align_size;
+                                       filetail = (data_start + data_len) &
+                                                       (diocb->blocksize - 1);
+                                       data_len -= align_size;
+                                       if (csum && filetail)
+                                               csum_after = diocb->blocksize - 
filetail;
+                                       else
+                                               csum_after = 0;
+                                       if (filetail)
+                                               btrfs_dio_eof_tail(&filetail, 
eof, diocb);
+                               }
+
+                               extcb->csum_pg2 = extcb->csum_pg1;
+                               err = btrfs_dio_read_stripes(extcb,
+                                       &data_start, &align_size, 1);
+                               if (!err && align_size)
+                                       err = -EIO;
+                               if (err) {
+                                       extcb->filetail = 0;
+                                       goto fail;
+                               }
+                               /* must skip area we will copy into on 
completion */
+                               btrfs_dio_skip_user_mem(&diocb->umc, 
extcb->filetail);
+                               extcb->beforetail = filelen;
+                       }
+                       data_len += csum_after + filetail;
+                       extcb->lockend -= data_len;
+                       diocb->lockstart = extcb->lockend + 1;
+               } else if (csum_after || filetail) {
+                       /* only the last extent read can end inside a
+                        * btrfs block, must read part of block after
+                        * user end into temp page to validate csum.
+                        * csum_pg2 saves csum_before page in same extent.
+                        */
+                       extcb->csum_pg2 = extcb->csum_pg1;
+                       csum_after += filetail;
+                       csum_after = ALIGN(csum_after, 512); /* for no csum */
+                       err = btrfs_dio_read_stripes(extcb,
+                               &data_start, &csum_after, 1);
+                       if (err)
+                               goto fail;
+                       BUG_ON(csum_after);
+                       extcb->filetail = filetail;
+                       extcb->beforetail = filelen;
+               }
+
+fail:
+               diocb->start += filelen + extcb->filetail;
+
+               /* completion code is on extent not on diocb */
+               extcb->active_umc = &extcb->umc;
+
+               btfrs_dio_unplug(extcb);
+
+               spin_lock_irq(&diocb->diolock);
+               diocb->pending_extcbs++;
+               /* decrement pending_bios to let reaper run on extcb */
+               btrfs_dio_extcb_biodone(extcb);
+               spin_unlock_irq(&diocb->diolock);
+
+               if (err)
+                       return err;
+       }
+
+       return err;
+}
+
+static void btfrs_dio_unplug(struct btrfs_dio_extcb *extcb)
+{
+       int dvn;
+
+       for (dvn = 0; dvn < extcb->stripes; dvn++) {
+               if (extcb->diodev[dvn].bio)
+                       btrfs_dio_submit_bio(extcb, dvn);
+               if (extcb->diodev[dvn].unplug) {
+                       struct backing_dev_info *bdi = blk_get_backing_dev_info(
+                                               
btrfs_map_stripe_bdev(extcb->em, dvn));
+                       if (bdi && bdi->unplug_io_fn)
+                               bdi->unplug_io_fn(bdi, NULL);
+               }
+       }
+}
+
+/* build and submit bios for multiple devices that describe a raid set */
+static int btrfs_dio_read_stripes(struct btrfs_dio_extcb *extcb,
+                               u64 *rd_start, u64 *rd_len, int temp_pages)
+{
+       int err = -EIO;
+
+       while (*rd_len) {
+               u64 dev_left = *rd_len;
+               struct btrfs_stripe_info stripe_info;
+               unsigned long iomask;
+               int mirror = 0;
+               int dvn;
+
+retry:
+               btrfs_map_to_stripe(extcb->em, READ, mirror, *rd_start,
+                               &dev_left, &stripe_info);
+
+               dvn = stripe_info.stripe_index;
+               extcb->diodev[dvn].physical = stripe_info.phys_offset +
+                       btrfs_map_stripe_physical(extcb->em, 
stripe_info.stripe_index);
+
+               /* device start and length may not be sector aligned or
+                * user memory address/length vectors may not be aligned
+                * on a device sector because device sector size is > 512.
+                * we might have different size devices in the filesystem,
+                * so retry all copies to see if any meet the alignment.
+                */
+               iomask = 
bdev_logical_block_size(btrfs_map_stripe_bdev(extcb->em, dvn)) - 1;
+               if ((extcb->diodev[dvn].physical & iomask) || (dev_left & 
iomask) ||
+                               (!temp_pages &&
+                               btrfs_dio_not_aligned(iomask, (u32)dev_left,
+                                                       &extcb->diocb->umc))) {
+                       if (mirror < btrfs_map_num_copies(extcb->em)) {
+                               mirror++;
+                               goto retry;
+                       }
+                       err = -ENOTBLK;
+                       goto bailout;
+               }
+
+               *rd_len -= dev_left;
+               *rd_start += dev_left;
+
+               while (dev_left) {
+                       err = btrfs_dio_new_bio(extcb, dvn);
+                       if (err)
+                               goto bailout;
+                       extcb->order[extcb->bo_used] = extcb->diodev[dvn].bio;
+                       extcb->bo_used++;
+
+                       if (temp_pages)
+                               err = btrfs_dio_add_temp_pages(&dev_left,
+                                               extcb, dvn);
+                       else
+                               err = btrfs_dio_add_user_pages(&dev_left,
+                                               extcb, dvn);
+
+                       btrfs_dio_submit_bio(extcb, dvn);
+
+                       /* err or limit on bios we can handle in one extcb */
+                       if (err || extcb->bo_used == MAX_STRIPE_SEGMENTS) {
+                               *rd_len += dev_left;
+                               *rd_start -= dev_left;
+                               goto bailout;
+                       }
+               }
+       }
+
+bailout:
+       return err;
+}
+
+static void btrfs_dio_reset_next_in(struct btrfs_dio_extcb *extcb)
+{
+       extcb->bo_now = 0;
+       extcb->bo_bvn = 0;
+       extcb->bo_frag = 0;
+}
+
+static void btrfs_dio_get_next_in(struct bio_vec *vec,
+                               struct btrfs_dio_extcb *extcb)
+{
+       *vec = extcb->order[extcb->bo_now]->bi_io_vec[extcb->bo_bvn];
+
+       if (extcb->bo_frag) {
+               vec->bv_offset += vec->bv_len - extcb->bo_frag;
+               vec->bv_len = extcb->bo_frag;
+               extcb->bo_frag = 0;
+       }
+
+       if (++extcb->bo_bvn == extcb->order[extcb->bo_now]->bi_vcnt) {
+               extcb->bo_now++;
+               extcb->bo_bvn = 0;
+       }
+}
+
+static void btrfs_dio_put_next_in(struct bio_vec *vec,
+                               struct btrfs_dio_extcb *extcb)
+{
+       while (vec->bv_len) {
+               unsigned int bv_len;
+               if (extcb->bo_frag) {
+                       /* current bi_io_vec is part of this put-back */
+                       vec->bv_len += extcb->bo_frag;
+                       extcb->bo_frag = 0;
+                       /* else put-back begins at previous bi_io_vec or bio */
+               } else if (extcb->bo_bvn) {
+                       extcb->bo_bvn--;
+               } else {
+                       extcb->bo_now--;
+                       extcb->bo_bvn = extcb->order[extcb->bo_now]->bi_vcnt - 
1;
+               }
+
+               bv_len = 
extcb->order[extcb->bo_now]->bi_io_vec[extcb->bo_bvn].bv_len;
+               if (vec->bv_len < bv_len) {
+                       extcb->bo_frag = vec->bv_len;
+                       vec->bv_len = 0;
+                       return;
+               }
+               vec->bv_len -= bv_len;
+       }
+}
+
+static int btrfs_dio_inflate_next_in(struct bio_vec *ivec,
+                               struct btrfs_inflate *icb)
+{
+       struct btrfs_dio_extcb *extcb =
+               container_of(icb, struct btrfs_dio_extcb, icb);
+
+       btrfs_dio_get_next_in(ivec, extcb);
+       return 0;
+}
+       
+static int btrfs_dio_inline_next_in(struct bio_vec *ivec,
+                               struct btrfs_inflate *icb)
+{
+       struct btrfs_dio_extcb *extcb =
+               container_of(icb, struct btrfs_dio_extcb, icb);
+
+       access_extent_buffer_page(ivec, extcb->leaf, extcb->iostart, 
extcb->iolen);
+       extcb->iostart += ivec->bv_len;
+       extcb->iolen -= ivec->bv_len;
+       return 0;
+}
+
+static int btrfs_dio_get_user_bvec(struct bio_vec *uv,
+                               struct btrfs_dio_user_mem_control *umc)
+{
+       /* allows 0-length user iov which is questionable but seems legal */
+       while (!umc->work_iov.iov_len) {
+               umc->user_iov++;
+               umc->work_iov = *umc->user_iov;
+       }
+
+       if (!umc->user_pages_left) {
+               unsigned long addr = (unsigned long)umc->work_iov.iov_base;
+               unsigned int offset = addr & (PAGE_SIZE-1);
+               int pages = min_t(long, umc->gup_max,
+                       (min_t(long, umc->work_iov.iov_len, umc->remaining)
+                               + offset + PAGE_SIZE-1) / PAGE_SIZE);
+
+               pages = get_user_pages_fast(addr, pages, 1, umc->pagelist);
+               if (pages <= 0)
+                       return pages ? pages : -ERANGE;
+               umc->user_pages_left = pages;
+               umc->next_user_page = 0;
+       }
+
+       uv->bv_page = umc->pagelist[umc->next_user_page];
+       uv->bv_offset = (unsigned long)umc->work_iov.iov_base
+                                       & (PAGE_SIZE-1);
+       uv->bv_len = min_t(long, PAGE_SIZE - uv->bv_offset,
+                       min_t(long, min_t(long, umc->todo, umc->remaining),
+                               umc->work_iov.iov_len));
+
+       /* advance position for next caller */
+       umc->work_iov.iov_base += uv->bv_len;
+       umc->work_iov.iov_len -= uv->bv_len;
+       umc->remaining -= uv->bv_len;
+       umc->todo -= uv->bv_len;
+       if (!umc->work_iov.iov_len || uv->bv_offset + uv->bv_len == PAGE_SIZE) {
+               umc->next_user_page++;
+               umc->user_pages_left--;
+       } else {
+               /* unaligned user vectors may have multiple page releasers so
+                * we must increment ref count now to prevent premature release
+                */
+               get_page(uv->bv_page);
+       }
+
+       return 0;
+}
+
+static int btrfs_dio_not_aligned(unsigned long iomask, u32 testlen,
+                               struct btrfs_dio_user_mem_control *umc)
+{
+       const struct iovec *nuv;
+
+       if (!umc) /* temp pages are always good */
+               return 0;
+
+       if ((unsigned long)umc->work_iov.iov_base & iomask)
+               return 1;
+       if (testlen <= umc->work_iov.iov_len)
+               return 0;
+       if (umc->work_iov.iov_len & iomask)
+               return 1;
+
+       testlen -= umc->work_iov.iov_len;
+       nuv = umc->user_iov;
+       while (testlen) {
+               nuv++;
+               while (nuv->iov_len == 0)
+                       nuv++;
+               if ((unsigned long)nuv->iov_base & iomask)
+                       return 1;
+               if (testlen <= nuv->iov_len)
+                       return 0;
+               if (nuv->iov_len & iomask)
+                       return 1;
+               testlen -= nuv->iov_len;
+       }
+       return 0;
+}
+
+/* error processing only, put back the user bvec we could not process
+ * so we can get it again later or release it properly
+ */
+static void btrfs_dio_put_user_bvec(struct bio_vec *uv,
+                               struct btrfs_dio_user_mem_control *umc)
+{
+       umc->work_iov.iov_base -= uv->bv_len;
+       umc->work_iov.iov_len += uv->bv_len;
+       umc->remaining += uv->bv_len;
+       umc->todo += uv->bv_len;
+       if (umc->work_iov.iov_len == uv->bv_len ||
+                       uv->bv_offset + uv->bv_len == PAGE_SIZE) {
+               umc->next_user_page--;
+               umc->user_pages_left++;
+       } else {
+               /* remove the extra ref we took on unaligned page */
+               put_page(uv->bv_page);
+       }
+}
+
+/* error processing only, release unused user pages */
+static void btrfs_dio_release_unused_pages(struct btrfs_dio_user_mem_control 
*umc)
+{
+       while (umc->user_pages_left) {
+               page_cache_release(umc->pagelist[umc->next_user_page]);
+               umc->next_user_page++;
+               umc->user_pages_left--;
+       }
+}
+
+static void btrfs_dio_skip_user_mem(struct btrfs_dio_user_mem_control *umc,
+                               u32 skip_len)
+{
+       while (skip_len) {
+               u32 len;
+               if (!umc->work_iov.iov_len) {
+                       umc->user_iov++;
+                       umc->work_iov = *umc->user_iov;
+               }
+
+               len = min_t(u32, umc->work_iov.iov_len, skip_len);
+               umc->work_iov.iov_base += len;
+               umc->work_iov.iov_len -= len;
+               umc->remaining -= len;
+               skip_len -= len;
+       }
+}
+
+static int btrfs_dio_get_next_out(struct bio_vec *ovec,
+                               struct btrfs_inflate *icb)
+{
+       struct btrfs_dio_extcb *extcb =
+               container_of(icb, struct btrfs_dio_extcb, icb);
+       return btrfs_dio_get_user_bvec(ovec, extcb->active_umc);
+}
+
+static void btrfs_dio_done_with_out(struct bio_vec *ovec,
+                               struct btrfs_inflate *icb)
+{
+       flush_dcache_page(ovec->bv_page);
+       if (!PageCompound(ovec->bv_page))
+               set_page_dirty_lock(ovec->bv_page);
+       page_cache_release(ovec->bv_page);
+}
+
+static void btrfs_dio_release_bios(struct btrfs_dio_extcb *extcb, int dirty)
+{
+       int vn;
+
+       for (vn = 0; vn < extcb->bo_used; vn++) {
+               struct bio *bio = extcb->order[vn];
+               struct bio_vec *bvec = bio->bi_io_vec;
+               int pn;
+
+               for (pn = 0; pn < bio->bi_vcnt; pn++) {
+                       struct page *page = bvec[pn].bv_page;
+                       if (dirty && !PageCompound(page) &&
+                                       page != extcb->csum_pg1 &&
+                                       page != extcb->csum_pg2)
+                               set_page_dirty_lock(page);
+                       page_cache_release(page);
+               }
+               bio_put(bio);
+       }
+       extcb->bo_used = 0;
+}
+
+/* finish non-compressed extent that has no errors */
+static void btrfs_dio_read_done(struct btrfs_dio_extcb *extcb)
+{
+       if (extcb->filetail) {
+               btrfs_dio_skip_user_mem(extcb->active_umc, extcb->beforetail);
+               extcb->active_umc->todo = extcb->filetail;
+               while (extcb->active_umc->todo) {
+                       struct bio_vec uv;
+                       char *filetail;
+                       char *out;
+
+                       extcb->error = btrfs_dio_get_user_bvec(&uv, 
extcb->active_umc);
+                       if (extcb->error) {
+                               extcb->filestart -= extcb->active_umc->todo;
+                               goto fail;
+                       }
+                       filetail = kmap_atomic(extcb->csum_pg1, KM_USER0);
+                       out = kmap_atomic(uv.bv_page, KM_USER1);
+                       memcpy(out + uv.bv_offset, filetail, uv.bv_len);
+                       kunmap_atomic(out, KM_USER1);
+                       kunmap_atomic(filetail, KM_USER0);
+
+                       btrfs_dio_done_with_out(&uv, NULL);
+               }
+       }
+fail:
+       btrfs_dio_release_bios(extcb, 1);
+}
+
+/* inflate and finish compressed extent that has no errors.
+ * all-or-nothing as partial result from zlib is likely garbage.
+ * we don't retry if decompression fails, the assumption is
+ * all mirrors are trash because we had valid checksums.
+ */ 
+static void btrfs_dio_decompress(struct btrfs_dio_extcb *extcb)
+{
+       u32 len = extcb->icb.out_len;
+
+       extcb->error = btrfs_zlib_inflate(&extcb->icb);
+
+       /* ugly again - compressed extents can end with an implied hole */
+       if (!extcb->error && extcb->icb.out_len != len) {
+               while (extcb->umc.todo) {
+                       struct bio_vec uv;
+                       char *out;
+
+                       extcb->error = btrfs_dio_get_user_bvec(&uv, 
&extcb->umc);
+                       if (extcb->error)
+                               goto fail;
+                       out = kmap_atomic(uv.bv_page, KM_USER0);
+                       memset(out + uv.bv_offset, 0, uv.bv_len);
+                       kunmap_atomic(out, KM_USER0);
+
+                       btrfs_dio_done_with_out(&uv, NULL);
+               }
+       }
+fail:
+       btrfs_dio_release_bios(extcb, 0);
+}
+
+static void btrfs_dio_free_extcb(struct btrfs_dio_extcb *extcb)
+{
+       if (!extcb->error)
+               extcb->error = extcb->shortread;
+       if (extcb->error) {
+               spin_lock_irq(&extcb->diocb->diolock);
+               if (extcb->diocb->terminate > extcb->filestart)
+                       extcb->diocb->terminate = extcb->filestart;
+               if (!extcb->diocb->error)
+                       extcb->diocb->error = extcb->error;
+               spin_unlock_irq(&extcb->diocb->diolock);
+       }
+
+       btrfs_dio_free_retry(extcb);
+
+       btrfs_dio_release_bios(extcb, 1); /* mark dirty as we just don't know */
+
+       btrfs_dio_release_unused_pages(extcb->active_umc);
+
+       unlock_extent(&BTRFS_I(extcb->diocb->inode)->io_tree, extcb->lockstart,
+                       extcb->lockend, GFP_NOFS);
+       free_extent_map(extcb->em);
+       kfree(extcb);
+}
+
+static int btrfs_dio_get_workbuf(struct btrfs_dio_extcb *extcb)
+{
+       if (extcb->compressed) {
+               if (!extcb->diocb->workspace) {
+                       struct workspace *workspace;
+                       workspace = find_zlib_workspace();
+                       if (IS_ERR(workspace))
+                               return -ENOMEM;
+                       extcb->diocb->workspace = workspace;
+               }
+               extcb->icb.workspace = extcb->diocb->workspace;
+               extcb->tmpbuf = extcb->icb.workspace->buf;
+       } else {
+               if (!extcb->diocb->csum_buf) {
+                       extcb->diocb->csum_buf = kmalloc(PAGE_SIZE, GFP_NOFS);
+                       if (!extcb->diocb->csum_buf)
+                               return -ENOMEM;
+               }
+               extcb->tmpbuf = extcb->diocb->csum_buf;
+       }
+       extcb->tmpbuf_size = PAGE_SIZE;
+       return 0;
+}
+
+/* on error retries, our work buffers could be released
+ * if not in use for other extcbs, so drop them to be safe
+ */
+static int btrfs_dio_drop_workbuf(struct btrfs_dio_extcb *extcb)
+{
+       extcb->icb.workspace = NULL;
+       extcb->tmpbuf = NULL;
+       extcb->tmpbuf_size = 0;
+       return 0;
+}
+
+static void btrfs_dio_complete_bios(struct btrfs_diocb *diocb)
+{
+       struct btrfs_dio_extcb *extcb;
+
+       do {
+               spin_lock_irq(&diocb->diolock);
+               extcb = diocb->done_extcbs;
+               if (extcb) {
+                       diocb->done_extcbs = extcb->next;
+                       diocb->pending_extcbs--;
+                       extcb->next = NULL;
+               }
+
+               spin_unlock_irq(&diocb->diolock);
+
+               if (extcb) {
+                       int err2 = extcb->error;
+
+                       /* when another I/O failed with a file offset
+                        * less than our own, no reason to do anything.
+                        */
+                       if (diocb->terminate < extcb->filestart) {
+                               btrfs_dio_free_retry(extcb);
+                               err2 = -EIO;
+                       } else if (err2 || extcb->retry_bio)
+                               err2 = btrfs_dio_read_retry(extcb);
+
+                       /* wait for io/csum retry we just started to finish */
+                       if (extcb->retry_bio)
+                               continue;
+
+                       if (!err2)
+                               err2 = btrfs_dio_get_workbuf(extcb);
+
+                       if (!err2 && !(BTRFS_I(diocb->inode)->flags
+                                       & BTRFS_INODE_NODATASUM)) {
+                               err2 = btrfs_dio_read_csum(extcb);
+                               if (extcb->retry_bio) {
+                                       btrfs_dio_drop_workbuf(extcb);
+                                       continue; /* trying another copy */
+                               }
+                       }
+
+                       if (!err2) {
+                               btrfs_dio_reset_next_in(extcb);
+                               if (extcb->compressed)
+                                       btrfs_dio_decompress(extcb);
+                               else
+                                       btrfs_dio_read_done(extcb);
+                       }
+
+                       if (err2)
+                               extcb->error = err2;
+                       btrfs_dio_free_extcb(extcb);
+                       cond_resched();
+               }
+       } while (extcb);
+
+       /* release large zlib memory until we run again */
+       if (diocb->workspace) {
+               free_workspace(diocb->workspace);
+               diocb->workspace = NULL;
+       }
+}
+
+static int btrfs_dio_new_bio(struct btrfs_dio_extcb *extcb, int dvn)
+{
+       int vecs = bio_get_nr_vecs(btrfs_map_stripe_bdev(extcb->em, dvn));
+
+       extcb->diodev[dvn].bio = bio_alloc(GFP_NOFS, vecs);
+       if (extcb->diodev[dvn].bio == NULL)
+               return -ENOMEM;
+
+       extcb->diodev[dvn].vecs = vecs;
+       extcb->diodev[dvn].bio->bi_bdev = btrfs_map_stripe_bdev(extcb->em, dvn);
+       extcb->diodev[dvn].bio->bi_sector = extcb->diodev[dvn].physical >> 9;
+       extcb->diodev[dvn].bio->bi_private = extcb;
+       extcb->diodev[dvn].bio->bi_end_io = &btrfs_dio_bi_end_io;
+
+       return 0;
+}
+
+static void btrfs_dio_submit_bio(struct btrfs_dio_extcb *extcb, int dvn)
+{
+       if (!extcb->diodev[dvn].bio)
+               return;
+       extcb->diodev[dvn].vecs = 0;
+       if (!extcb->diodev[dvn].bio->bi_vcnt) {
+               bio_put(extcb->diodev[dvn].bio);
+               extcb->diodev[dvn].bio = NULL;
+               return;
+       }
+       spin_lock_irq(&extcb->diocb->diolock);
+       extcb->pending_bios++;
+       spin_unlock_irq(&extcb->diocb->diolock);
+
+       bio_get(extcb->diodev[dvn].bio);
+       submit_bio(extcb->diocb->rw, extcb->diodev[dvn].bio);
+       bio_put(extcb->diodev[dvn].bio);
+       extcb->diodev[dvn].bio = NULL;
+       extcb->diodev[dvn].unplug++;
+}
+
+/* pin user pages and add to current bio until either
+ * bio is full or device read/write length remaining is 0.
+ * spans memory segments in multiple io vectors that can
+ * begin and end on non-page (but sector-size aligned) boundaries.
+ */   
+static int btrfs_dio_add_user_pages(u64 *dev_left, struct btrfs_dio_extcb 
*extcb,
+                               int dvn)
+{
+       extcb->active_umc->todo = *dev_left;
+       while (extcb->diodev[dvn].vecs && *dev_left) {
+               struct bio_vec uv;
+
+               int err = btrfs_dio_get_user_bvec(&uv, extcb->active_umc);
+               if (err)
+                       return err;
+
+               if (!bio_add_page(extcb->diodev[dvn].bio, uv.bv_page,
+                               uv.bv_len, uv.bv_offset)) {
+                       btrfs_dio_put_user_bvec(&uv, extcb->active_umc);
+                       extcb->diodev[dvn].vecs = 0;
+                       return 0;
+               }
+               extcb->iolen += uv.bv_len;
+               extcb->diodev[dvn].physical += uv.bv_len;
+               *dev_left -= uv.bv_len;
+               extcb->diodev[dvn].vecs--;
+       }
+       return 0;
+}
+
+/* submit kernel temporary pages for compressed read */
+static int btrfs_dio_add_temp_pages(u64 *dev_left, struct btrfs_dio_extcb 
*extcb,
+                               int dvn)
+{
+       while (extcb->diodev[dvn].vecs && *dev_left) {
+               unsigned int pglen = min_t(long, *dev_left, PAGE_SIZE);
+               struct page *page = alloc_page(GFP_NOFS | __GFP_HIGHMEM);
+
+               if (!page)
+                       return -ENOMEM;
+               if (!bio_add_page(extcb->diodev[dvn].bio, page, pglen, 0)) {
+                       extcb->diodev[dvn].vecs = 0;
+                       page_cache_release(page);
+                       return 0;
+               }
+               extcb->csum_pg1 = page;
+               extcb->iolen += pglen;
+               extcb->diodev[dvn].physical += pglen;
+               *dev_left -= pglen;
+               extcb->diodev[dvn].vecs--;
+       }
+
+       return 0;
+}
+
+static int btrfs_dio_hole_read(struct btrfs_diocb *diocb, u64 hole_len)
+{
+       int err = 0;
+       diocb->umc.todo = hole_len;
+       while (diocb->umc.todo) {
+               struct bio_vec uv;
+               char *out;
+
+               err = btrfs_dio_get_user_bvec(&uv, &diocb->umc);
+               if (err)
+                       goto fail;
+               diocb->start += uv.bv_len;
+               out = kmap_atomic(uv.bv_page, KM_USER0);
+               memset(out + uv.bv_offset, 0, uv.bv_len);
+               kunmap_atomic(out, KM_USER0);
+
+               btrfs_dio_done_with_out(&uv, NULL);
+       }
+fail:
+       unlock_extent(&BTRFS_I(diocb->inode)->io_tree, diocb->lockstart,
+                       diocb->lockstart + hole_len - 1, GFP_NOFS);
+       diocb->lockstart += hole_len;
+       return err;
+}
+
+static int btrfs_dio_inline_read(struct btrfs_diocb *diocb, u64 *data_len)
+{
+       int err;
+       size_t size;
+       size_t extent_offset;
+       u64 extent_start;
+       u64 objectid = diocb->inode->i_ino;
+       struct btrfs_root *root = BTRFS_I(diocb->inode)->root;
+       struct btrfs_path *path;
+       struct btrfs_file_extent_item *item;
+       struct extent_buffer *leaf;
+       struct btrfs_key found_key;
+
+       path = btrfs_alloc_path();
+
+       err = btrfs_lookup_file_extent(NULL, root, path, objectid, 
diocb->start, 0);
+       if (err) {
+               if (err < 0)
+                       goto notfound;
+               err= -EDOM;
+               if (path->slots[0] == 0) {
+               printk(KERN_ERR "btrfs directIO inline extent leaf not found 
ino %lu\n",
+                               diocb->inode->i_ino);
+                       goto fail;
+               }
+               path->slots[0]--;
+       }
+
+       leaf = path->nodes[0];
+       item = btrfs_item_ptr(leaf, path->slots[0],
+                             struct btrfs_file_extent_item);
+       btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
+       if (found_key.objectid != objectid ||
+               btrfs_key_type(&found_key) != BTRFS_EXTENT_DATA_KEY ||
+               btrfs_file_extent_type(leaf, item) != BTRFS_FILE_EXTENT_INLINE) 
{
+               printk(KERN_ERR "btrfs directIO inline extent leaf mismatch ino 
%lu\n",
+                               diocb->inode->i_ino);
+               err= -EDOM;
+               goto fail;
+       }
+
+       extent_start = found_key.offset;
+       /* uncompressed size */
+       size = btrfs_file_extent_inline_len(leaf, item);
+       if (diocb->start < extent_start) {
+               printk(KERN_ERR "btrfs directIO inline extent range mismatch 
ino %lu"
+                       " fpos %lld found start %lld size %ld\n",
+                       diocb->inode->i_ino,diocb->start,extent_start,size);
+               err= -EDOM;
+               goto fail;
+       }
+
+       /* we can end here when we start in an implied hole on a larger file */
+       if (diocb->start >= extent_start + size) {
+               *data_len = 0;
+               err = 0;
+               goto fail;
+       }
+
+       extent_offset = diocb->start - extent_start;
+       size = min_t(u64, *data_len, size - extent_offset);
+
+       size = min_t(u64, *data_len, size);
+       *data_len = size;
+
+       if (btrfs_file_extent_compression(leaf, item) ==
+                                               BTRFS_COMPRESS_ZLIB) {
+               struct btrfs_dio_extcb *extcb;
+
+               extcb = kzalloc(sizeof(*extcb), GFP_NOFS);
+               if (!extcb) {
+                       err = -ENOMEM;
+                       goto fail;
+               }
+
+               extcb->diocb = diocb;
+               extcb->compressed = 1;
+
+               extcb->active_umc = &extcb->umc;
+               extcb->umc.gup_max = GUP_IOSUBMIT_MAX;
+               extcb->umc.pagelist = diocb->gup_iosubmit_pages;
+               extcb->umc.work_iov = diocb->umc.work_iov;
+               extcb->umc.user_iov = diocb->umc.user_iov;
+               extcb->umc.remaining = diocb->umc.remaining;
+               extcb->umc.todo = size;
+
+               extcb->iostart = btrfs_file_extent_inline_start(item);
+               extcb->iolen = btrfs_file_extent_inline_item_len(leaf,
+                                       btrfs_item_nr(leaf, path->slots[0]));
+
+               extcb->icb.out_start = extent_offset;
+               extcb->icb.out_len = size;
+               extcb->icb.get_next_in = btrfs_dio_inline_next_in;
+               extcb->icb.get_next_out = btrfs_dio_get_next_out;
+               extcb->icb.done_with_out = btrfs_dio_done_with_out;
+               /* NULL icb.workspace so btrfs_zlib_inflate allocates workspace 
*/
+
+               extcb->leaf = leaf;
+
+               err = btrfs_zlib_inflate(&extcb->icb);
+               /* all or nothing as we can't trust partial inflate */
+               if (!err)
+                       diocb->start += size;
+
+               /* we allow extents after inline if a hole follows */
+               diocb->umc.work_iov = extcb->umc.work_iov;
+               diocb->umc.user_iov = extcb->umc.user_iov;
+               diocb->umc.remaining = extcb->umc.remaining;
+
+               kfree(extcb);
+       } else {
+               unsigned long inline_start;
+               inline_start = btrfs_file_extent_inline_start(item)
+                               + extent_offset;
+               diocb->umc.todo = size;
+               while (diocb->umc.todo) {
+                       struct bio_vec uv;
+                       char *out;
+
+                       err = btrfs_dio_get_user_bvec(&uv, &diocb->umc);
+                       if (err)
+                               goto fail;
+                       diocb->start += uv.bv_len;
+                       out = kmap_atomic(uv.bv_page, KM_USER1);
+                       read_extent_buffer(leaf, out + uv.bv_offset,
+                                       inline_start, uv.bv_len);
+                       inline_start += uv.bv_len;
+                       kunmap_atomic(out, KM_USER1);
+
+                       btrfs_dio_done_with_out(&uv, NULL);
+               }
+       }
+
+fail:
+       btrfs_release_path(root, path);
+notfound:
+       btrfs_free_path(path);
+       if (!err && *data_len) {
+               unlock_extent(&BTRFS_I(diocb->inode)->io_tree, diocb->lockstart,
+                               diocb->lockstart + *data_len - 1, GFP_NOFS);
+               diocb->lockstart += *data_len;
+       }
+       return err;
+}
+
+/* verify disk data checksums for extent read.
+ * complexity is user memory addesses may not be
+ * aligned with our checksummed logical disk blocks.
+ *
+ * this changes extcb->filestart for uncompressed extents
+ * to identify where good data ends on a partial success.
+ */
+static int btrfs_dio_read_csum(struct btrfs_dio_extcb *extcb)
+{
+       struct bio_vec ivec;
+       struct btrfs_root *root = 
BTRFS_I(extcb->diocb->inode)->root->fs_info->csum_root;
+       u32 iolen_per_csum_buf = extcb->diocb->blocksize * (extcb->tmpbuf_size
+               / btrfs_super_csum_size(&root->fs_info->super_copy));
+
+       if (extcb->iolen & (extcb->diocb->blocksize - 1)) {
+               printk(KERN_WARNING "btrfs directIO unaligned checksum for ino 
%lu\n",
+                               extcb->diocb->inode->i_ino);
+               extcb->iolen &= ~(extcb->diocb->blocksize - 1);
+       }
+
+       ivec.bv_len = 0;
+       while (extcb->iolen) {
+               u64 len = min(extcb->iolen, iolen_per_csum_buf);
+               u64 end = extcb->iostart + len - 1;
+               u32 *fs_csum = (u32 *)extcb->tmpbuf;
+               u32 csum;
+               int err;
+
+               err = btrfs_lookup_csums_range(root, extcb->iostart, end, NULL, 
fs_csum);
+               if (err) {
+                       printk(KERN_ERR "btrfs directIO csum lookup failed ino 
%lu "
+                               "extent start %llu end %llu\n",
+                               extcb->diocb->inode->i_ino, extcb->iostart, 
end);
+                       return err;
+               }
+
+               while (len) {
+                       size_t csum_len = extcb->diocb->blocksize;
+
+                       /* each checksum block is a filesystem block and on the
+                        * same device, but user memory can be 512 byte aligned
+                        * so we have to be able to span multiple pages here
+                        */ 
+                       csum = ~(u32)0;
+                       while (csum_len) {
+                               char *in;
+                               size_t cl;
+
+                               if (ivec.bv_len == 0)
+                                       btrfs_dio_get_next_in(&ivec, extcb);
+                               cl = min_t(size_t, ivec.bv_len, csum_len);
+                               in = kmap_atomic(ivec.bv_page, KM_USER0);
+                               csum = btrfs_csum_data(root, in + 
ivec.bv_offset, csum, cl);
+                               kunmap_atomic(in, KM_USER0);
+                               ivec.bv_offset += cl;
+                               ivec.bv_len -= cl;
+                               csum_len -= cl;
+                       }
+
+                       btrfs_csum_final(csum, (char *)&csum);
+                       if (csum != *fs_csum) {
+                               printk(KERN_WARNING "btrfs directIO csum failed 
ino %lu "
+                                       "block %llu csum %u wanted %u\n",
+                                       extcb->diocb->inode->i_ino,
+                                       extcb->iostart, csum, *fs_csum);
+                               /* give up if partial read failure or
+                                * missing checksum from btree lookup
+                                */
+                               if (extcb->shortread || *fs_csum == 0)
+                                       return -EIO;
+                               extcb->retry_csum = *fs_csum;
+                               extcb->retry_start = extcb->iostart;
+                               extcb->retry_mirror = 0;
+                               extcb->retry_len = extcb->diocb->blocksize;
+
+                               /* need to give back vector remaining
+                                * length and the length of checksum block
+                                * so we are at correct input spot for retry
+                                */
+                               ivec.bv_len += extcb->diocb->blocksize;
+                               btrfs_dio_put_next_in(&ivec, extcb);
+                               return btrfs_dio_retry_block(extcb);
+                       }
+
+                       extcb->iostart += extcb->diocb->blocksize;
+                       extcb->iolen -= extcb->diocb->blocksize;
+                       if (!extcb->compressed) {
+                               if (!extcb->iolen && extcb->filetail) {
+                                       extcb->filestart += extcb->filetail;
+                               } else {
+                                       extcb->filestart += 
extcb->diocb->blocksize;
+                                       /* 1st extent can start inside block */
+                                       extcb->filestart &= 
~(extcb->diocb->blocksize -1);
+                               }
+                       }
+                       len -= extcb->diocb->blocksize;
+                       fs_csum++;
+                       cond_resched();
+               }
+       }
+       return 0;
+}
+
+static void btrfs_dio_free_retry(struct btrfs_dio_extcb *extcb)
+{
+       if (!extcb->retry_bio)
+               return;
+
+       /* we only allocate temp pages for uncompressed retries */
+       if (!extcb->compressed) {
+               struct bio_vec *bvec = extcb->retry_bio->bi_io_vec;
+               int pn;
+
+               for (pn = 0; pn < extcb->retry_bio->bi_vcnt; pn++)
+                       page_cache_release(bvec[pn].bv_page);
+       }
+       bio_put(extcb->retry_bio);
+       extcb->retry_bio = NULL;
+}
+
+/* reads exactly one filesystem block into temp page(s) for
+ * retry on bio/checksum error.  blocksize and temp pages
+ * guarentee we don't have sector size issues between mirrors
+ * and are not failing checksum from user overwriting memory.
+ * if it works, we will memcopy the new data to user memory.
+ */
+static int btrfs_dio_retry_block(struct btrfs_dio_extcb *extcb)
+{
+       struct btrfs_stripe_info stripe_info;
+       u64 len = extcb->diocb->blocksize;
+       u64 physical;
+       struct backing_dev_info *bdi;
+       int pages = ALIGN(len, PAGE_SIZE) / PAGE_SIZE;
+
+       btrfs_dio_free_retry(extcb);
+       extcb->retry_mirror++;
+       if (extcb->retry_mirror > btrfs_map_num_copies(extcb->em)) {
+               u32 good = extcb->retry_start -
+                               min(extcb->retry_start, extcb->iostart);
+               /* csum retry ends here as always !good */
+               if (extcb->compressed || !good)
+                       return -EIO;
+               /* no checksum, return partial success of i/o from device */
+               if (BTRFS_I(extcb->diocb->inode)->flags & 
BTRFS_INODE_NODATASUM) {
+                       extcb->filestart += good;
+                       return -EIO;
+               }
+               /* limit checksum test to valid read length */
+               extcb->iolen = good;
+               extcb->filetail = 0;
+               extcb->shortread = -EIO;
+               btrfs_dio_reset_next_in(extcb);
+               return 0;
+       }
+
+       extcb->retry_bio = bio_alloc(GFP_NOFS, pages);
+       if (extcb->retry_bio == NULL)
+               return -ENOMEM;
+
+       btrfs_map_to_stripe(extcb->em, READ, extcb->retry_mirror,
+                       extcb->retry_start, &len, &stripe_info);
+       physical = stripe_info.phys_offset +
+               btrfs_map_stripe_physical(extcb->em, stripe_info.stripe_index);
+       extcb->retry_bio->bi_sector = physical >> 9;
+       extcb->retry_bio->bi_bdev =
+               btrfs_map_stripe_bdev(extcb->em, stripe_info.stripe_index);
+       extcb->retry_bio->bi_private = extcb;
+       extcb->retry_bio->bi_end_io = &btrfs_dio_bi_end_io;
+       bdi = blk_get_backing_dev_info(extcb->retry_bio->bi_bdev);
+
+       while (len) {
+               unsigned int pglen = min_t(long, len, PAGE_SIZE);
+               struct page *page;
+
+               /* compressed read bios use temp pages, reuse them */
+               if (extcb->compressed)
+                       page = extcb->order[extcb->bo_now]->
+                                       bi_io_vec[extcb->bo_bvn].bv_page;
+               else
+                       page = alloc_page(GFP_NOFS | __GFP_HIGHMEM);
+
+               if (!bio_add_page(extcb->retry_bio, page, pglen, 0)) {
+                       if (!extcb->compressed)
+                               page_cache_release(page);
+                       return -EIO;
+               }
+               len -= pglen;
+               if (len && extcb->compressed)
+                       extcb->bo_bvn++;
+       }
+
+       spin_lock_irq(&extcb->diocb->diolock);
+       extcb->pending_bios++;
+       extcb->diocb->pending_extcbs++;
+       spin_unlock_irq(&extcb->diocb->diolock);
+       bio_get(extcb->retry_bio);
+       submit_bio(extcb->diocb->rw, extcb->retry_bio);
+       bio_put(extcb->retry_bio);
+       if (bdi && bdi->unplug_io_fn)
+               bdi->unplug_io_fn(bdi, NULL);
+       return 0;
+}
+
+/* scan forward in file order looking for next bio that failed */
+static int btrfs_dio_bad_bio_scan(struct btrfs_dio_extcb *extcb)
+{
+       for ( ; extcb->bo_now < extcb->bo_used; extcb->bo_now++) {
+               struct bio *bio = extcb->order[extcb->bo_now];
+               int vn;
+
+               extcb->retry_len = 0;
+               for (vn = 0; vn < bio->bi_vcnt; vn++)
+                       extcb->retry_len += bio->bi_io_vec[vn].bv_len;
+
+               if (!test_bit(BIO_UPTODATE, &bio->bi_flags)) {
+                       extcb->bo_bvn = 0;
+                       extcb->bo_frag = 0;
+                       return btrfs_dio_retry_block(extcb);
+               }
+
+               extcb->retry_start += extcb->retry_len;
+       }
+
+       /* if we get here, it must all be good */
+       btrfs_dio_reset_next_in(extcb);
+       extcb->error = 0;
+       return 0;
+}
+
+static int btrfs_dio_read_retry(struct btrfs_dio_extcb *extcb)
+{
+       /* begin with first I/O error from bios sent by initial extent submit */
+       if (!extcb->retry_bio) {
+               extcb->retry_start = extcb->iostart;
+               extcb->retry_mirror = 0;
+               return btrfs_dio_bad_bio_scan(extcb);
+       }
+
+       /* we already sent a block retry and are now checking it */
+       if (!test_bit(BIO_UPTODATE, &extcb->retry_bio->bi_flags))
+               return btrfs_dio_retry_block(extcb);
+
+       extcb->error = 0;
+
+       if (extcb->retry_csum) {
+               struct btrfs_root *root = BTRFS_I(extcb->diocb->inode)->
+                                       root->fs_info->csum_root;
+               struct bio_vec *retry = extcb->retry_bio->bi_io_vec;
+               char *new;
+               u32 csum = ~0;
+               size_t csum_len = extcb->retry_len;
+
+               /* blocksize can exceed page size */ 
+               while (csum_len) {
+                       size_t cl = min_t(size_t, retry->bv_len, csum_len);
+                       new = kmap_atomic(retry->bv_page, KM_USER0);
+                       csum = btrfs_csum_data(root, new, csum, cl);
+                       kunmap_atomic(new, KM_USER0);
+                       retry++;
+                       csum_len -= cl;
+               }
+               btrfs_csum_final(csum, (char *)&csum);
+               if (csum != extcb->retry_csum)
+                       return btrfs_dio_retry_block(extcb);
+       }
+
+       /* compressed extents have temp pages that we read blocks into,
+        * uncompressed extents must be de-blocked into user's pages
+        */
+       if (!extcb->compressed) {
+               struct bio_vec *retry = extcb->retry_bio->bi_io_vec;
+               struct bio_vec bad;
+               size_t bad_len = min(extcb->retry_len, extcb->diocb->blocksize);
+               size_t offset;
+
+               /* user file position can start inside logical block */
+               offset = extcb->retry_start & (extcb->diocb->blocksize-1);
+               retry->bv_offset += offset;
+               retry->bv_len -= offset;
+                       
+               bad.bv_len = 0;
+               while (bad_len) {
+                       size_t cl;
+                       char *new;
+                       char *out;
+
+                       if (bad.bv_len == 0)
+                               btrfs_dio_get_next_in(&bad, extcb);
+                       cl = min_t(size_t, bad_len, min(bad.bv_len, 
retry->bv_len));
+                       new = kmap_atomic(retry->bv_page, KM_USER0);
+                       out = kmap_atomic(bad.bv_page, KM_USER1);
+                       memcpy(out + bad.bv_offset, new + retry->bv_offset, cl);
+                       kunmap_atomic(out, KM_USER1);
+                       kunmap_atomic(new, KM_USER0);
+
+                       retry->bv_offset += cl;
+                       retry->bv_len -= cl;
+                       if (!retry->bv_len)
+                               retry++;
+                       bad.bv_offset += cl;
+                       bad.bv_len -= cl;
+                       bad_len -= cl;
+               }
+
+               /* record unfinished part of unaligned user memory for next 
retry */
+               btrfs_dio_put_next_in(&bad, extcb);
+       }
+
+       btrfs_dio_free_retry(extcb);
+
+       if (extcb->retry_csum) {
+               extcb->iostart += extcb->diocb->blocksize;
+               extcb->iolen -= extcb->diocb->blocksize;
+               if (!extcb->compressed) {
+                       if (!extcb->iolen && extcb->filetail) {
+                               extcb->filestart += extcb->filetail;
+                       } else {
+                               extcb->filestart += extcb->diocb->blocksize;
+                               extcb->filestart &= ~(extcb->diocb->blocksize 
-1);
+                       }
+               }
+               return 0;
+       }       
+
+       /* we are still processing bad bios from I/O submit */
+       extcb->retry_start += extcb->diocb->blocksize;
+       extcb->retry_mirror = 0;
+
+       /* do we have any more blocks to do in this bio */
+       extcb->retry_len -= extcb->diocb->blocksize;
+       if (extcb->retry_len)
+               return btrfs_dio_retry_block(extcb);
+
+       /* continue scan with next bio */
+       if (extcb->compressed) /* uncompressed copy already incremented bo_now 
*/
+               extcb->bo_now++;
+       return btrfs_dio_bad_bio_scan(extcb);
+}
-- 
1.5.6.3
--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to [email protected]
More majordomo info at  http://vger.kernel.org/majordomo-info.html

[PATCH V2] Btrfs: Full direct I/O and AIO read implementation.

Reply via email to