Merge branch 'for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/mason/linux...
[~shefty/rdma-dev.git] / fs / btrfs / extent_io.c
index 2862454bcdb32fb78b435b1806d5db297a65f774..8d904dd7ea9f6175012d8abdcf810a7d8d200ae7 100644 (file)
@@ -19,6 +19,7 @@
 #include "btrfs_inode.h"
 #include "volumes.h"
 #include "check-integrity.h"
+#include "locking.h"
 
 static struct kmem_cache *extent_state_cache;
 static struct kmem_cache *extent_buffer_cache;
@@ -53,6 +54,13 @@ struct extent_page_data {
        unsigned int sync_io:1;
 };
 
+static noinline void flush_write_bio(void *data);
+static inline struct btrfs_fs_info *
+tree_fs_info(struct extent_io_tree *tree)
+{
+       return btrfs_sb(tree->mapping->host->i_sb);
+}
+
 int __init extent_io_init(void)
 {
        extent_state_cache = kmem_cache_create("extent_state",
@@ -136,6 +144,7 @@ static struct extent_state *alloc_extent_state(gfp_t mask)
 #endif
        atomic_set(&state->refs, 1);
        init_waitqueue_head(&state->wq);
+       trace_alloc_extent_state(state, mask, _RET_IP_);
        return state;
 }
 
@@ -153,6 +162,7 @@ void free_extent_state(struct extent_state *state)
                list_del(&state->leak_list);
                spin_unlock_irqrestore(&leak_lock, flags);
 #endif
+               trace_free_extent_state(state, _RET_IP_);
                kmem_cache_free(extent_state_cache, state);
        }
 }
@@ -439,6 +449,13 @@ alloc_extent_state_atomic(struct extent_state *prealloc)
        return prealloc;
 }
 
+void extent_io_tree_panic(struct extent_io_tree *tree, int err)
+{
+       btrfs_panic(tree_fs_info(tree), err, "Locking error: "
+                   "Extent tree was modified by another "
+                   "thread while locked.");
+}
+
 /*
  * clear some bits on a range in the tree.  This may require splitting
  * or inserting elements in the tree, so the gfp mask is used to
@@ -449,8 +466,7 @@ alloc_extent_state_atomic(struct extent_state *prealloc)
  *
  * the range [start, end] is inclusive.
  *
- * This takes the tree lock, and returns < 0 on error, > 0 if any of the
- * bits were already set, or zero if none of the bits were already set.
+ * This takes the tree lock, and returns 0 on success and < 0 on error.
  */
 int clear_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
                     int bits, int wake, int delete,
@@ -464,7 +480,6 @@ int clear_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
        struct rb_node *node;
        u64 last_end;
        int err;
-       int set = 0;
        int clear = 0;
 
        if (delete)
@@ -542,12 +557,14 @@ hit_next:
                prealloc = alloc_extent_state_atomic(prealloc);
                BUG_ON(!prealloc);
                err = split_state(tree, state, prealloc, start);
-               BUG_ON(err == -EEXIST);
+               if (err)
+                       extent_io_tree_panic(tree, err);
+
                prealloc = NULL;
                if (err)
                        goto out;
                if (state->end <= end) {
-                       set |= clear_state_bit(tree, state, &bits, wake);
+                       clear_state_bit(tree, state, &bits, wake);
                        if (last_end == (u64)-1)
                                goto out;
                        start = last_end + 1;
@@ -564,17 +581,19 @@ hit_next:
                prealloc = alloc_extent_state_atomic(prealloc);
                BUG_ON(!prealloc);
                err = split_state(tree, state, prealloc, end + 1);
-               BUG_ON(err == -EEXIST);
+               if (err)
+                       extent_io_tree_panic(tree, err);
+
                if (wake)
                        wake_up(&state->wq);
 
-               set |= clear_state_bit(tree, prealloc, &bits, wake);
+               clear_state_bit(tree, prealloc, &bits, wake);
 
                prealloc = NULL;
                goto out;
        }
 
-       set |= clear_state_bit(tree, state, &bits, wake);
+       clear_state_bit(tree, state, &bits, wake);
 next:
        if (last_end == (u64)-1)
                goto out;
@@ -591,7 +610,7 @@ out:
        if (prealloc)
                free_extent_state(prealloc);
 
-       return set;
+       return 0;
 
 search_again:
        if (start > end)
@@ -602,8 +621,8 @@ search_again:
        goto again;
 }
 
-static int wait_on_state(struct extent_io_tree *tree,
-                        struct extent_state *state)
+static void wait_on_state(struct extent_io_tree *tree,
+                         struct extent_state *state)
                __releases(tree->lock)
                __acquires(tree->lock)
 {
@@ -613,7 +632,6 @@ static int wait_on_state(struct extent_io_tree *tree,
        schedule();
        spin_lock(&tree->lock);
        finish_wait(&state->wq, &wait);
-       return 0;
 }
 
 /*
@@ -621,7 +639,7 @@ static int wait_on_state(struct extent_io_tree *tree,
  * The range [start, end] is inclusive.
  * The tree lock is taken by this function
  */
-int wait_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, int bits)
+void wait_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, int bits)
 {
        struct extent_state *state;
        struct rb_node *node;
@@ -658,7 +676,6 @@ again:
        }
 out:
        spin_unlock(&tree->lock);
-       return 0;
 }
 
 static void set_state_bits(struct extent_io_tree *tree,
@@ -706,9 +723,10 @@ static void uncache_state(struct extent_state **cached_ptr)
  * [start, end] is inclusive This takes the tree lock.
  */
 
-int set_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
-                  int bits, int exclusive_bits, u64 *failed_start,
-                  struct extent_state **cached_state, gfp_t mask)
+static int __must_check
+__set_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
+                int bits, int exclusive_bits, u64 *failed_start,
+                struct extent_state **cached_state, gfp_t mask)
 {
        struct extent_state *state;
        struct extent_state *prealloc = NULL;
@@ -742,8 +760,10 @@ again:
                prealloc = alloc_extent_state_atomic(prealloc);
                BUG_ON(!prealloc);
                err = insert_state(tree, prealloc, start, end, &bits);
+               if (err)
+                       extent_io_tree_panic(tree, err);
+
                prealloc = NULL;
-               BUG_ON(err == -EEXIST);
                goto out;
        }
        state = rb_entry(node, struct extent_state, rb_node);
@@ -809,7 +829,9 @@ hit_next:
                prealloc = alloc_extent_state_atomic(prealloc);
                BUG_ON(!prealloc);
                err = split_state(tree, state, prealloc, start);
-               BUG_ON(err == -EEXIST);
+               if (err)
+                       extent_io_tree_panic(tree, err);
+
                prealloc = NULL;
                if (err)
                        goto out;
@@ -846,12 +868,9 @@ hit_next:
                 */
                err = insert_state(tree, prealloc, start, this_end,
                                   &bits);
-               BUG_ON(err == -EEXIST);
-               if (err) {
-                       free_extent_state(prealloc);
-                       prealloc = NULL;
-                       goto out;
-               }
+               if (err)
+                       extent_io_tree_panic(tree, err);
+
                cache_state(prealloc, cached_state);
                prealloc = NULL;
                start = this_end + 1;
@@ -873,7 +892,8 @@ hit_next:
                prealloc = alloc_extent_state_atomic(prealloc);
                BUG_ON(!prealloc);
                err = split_state(tree, state, prealloc, end + 1);
-               BUG_ON(err == -EEXIST);
+               if (err)
+                       extent_io_tree_panic(tree, err);
 
                set_state_bits(tree, prealloc, &bits);
                cache_state(prealloc, cached_state);
@@ -900,6 +920,15 @@ search_again:
        goto again;
 }
 
+int set_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, int bits,
+                  u64 *failed_start, struct extent_state **cached_state,
+                  gfp_t mask)
+{
+       return __set_extent_bit(tree, start, end, bits, 0, failed_start,
+                               cached_state, mask);
+}
+
+
 /**
  * convert_extent - convert all bits in a given range from one bit to another
  * @tree:      the io tree to search
@@ -946,7 +975,8 @@ again:
                }
                err = insert_state(tree, prealloc, start, end, &bits);
                prealloc = NULL;
-               BUG_ON(err == -EEXIST);
+               if (err)
+                       extent_io_tree_panic(tree, err);
                goto out;
        }
        state = rb_entry(node, struct extent_state, rb_node);
@@ -1002,7 +1032,8 @@ hit_next:
                        goto out;
                }
                err = split_state(tree, state, prealloc, start);
-               BUG_ON(err == -EEXIST);
+               if (err)
+                       extent_io_tree_panic(tree, err);
                prealloc = NULL;
                if (err)
                        goto out;
@@ -1041,12 +1072,8 @@ hit_next:
                 */
                err = insert_state(tree, prealloc, start, this_end,
                                   &bits);
-               BUG_ON(err == -EEXIST);
-               if (err) {
-                       free_extent_state(prealloc);
-                       prealloc = NULL;
-                       goto out;
-               }
+               if (err)
+                       extent_io_tree_panic(tree, err);
                prealloc = NULL;
                start = this_end + 1;
                goto search_again;
@@ -1065,7 +1092,8 @@ hit_next:
                }
 
                err = split_state(tree, state, prealloc, end + 1);
-               BUG_ON(err == -EEXIST);
+               if (err)
+                       extent_io_tree_panic(tree, err);
 
                set_state_bits(tree, prealloc, &bits);
                clear_state_bit(tree, prealloc, &clear_bits, 0);
@@ -1095,14 +1123,14 @@ search_again:
 int set_extent_dirty(struct extent_io_tree *tree, u64 start, u64 end,
                     gfp_t mask)
 {
-       return set_extent_bit(tree, start, end, EXTENT_DIRTY, 0, NULL,
+       return set_extent_bit(tree, start, end, EXTENT_DIRTY, NULL,
                              NULL, mask);
 }
 
 int set_extent_bits(struct extent_io_tree *tree, u64 start, u64 end,
                    int bits, gfp_t mask)
 {
-       return set_extent_bit(tree, start, end, bits, 0, NULL,
+       return set_extent_bit(tree, start, end, bits, NULL,
                              NULL, mask);
 }
 
@@ -1117,7 +1145,7 @@ int set_extent_delalloc(struct extent_io_tree *tree, u64 start, u64 end,
 {
        return set_extent_bit(tree, start, end,
                              EXTENT_DELALLOC | EXTENT_UPTODATE,
-                             0, NULL, cached_state, mask);
+                             NULL, cached_state, mask);
 }
 
 int clear_extent_dirty(struct extent_io_tree *tree, u64 start, u64 end,
@@ -1131,7 +1159,7 @@ int clear_extent_dirty(struct extent_io_tree *tree, u64 start, u64 end,
 int set_extent_new(struct extent_io_tree *tree, u64 start, u64 end,
                     gfp_t mask)
 {
-       return set_extent_bit(tree, start, end, EXTENT_NEW, 0, NULL,
+       return set_extent_bit(tree, start, end, EXTENT_NEW, NULL,
                              NULL, mask);
 }
 
@@ -1139,7 +1167,7 @@ int set_extent_uptodate(struct extent_io_tree *tree, u64 start, u64 end,
                        struct extent_state **cached_state, gfp_t mask)
 {
        return set_extent_bit(tree, start, end, EXTENT_UPTODATE, 0,
-                             NULL, cached_state, mask);
+                             cached_state, mask);
 }
 
 static int clear_extent_uptodate(struct extent_io_tree *tree, u64 start,
@@ -1155,42 +1183,40 @@ static int clear_extent_uptodate(struct extent_io_tree *tree, u64 start,
  * us if waiting is desired.
  */
 int lock_extent_bits(struct extent_io_tree *tree, u64 start, u64 end,
-                    int bits, struct extent_state **cached_state, gfp_t mask)
+                    int bits, struct extent_state **cached_state)
 {
        int err;
        u64 failed_start;
        while (1) {
-               err = set_extent_bit(tree, start, end, EXTENT_LOCKED | bits,
-                                    EXTENT_LOCKED, &failed_start,
-                                    cached_state, mask);
-               if (err == -EEXIST && (mask & __GFP_WAIT)) {
+               err = __set_extent_bit(tree, start, end, EXTENT_LOCKED | bits,
+                                      EXTENT_LOCKED, &failed_start,
+                                      cached_state, GFP_NOFS);
+               if (err == -EEXIST) {
                        wait_extent_bit(tree, failed_start, end, EXTENT_LOCKED);
                        start = failed_start;
-               } else {
+               } else
                        break;
-               }
                WARN_ON(start > end);
        }
        return err;
 }
 
-int lock_extent(struct extent_io_tree *tree, u64 start, u64 end, gfp_t mask)
+int lock_extent(struct extent_io_tree *tree, u64 start, u64 end)
 {
-       return lock_extent_bits(tree, start, end, 0, NULL, mask);
+       return lock_extent_bits(tree, start, end, 0, NULL);
 }
 
-int try_lock_extent(struct extent_io_tree *tree, u64 start, u64 end,
-                   gfp_t mask)
+int try_lock_extent(struct extent_io_tree *tree, u64 start, u64 end)
 {
        int err;
        u64 failed_start;
 
-       err = set_extent_bit(tree, start, end, EXTENT_LOCKED, EXTENT_LOCKED,
-                            &failed_start, NULL, mask);
+       err = __set_extent_bit(tree, start, end, EXTENT_LOCKED, EXTENT_LOCKED,
+                              &failed_start, NULL, GFP_NOFS);
        if (err == -EEXIST) {
                if (failed_start > start)
                        clear_extent_bit(tree, start, failed_start - 1,
-                                        EXTENT_LOCKED, 1, 0, NULL, mask);
+                                        EXTENT_LOCKED, 1, 0, NULL, GFP_NOFS);
                return 0;
        }
        return 1;
@@ -1203,10 +1229,10 @@ int unlock_extent_cached(struct extent_io_tree *tree, u64 start, u64 end,
                                mask);
 }
 
-int unlock_extent(struct extent_io_tree *tree, u64 start, u64 end, gfp_t mask)
+int unlock_extent(struct extent_io_tree *tree, u64 start, u64 end)
 {
        return clear_extent_bit(tree, start, end, EXTENT_LOCKED, 1, 0, NULL,
-                               mask);
+                               GFP_NOFS);
 }
 
 /*
@@ -1220,7 +1246,7 @@ static int set_range_writeback(struct extent_io_tree *tree, u64 start, u64 end)
 
        while (index <= end_index) {
                page = find_get_page(tree->mapping, index);
-               BUG_ON(!page);
+               BUG_ON(!page); /* Pages should be in the extent_io_tree */
                set_page_writeback(page);
                page_cache_release(page);
                index++;
@@ -1343,9 +1369,9 @@ out:
        return found;
 }
 
-static noinline int __unlock_for_delalloc(struct inode *inode,
-                                         struct page *locked_page,
-                                         u64 start, u64 end)
+static noinline void __unlock_for_delalloc(struct inode *inode,
+                                          struct page *locked_page,
+                                          u64 start, u64 end)
 {
        int ret;
        struct page *pages[16];
@@ -1355,7 +1381,7 @@ static noinline int __unlock_for_delalloc(struct inode *inode,
        int i;
 
        if (index == locked_page->index && end_index == index)
-               return 0;
+               return;
 
        while (nr_pages > 0) {
                ret = find_get_pages_contig(inode->i_mapping, index,
@@ -1370,7 +1396,6 @@ static noinline int __unlock_for_delalloc(struct inode *inode,
                index += ret;
                cond_resched();
        }
-       return 0;
 }
 
 static noinline int lock_delalloc_pages(struct inode *inode,
@@ -1500,11 +1525,10 @@ again:
                        goto out_failed;
                }
        }
-       BUG_ON(ret);
+       BUG_ON(ret); /* Only valid values are 0 and -EAGAIN */
 
        /* step three, lock the state bits for the whole range */
-       lock_extent_bits(tree, delalloc_start, delalloc_end,
-                        0, &cached_state, GFP_NOFS);
+       lock_extent_bits(tree, delalloc_start, delalloc_end, 0, &cached_state);
 
        /* then test to make sure it is all still delalloc */
        ret = test_range_bit(tree, delalloc_start, delalloc_end,
@@ -1761,39 +1785,34 @@ int test_range_bit(struct extent_io_tree *tree, u64 start, u64 end,
  * helper function to set a given page up to date if all the
  * extents in the tree for that page are up to date
  */
-static int check_page_uptodate(struct extent_io_tree *tree,
-                              struct page *page)
+static void check_page_uptodate(struct extent_io_tree *tree, struct page *page)
 {
        u64 start = (u64)page->index << PAGE_CACHE_SHIFT;
        u64 end = start + PAGE_CACHE_SIZE - 1;
        if (test_range_bit(tree, start, end, EXTENT_UPTODATE, 1, NULL))
                SetPageUptodate(page);
-       return 0;
 }
 
 /*
  * helper function to unlock a page if all the extents in the tree
  * for that page are unlocked
  */
-static int check_page_locked(struct extent_io_tree *tree,
-                            struct page *page)
+static void check_page_locked(struct extent_io_tree *tree, struct page *page)
 {
        u64 start = (u64)page->index << PAGE_CACHE_SHIFT;
        u64 end = start + PAGE_CACHE_SIZE - 1;
        if (!test_range_bit(tree, start, end, EXTENT_LOCKED, 0, NULL))
                unlock_page(page);
-       return 0;
 }
 
 /*
  * helper function to end page writeback if all the extents
  * in the tree for that page are done with writeback
  */
-static int check_page_writeback(struct extent_io_tree *tree,
-                            struct page *page)
+static void check_page_writeback(struct extent_io_tree *tree,
+                                struct page *page)
 {
        end_page_writeback(page);
-       return 0;
 }
 
 /*
@@ -1912,6 +1931,26 @@ int repair_io_failure(struct btrfs_mapping_tree *map_tree, u64 start,
        return 0;
 }
 
+int repair_eb_io_failure(struct btrfs_root *root, struct extent_buffer *eb,
+                        int mirror_num)
+{
+       struct btrfs_mapping_tree *map_tree = &root->fs_info->mapping_tree;
+       u64 start = eb->start;
+       unsigned long i, num_pages = num_extent_pages(eb->start, eb->len);
+       int ret;
+
+       for (i = 0; i < num_pages; i++) {
+               struct page *p = extent_buffer_page(eb, i);
+               ret = repair_io_failure(map_tree, start, PAGE_CACHE_SIZE,
+                                       start, p, mirror_num);
+               if (ret)
+                       break;
+               start += PAGE_CACHE_SIZE;
+       }
+
+       return ret;
+}
+
 /*
  * each time an IO finishes, we do a fast check in the IO failure tree
  * to see if we need to process or clean up an io_failure_record
@@ -2258,6 +2297,7 @@ static void end_bio_extent_readpage(struct bio *bio, int err)
        u64 start;
        u64 end;
        int whole_page;
+       int failed_mirror;
        int ret;
 
        if (err)
@@ -2304,9 +2344,16 @@ static void end_bio_extent_readpage(struct bio *bio, int err)
                        else
                                clean_io_failure(start, page);
                }
-               if (!uptodate) {
-                       int failed_mirror;
+
+               if (!uptodate)
                        failed_mirror = (int)(unsigned long)bio->bi_bdev;
+
+               if (!uptodate && tree->ops && tree->ops->readpage_io_failed_hook) {
+                       ret = tree->ops->readpage_io_failed_hook(page, failed_mirror);
+                       if (!ret && !err &&
+                           test_bit(BIO_UPTODATE, &bio->bi_flags))
+                               uptodate = 1;
+               } else if (!uptodate) {
                        /*
                         * The generic bio_readpage_error handles errors the
                         * following way: If possible, new read requests are
@@ -2320,7 +2367,6 @@ static void end_bio_extent_readpage(struct bio *bio, int err)
                        ret = bio_readpage_error(bio, page, start, end,
                                                        failed_mirror, NULL);
                        if (ret == 0) {
-error_handled:
                                uptodate =
                                        test_bit(BIO_UPTODATE, &bio->bi_flags);
                                if (err)
@@ -2328,16 +2374,9 @@ error_handled:
                                uncache_state(&cached);
                                continue;
                        }
-                       if (tree->ops && tree->ops->readpage_io_failed_hook) {
-                               ret = tree->ops->readpage_io_failed_hook(
-                                                       bio, page, start, end,
-                                                       failed_mirror, state);
-                               if (ret == 0)
-                                       goto error_handled;
-                       }
                }
 
-               if (uptodate) {
+               if (uptodate && tree->track_uptodate) {
                        set_extent_uptodate(tree, start, end, &cached,
                                            GFP_ATOMIC);
                }
@@ -2386,8 +2425,12 @@ btrfs_bio_alloc(struct block_device *bdev, u64 first_sector, int nr_vecs,
        return bio;
 }
 
-static int submit_one_bio(int rw, struct bio *bio, int mirror_num,
-                         unsigned long bio_flags)
+/*
+ * Since writes are async, they will only return -ENOMEM.
+ * Reads can return the full range of I/O error conditions.
+ */
+static int __must_check submit_one_bio(int rw, struct bio *bio,
+                                      int mirror_num, unsigned long bio_flags)
 {
        int ret = 0;
        struct bio_vec *bvec = bio->bi_io_vec + bio->bi_vcnt - 1;
@@ -2413,6 +2456,19 @@ static int submit_one_bio(int rw, struct bio *bio, int mirror_num,
        return ret;
 }
 
+static int merge_bio(struct extent_io_tree *tree, struct page *page,
+                    unsigned long offset, size_t size, struct bio *bio,
+                    unsigned long bio_flags)
+{
+       int ret = 0;
+       if (tree->ops && tree->ops->merge_bio_hook)
+               ret = tree->ops->merge_bio_hook(page, offset, size, bio,
+                                               bio_flags);
+       BUG_ON(ret < 0);
+       return ret;
+
+}
+
 static int submit_extent_page(int rw, struct extent_io_tree *tree,
                              struct page *page, sector_t sector,
                              size_t size, unsigned long offset,
@@ -2441,12 +2497,12 @@ static int submit_extent_page(int rw, struct extent_io_tree *tree,
                                sector;
 
                if (prev_bio_flags != bio_flags || !contig ||
-                   (tree->ops && tree->ops->merge_bio_hook &&
-                    tree->ops->merge_bio_hook(page, offset, page_size, bio,
-                                              bio_flags)) ||
+                   merge_bio(tree, page, offset, page_size, bio, bio_flags) ||
                    bio_add_page(bio, page, page_size, offset) < page_size) {
                        ret = submit_one_bio(rw, bio, mirror_num,
                                             prev_bio_flags);
+                       if (ret < 0)
+                               return ret;
                        bio = NULL;
                } else {
                        return 0;
@@ -2473,25 +2529,31 @@ static int submit_extent_page(int rw, struct extent_io_tree *tree,
        return ret;
 }
 
-void set_page_extent_mapped(struct page *page)
+void attach_extent_buffer_page(struct extent_buffer *eb, struct page *page)
 {
        if (!PagePrivate(page)) {
                SetPagePrivate(page);
                page_cache_get(page);
-               set_page_private(page, EXTENT_PAGE_PRIVATE);
+               set_page_private(page, (unsigned long)eb);
+       } else {
+               WARN_ON(page->private != (unsigned long)eb);
        }
 }
 
-static void set_page_extent_head(struct page *page, unsigned long len)
+void set_page_extent_mapped(struct page *page)
 {
-       WARN_ON(!PagePrivate(page));
-       set_page_private(page, EXTENT_PAGE_PRIVATE_FIRST_PAGE | len << 2);
+       if (!PagePrivate(page)) {
+               SetPagePrivate(page);
+               page_cache_get(page);
+               set_page_private(page, EXTENT_PAGE_PRIVATE);
+       }
 }
 
 /*
  * basic readpage implementation.  Locked extent state structs are inserted
  * into the tree that are removed when the IO is done (by the end_io
  * handlers)
+ * XXX JDM: This needs looking at to ensure proper page locking
  */
 static int __extent_read_full_page(struct extent_io_tree *tree,
                                   struct page *page,
@@ -2531,11 +2593,11 @@ static int __extent_read_full_page(struct extent_io_tree *tree,
 
        end = page_end;
        while (1) {
-               lock_extent(tree, start, end, GFP_NOFS);
+               lock_extent(tree, start, end);
                ordered = btrfs_lookup_ordered_extent(inode, start);
                if (!ordered)
                        break;
-               unlock_extent(tree, start, end, GFP_NOFS);
+               unlock_extent(tree, start, end);
                btrfs_start_ordered_extent(inode, ordered, 1);
                btrfs_put_ordered_extent(ordered);
        }
@@ -2572,7 +2634,7 @@ static int __extent_read_full_page(struct extent_io_tree *tree,
                                end - cur + 1, 0);
                if (IS_ERR_OR_NULL(em)) {
                        SetPageError(page);
-                       unlock_extent(tree, cur, end, GFP_NOFS);
+                       unlock_extent(tree, cur, end);
                        break;
                }
                extent_offset = cur - em->start;
@@ -2624,7 +2686,7 @@ static int __extent_read_full_page(struct extent_io_tree *tree,
                if (test_range_bit(tree, cur, cur_end,
                                   EXTENT_UPTODATE, 1, NULL)) {
                        check_page_uptodate(tree, page);
-                       unlock_extent(tree, cur, cur + iosize - 1, GFP_NOFS);
+                       unlock_extent(tree, cur, cur + iosize - 1);
                        cur = cur + iosize;
                        pg_offset += iosize;
                        continue;
@@ -2634,7 +2696,7 @@ static int __extent_read_full_page(struct extent_io_tree *tree,
                 */
                if (block_start == EXTENT_MAP_INLINE) {
                        SetPageError(page);
-                       unlock_extent(tree, cur, cur + iosize - 1, GFP_NOFS);
+                       unlock_extent(tree, cur, cur + iosize - 1);
                        cur = cur + iosize;
                        pg_offset += iosize;
                        continue;
@@ -2654,6 +2716,7 @@ static int __extent_read_full_page(struct extent_io_tree *tree,
                                         end_bio_extent_readpage, mirror_num,
                                         *bio_flags,
                                         this_bio_flag);
+                       BUG_ON(ret == -ENOMEM);
                        nr++;
                        *bio_flags = this_bio_flag;
                }
@@ -2795,7 +2858,11 @@ static int __extent_writepage(struct page *page, struct writeback_control *wbc,
                                                       delalloc_end,
                                                       &page_started,
                                                       &nr_written);
-                       BUG_ON(ret);
+                       /* File system has been set read-only */
+                       if (ret) {
+                               SetPageError(page);
+                               goto done;
+                       }
                        /*
                         * delalloc_end is already one less than the total
                         * length, so we don't subtract one from
@@ -2968,6 +3035,275 @@ done_unlocked:
        return 0;
 }
 
+static int eb_wait(void *word)
+{
+       io_schedule();
+       return 0;
+}
+
+static void wait_on_extent_buffer_writeback(struct extent_buffer *eb)
+{
+       wait_on_bit(&eb->bflags, EXTENT_BUFFER_WRITEBACK, eb_wait,
+                   TASK_UNINTERRUPTIBLE);
+}
+
+static int lock_extent_buffer_for_io(struct extent_buffer *eb,
+                                    struct btrfs_fs_info *fs_info,
+                                    struct extent_page_data *epd)
+{
+       unsigned long i, num_pages;
+       int flush = 0;
+       int ret = 0;
+
+       if (!btrfs_try_tree_write_lock(eb)) {
+               flush = 1;
+               flush_write_bio(epd);
+               btrfs_tree_lock(eb);
+       }
+
+       if (test_bit(EXTENT_BUFFER_WRITEBACK, &eb->bflags)) {
+               btrfs_tree_unlock(eb);
+               if (!epd->sync_io)
+                       return 0;
+               if (!flush) {
+                       flush_write_bio(epd);
+                       flush = 1;
+               }
+               while (1) {
+                       wait_on_extent_buffer_writeback(eb);
+                       btrfs_tree_lock(eb);
+                       if (!test_bit(EXTENT_BUFFER_WRITEBACK, &eb->bflags))
+                               break;
+                       btrfs_tree_unlock(eb);
+               }
+       }
+
+       if (test_and_clear_bit(EXTENT_BUFFER_DIRTY, &eb->bflags)) {
+               set_bit(EXTENT_BUFFER_WRITEBACK, &eb->bflags);
+               btrfs_set_header_flag(eb, BTRFS_HEADER_FLAG_WRITTEN);
+               spin_lock(&fs_info->delalloc_lock);
+               if (fs_info->dirty_metadata_bytes >= eb->len)
+                       fs_info->dirty_metadata_bytes -= eb->len;
+               else
+                       WARN_ON(1);
+               spin_unlock(&fs_info->delalloc_lock);
+               ret = 1;
+       }
+
+       btrfs_tree_unlock(eb);
+
+       if (!ret)
+               return ret;
+
+       num_pages = num_extent_pages(eb->start, eb->len);
+       for (i = 0; i < num_pages; i++) {
+               struct page *p = extent_buffer_page(eb, i);
+
+               if (!trylock_page(p)) {
+                       if (!flush) {
+                               flush_write_bio(epd);
+                               flush = 1;
+                       }
+                       lock_page(p);
+               }
+       }
+
+       return ret;
+}
+
+static void end_extent_buffer_writeback(struct extent_buffer *eb)
+{
+       clear_bit(EXTENT_BUFFER_WRITEBACK, &eb->bflags);
+       smp_mb__after_clear_bit();
+       wake_up_bit(&eb->bflags, EXTENT_BUFFER_WRITEBACK);
+}
+
+static void end_bio_extent_buffer_writepage(struct bio *bio, int err)
+{
+       int uptodate = err == 0;
+       struct bio_vec *bvec = bio->bi_io_vec + bio->bi_vcnt - 1;
+       struct extent_buffer *eb;
+       int done;
+
+       do {
+               struct page *page = bvec->bv_page;
+
+               bvec--;
+               eb = (struct extent_buffer *)page->private;
+               BUG_ON(!eb);
+               done = atomic_dec_and_test(&eb->io_pages);
+
+               if (!uptodate || test_bit(EXTENT_BUFFER_IOERR, &eb->bflags)) {
+                       set_bit(EXTENT_BUFFER_IOERR, &eb->bflags);
+                       ClearPageUptodate(page);
+                       SetPageError(page);
+               }
+
+               end_page_writeback(page);
+
+               if (!done)
+                       continue;
+
+               end_extent_buffer_writeback(eb);
+       } while (bvec >= bio->bi_io_vec);
+
+       bio_put(bio);
+
+}
+
+static int write_one_eb(struct extent_buffer *eb,
+                       struct btrfs_fs_info *fs_info,
+                       struct writeback_control *wbc,
+                       struct extent_page_data *epd)
+{
+       struct block_device *bdev = fs_info->fs_devices->latest_bdev;
+       u64 offset = eb->start;
+       unsigned long i, num_pages;
+       int rw = (epd->sync_io ? WRITE_SYNC : WRITE);
+       int ret;
+
+       clear_bit(EXTENT_BUFFER_IOERR, &eb->bflags);
+       num_pages = num_extent_pages(eb->start, eb->len);
+       atomic_set(&eb->io_pages, num_pages);
+       for (i = 0; i < num_pages; i++) {
+               struct page *p = extent_buffer_page(eb, i);
+
+               clear_page_dirty_for_io(p);
+               set_page_writeback(p);
+               ret = submit_extent_page(rw, eb->tree, p, offset >> 9,
+                                        PAGE_CACHE_SIZE, 0, bdev, &epd->bio,
+                                        -1, end_bio_extent_buffer_writepage,
+                                        0, 0, 0);
+               if (ret) {
+                       set_bit(EXTENT_BUFFER_IOERR, &eb->bflags);
+                       SetPageError(p);
+                       if (atomic_sub_and_test(num_pages - i, &eb->io_pages))
+                               end_extent_buffer_writeback(eb);
+                       ret = -EIO;
+                       break;
+               }
+               offset += PAGE_CACHE_SIZE;
+               update_nr_written(p, wbc, 1);
+               unlock_page(p);
+       }
+
+       if (unlikely(ret)) {
+               for (; i < num_pages; i++) {
+                       struct page *p = extent_buffer_page(eb, i);
+                       unlock_page(p);
+               }
+       }
+
+       return ret;
+}
+
+int btree_write_cache_pages(struct address_space *mapping,
+                                  struct writeback_control *wbc)
+{
+       struct extent_io_tree *tree = &BTRFS_I(mapping->host)->io_tree;
+       struct btrfs_fs_info *fs_info = BTRFS_I(mapping->host)->root->fs_info;
+       struct extent_buffer *eb, *prev_eb = NULL;
+       struct extent_page_data epd = {
+               .bio = NULL,
+               .tree = tree,
+               .extent_locked = 0,
+               .sync_io = wbc->sync_mode == WB_SYNC_ALL,
+       };
+       int ret = 0;
+       int done = 0;
+       int nr_to_write_done = 0;
+       struct pagevec pvec;
+       int nr_pages;
+       pgoff_t index;
+       pgoff_t end;            /* Inclusive */
+       int scanned = 0;
+       int tag;
+
+       pagevec_init(&pvec, 0);
+       if (wbc->range_cyclic) {
+               index = mapping->writeback_index; /* Start from prev offset */
+               end = -1;
+       } else {
+               index = wbc->range_start >> PAGE_CACHE_SHIFT;
+               end = wbc->range_end >> PAGE_CACHE_SHIFT;
+               scanned = 1;
+       }
+       if (wbc->sync_mode == WB_SYNC_ALL)
+               tag = PAGECACHE_TAG_TOWRITE;
+       else
+               tag = PAGECACHE_TAG_DIRTY;
+retry:
+       if (wbc->sync_mode == WB_SYNC_ALL)
+               tag_pages_for_writeback(mapping, index, end);
+       while (!done && !nr_to_write_done && (index <= end) &&
+              (nr_pages = pagevec_lookup_tag(&pvec, mapping, &index, tag,
+                       min(end - index, (pgoff_t)PAGEVEC_SIZE-1) + 1))) {
+               unsigned i;
+
+               scanned = 1;
+               for (i = 0; i < nr_pages; i++) {
+                       struct page *page = pvec.pages[i];
+
+                       if (!PagePrivate(page))
+                               continue;
+
+                       if (!wbc->range_cyclic && page->index > end) {
+                               done = 1;
+                               break;
+                       }
+
+                       eb = (struct extent_buffer *)page->private;
+                       if (!eb) {
+                               WARN_ON(1);
+                               continue;
+                       }
+
+                       if (eb == prev_eb)
+                               continue;
+
+                       if (!atomic_inc_not_zero(&eb->refs)) {
+                               WARN_ON(1);
+                               continue;
+                       }
+
+                       prev_eb = eb;
+                       ret = lock_extent_buffer_for_io(eb, fs_info, &epd);
+                       if (!ret) {
+                               free_extent_buffer(eb);
+                               continue;
+                       }
+
+                       ret = write_one_eb(eb, fs_info, wbc, &epd);
+                       if (ret) {
+                               done = 1;
+                               free_extent_buffer(eb);
+                               break;
+                       }
+                       free_extent_buffer(eb);
+
+                       /*
+                        * the filesystem may choose to bump up nr_to_write.
+                        * We have to make sure to honor the new nr_to_write
+                        * at any time
+                        */
+                       nr_to_write_done = wbc->nr_to_write <= 0;
+               }
+               pagevec_release(&pvec);
+               cond_resched();
+       }
+       if (!scanned && !done) {
+               /*
+                * We hit the last page and there is more work to be done: wrap
+                * back to the start of the file
+                */
+               scanned = 1;
+               index = 0;
+               goto retry;
+       }
+       flush_write_bio(&epd);
+       return ret;
+}
+
 /**
  * write_cache_pages - walk the list of dirty pages of the given address space and write all of them.
  * @mapping: address space structure to write
@@ -3099,10 +3435,14 @@ retry:
 static void flush_epd_write_bio(struct extent_page_data *epd)
 {
        if (epd->bio) {
+               int rw = WRITE;
+               int ret;
+
                if (epd->sync_io)
-                       submit_one_bio(WRITE_SYNC, epd->bio, 0, 0);
-               else
-                       submit_one_bio(WRITE, epd->bio, 0, 0);
+                       rw = WRITE_SYNC;
+
+               ret = submit_one_bio(rw, epd->bio, 0, 0);
+               BUG_ON(ret < 0); /* -ENOMEM */
                epd->bio = NULL;
        }
 }
@@ -3219,7 +3559,7 @@ int extent_readpages(struct extent_io_tree *tree,
        }
        BUG_ON(!list_empty(pages));
        if (bio)
-               submit_one_bio(READ, bio, 0, bio_flags);
+               return submit_one_bio(READ, bio, 0, bio_flags);
        return 0;
 }
 
@@ -3240,7 +3580,7 @@ int extent_invalidatepage(struct extent_io_tree *tree,
        if (start > end)
                return 0;
 
-       lock_extent_bits(tree, start, end, 0, &cached_state, GFP_NOFS);
+       lock_extent_bits(tree, start, end, 0, &cached_state);
        wait_on_page_writeback(page);
        clear_extent_bit(tree, start, end,
                         EXTENT_LOCKED | EXTENT_DIRTY | EXTENT_DELALLOC |
@@ -3454,7 +3794,7 @@ int extent_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
        }
 
        lock_extent_bits(&BTRFS_I(inode)->io_tree, start, start + len, 0,
-                        &cached_state, GFP_NOFS);
+                        &cached_state);
 
        em = get_extent_skip_holes(inode, start, last_for_get_extent,
                                   get_extent);
@@ -3548,26 +3888,7 @@ out:
 inline struct page *extent_buffer_page(struct extent_buffer *eb,
                                              unsigned long i)
 {
-       struct page *p;
-       struct address_space *mapping;
-
-       if (i == 0)
-               return eb->first_page;
-       i += eb->start >> PAGE_CACHE_SHIFT;
-       mapping = eb->first_page->mapping;
-       if (!mapping)
-               return NULL;
-
-       /*
-        * extent_buffer_page is only called after pinning the page
-        * by increasing the reference count.  So we know the page must
-        * be in the radix tree.
-        */
-       rcu_read_lock();
-       p = radix_tree_lookup(&mapping->page_tree, i);
-       rcu_read_unlock();
-
-       return p;
+       return eb->pages[i];
 }
 
 inline unsigned long num_extent_pages(u64 start, u64 len)
@@ -3576,6 +3897,19 @@ inline unsigned long num_extent_pages(u64 start, u64 len)
                (start >> PAGE_CACHE_SHIFT);
 }
 
+static void __free_extent_buffer(struct extent_buffer *eb)
+{
+#if LEAK_DEBUG
+       unsigned long flags;
+       spin_lock_irqsave(&leak_lock, flags);
+       list_del(&eb->leak_list);
+       spin_unlock_irqrestore(&leak_lock, flags);
+#endif
+       if (eb->pages && eb->pages != eb->inline_pages)
+               kfree(eb->pages);
+       kmem_cache_free(extent_buffer_cache, eb);
+}
+
 static struct extent_buffer *__alloc_extent_buffer(struct extent_io_tree *tree,
                                                   u64 start,
                                                   unsigned long len,
@@ -3591,6 +3925,7 @@ static struct extent_buffer *__alloc_extent_buffer(struct extent_io_tree *tree,
                return NULL;
        eb->start = start;
        eb->len = len;
+       eb->tree = tree;
        rwlock_init(&eb->lock);
        atomic_set(&eb->write_locks, 0);
        atomic_set(&eb->read_locks, 0);
@@ -3607,20 +3942,32 @@ static struct extent_buffer *__alloc_extent_buffer(struct extent_io_tree *tree,
        list_add(&eb->leak_list, &buffers);
        spin_unlock_irqrestore(&leak_lock, flags);
 #endif
+       spin_lock_init(&eb->refs_lock);
        atomic_set(&eb->refs, 1);
+       atomic_set(&eb->io_pages, 0);
+
+       if (len > MAX_INLINE_EXTENT_BUFFER_SIZE) {
+               struct page **pages;
+               int num_pages = (len + PAGE_CACHE_SIZE - 1) >>
+                       PAGE_CACHE_SHIFT;
+               pages = kzalloc(num_pages, mask);
+               if (!pages) {
+                       __free_extent_buffer(eb);
+                       return NULL;
+               }
+               eb->pages = pages;
+       } else {
+               eb->pages = eb->inline_pages;
+       }
 
        return eb;
 }
 
-static void __free_extent_buffer(struct extent_buffer *eb)
+static int extent_buffer_under_io(struct extent_buffer *eb)
 {
-#if LEAK_DEBUG
-       unsigned long flags;
-       spin_lock_irqsave(&leak_lock, flags);
-       list_del(&eb->leak_list);
-       spin_unlock_irqrestore(&leak_lock, flags);
-#endif
-       kmem_cache_free(extent_buffer_cache, eb);
+       return (atomic_read(&eb->io_pages) ||
+               test_bit(EXTENT_BUFFER_WRITEBACK, &eb->bflags) ||
+               test_bit(EXTENT_BUFFER_DIRTY, &eb->bflags));
 }
 
 /*
@@ -3632,8 +3979,7 @@ static void btrfs_release_extent_buffer_page(struct extent_buffer *eb,
        unsigned long index;
        struct page *page;
 
-       if (!eb->first_page)
-               return;
+       BUG_ON(extent_buffer_under_io(eb));
 
        index = num_extent_pages(eb->start, eb->len);
        if (start_idx >= index)
@@ -3642,8 +3988,34 @@ static void btrfs_release_extent_buffer_page(struct extent_buffer *eb,
        do {
                index--;
                page = extent_buffer_page(eb, index);
-               if (page)
+               if (page) {
+                       spin_lock(&page->mapping->private_lock);
+                       /*
+                        * We do this since we'll remove the pages after we've
+                        * removed the eb from the radix tree, so we could race
+                        * and have this page now attached to the new eb.  So
+                        * only clear page_private if it's still connected to
+                        * this eb.
+                        */
+                       if (PagePrivate(page) &&
+                           page->private == (unsigned long)eb) {
+                               BUG_ON(test_bit(EXTENT_BUFFER_DIRTY, &eb->bflags));
+                               BUG_ON(PageDirty(page));
+                               BUG_ON(PageWriteback(page));
+                               /*
+                                * We need to make sure we haven't be attached
+                                * to a new eb.
+                                */
+                               ClearPagePrivate(page);
+                               set_page_private(page, 0);
+                               /* One for the page private */
+                               page_cache_release(page);
+                       }
+                       spin_unlock(&page->mapping->private_lock);
+
+                       /* One for when we alloced the page */
                        page_cache_release(page);
+               }
        } while (index != start_idx);
 }
 
@@ -3656,9 +4028,50 @@ static inline void btrfs_release_extent_buffer(struct extent_buffer *eb)
        __free_extent_buffer(eb);
 }
 
+static void check_buffer_tree_ref(struct extent_buffer *eb)
+{
+       /* the ref bit is tricky.  We have to make sure it is set
+        * if we have the buffer dirty.   Otherwise the
+        * code to free a buffer can end up dropping a dirty
+        * page
+        *
+        * Once the ref bit is set, it won't go away while the
+        * buffer is dirty or in writeback, and it also won't
+        * go away while we have the reference count on the
+        * eb bumped.
+        *
+        * We can't just set the ref bit without bumping the
+        * ref on the eb because free_extent_buffer might
+        * see the ref bit and try to clear it.  If this happens
+        * free_extent_buffer might end up dropping our original
+        * ref by mistake and freeing the page before we are able
+        * to add one more ref.
+        *
+        * So bump the ref count first, then set the bit.  If someone
+        * beat us to it, drop the ref we added.
+        */
+       if (!test_bit(EXTENT_BUFFER_TREE_REF, &eb->bflags)) {
+               atomic_inc(&eb->refs);
+               if (test_and_set_bit(EXTENT_BUFFER_TREE_REF, &eb->bflags))
+                       atomic_dec(&eb->refs);
+       }
+}
+
+static void mark_extent_buffer_accessed(struct extent_buffer *eb)
+{
+       unsigned long num_pages, i;
+
+       check_buffer_tree_ref(eb);
+
+       num_pages = num_extent_pages(eb->start, eb->len);
+       for (i = 0; i < num_pages; i++) {
+               struct page *p = extent_buffer_page(eb, i);
+               mark_page_accessed(p);
+       }
+}
+
 struct extent_buffer *alloc_extent_buffer(struct extent_io_tree *tree,
-                                         u64 start, unsigned long len,
-                                         struct page *page0)
+                                         u64 start, unsigned long len)
 {
        unsigned long num_pages = num_extent_pages(start, len);
        unsigned long i;
@@ -3674,7 +4087,7 @@ struct extent_buffer *alloc_extent_buffer(struct extent_io_tree *tree,
        eb = radix_tree_lookup(&tree->buffer, start >> PAGE_CACHE_SHIFT);
        if (eb && atomic_inc_not_zero(&eb->refs)) {
                rcu_read_unlock();
-               mark_page_accessed(eb->first_page);
+               mark_extent_buffer_accessed(eb);
                return eb;
        }
        rcu_read_unlock();
@@ -3683,32 +4096,43 @@ struct extent_buffer *alloc_extent_buffer(struct extent_io_tree *tree,
        if (!eb)
                return NULL;
 
-       if (page0) {
-               eb->first_page = page0;
-               i = 1;
-               index++;
-               page_cache_get(page0);
-               mark_page_accessed(page0);
-               set_page_extent_mapped(page0);
-               set_page_extent_head(page0, len);
-               uptodate = PageUptodate(page0);
-       } else {
-               i = 0;
-       }
-       for (; i < num_pages; i++, index++) {
+       for (i = 0; i < num_pages; i++, index++) {
                p = find_or_create_page(mapping, index, GFP_NOFS);
                if (!p) {
                        WARN_ON(1);
                        goto free_eb;
                }
-               set_page_extent_mapped(p);
-               mark_page_accessed(p);
-               if (i == 0) {
-                       eb->first_page = p;
-                       set_page_extent_head(p, len);
-               } else {
-                       set_page_private(p, EXTENT_PAGE_PRIVATE);
+
+               spin_lock(&mapping->private_lock);
+               if (PagePrivate(p)) {
+                       /*
+                        * We could have already allocated an eb for this page
+                        * and attached one so lets see if we can get a ref on
+                        * the existing eb, and if we can we know it's good and
+                        * we can just return that one, else we know we can just
+                        * overwrite page->private.
+                        */
+                       exists = (struct extent_buffer *)p->private;
+                       if (atomic_inc_not_zero(&exists->refs)) {
+                               spin_unlock(&mapping->private_lock);
+                               unlock_page(p);
+                               mark_extent_buffer_accessed(exists);
+                               goto free_eb;
+                       }
+
+                       /*
+                        * Do this so attach doesn't complain and we need to
+                        * drop the ref the old guy had.
+                        */
+                       ClearPagePrivate(p);
+                       WARN_ON(PageDirty(p));
+                       page_cache_release(p);
                }
+               attach_extent_buffer_page(eb, p);
+               spin_unlock(&mapping->private_lock);
+               WARN_ON(PageDirty(p));
+               mark_page_accessed(p);
+               eb->pages[i] = p;
                if (!PageUptodate(p))
                        uptodate = 0;
 
@@ -3716,12 +4140,10 @@ struct extent_buffer *alloc_extent_buffer(struct extent_io_tree *tree,
                 * see below about how we avoid a nasty race with release page
                 * and why we unlock later
                 */
-               if (i != 0)
-                       unlock_page(p);
        }
        if (uptodate)
                set_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags);
-
+again:
        ret = radix_tree_preload(GFP_NOFS & ~__GFP_HIGHMEM);
        if (ret)
                goto free_eb;
@@ -3731,14 +4153,21 @@ struct extent_buffer *alloc_extent_buffer(struct extent_io_tree *tree,
        if (ret == -EEXIST) {
                exists = radix_tree_lookup(&tree->buffer,
                                                start >> PAGE_CACHE_SHIFT);
-               /* add one reference for the caller */
-               atomic_inc(&exists->refs);
+               if (!atomic_inc_not_zero(&exists->refs)) {
+                       spin_unlock(&tree->buffer_lock);
+                       radix_tree_preload_end();
+                       exists = NULL;
+                       goto again;
+               }
                spin_unlock(&tree->buffer_lock);
                radix_tree_preload_end();
+               mark_extent_buffer_accessed(exists);
                goto free_eb;
        }
        /* add one reference for the tree */
-       atomic_inc(&eb->refs);
+       spin_lock(&eb->refs_lock);
+       check_buffer_tree_ref(eb);
+       spin_unlock(&eb->refs_lock);
        spin_unlock(&tree->buffer_lock);
        radix_tree_preload_end();
 
@@ -3751,15 +4180,20 @@ struct extent_buffer *alloc_extent_buffer(struct extent_io_tree *tree,
         * after the extent buffer is in the radix tree so
         * it doesn't get lost
         */
-       set_page_extent_mapped(eb->first_page);
-       set_page_extent_head(eb->first_page, eb->len);
-       if (!page0)
-               unlock_page(eb->first_page);
+       SetPageChecked(eb->pages[0]);
+       for (i = 1; i < num_pages; i++) {
+               p = extent_buffer_page(eb, i);
+               ClearPageChecked(p);
+               unlock_page(p);
+       }
+       unlock_page(eb->pages[0]);
        return eb;
 
 free_eb:
-       if (eb->first_page && !page0)
-               unlock_page(eb->first_page);
+       for (i = 0; i < num_pages; i++) {
+               if (eb->pages[i])
+                       unlock_page(eb->pages[i]);
+       }
 
        if (!atomic_dec_and_test(&eb->refs))
                return exists;
@@ -3776,7 +4210,7 @@ struct extent_buffer *find_extent_buffer(struct extent_io_tree *tree,
        eb = radix_tree_lookup(&tree->buffer, start >> PAGE_CACHE_SHIFT);
        if (eb && atomic_inc_not_zero(&eb->refs)) {
                rcu_read_unlock();
-               mark_page_accessed(eb->first_page);
+               mark_extent_buffer_accessed(eb);
                return eb;
        }
        rcu_read_unlock();
@@ -3784,19 +4218,71 @@ struct extent_buffer *find_extent_buffer(struct extent_io_tree *tree,
        return NULL;
 }
 
+static inline void btrfs_release_extent_buffer_rcu(struct rcu_head *head)
+{
+       struct extent_buffer *eb =
+                       container_of(head, struct extent_buffer, rcu_head);
+
+       __free_extent_buffer(eb);
+}
+
+/* Expects to have eb->eb_lock already held */
+static void release_extent_buffer(struct extent_buffer *eb, gfp_t mask)
+{
+       WARN_ON(atomic_read(&eb->refs) == 0);
+       if (atomic_dec_and_test(&eb->refs)) {
+               struct extent_io_tree *tree = eb->tree;
+
+               spin_unlock(&eb->refs_lock);
+
+               spin_lock(&tree->buffer_lock);
+               radix_tree_delete(&tree->buffer,
+                                 eb->start >> PAGE_CACHE_SHIFT);
+               spin_unlock(&tree->buffer_lock);
+
+               /* Should be safe to release our pages at this point */
+               btrfs_release_extent_buffer_page(eb, 0);
+
+               call_rcu(&eb->rcu_head, btrfs_release_extent_buffer_rcu);
+               return;
+       }
+       spin_unlock(&eb->refs_lock);
+}
+
 void free_extent_buffer(struct extent_buffer *eb)
 {
        if (!eb)
                return;
 
-       if (!atomic_dec_and_test(&eb->refs))
+       spin_lock(&eb->refs_lock);
+       if (atomic_read(&eb->refs) == 2 &&
+           test_bit(EXTENT_BUFFER_STALE, &eb->bflags) &&
+           !extent_buffer_under_io(eb) &&
+           test_and_clear_bit(EXTENT_BUFFER_TREE_REF, &eb->bflags))
+               atomic_dec(&eb->refs);
+
+       /*
+        * I know this is terrible, but it's temporary until we stop tracking
+        * the uptodate bits and such for the extent buffers.
+        */
+       release_extent_buffer(eb, GFP_ATOMIC);
+}
+
+void free_extent_buffer_stale(struct extent_buffer *eb)
+{
+       if (!eb)
                return;
 
-       WARN_ON(1);
+       spin_lock(&eb->refs_lock);
+       set_bit(EXTENT_BUFFER_STALE, &eb->bflags);
+
+       if (atomic_read(&eb->refs) == 2 && !extent_buffer_under_io(eb) &&
+           test_and_clear_bit(EXTENT_BUFFER_TREE_REF, &eb->bflags))
+               atomic_dec(&eb->refs);
+       release_extent_buffer(eb, GFP_NOFS);
 }
 
-int clear_extent_buffer_dirty(struct extent_io_tree *tree,
-                             struct extent_buffer *eb)
+void clear_extent_buffer_dirty(struct extent_buffer *eb)
 {
        unsigned long i;
        unsigned long num_pages;
@@ -3812,10 +4298,6 @@ int clear_extent_buffer_dirty(struct extent_io_tree *tree,
                lock_page(page);
                WARN_ON(!PagePrivate(page));
 
-               set_page_extent_mapped(page);
-               if (i == 0)
-                       set_page_extent_head(page, eb->len);
-
                clear_page_dirty_for_io(page);
                spin_lock_irq(&page->mapping->tree_lock);
                if (!PageDirty(page)) {
@@ -3827,24 +4309,29 @@ int clear_extent_buffer_dirty(struct extent_io_tree *tree,
                ClearPageError(page);
                unlock_page(page);
        }
-       return 0;
+       WARN_ON(atomic_read(&eb->refs) == 0);
 }
 
-int set_extent_buffer_dirty(struct extent_io_tree *tree,
-                            struct extent_buffer *eb)
+int set_extent_buffer_dirty(struct extent_buffer *eb)
 {
        unsigned long i;
        unsigned long num_pages;
        int was_dirty = 0;
 
+       check_buffer_tree_ref(eb);
+
        was_dirty = test_and_set_bit(EXTENT_BUFFER_DIRTY, &eb->bflags);
+
        num_pages = num_extent_pages(eb->start, eb->len);
+       WARN_ON(atomic_read(&eb->refs) == 0);
+       WARN_ON(!test_bit(EXTENT_BUFFER_TREE_REF, &eb->bflags));
+
        for (i = 0; i < num_pages; i++)
-               __set_page_dirty_nobuffers(extent_buffer_page(eb, i));
+               set_page_dirty(extent_buffer_page(eb, i));
        return was_dirty;
 }
 
-static int __eb_straddles_pages(u64 start, u64 len)
+static int range_straddles_pages(u64 start, u64 len)
 {
        if (len < PAGE_CACHE_SIZE)
                return 1;
@@ -3855,25 +4342,14 @@ static int __eb_straddles_pages(u64 start, u64 len)
        return 0;
 }
 
-static int eb_straddles_pages(struct extent_buffer *eb)
-{
-       return __eb_straddles_pages(eb->start, eb->len);
-}
-
-int clear_extent_buffer_uptodate(struct extent_io_tree *tree,
-                               struct extent_buffer *eb,
-                               struct extent_state **cached_state)
+int clear_extent_buffer_uptodate(struct extent_buffer *eb)
 {
        unsigned long i;
        struct page *page;
        unsigned long num_pages;
 
-       num_pages = num_extent_pages(eb->start, eb->len);
        clear_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags);
-
-       clear_extent_uptodate(tree, eb->start, eb->start + eb->len - 1,
-                             cached_state, GFP_NOFS);
-
+       num_pages = num_extent_pages(eb->start, eb->len);
        for (i = 0; i < num_pages; i++) {
                page = extent_buffer_page(eb, i);
                if (page)
@@ -3882,27 +4358,16 @@ int clear_extent_buffer_uptodate(struct extent_io_tree *tree,
        return 0;
 }
 
-int set_extent_buffer_uptodate(struct extent_io_tree *tree,
-                               struct extent_buffer *eb)
+int set_extent_buffer_uptodate(struct extent_buffer *eb)
 {
        unsigned long i;
        struct page *page;
        unsigned long num_pages;
 
+       set_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags);
        num_pages = num_extent_pages(eb->start, eb->len);
-
-       if (eb_straddles_pages(eb)) {
-               set_extent_uptodate(tree, eb->start, eb->start + eb->len - 1,
-                                   NULL, GFP_NOFS);
-       }
        for (i = 0; i < num_pages; i++) {
                page = extent_buffer_page(eb, i);
-               if ((i == 0 && (eb->start & (PAGE_CACHE_SIZE - 1))) ||
-                   ((i == num_pages - 1) &&
-                    ((eb->start + eb->len) & (PAGE_CACHE_SIZE - 1)))) {
-                       check_page_uptodate(tree, page);
-                       continue;
-               }
                SetPageUptodate(page);
        }
        return 0;
@@ -3917,7 +4382,7 @@ int extent_range_uptodate(struct extent_io_tree *tree,
        int uptodate;
        unsigned long index;
 
-       if (__eb_straddles_pages(start, end - start + 1)) {
+       if (range_straddles_pages(start, end - start + 1)) {
                ret = test_range_bit(tree, start, end,
                                     EXTENT_UPTODATE, 1, NULL);
                if (ret)
@@ -3939,35 +4404,9 @@ int extent_range_uptodate(struct extent_io_tree *tree,
        return pg_uptodate;
 }
 
-int extent_buffer_uptodate(struct extent_io_tree *tree,
-                          struct extent_buffer *eb,
-                          struct extent_state *cached_state)
+int extent_buffer_uptodate(struct extent_buffer *eb)
 {
-       int ret = 0;
-       unsigned long num_pages;
-       unsigned long i;
-       struct page *page;
-       int pg_uptodate = 1;
-
-       if (test_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags))
-               return 1;
-
-       if (eb_straddles_pages(eb)) {
-               ret = test_range_bit(tree, eb->start, eb->start + eb->len - 1,
-                                  EXTENT_UPTODATE, 1, cached_state);
-               if (ret)
-                       return ret;
-       }
-
-       num_pages = num_extent_pages(eb->start, eb->len);
-       for (i = 0; i < num_pages; i++) {
-               page = extent_buffer_page(eb, i);
-               if (!PageUptodate(page)) {
-                       pg_uptodate = 0;
-                       break;
-               }
-       }
-       return pg_uptodate;
+       return test_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags);
 }
 
 int read_extent_buffer_pages(struct extent_io_tree *tree,
@@ -3981,21 +4420,14 @@ int read_extent_buffer_pages(struct extent_io_tree *tree,
        int ret = 0;
        int locked_pages = 0;
        int all_uptodate = 1;
-       int inc_all_pages = 0;
        unsigned long num_pages;
+       unsigned long num_reads = 0;
        struct bio *bio = NULL;
        unsigned long bio_flags = 0;
 
        if (test_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags))
                return 0;
 
-       if (eb_straddles_pages(eb)) {
-               if (test_range_bit(tree, eb->start, eb->start + eb->len - 1,
-                                  EXTENT_UPTODATE, 1, NULL)) {
-                       return 0;
-               }
-       }
-
        if (start) {
                WARN_ON(start < eb->start);
                start_i = (start >> PAGE_CACHE_SHIFT) -
@@ -4014,8 +4446,10 @@ int read_extent_buffer_pages(struct extent_io_tree *tree,
                        lock_page(page);
                }
                locked_pages++;
-               if (!PageUptodate(page))
+               if (!PageUptodate(page)) {
+                       num_reads++;
                        all_uptodate = 0;
+               }
        }
        if (all_uptodate) {
                if (start_i == 0)
@@ -4023,20 +4457,12 @@ int read_extent_buffer_pages(struct extent_io_tree *tree,
                goto unlock_exit;
        }
 
+       clear_bit(EXTENT_BUFFER_IOERR, &eb->bflags);
+       eb->failed_mirror = 0;
+       atomic_set(&eb->io_pages, num_reads);
        for (i = start_i; i < num_pages; i++) {
                page = extent_buffer_page(eb, i);
-
-               WARN_ON(!PagePrivate(page));
-
-               set_page_extent_mapped(page);
-               if (i == 0)
-                       set_page_extent_head(page, eb->len);
-
-               if (inc_all_pages)
-                       page_cache_get(page);
                if (!PageUptodate(page)) {
-                       if (start_i == 0)
-                               inc_all_pages = 1;
                        ClearPageError(page);
                        err = __extent_read_full_page(tree, page,
                                                      get_extent, &bio,
@@ -4048,8 +4474,11 @@ int read_extent_buffer_pages(struct extent_io_tree *tree,
                }
        }
 
-       if (bio)
-               submit_one_bio(READ, bio, mirror_num, bio_flags);
+       if (bio) {
+               err = submit_one_bio(READ, bio, mirror_num, bio_flags);
+               if (err)
+                       return err;
+       }
 
        if (ret || wait != WAIT_COMPLETE)
                return ret;
@@ -4061,8 +4490,6 @@ int read_extent_buffer_pages(struct extent_io_tree *tree,
                        ret = -EIO;
        }
 
-       if (!ret)
-               set_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags);
        return ret;
 
 unlock_exit:
@@ -4304,15 +4731,20 @@ static void copy_pages(struct page *dst_page, struct page *src_page,
 {
        char *dst_kaddr = page_address(dst_page);
        char *src_kaddr;
+       int must_memmove = 0;
 
        if (dst_page != src_page) {
                src_kaddr = page_address(src_page);
        } else {
                src_kaddr = dst_kaddr;
-               BUG_ON(areas_overlap(src_off, dst_off, len));
+               if (areas_overlap(src_off, dst_off, len))
+                       must_memmove = 1;
        }
 
-       memcpy(dst_kaddr + dst_off, src_kaddr + src_off, len);
+       if (must_memmove)
+               memmove(dst_kaddr + dst_off, src_kaddr + src_off, len);
+       else
+               memcpy(dst_kaddr + dst_off, src_kaddr + src_off, len);
 }
 
 void memcpy_extent_buffer(struct extent_buffer *dst, unsigned long dst_offset,
@@ -4382,7 +4814,7 @@ void memmove_extent_buffer(struct extent_buffer *dst, unsigned long dst_offset,
                       "len %lu len %lu\n", dst_offset, len, dst->len);
                BUG_ON(1);
        }
-       if (!areas_overlap(src_offset, dst_offset, len)) {
+       if (dst_offset < src_offset) {
                memcpy_extent_buffer(dst, dst_offset, src_offset, len);
                return;
        }
@@ -4408,47 +4840,48 @@ void memmove_extent_buffer(struct extent_buffer *dst, unsigned long dst_offset,
        }
 }
 
-static inline void btrfs_release_extent_buffer_rcu(struct rcu_head *head)
+int try_release_extent_buffer(struct page *page, gfp_t mask)
 {
-       struct extent_buffer *eb =
-                       container_of(head, struct extent_buffer, rcu_head);
-
-       btrfs_release_extent_buffer(eb);
-}
-
-int try_release_extent_buffer(struct extent_io_tree *tree, struct page *page)
-{
-       u64 start = page_offset(page);
        struct extent_buffer *eb;
-       int ret = 1;
 
-       spin_lock(&tree->buffer_lock);
-       eb = radix_tree_lookup(&tree->buffer, start >> PAGE_CACHE_SHIFT);
-       if (!eb) {
-               spin_unlock(&tree->buffer_lock);
-               return ret;
+       /*
+        * We need to make sure noboody is attaching this page to an eb right
+        * now.
+        */
+       spin_lock(&page->mapping->private_lock);
+       if (!PagePrivate(page)) {
+               spin_unlock(&page->mapping->private_lock);
+               return 1;
        }
 
-       if (test_bit(EXTENT_BUFFER_DIRTY, &eb->bflags)) {
-               ret = 0;
-               goto out;
-       }
+       eb = (struct extent_buffer *)page->private;
+       BUG_ON(!eb);
 
        /*
-        * set @eb->refs to 0 if it is already 1, and then release the @eb.
-        * Or go back.
+        * This is a little awful but should be ok, we need to make sure that
+        * the eb doesn't disappear out from under us while we're looking at
+        * this page.
         */
-       if (atomic_cmpxchg(&eb->refs, 1, 0) != 1) {
-               ret = 0;
-               goto out;
+       spin_lock(&eb->refs_lock);
+       if (atomic_read(&eb->refs) != 1 || extent_buffer_under_io(eb)) {
+               spin_unlock(&eb->refs_lock);
+               spin_unlock(&page->mapping->private_lock);
+               return 0;
        }
+       spin_unlock(&page->mapping->private_lock);
 
-       radix_tree_delete(&tree->buffer, start >> PAGE_CACHE_SHIFT);
-out:
-       spin_unlock(&tree->buffer_lock);
+       if ((mask & GFP_NOFS) == GFP_NOFS)
+               mask = GFP_NOFS;
 
-       /* at this point we can safely release the extent buffer */
-       if (atomic_read(&eb->refs) == 0)
-               call_rcu(&eb->rcu_head, btrfs_release_extent_buffer_rcu);
-       return ret;
+       /*
+        * If tree ref isn't set then we know the ref on this eb is a real ref,
+        * so just return, this page will likely be freed soon anyway.
+        */
+       if (!test_and_clear_bit(EXTENT_BUFFER_TREE_REF, &eb->bflags)) {
+               spin_unlock(&eb->refs_lock);
+               return 0;
+       }
+       release_extent_buffer(eb, mask);
+
+       return 1;
 }