These changes are the raw update to linux-4.4.6-rt14. Kernel sources
[kvmfornfv.git] / kernel / fs / btrfs / extent_io.c
index c32d226..9abe187 100644 (file)
@@ -96,8 +96,8 @@ static inline void __btrfs_debug_check_extent_io_range(const char *caller,
        inode = tree->mapping->host;
        isize = i_size_read(inode);
        if (end >= PAGE_SIZE && (end % 2) == 0 && end != isize - 1) {
-               printk_ratelimited(KERN_DEBUG
-                   "BTRFS: %s: ino %llu isize %llu odd range [%llu,%llu]\n",
+               btrfs_debug_rl(BTRFS_I(inode)->root->fs_info,
+                   "%s: ino %llu isize %llu odd range [%llu,%llu]",
                                caller, btrfs_ino(inode), isize, start, end);
        }
 }
@@ -131,6 +131,25 @@ struct extent_page_data {
        unsigned int sync_io:1;
 };
 
+static void add_extent_changeset(struct extent_state *state, unsigned bits,
+                                struct extent_changeset *changeset,
+                                int set)
+{
+       int ret;
+
+       if (!changeset)
+               return;
+       if (set && (state->state & bits) == bits)
+               return;
+       if (!set && (state->state & bits) == 0)
+               return;
+       changeset->bytes_changed += state->end - state->start + 1;
+       ret = ulist_add(changeset->range_changed, state->start, state->end,
+                       GFP_ATOMIC);
+       /* ENOMEM */
+       BUG_ON(ret < 0);
+}
+
 static noinline void flush_write_bio(void *data);
 static inline struct btrfs_fs_info *
 tree_fs_info(struct extent_io_tree *tree)
@@ -410,7 +429,8 @@ static void clear_state_cb(struct extent_io_tree *tree,
 }
 
 static void set_state_bits(struct extent_io_tree *tree,
-                          struct extent_state *state, unsigned *bits);
+                          struct extent_state *state, unsigned *bits,
+                          struct extent_changeset *changeset);
 
 /*
  * insert an extent_state struct into the tree.  'bits' are set on the
@@ -426,7 +446,7 @@ static int insert_state(struct extent_io_tree *tree,
                        struct extent_state *state, u64 start, u64 end,
                        struct rb_node ***p,
                        struct rb_node **parent,
-                       unsigned *bits)
+                       unsigned *bits, struct extent_changeset *changeset)
 {
        struct rb_node *node;
 
@@ -436,7 +456,7 @@ static int insert_state(struct extent_io_tree *tree,
        state->start = start;
        state->end = end;
 
-       set_state_bits(tree, state, bits);
+       set_state_bits(tree, state, bits, changeset);
 
        node = tree_insert(&tree->state, NULL, end, &state->rb_node, p, parent);
        if (node) {
@@ -511,7 +531,8 @@ static struct extent_state *next_state(struct extent_state *state)
  */
 static struct extent_state *clear_state_bit(struct extent_io_tree *tree,
                                            struct extent_state *state,
-                                           unsigned *bits, int wake)
+                                           unsigned *bits, int wake,
+                                           struct extent_changeset *changeset)
 {
        struct extent_state *next;
        unsigned bits_to_clear = *bits & ~EXTENT_CTLBITS;
@@ -522,6 +543,7 @@ static struct extent_state *clear_state_bit(struct extent_io_tree *tree,
                tree->dirty_bytes -= range;
        }
        clear_state_cb(tree, state, bits);
+       add_extent_changeset(state, bits_to_clear, changeset, 0);
        state->state &= ~bits_to_clear;
        if (wake)
                wake_up(&state->wq);
@@ -569,10 +591,10 @@ static void extent_io_tree_panic(struct extent_io_tree *tree, int err)
  *
  * This takes the tree lock, and returns 0 on success and < 0 on error.
  */
-int clear_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
-                    unsigned bits, int wake, int delete,
-                    struct extent_state **cached_state,
-                    gfp_t mask)
+static int __clear_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
+                             unsigned bits, int wake, int delete,
+                             struct extent_state **cached_state,
+                             gfp_t mask, struct extent_changeset *changeset)
 {
        struct extent_state *state;
        struct extent_state *cached;
@@ -594,7 +616,7 @@ int clear_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
        if (bits & (EXTENT_IOBITS | EXTENT_BOUNDARY))
                clear = 1;
 again:
-       if (!prealloc && (mask & __GFP_WAIT)) {
+       if (!prealloc && gfpflags_allow_blocking(mask)) {
                /*
                 * Don't care for allocation failure here because we might end
                 * up not needing the pre-allocated extent state at all, which
@@ -671,7 +693,8 @@ hit_next:
                if (err)
                        goto out;
                if (state->end <= end) {
-                       state = clear_state_bit(tree, state, &bits, wake);
+                       state = clear_state_bit(tree, state, &bits, wake,
+                                               changeset);
                        goto next;
                }
                goto search_again;
@@ -692,13 +715,13 @@ hit_next:
                if (wake)
                        wake_up(&state->wq);
 
-               clear_state_bit(tree, prealloc, &bits, wake);
+               clear_state_bit(tree, prealloc, &bits, wake, changeset);
 
                prealloc = NULL;
                goto out;
        }
 
-       state = clear_state_bit(tree, state, &bits, wake);
+       state = clear_state_bit(tree, state, &bits, wake, changeset);
 next:
        if (last_end == (u64)-1)
                goto out;
@@ -718,7 +741,7 @@ search_again:
        if (start > end)
                goto out;
        spin_unlock(&tree->lock);
-       if (mask & __GFP_WAIT)
+       if (gfpflags_allow_blocking(mask))
                cond_resched();
        goto again;
 }
@@ -789,7 +812,7 @@ out:
 
 static void set_state_bits(struct extent_io_tree *tree,
                           struct extent_state *state,
-                          unsigned *bits)
+                          unsigned *bits, struct extent_changeset *changeset)
 {
        unsigned bits_to_set = *bits & ~EXTENT_CTLBITS;
 
@@ -798,6 +821,7 @@ static void set_state_bits(struct extent_io_tree *tree,
                u64 range = state->end - state->start + 1;
                tree->dirty_bytes += range;
        }
+       add_extent_changeset(state, bits_to_set, changeset, 1);
        state->state |= bits_to_set;
 }
 
@@ -835,7 +859,7 @@ static int __must_check
 __set_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
                 unsigned bits, unsigned exclusive_bits,
                 u64 *failed_start, struct extent_state **cached_state,
-                gfp_t mask)
+                gfp_t mask, struct extent_changeset *changeset)
 {
        struct extent_state *state;
        struct extent_state *prealloc = NULL;
@@ -850,7 +874,7 @@ __set_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
 
        bits |= EXTENT_FIRST_DELALLOC;
 again:
-       if (!prealloc && (mask & __GFP_WAIT)) {
+       if (!prealloc && gfpflags_allow_blocking(mask)) {
                prealloc = alloc_extent_state(mask);
                BUG_ON(!prealloc);
        }
@@ -873,7 +897,7 @@ again:
                prealloc = alloc_extent_state_atomic(prealloc);
                BUG_ON(!prealloc);
                err = insert_state(tree, prealloc, start, end,
-                                  &p, &parent, &bits);
+                                  &p, &parent, &bits, changeset);
                if (err)
                        extent_io_tree_panic(tree, err);
 
@@ -899,7 +923,7 @@ hit_next:
                        goto out;
                }
 
-               set_state_bits(tree, state, &bits);
+               set_state_bits(tree, state, &bits, changeset);
                cache_state(state, cached_state);
                merge_state(tree, state);
                if (last_end == (u64)-1)
@@ -945,7 +969,7 @@ hit_next:
                if (err)
                        goto out;
                if (state->end <= end) {
-                       set_state_bits(tree, state, &bits);
+                       set_state_bits(tree, state, &bits, changeset);
                        cache_state(state, cached_state);
                        merge_state(tree, state);
                        if (last_end == (u64)-1)
@@ -980,7 +1004,7 @@ hit_next:
                 * the later extent.
                 */
                err = insert_state(tree, prealloc, start, this_end,
-                                  NULL, NULL, &bits);
+                                  NULL, NULL, &bits, changeset);
                if (err)
                        extent_io_tree_panic(tree, err);
 
@@ -1008,7 +1032,7 @@ hit_next:
                if (err)
                        extent_io_tree_panic(tree, err);
 
-               set_state_bits(tree, prealloc, &bits);
+               set_state_bits(tree, prealloc, &bits, changeset);
                cache_state(prealloc, cached_state);
                merge_state(tree, prealloc);
                prealloc = NULL;
@@ -1028,7 +1052,7 @@ search_again:
        if (start > end)
                goto out;
        spin_unlock(&tree->lock);
-       if (mask & __GFP_WAIT)
+       if (gfpflags_allow_blocking(mask))
                cond_resched();
        goto again;
 }
@@ -1038,7 +1062,7 @@ int set_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
                   struct extent_state **cached_state, gfp_t mask)
 {
        return __set_extent_bit(tree, start, end, bits, 0, failed_start,
-                               cached_state, mask);
+                               cached_state, mask, NULL);
 }
 
 
@@ -1076,7 +1100,7 @@ int convert_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
        btrfs_debug_check_extent_io_range(tree, start, end);
 
 again:
-       if (!prealloc && (mask & __GFP_WAIT)) {
+       if (!prealloc && gfpflags_allow_blocking(mask)) {
                /*
                 * Best effort, don't worry if extent state allocation fails
                 * here for the first iteration. We might have a cached state
@@ -1111,7 +1135,7 @@ again:
                        goto out;
                }
                err = insert_state(tree, prealloc, start, end,
-                                  &p, &parent, &bits);
+                                  &p, &parent, &bits, NULL);
                if (err)
                        extent_io_tree_panic(tree, err);
                cache_state(prealloc, cached_state);
@@ -1130,9 +1154,9 @@ hit_next:
         * Just lock what we found and keep going
         */
        if (state->start == start && state->end <= end) {
-               set_state_bits(tree, state, &bits);
+               set_state_bits(tree, state, &bits, NULL);
                cache_state(state, cached_state);
-               state = clear_state_bit(tree, state, &clear_bits, 0);
+               state = clear_state_bit(tree, state, &clear_bits, 0, NULL);
                if (last_end == (u64)-1)
                        goto out;
                start = last_end + 1;
@@ -1171,9 +1195,10 @@ hit_next:
                if (err)
                        goto out;
                if (state->end <= end) {
-                       set_state_bits(tree, state, &bits);
+                       set_state_bits(tree, state, &bits, NULL);
                        cache_state(state, cached_state);
-                       state = clear_state_bit(tree, state, &clear_bits, 0);
+                       state = clear_state_bit(tree, state, &clear_bits, 0,
+                                               NULL);
                        if (last_end == (u64)-1)
                                goto out;
                        start = last_end + 1;
@@ -1208,7 +1233,7 @@ hit_next:
                 * the later extent.
                 */
                err = insert_state(tree, prealloc, start, this_end,
-                                  NULL, NULL, &bits);
+                                  NULL, NULL, &bits, NULL);
                if (err)
                        extent_io_tree_panic(tree, err);
                cache_state(prealloc, cached_state);
@@ -1233,9 +1258,9 @@ hit_next:
                if (err)
                        extent_io_tree_panic(tree, err);
 
-               set_state_bits(tree, prealloc, &bits);
+               set_state_bits(tree, prealloc, &bits, NULL);
                cache_state(prealloc, cached_state);
-               clear_state_bit(tree, prealloc, &clear_bits, 0);
+               clear_state_bit(tree, prealloc, &clear_bits, 0, NULL);
                prealloc = NULL;
                goto out;
        }
@@ -1253,7 +1278,7 @@ search_again:
        if (start > end)
                goto out;
        spin_unlock(&tree->lock);
-       if (mask & __GFP_WAIT)
+       if (gfpflags_allow_blocking(mask))
                cond_resched();
        first_iteration = false;
        goto again;
@@ -1274,10 +1299,53 @@ int set_extent_bits(struct extent_io_tree *tree, u64 start, u64 end,
                              NULL, mask);
 }
 
+int set_record_extent_bits(struct extent_io_tree *tree, u64 start, u64 end,
+                          unsigned bits, gfp_t mask,
+                          struct extent_changeset *changeset)
+{
+       /*
+        * We don't support EXTENT_LOCKED yet, as current changeset will
+        * record any bits changed, so for EXTENT_LOCKED case, it will
+        * either fail with -EEXIST or changeset will record the whole
+        * range.
+        */
+       BUG_ON(bits & EXTENT_LOCKED);
+
+       return __set_extent_bit(tree, start, end, bits, 0, NULL, NULL, mask,
+                               changeset);
+}
+
+int clear_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
+                    unsigned bits, int wake, int delete,
+                    struct extent_state **cached, gfp_t mask)
+{
+       return __clear_extent_bit(tree, start, end, bits, wake, delete,
+                                 cached, mask, NULL);
+}
+
 int clear_extent_bits(struct extent_io_tree *tree, u64 start, u64 end,
                      unsigned bits, gfp_t mask)
 {
-       return clear_extent_bit(tree, start, end, bits, 0, 0, NULL, mask);
+       int wake = 0;
+
+       if (bits & EXTENT_LOCKED)
+               wake = 1;
+
+       return clear_extent_bit(tree, start, end, bits, wake, 0, NULL, mask);
+}
+
+int clear_record_extent_bits(struct extent_io_tree *tree, u64 start, u64 end,
+                            unsigned bits, gfp_t mask,
+                            struct extent_changeset *changeset)
+{
+       /*
+        * Don't support EXTENT_LOCKED case, same reason as
+        * set_record_extent_bits().
+        */
+       BUG_ON(bits & EXTENT_LOCKED);
+
+       return __clear_extent_bit(tree, start, end, bits, 0, 0, NULL, mask,
+                                 changeset);
 }
 
 int set_extent_delalloc(struct extent_io_tree *tree, u64 start, u64 end,
@@ -1338,7 +1406,7 @@ int lock_extent_bits(struct extent_io_tree *tree, u64 start, u64 end,
        while (1) {
                err = __set_extent_bit(tree, start, end, EXTENT_LOCKED | bits,
                                       EXTENT_LOCKED, &failed_start,
-                                      cached_state, GFP_NOFS);
+                                      cached_state, GFP_NOFS, NULL);
                if (err == -EEXIST) {
                        wait_extent_bit(tree, failed_start, end, EXTENT_LOCKED);
                        start = failed_start;
@@ -1360,7 +1428,7 @@ int try_lock_extent(struct extent_io_tree *tree, u64 start, u64 end)
        u64 failed_start;
 
        err = __set_extent_bit(tree, start, end, EXTENT_LOCKED, EXTENT_LOCKED,
-                              &failed_start, NULL, GFP_NOFS);
+                              &failed_start, NULL, GFP_NOFS, NULL);
        if (err == -EEXIST) {
                if (failed_start > start)
                        clear_extent_bit(tree, start, failed_start - 1,
@@ -2073,8 +2141,8 @@ int repair_io_failure(struct inode *inode, u64 start, u64 length, u64 logical,
                return -EIO;
        }
 
-       printk_ratelimited_in_rcu(KERN_INFO
-                                 "BTRFS: read error corrected: ino %llu off %llu (dev %s sector %llu)\n",
+       btrfs_info_rl_in_rcu(fs_info,
+               "read error corrected: ino %llu off %llu (dev %s sector %llu)",
                                  btrfs_ino(inode), start,
                                  rcu_str_deref(dev->name), sector);
        bio_put(bio);
@@ -2481,7 +2549,7 @@ int end_extent_writepage(struct page *page, int err, u64 start, u64 end)
  * Scheduling is not allowed, so the extent state tree is expected
  * to have one and only one object corresponding to this IO.
  */
-static void end_bio_extent_writepage(struct bio *bio, int err)
+static void end_bio_extent_writepage(struct bio *bio)
 {
        struct bio_vec *bvec;
        u64 start;
@@ -2511,7 +2579,7 @@ static void end_bio_extent_writepage(struct bio *bio, int err)
                start = page_offset(page);
                end = start + bvec->bv_offset + bvec->bv_len - 1;
 
-               if (end_extent_writepage(page, err, start, end))
+               if (end_extent_writepage(page, bio->bi_error, start, end))
                        continue;
 
                end_page_writeback(page);
@@ -2543,10 +2611,10 @@ endio_readpage_release_extent(struct extent_io_tree *tree, u64 start, u64 len,
  * Scheduling is not allowed, so the extent state tree is expected
  * to have one and only one object corresponding to this IO.
  */
-static void end_bio_extent_readpage(struct bio *bio, int err)
+static void end_bio_extent_readpage(struct bio *bio)
 {
        struct bio_vec *bvec;
-       int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags);
+       int uptodate = !bio->bi_error;
        struct btrfs_io_bio *io_bio = btrfs_io_bio(bio);
        struct extent_io_tree *tree;
        u64 offset = 0;
@@ -2559,16 +2627,13 @@ static void end_bio_extent_readpage(struct bio *bio, int err)
        int ret;
        int i;
 
-       if (err)
-               uptodate = 0;
-
        bio_for_each_segment_all(bvec, bio, i) {
                struct page *page = bvec->bv_page;
                struct inode *inode = page->mapping->host;
 
                pr_debug("end_bio_extent_readpage: bi_sector=%llu, err=%d, "
-                        "mirror=%u\n", (u64)bio->bi_iter.bi_sector, err,
-                        io_bio->mirror_num);
+                        "mirror=%u\n", (u64)bio->bi_iter.bi_sector,
+                        bio->bi_error, io_bio->mirror_num);
                tree = &BTRFS_I(inode)->io_tree;
 
                /* We always issue full-page reads, but if some block
@@ -2609,8 +2674,7 @@ static void end_bio_extent_readpage(struct bio *bio, int err)
 
                if (tree->ops && tree->ops->readpage_io_failed_hook) {
                        ret = tree->ops->readpage_io_failed_hook(page, mirror);
-                       if (!ret && !err &&
-                           test_bit(BIO_UPTODATE, &bio->bi_flags))
+                       if (!ret && !bio->bi_error)
                                uptodate = 1;
                } else {
                        /*
@@ -2626,10 +2690,7 @@ static void end_bio_extent_readpage(struct bio *bio, int err)
                        ret = bio_readpage_error(bio, offset, page, start, end,
                                                 mirror);
                        if (ret == 0) {
-                               uptodate =
-                                       test_bit(BIO_UPTODATE, &bio->bi_flags);
-                               if (err)
-                                       uptodate = 0;
+                               uptodate = !bio->bi_error;
                                offset += len;
                                continue;
                        }
@@ -2679,7 +2740,7 @@ readpage_ok:
                endio_readpage_release_extent(tree, extent_start, extent_len,
                                              uptodate);
        if (io_bio->end_io)
-               io_bio->end_io(io_bio, err);
+               io_bio->end_io(io_bio, bio->bi_error);
        bio_put(bio);
 }
 
@@ -2725,6 +2786,12 @@ struct bio *btrfs_bio_clone(struct bio *bio, gfp_t gfp_mask)
                btrfs_bio->csum = NULL;
                btrfs_bio->csum_allocated = NULL;
                btrfs_bio->end_io = NULL;
+
+#ifdef CONFIG_BLK_CGROUP
+               /* FIXME, put this into bio_clone_bioset */
+               if (bio->bi_css)
+                       bio_associate_blkcg(new, bio->bi_css);
+#endif
        }
        return new;
 }
@@ -2767,8 +2834,6 @@ static int __must_check submit_one_bio(int rw, struct bio *bio,
        else
                btrfsic_submit_bio(rw, bio);
 
-       if (bio_flagged(bio, BIO_EOPNOTSUPP))
-               ret = -EOPNOTSUPP;
        bio_put(bio);
        return ret;
 }
@@ -2787,6 +2852,7 @@ static int merge_bio(int rw, struct extent_io_tree *tree, struct page *page,
 }
 
 static int submit_extent_page(int rw, struct extent_io_tree *tree,
+                             struct writeback_control *wbc,
                              struct page *page, sector_t sector,
                              size_t size, unsigned long offset,
                              struct block_device *bdev,
@@ -2795,13 +2861,12 @@ static int submit_extent_page(int rw, struct extent_io_tree *tree,
                              bio_end_io_t end_io_func,
                              int mirror_num,
                              unsigned long prev_bio_flags,
-                             unsigned long bio_flags)
+                             unsigned long bio_flags,
+                             bool force_bio_submit)
 {
        int ret = 0;
        struct bio *bio;
-       int nr;
        int contig = 0;
-       int this_compressed = bio_flags & EXTENT_BIO_COMPRESSED;
        int old_compressed = prev_bio_flags & EXTENT_BIO_COMPRESSED;
        size_t page_size = min_t(size_t, size, PAGE_CACHE_SIZE);
 
@@ -2813,6 +2878,7 @@ static int submit_extent_page(int rw, struct extent_io_tree *tree,
                        contig = bio_end_sector(bio) == sector;
 
                if (prev_bio_flags != bio_flags || !contig ||
+                   force_bio_submit ||
                    merge_bio(rw, tree, page, offset, page_size, bio, bio_flags) ||
                    bio_add_page(bio, page, page_size, offset) < page_size) {
                        ret = submit_one_bio(rw, bio, mirror_num,
@@ -2823,21 +2889,24 @@ static int submit_extent_page(int rw, struct extent_io_tree *tree,
                        }
                        bio = NULL;
                } else {
+                       if (wbc)
+                               wbc_account_io(wbc, page, page_size);
                        return 0;
                }
        }
-       if (this_compressed)
-               nr = BIO_MAX_PAGES;
-       else
-               nr = bio_get_nr_vecs(bdev);
 
-       bio = btrfs_bio_alloc(bdev, sector, nr, GFP_NOFS | __GFP_HIGH);
+       bio = btrfs_bio_alloc(bdev, sector, BIO_MAX_PAGES,
+                       GFP_NOFS | __GFP_HIGH);
        if (!bio)
                return -ENOMEM;
 
        bio_add_page(bio, page, page_size, offset);
        bio->bi_end_io = end_io_func;
        bio->bi_private = tree;
+       if (wbc) {
+               wbc_init_bio(wbc, bio);
+               wbc_account_io(wbc, page, page_size);
+       }
 
        if (bio_ret)
                *bio_ret = bio;
@@ -2906,7 +2975,8 @@ static int __do_readpage(struct extent_io_tree *tree,
                         get_extent_t *get_extent,
                         struct extent_map **em_cached,
                         struct bio **bio, int mirror_num,
-                        unsigned long *bio_flags, int rw)
+                        unsigned long *bio_flags, int rw,
+                        u64 *prev_em_start)
 {
        struct inode *inode = page->mapping->host;
        u64 start = page_offset(page);
@@ -2954,6 +3024,7 @@ static int __do_readpage(struct extent_io_tree *tree,
        }
        while (cur <= end) {
                unsigned long pnr = (last_byte >> PAGE_CACHE_SHIFT) + 1;
+               bool force_bio_submit = false;
 
                if (cur >= last_byte) {
                        char *userpage;
@@ -3004,6 +3075,49 @@ static int __do_readpage(struct extent_io_tree *tree,
                block_start = em->block_start;
                if (test_bit(EXTENT_FLAG_PREALLOC, &em->flags))
                        block_start = EXTENT_MAP_HOLE;
+
+               /*
+                * If we have a file range that points to a compressed extent
+                * and it's followed by a consecutive file range that points to
+                * to the same compressed extent (possibly with a different
+                * offset and/or length, so it either points to the whole extent
+                * or only part of it), we must make sure we do not submit a
+                * single bio to populate the pages for the 2 ranges because
+                * this makes the compressed extent read zero out the pages
+                * belonging to the 2nd range. Imagine the following scenario:
+                *
+                *  File layout
+                *  [0 - 8K]                     [8K - 24K]
+                *    |                               |
+                *    |                               |
+                * points to extent X,         points to extent X,
+                * offset 4K, length of 8K     offset 0, length 16K
+                *
+                * [extent X, compressed length = 4K uncompressed length = 16K]
+                *
+                * If the bio to read the compressed extent covers both ranges,
+                * it will decompress extent X into the pages belonging to the
+                * first range and then it will stop, zeroing out the remaining
+                * pages that belong to the other range that points to extent X.
+                * So here we make sure we submit 2 bios, one for the first
+                * range and another one for the third range. Both will target
+                * the same physical extent from disk, but we can't currently
+                * make the compressed bio endio callback populate the pages
+                * for both ranges because each compressed bio is tightly
+                * coupled with a single extent map, and each range can have
+                * an extent map with a different offset value relative to the
+                * uncompressed data of our extent and different lengths. This
+                * is a corner case so we prioritize correctness over
+                * non-optimal behavior (submitting 2 bios for the same extent).
+                */
+               if (test_bit(EXTENT_FLAG_COMPRESSED, &em->flags) &&
+                   prev_em_start && *prev_em_start != (u64)-1 &&
+                   *prev_em_start != em->orig_start)
+                       force_bio_submit = true;
+
+               if (prev_em_start)
+                       *prev_em_start = em->orig_start;
+
                free_extent_map(em);
                em = NULL;
 
@@ -3019,8 +3133,12 @@ static int __do_readpage(struct extent_io_tree *tree,
 
                        set_extent_uptodate(tree, cur, cur + iosize - 1,
                                            &cached, GFP_NOFS);
-                       unlock_extent_cached(tree, cur, cur + iosize - 1,
-                                            &cached, GFP_NOFS);
+                       if (parent_locked)
+                               free_extent_state(cached);
+                       else
+                               unlock_extent_cached(tree, cur,
+                                                    cur + iosize - 1,
+                                                    &cached, GFP_NOFS);
                        cur = cur + iosize;
                        pg_offset += iosize;
                        continue;
@@ -3048,12 +3166,13 @@ static int __do_readpage(struct extent_io_tree *tree,
                }
 
                pnr -= page->index;
-               ret = submit_extent_page(rw, tree, page,
+               ret = submit_extent_page(rw, tree, NULL, page,
                                         sector, disk_io_size, pg_offset,
                                         bdev, bio, pnr,
                                         end_bio_extent_readpage, mirror_num,
                                         *bio_flags,
-                                        this_bio_flag);
+                                        this_bio_flag,
+                                        force_bio_submit);
                if (!ret) {
                        nr++;
                        *bio_flags = this_bio_flag;
@@ -3080,7 +3199,8 @@ static inline void __do_contiguous_readpages(struct extent_io_tree *tree,
                                             get_extent_t *get_extent,
                                             struct extent_map **em_cached,
                                             struct bio **bio, int mirror_num,
-                                            unsigned long *bio_flags, int rw)
+                                            unsigned long *bio_flags, int rw,
+                                            u64 *prev_em_start)
 {
        struct inode *inode;
        struct btrfs_ordered_extent *ordered;
@@ -3100,7 +3220,7 @@ static inline void __do_contiguous_readpages(struct extent_io_tree *tree,
 
        for (index = 0; index < nr_pages; index++) {
                __do_readpage(tree, pages[index], get_extent, em_cached, bio,
-                             mirror_num, bio_flags, rw);
+                             mirror_num, bio_flags, rw, prev_em_start);
                page_cache_release(pages[index]);
        }
 }
@@ -3110,7 +3230,8 @@ static void __extent_readpages(struct extent_io_tree *tree,
                               int nr_pages, get_extent_t *get_extent,
                               struct extent_map **em_cached,
                               struct bio **bio, int mirror_num,
-                              unsigned long *bio_flags, int rw)
+                              unsigned long *bio_flags, int rw,
+                              u64 *prev_em_start)
 {
        u64 start = 0;
        u64 end = 0;
@@ -3131,7 +3252,7 @@ static void __extent_readpages(struct extent_io_tree *tree,
                                                  index - first_index, start,
                                                  end, get_extent, em_cached,
                                                  bio, mirror_num, bio_flags,
-                                                 rw);
+                                                 rw, prev_em_start);
                        start = page_start;
                        end = start + PAGE_CACHE_SIZE - 1;
                        first_index = index;
@@ -3142,7 +3263,8 @@ static void __extent_readpages(struct extent_io_tree *tree,
                __do_contiguous_readpages(tree, &pages[first_index],
                                          index - first_index, start,
                                          end, get_extent, em_cached, bio,
-                                         mirror_num, bio_flags, rw);
+                                         mirror_num, bio_flags, rw,
+                                         prev_em_start);
 }
 
 static int __extent_read_full_page(struct extent_io_tree *tree,
@@ -3168,7 +3290,7 @@ static int __extent_read_full_page(struct extent_io_tree *tree,
        }
 
        ret = __do_readpage(tree, page, get_extent, NULL, bio, mirror_num,
-                           bio_flags, rw);
+                           bio_flags, rw, NULL);
        return ret;
 }
 
@@ -3194,7 +3316,7 @@ int extent_read_full_page_nolock(struct extent_io_tree *tree, struct page *page,
        int ret;
 
        ret = __do_readpage(tree, page, get_extent, NULL, &bio, mirror_num,
-                                     &bio_flags, READ);
+                           &bio_flags, READ, NULL);
        if (bio)
                ret = submit_one_bio(READ, bio, mirror_num, bio_flags);
        return ret;
@@ -3443,11 +3565,11 @@ static noinline_for_stack int __extent_writepage_io(struct inode *inode,
                                       page->index, cur, end);
                        }
 
-                       ret = submit_extent_page(write_flags, tree, page,
+                       ret = submit_extent_page(write_flags, tree, wbc, page,
                                                 sector, iosize, pg_offset,
                                                 bdev, &epd->bio, max_nr,
                                                 end_bio_extent_writepage,
-                                                0, 0, 0);
+                                                0, 0, 0, false);
                        if (ret)
                                SetPageError(page);
                }
@@ -3693,7 +3815,7 @@ static void set_btree_ioerr(struct page *page)
        }
 }
 
-static void end_bio_extent_buffer_writepage(struct bio *bio, int err)
+static void end_bio_extent_buffer_writepage(struct bio *bio)
 {
        struct bio_vec *bvec;
        struct extent_buffer *eb;
@@ -3706,7 +3828,8 @@ static void end_bio_extent_buffer_writepage(struct bio *bio, int err)
                BUG_ON(!eb);
                done = atomic_dec_and_test(&eb->io_pages);
 
-               if (err || test_bit(EXTENT_BUFFER_WRITE_ERR, &eb->bflags)) {
+               if (bio->bi_error ||
+                   test_bit(EXTENT_BUFFER_WRITE_ERR, &eb->bflags)) {
                        ClearPageUptodate(page);
                        set_btree_ioerr(page);
                }
@@ -3746,10 +3869,10 @@ static noinline_for_stack int write_one_eb(struct extent_buffer *eb,
 
                clear_page_dirty_for_io(p);
                set_page_writeback(p);
-               ret = submit_extent_page(rw, tree, p, offset >> 9,
+               ret = submit_extent_page(rw, tree, wbc, p, offset >> 9,
                                         PAGE_CACHE_SIZE, 0, bdev, &epd->bio,
                                         -1, end_bio_extent_buffer_writepage,
-                                        0, epd->bio_flags, bio_flags);
+                                        0, epd->bio_flags, bio_flags, false);
                epd->bio_flags = bio_flags;
                if (ret) {
                        set_btree_ioerr(p);
@@ -4153,6 +4276,7 @@ int extent_readpages(struct extent_io_tree *tree,
        struct page *page;
        struct extent_map *em_cached = NULL;
        int nr = 0;
+       u64 prev_em_start = (u64)-1;
 
        for (page_idx = 0; page_idx < nr_pages; page_idx++) {
                page = list_entry(pages->prev, struct page, lru);
@@ -4169,12 +4293,12 @@ int extent_readpages(struct extent_io_tree *tree,
                if (nr < ARRAY_SIZE(pagepool))
                        continue;
                __extent_readpages(tree, pagepool, nr, get_extent, &em_cached,
-                                  &bio, 0, &bio_flags, READ);
+                                  &bio, 0, &bio_flags, READ, &prev_em_start);
                nr = 0;
        }
        if (nr)
                __extent_readpages(tree, pagepool, nr, get_extent, &em_cached,
-                                  &bio, 0, &bio_flags, READ);
+                                  &bio, 0, &bio_flags, READ, &prev_em_start);
 
        if (em_cached)
                free_extent_map(em_cached);
@@ -4262,7 +4386,7 @@ int try_release_extent_mapping(struct extent_map_tree *map,
        u64 start = page_offset(page);
        u64 end = start + PAGE_CACHE_SIZE - 1;
 
-       if ((mask & __GFP_WAIT) &&
+       if (gfpflags_allow_blocking(mask) &&
            page->mapping->host->i_size > 16 * 1024 * 1024) {
                u64 len;
                while (start <= end) {
@@ -4492,6 +4616,8 @@ int extent_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
                }
                if (test_bit(EXTENT_FLAG_COMPRESSED, &em->flags))
                        flags |= FIEMAP_EXTENT_ENCODED;
+               if (test_bit(EXTENT_FLAG_PREALLOC, &em->flags))
+                       flags |= FIEMAP_EXTENT_UNWRITTEN;
 
                free_extent_map(em);
                em = NULL;
@@ -4609,9 +4735,7 @@ __alloc_extent_buffer(struct btrfs_fs_info *fs_info, u64 start,
 {
        struct extent_buffer *eb = NULL;
 
-       eb = kmem_cache_zalloc(extent_buffer_cache, GFP_NOFS);
-       if (eb == NULL)
-               return NULL;
+       eb = kmem_cache_zalloc(extent_buffer_cache, GFP_NOFS|__GFP_NOFAIL);
        eb->start = start;
        eb->len = len;
        eb->fs_info = fs_info;
@@ -4869,7 +4993,7 @@ struct extent_buffer *alloc_extent_buffer(struct btrfs_fs_info *fs_info,
                return NULL;
 
        for (i = 0; i < num_pages; i++, index++) {
-               p = find_or_create_page(mapping, index, GFP_NOFS);
+               p = find_or_create_page(mapping, index, GFP_NOFS|__GFP_NOFAIL);
                if (!p)
                        goto free_eb;
 
@@ -5509,13 +5633,15 @@ void memcpy_extent_buffer(struct extent_buffer *dst, unsigned long dst_offset,
        unsigned long src_i;
 
        if (src_offset + len > dst->len) {
-               printk(KERN_ERR "BTRFS: memmove bogus src_offset %lu move "
-                      "len %lu dst len %lu\n", src_offset, len, dst->len);
+               btrfs_err(dst->fs_info,
+                       "memmove bogus src_offset %lu move "
+                      "len %lu dst len %lu", src_offset, len, dst->len);
                BUG_ON(1);
        }
        if (dst_offset + len > dst->len) {
-               printk(KERN_ERR "BTRFS: memmove bogus dst_offset %lu move "
-                      "len %lu dst len %lu\n", dst_offset, len, dst->len);
+               btrfs_err(dst->fs_info,
+                       "memmove bogus dst_offset %lu move "
+                      "len %lu dst len %lu", dst_offset, len, dst->len);
                BUG_ON(1);
        }
 
@@ -5555,13 +5681,13 @@ void memmove_extent_buffer(struct extent_buffer *dst, unsigned long dst_offset,
        unsigned long src_i;
 
        if (src_offset + len > dst->len) {
-               printk(KERN_ERR "BTRFS: memmove bogus src_offset %lu move "
-                      "len %lu len %lu\n", src_offset, len, dst->len);
+               btrfs_err(dst->fs_info, "memmove bogus src_offset %lu move "
+                      "len %lu len %lu", src_offset, len, dst->len);
                BUG_ON(1);
        }
        if (dst_offset + len > dst->len) {
-               printk(KERN_ERR "BTRFS: memmove bogus dst_offset %lu move "
-                      "len %lu len %lu\n", dst_offset, len, dst->len);
+               btrfs_err(dst->fs_info, "memmove bogus dst_offset %lu move "
+                      "len %lu len %lu", dst_offset, len, dst->len);
                BUG_ON(1);
        }
        if (dst_offset < src_offset) {