These changes are the raw update to linux-4.4.6-rt14. Kernel sources
[kvmfornfv.git] / kernel / fs / btrfs / inode.c
index 8bb0136..4bc9dbf 100644 (file)
@@ -310,6 +310,13 @@ static noinline int cow_file_range_inline(struct btrfs_root *root,
        btrfs_delalloc_release_metadata(inode, end + 1 - start);
        btrfs_drop_extent_cache(inode, start, aligned_end - 1, 0);
 out:
+       /*
+        * Don't forget to free the reserved space, as for inlined extent
+        * it won't count as data extent, free them directly here.
+        * And at reserve time, it's always aligned to page size, so
+        * just free one page here.
+        */
+       btrfs_qgroup_free_data(inode, 0, PAGE_CACHE_SIZE);
        btrfs_free_path(path);
        btrfs_end_transaction(trans, root);
        return ret;
@@ -1096,6 +1103,9 @@ static noinline void async_cow_submit(struct btrfs_work *work)
        nr_pages = (async_cow->end - async_cow->start + PAGE_CACHE_SIZE) >>
                PAGE_CACHE_SHIFT;
 
+       /*
+        * atomic_sub_return implies a barrier for waitqueue_active
+        */
        if (atomic_sub_return(nr_pages, &root->fs_info->async_delalloc_pages) <
            5 * 1024 * 1024 &&
            waitqueue_active(&root->fs_info->async_submit_wait))
@@ -1294,8 +1304,14 @@ next_slot:
                num_bytes = 0;
                btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
 
-               if (found_key.objectid > ino ||
-                   found_key.type > BTRFS_EXTENT_DATA_KEY ||
+               if (found_key.objectid > ino)
+                       break;
+               if (WARN_ON_ONCE(found_key.objectid < ino) ||
+                   found_key.type < BTRFS_EXTENT_DATA_KEY) {
+                       path->slots[0]++;
+                       goto next_slot;
+               }
+               if (found_key.type > BTRFS_EXTENT_DATA_KEY ||
                    found_key.offset > end)
                        break;
 
@@ -1766,7 +1782,8 @@ static void btrfs_clear_bit_hook(struct inode *inode,
 
                if (root->root_key.objectid != BTRFS_DATA_RELOC_TREE_OBJECTID
                    && do_list && !(state->state & EXTENT_NORESERVE))
-                       btrfs_free_reserved_data_space(inode, len);
+                       btrfs_free_reserved_data_space_noquota(inode,
+                                       state->start, len);
 
                __percpu_counter_add(&root->fs_info->delalloc_bytes, -len,
                                     root->fs_info->delalloc_batch);
@@ -1845,8 +1862,10 @@ static int __btrfs_submit_bio_done(struct inode *inode, int rw, struct bio *bio,
        int ret;
 
        ret = btrfs_map_bio(root, rw, bio, mirror_num, 1);
-       if (ret)
-               bio_endio(bio, ret);
+       if (ret) {
+               bio->bi_error = ret;
+               bio_endio(bio);
+       }
        return ret;
 }
 
@@ -1859,15 +1878,15 @@ static int btrfs_submit_bio_hook(struct inode *inode, int rw, struct bio *bio,
                          u64 bio_offset)
 {
        struct btrfs_root *root = BTRFS_I(inode)->root;
+       enum btrfs_wq_endio_type metadata = BTRFS_WQ_ENDIO_DATA;
        int ret = 0;
        int skip_sum;
-       int metadata = 0;
        int async = !atomic_read(&BTRFS_I(inode)->sync_writers);
 
        skip_sum = BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM;
 
        if (btrfs_is_free_space_inode(inode))
-               metadata = 2;
+               metadata = BTRFS_WQ_ENDIO_FREE_SPACE;
 
        if (!(rw & REQ_WRITE)) {
                ret = btrfs_bio_wq_end_io(root->fs_info, bio, metadata);
@@ -1906,8 +1925,10 @@ mapit:
        ret = btrfs_map_bio(root, rw, bio, mirror_num, 0);
 
 out:
-       if (ret < 0)
-               bio_endio(bio, ret);
+       if (ret < 0) {
+               bio->bi_error = ret;
+               bio_endio(bio);
+       }
        return ret;
 }
 
@@ -1985,7 +2006,8 @@ again:
                goto again;
        }
 
-       ret = btrfs_delalloc_reserve_space(inode, PAGE_CACHE_SIZE);
+       ret = btrfs_delalloc_reserve_space(inode, page_start,
+                                          PAGE_CACHE_SIZE);
        if (ret) {
                mapping_set_error(page->mapping, ret);
                end_extent_writepage(page, ret, page_start, page_end);
@@ -2111,7 +2133,13 @@ static int insert_reserved_file_extent(struct btrfs_trans_handle *trans,
        ins.type = BTRFS_EXTENT_ITEM_KEY;
        ret = btrfs_alloc_reserved_file_extent(trans, root,
                                        root->root_key.objectid,
-                                       btrfs_ino(inode), file_pos, &ins);
+                                       btrfs_ino(inode), file_pos,
+                                       ram_bytes, &ins);
+       /*
+        * Release the reserved range from inode dirty range map, as it is
+        * already moved into delayed_ref_head
+        */
+       btrfs_qgroup_release_data(inode, file_pos, ram_bytes);
 out:
        btrfs_free_path(path);
 
@@ -2569,7 +2597,7 @@ again:
        ret = btrfs_inc_extent_ref(trans, root, new->bytenr,
                        new->disk_len, 0,
                        backref->root_id, backref->inum,
-                       new->file_pos, 0);      /* start - extent_offset */
+                       new->file_pos); /* start - extent_offset */
        if (ret) {
                btrfs_abort_transaction(trans, root, ret);
                goto out_free_path;
@@ -2595,7 +2623,6 @@ static void free_sa_defrag_extent(struct new_sa_defrag_extent *new)
                return;
 
        list_for_each_entry_safe(old, tmp, &new->head, list) {
-               list_del(&old->list);
                kfree(old);
        }
        kfree(new);
@@ -2820,6 +2847,14 @@ static int btrfs_finish_ordered_io(struct btrfs_ordered_extent *ordered_extent)
 
        if (test_bit(BTRFS_ORDERED_NOCOW, &ordered_extent->flags)) {
                BUG_ON(!list_empty(&ordered_extent->list)); /* Logic error */
+
+               /*
+                * For mwrite(mmap + memset to write) case, we still reserve
+                * space for NOCOW range.
+                * As NOCOW won't cause a new delayed ref, just free the space
+                */
+               btrfs_qgroup_free_data(inode, ordered_extent->file_offset,
+                                      ordered_extent->len);
                btrfs_ordered_update_i_size(inode, 0, ordered_extent);
                if (nolock)
                        trans = btrfs_join_transaction_nolock(root);
@@ -3014,8 +3049,6 @@ static int __readpage_endio_check(struct inode *inode,
        char *kaddr;
        u32 csum_expected;
        u32 csum = ~(u32)0;
-       static DEFINE_RATELIMIT_STATE(_rs, DEFAULT_RATELIMIT_INTERVAL,
-                                     DEFAULT_RATELIMIT_BURST);
 
        csum_expected = *(((u32 *)io_bio->csum) + icsum);
 
@@ -3028,9 +3061,8 @@ static int __readpage_endio_check(struct inode *inode,
        kunmap_atomic(kaddr);
        return 0;
 zeroit:
-       if (__ratelimit(&_rs))
-               btrfs_warn(BTRFS_I(inode)->root->fs_info,
-                          "csum failed ino %llu off %llu csum %u expected csum %u",
+       btrfs_warn_rl(BTRFS_I(inode)->root->fs_info,
+               "csum failed ino %llu off %llu csum %u expected csum %u",
                           btrfs_ino(inode), start, csum, csum_expected);
        memset(kaddr + pgoff, 1, len);
        flush_dcache_page(page);
@@ -3110,8 +3142,6 @@ void btrfs_run_delayed_iputs(struct btrfs_root *root)
        if (empty)
                return;
 
-       down_read(&fs_info->delayed_iput_sem);
-
        spin_lock(&fs_info->delayed_iput_lock);
        list_splice_init(&fs_info->delayed_iputs, &list);
        spin_unlock(&fs_info->delayed_iput_lock);
@@ -3122,8 +3152,6 @@ void btrfs_run_delayed_iputs(struct btrfs_root *root)
                iput(delayed->inode);
                kfree(delayed);
        }
-
-       up_read(&root->fs_info->delayed_iput_sem);
 }
 
 /*
@@ -3654,6 +3682,35 @@ cache_index:
                set_bit(BTRFS_INODE_NEEDS_FULL_SYNC,
                        &BTRFS_I(inode)->runtime_flags);
 
+       /*
+        * We don't persist the id of the transaction where an unlink operation
+        * against the inode was last made. So here we assume the inode might
+        * have been evicted, and therefore the exact value of last_unlink_trans
+        * lost, and set it to last_trans to avoid metadata inconsistencies
+        * between the inode and its parent if the inode is fsync'ed and the log
+        * replayed. For example, in the scenario:
+        *
+        * touch mydir/foo
+        * ln mydir/foo mydir/bar
+        * sync
+        * unlink mydir/bar
+        * echo 2 > /proc/sys/vm/drop_caches   # evicts inode
+        * xfs_io -c fsync mydir/foo
+        * <power failure>
+        * mount fs, triggers fsync log replay
+        *
+        * We must make sure that when we fsync our inode foo we also log its
+        * parent inode, otherwise after log replay the parent still has the
+        * dentry with the "bar" name but our inode foo has a link count of 1
+        * and doesn't have an inode ref with the name "bar" anymore.
+        *
+        * Setting last_unlink_trans to last_trans is a pessimistic approach,
+        * but it guarantees correctness at the expense of ocassional full
+        * transaction commits on fsync if our inode is a directory, or if our
+        * inode is not a directory, logging its parent unnecessarily.
+        */
+       BTRFS_I(inode)->last_unlink_trans = BTRFS_I(inode)->last_trans;
+
        path->slots[0]++;
        if (inode->i_nlink != 1 ||
            path->slots[0] >= btrfs_header_nritems(leaf))
@@ -3985,9 +4042,7 @@ int btrfs_unlink_inode(struct btrfs_trans_handle *trans,
  */
 static struct btrfs_trans_handle *__unlink_start_trans(struct inode *dir)
 {
-       struct btrfs_trans_handle *trans;
        struct btrfs_root *root = BTRFS_I(dir)->root;
-       int ret;
 
        /*
         * 1 for the possible orphan item
@@ -3996,27 +4051,7 @@ static struct btrfs_trans_handle *__unlink_start_trans(struct inode *dir)
         * 1 for the inode ref
         * 1 for the inode
         */
-       trans = btrfs_start_transaction(root, 5);
-       if (!IS_ERR(trans) || PTR_ERR(trans) != -ENOSPC)
-               return trans;
-
-       if (PTR_ERR(trans) == -ENOSPC) {
-               u64 num_bytes = btrfs_calc_trans_metadata_size(root, 5);
-
-               trans = btrfs_start_transaction(root, 0);
-               if (IS_ERR(trans))
-                       return trans;
-               ret = btrfs_cond_migrate_bytes(root->fs_info,
-                                              &root->fs_info->trans_block_rsv,
-                                              num_bytes, 5);
-               if (ret) {
-                       btrfs_end_transaction(trans, root);
-                       return ERR_PTR(ret);
-               }
-               trans->block_rsv = &root->fs_info->trans_block_rsv;
-               trans->bytes_reserved = num_bytes;
-       }
-       return trans;
+       return btrfs_start_transaction_fallback_global_rsv(root, 5, 5);
 }
 
 static int btrfs_unlink(struct inode *dir, struct dentry *dentry)
@@ -4184,6 +4219,47 @@ static int truncate_space_check(struct btrfs_trans_handle *trans,
 
 }
 
+static int truncate_inline_extent(struct inode *inode,
+                                 struct btrfs_path *path,
+                                 struct btrfs_key *found_key,
+                                 const u64 item_end,
+                                 const u64 new_size)
+{
+       struct extent_buffer *leaf = path->nodes[0];
+       int slot = path->slots[0];
+       struct btrfs_file_extent_item *fi;
+       u32 size = (u32)(new_size - found_key->offset);
+       struct btrfs_root *root = BTRFS_I(inode)->root;
+
+       fi = btrfs_item_ptr(leaf, slot, struct btrfs_file_extent_item);
+
+       if (btrfs_file_extent_compression(leaf, fi) != BTRFS_COMPRESS_NONE) {
+               loff_t offset = new_size;
+               loff_t page_end = ALIGN(offset, PAGE_CACHE_SIZE);
+
+               /*
+                * Zero out the remaining of the last page of our inline extent,
+                * instead of directly truncating our inline extent here - that
+                * would be much more complex (decompressing all the data, then
+                * compressing the truncated data, which might be bigger than
+                * the size of the inline extent, resize the extent, etc).
+                * We release the path because to get the page we might need to
+                * read the extent item from disk (data not in the page cache).
+                */
+               btrfs_release_path(path);
+               return btrfs_truncate_page(inode, offset, page_end - offset, 0);
+       }
+
+       btrfs_set_file_extent_ram_bytes(leaf, fi, size);
+       size = btrfs_file_extent_calc_inline_size(size);
+       btrfs_truncate_item(root, path, size, 1);
+
+       if (test_bit(BTRFS_ROOT_REF_COWS, &root->state))
+               inode_sub_bytes(inode, item_end + 1 - new_size);
+
+       return 0;
+}
+
 /*
  * this can truncate away extent items, csum items and directory items.
  * It starts at a high offset and removes keys until it can't find
@@ -4209,7 +4285,7 @@ int btrfs_truncate_inode_items(struct btrfs_trans_handle *trans,
        u64 extent_num_bytes = 0;
        u64 extent_offset = 0;
        u64 item_end = 0;
-       u64 last_size = (u64)-1;
+       u64 last_size = new_size;
        u32 found_type = (u8)-1;
        int found_extent;
        int del_item;
@@ -4378,27 +4454,40 @@ search_again:
                         * special encodings
                         */
                        if (!del_item &&
-                           btrfs_file_extent_compression(leaf, fi) == 0 &&
                            btrfs_file_extent_encryption(leaf, fi) == 0 &&
                            btrfs_file_extent_other_encoding(leaf, fi) == 0) {
-                               u32 size = new_size - found_key.offset;
-
-                               if (test_bit(BTRFS_ROOT_REF_COWS, &root->state))
-                                       inode_sub_bytes(inode, item_end + 1 -
-                                                       new_size);
 
                                /*
-                                * update the ram bytes to properly reflect
-                                * the new size of our item
+                                * Need to release path in order to truncate a
+                                * compressed extent. So delete any accumulated
+                                * extent items so far.
                                 */
-                               btrfs_set_file_extent_ram_bytes(leaf, fi, size);
-                               size =
-                                   btrfs_file_extent_calc_inline_size(size);
-                               btrfs_truncate_item(root, path, size, 1);
+                               if (btrfs_file_extent_compression(leaf, fi) !=
+                                   BTRFS_COMPRESS_NONE && pending_del_nr) {
+                                       err = btrfs_del_items(trans, root, path,
+                                                             pending_del_slot,
+                                                             pending_del_nr);
+                                       if (err) {
+                                               btrfs_abort_transaction(trans,
+                                                                       root,
+                                                                       err);
+                                               goto error;
+                                       }
+                                       pending_del_nr = 0;
+                               }
+
+                               err = truncate_inline_extent(inode, path,
+                                                            &found_key,
+                                                            item_end,
+                                                            new_size);
+                               if (err) {
+                                       btrfs_abort_transaction(trans,
+                                                               root, err);
+                                       goto error;
+                               }
                        } else if (test_bit(BTRFS_ROOT_REF_COWS,
                                            &root->state)) {
-                               inode_sub_bytes(inode, item_end + 1 -
-                                               found_key.offset);
+                               inode_sub_bytes(inode, item_end + 1 - new_size);
                        }
                }
 delete:
@@ -4428,7 +4517,7 @@ delete:
                        ret = btrfs_free_extent(trans, root, extent_start,
                                                extent_num_bytes, 0,
                                                btrfs_header_owner(leaf),
-                                               ino, extent_offset, 0);
+                                               ino, extent_offset);
                        BUG_ON(ret);
                        if (btrfs_should_throttle_delayed_refs(trans, root))
                                btrfs_async_run_delayed_refs(root,
@@ -4493,8 +4582,7 @@ out:
                        btrfs_abort_transaction(trans, root, ret);
        }
 error:
-       if (last_size != (u64)-1 &&
-           root->root_key.objectid != BTRFS_TREE_LOG_OBJECTID)
+       if (root->root_key.objectid != BTRFS_TREE_LOG_OBJECTID)
                btrfs_ordered_update_i_size(inode, last_size, NULL);
 
        btrfs_free_path(path);
@@ -4543,14 +4631,17 @@ int btrfs_truncate_page(struct inode *inode, loff_t from, loff_t len,
        if ((offset & (blocksize - 1)) == 0 &&
            (!len || ((len & (blocksize - 1)) == 0)))
                goto out;
-       ret = btrfs_delalloc_reserve_space(inode, PAGE_CACHE_SIZE);
+       ret = btrfs_delalloc_reserve_space(inode,
+                       round_down(from, PAGE_CACHE_SIZE), PAGE_CACHE_SIZE);
        if (ret)
                goto out;
 
 again:
        page = find_or_create_page(mapping, index, mask);
        if (!page) {
-               btrfs_delalloc_release_space(inode, PAGE_CACHE_SIZE);
+               btrfs_delalloc_release_space(inode,
+                               round_down(from, PAGE_CACHE_SIZE),
+                               PAGE_CACHE_SIZE);
                ret = -ENOMEM;
                goto out;
        }
@@ -4618,7 +4709,8 @@ again:
 
 out_unlock:
        if (ret)
-               btrfs_delalloc_release_space(inode, PAGE_CACHE_SIZE);
+               btrfs_delalloc_release_space(inode, page_start,
+                                            PAGE_CACHE_SIZE);
        unlock_page(page);
        page_cache_release(page);
 out:
@@ -4986,24 +5078,53 @@ static void evict_inode_truncate_pages(struct inode *inode)
        }
        write_unlock(&map_tree->lock);
 
+       /*
+        * Keep looping until we have no more ranges in the io tree.
+        * We can have ongoing bios started by readpages (called from readahead)
+        * that have their endio callback (extent_io.c:end_bio_extent_readpage)
+        * still in progress (unlocked the pages in the bio but did not yet
+        * unlocked the ranges in the io tree). Therefore this means some
+        * ranges can still be locked and eviction started because before
+        * submitting those bios, which are executed by a separate task (work
+        * queue kthread), inode references (inode->i_count) were not taken
+        * (which would be dropped in the end io callback of each bio).
+        * Therefore here we effectively end up waiting for those bios and
+        * anyone else holding locked ranges without having bumped the inode's
+        * reference count - if we don't do it, when they access the inode's
+        * io_tree to unlock a range it may be too late, leading to an
+        * use-after-free issue.
+        */
        spin_lock(&io_tree->lock);
        while (!RB_EMPTY_ROOT(&io_tree->state)) {
                struct extent_state *state;
                struct extent_state *cached_state = NULL;
+               u64 start;
+               u64 end;
 
                node = rb_first(&io_tree->state);
                state = rb_entry(node, struct extent_state, rb_node);
-               atomic_inc(&state->refs);
+               start = state->start;
+               end = state->end;
                spin_unlock(&io_tree->lock);
 
-               lock_extent_bits(io_tree, state->start, state->end,
-                                0, &cached_state);
-               clear_extent_bit(io_tree, state->start, state->end,
+               lock_extent_bits(io_tree, start, end, 0, &cached_state);
+
+               /*
+                * If still has DELALLOC flag, the extent didn't reach disk,
+                * and its reserved space won't be freed by delayed_ref.
+                * So we need to free its reserved space here.
+                * (Refer to comment in btrfs_invalidatepage, case 2)
+                *
+                * Note, end is the bytenr of last byte, so we need + 1 here.
+                */
+               if (state->state & EXTENT_DELALLOC)
+                       btrfs_qgroup_free_data(inode, start, end - start + 1);
+
+               clear_extent_bit(io_tree, start, end,
                                 EXTENT_LOCKED | EXTENT_DIRTY |
                                 EXTENT_DELALLOC | EXTENT_DO_ACCOUNTING |
                                 EXTENT_DEFRAG, 1, 1,
                                 &cached_state, GFP_NOFS);
-               free_extent_state(state);
 
                cond_resched();
                spin_lock(&io_tree->lock);
@@ -5035,7 +5156,8 @@ void btrfs_evict_inode(struct inode *inode)
                goto no_delete;
        }
        /* do we really want it for ->i_nlink > 0 and zero btrfs_root_refs? */
-       btrfs_wait_ordered_range(inode, 0, (u64)-1);
+       if (!special_file(inode->i_mode))
+               btrfs_wait_ordered_range(inode, 0, (u64)-1);
 
        btrfs_free_io_failure_record(inode, 0, (u64)-1);
 
@@ -5615,6 +5737,7 @@ static int btrfs_real_readdir(struct file *file, struct dir_context *ctx)
        char *name_ptr;
        int name_len;
        int is_curr = 0;        /* ctx->pos points to the current index? */
+       bool emitted;
 
        /* FIXME, use a real flag for deciding about the key type */
        if (root->fs_info->tree_root == root)
@@ -5643,6 +5766,7 @@ static int btrfs_real_readdir(struct file *file, struct dir_context *ctx)
        if (ret < 0)
                goto err;
 
+       emitted = false;
        while (1) {
                leaf = path->nodes[0];
                slot = path->slots[0];
@@ -5722,6 +5846,7 @@ skip:
 
                        if (over)
                                goto nopos;
+                       emitted = true;
                        di_len = btrfs_dir_name_len(leaf, di) +
                                 btrfs_dir_data_len(leaf, di) + sizeof(*di);
                        di_cur += di_len;
@@ -5734,11 +5859,20 @@ next:
        if (key_type == BTRFS_DIR_INDEX_KEY) {
                if (is_curr)
                        ctx->pos++;
-               ret = btrfs_readdir_delayed_dir_index(ctx, &ins_list);
+               ret = btrfs_readdir_delayed_dir_index(ctx, &ins_list, &emitted);
                if (ret)
                        goto nopos;
        }
 
+       /*
+        * If we haven't emitted any dir entry, we must not touch ctx->pos as
+        * it was was set to the termination value in previous call. We assume
+        * that "." and ".." were emitted if we reach this point and set the
+        * termination value as well for an empty directory.
+        */
+       if (ctx->pos > 2 && !emitted)
+               goto nopos;
+
        /* Reached end of directory/root. Bump pos past the last item. */
        ctx->pos++;
 
@@ -6218,9 +6352,6 @@ static int btrfs_mknod(struct inode *dir, struct dentry *dentry,
        u64 objectid;
        u64 index = 0;
 
-       if (!new_valid_dev(rdev))
-               return -EINVAL;
-
        /*
         * 2 for inode item and ref
         * 2 for dir items
@@ -6358,7 +6489,7 @@ out_unlock_inode:
 static int btrfs_link(struct dentry *old_dentry, struct inode *dir,
                      struct dentry *dentry)
 {
-       struct btrfs_trans_handle *trans;
+       struct btrfs_trans_handle *trans = NULL;
        struct btrfs_root *root = BTRFS_I(dir)->root;
        struct inode *inode = d_inode(old_dentry);
        u64 index;
@@ -6384,6 +6515,7 @@ static int btrfs_link(struct dentry *old_dentry, struct inode *dir,
        trans = btrfs_start_transaction(root, 5);
        if (IS_ERR(trans)) {
                err = PTR_ERR(trans);
+               trans = NULL;
                goto fail;
        }
 
@@ -6417,9 +6549,10 @@ static int btrfs_link(struct dentry *old_dentry, struct inode *dir,
                btrfs_log_new_name(trans, inode, NULL, parent);
        }
 
-       btrfs_end_transaction(trans, root);
        btrfs_balance_delayed_items(root);
 fail:
+       if (trans)
+               btrfs_end_transaction(trans, root);
        if (drop_inode) {
                inode_dec_link_count(inode);
                iput(inode);
@@ -6860,8 +6993,7 @@ out:
 
        trace_btrfs_get_extent(root, em);
 
-       if (path)
-               btrfs_free_path(path);
+       btrfs_free_path(path);
        if (trans) {
                ret = btrfs_end_transaction(trans, root);
                if (!err)
@@ -7360,6 +7492,32 @@ static struct extent_map *create_pinned_em(struct inode *inode, u64 start,
        return em;
 }
 
+struct btrfs_dio_data {
+       u64 outstanding_extents;
+       u64 reserve;
+};
+
+static void adjust_dio_outstanding_extents(struct inode *inode,
+                                          struct btrfs_dio_data *dio_data,
+                                          const u64 len)
+{
+       unsigned num_extents;
+
+       num_extents = (unsigned) div64_u64(len + BTRFS_MAX_EXTENT_SIZE - 1,
+                                          BTRFS_MAX_EXTENT_SIZE);
+       /*
+        * If we have an outstanding_extents count still set then we're
+        * within our reservation, otherwise we need to adjust our inode
+        * counter appropriately.
+        */
+       if (dio_data->outstanding_extents) {
+               dio_data->outstanding_extents -= num_extents;
+       } else {
+               spin_lock(&BTRFS_I(inode)->lock);
+               BTRFS_I(inode)->outstanding_extents += num_extents;
+               spin_unlock(&BTRFS_I(inode)->lock);
+       }
+}
 
 static int btrfs_get_blocks_direct(struct inode *inode, sector_t iblock,
                                   struct buffer_head *bh_result, int create)
@@ -7367,10 +7525,10 @@ static int btrfs_get_blocks_direct(struct inode *inode, sector_t iblock,
        struct extent_map *em;
        struct btrfs_root *root = BTRFS_I(inode)->root;
        struct extent_state *cached_state = NULL;
+       struct btrfs_dio_data *dio_data = NULL;
        u64 start = iblock << inode->i_blkbits;
        u64 lockstart, lockend;
        u64 len = bh_result->b_size;
-       u64 *outstanding_extents = NULL;
        int unlock_bits = EXTENT_LOCKED;
        int ret = 0;
 
@@ -7388,7 +7546,7 @@ static int btrfs_get_blocks_direct(struct inode *inode, sector_t iblock,
                 * that anything that needs to check if there's a transction doesn't get
                 * confused.
                 */
-               outstanding_extents = current->journal_info;
+               dio_data = current->journal_info;
                current->journal_info = NULL;
        }
 
@@ -7396,8 +7554,11 @@ static int btrfs_get_blocks_direct(struct inode *inode, sector_t iblock,
         * If this errors out it's because we couldn't invalidate pagecache for
         * this range and we need to fallback to buffered.
         */
-       if (lock_extent_direct(inode, lockstart, lockend, &cached_state, create))
-               return -ENOTBLK;
+       if (lock_extent_direct(inode, lockstart, lockend, &cached_state,
+                              create)) {
+               ret = -ENOTBLK;
+               goto err;
+       }
 
        em = btrfs_get_extent(inode, NULL, 0, start, len, 0);
        if (IS_ERR(em)) {
@@ -7515,21 +7676,11 @@ unlock:
                if (start + len > i_size_read(inode))
                        i_size_write(inode, start + len);
 
-               /*
-                * If we have an outstanding_extents count still set then we're
-                * within our reservation, otherwise we need to adjust our inode
-                * counter appropriately.
-                */
-               if (*outstanding_extents) {
-                       (*outstanding_extents)--;
-               } else {
-                       spin_lock(&BTRFS_I(inode)->lock);
-                       BTRFS_I(inode)->outstanding_extents++;
-                       spin_unlock(&BTRFS_I(inode)->lock);
-               }
-
-               current->journal_info = outstanding_extents;
-               btrfs_free_reserved_data_space(inode, len);
+               adjust_dio_outstanding_extents(inode, dio_data, len);
+               btrfs_free_reserved_data_space(inode, start, len);
+               WARN_ON(dio_data->reserve < len);
+               dio_data->reserve -= len;
+               current->journal_info = dio_data;
        }
 
        /*
@@ -7552,8 +7703,17 @@ unlock:
 unlock_err:
        clear_extent_bit(&BTRFS_I(inode)->io_tree, lockstart, lockend,
                         unlock_bits, 1, 0, &cached_state, GFP_NOFS);
-       if (outstanding_extents)
-               current->journal_info = outstanding_extents;
+err:
+       if (dio_data)
+               current->journal_info = dio_data;
+       /*
+        * Compensate the delalloc release we do in btrfs_direct_IO() when we
+        * write less data then expected, so that we don't underflow our inode's
+        * outstanding extents counter.
+        */
+       if (create && dio_data)
+               adjust_dio_outstanding_extents(inode, dio_data, len);
+
        return ret;
 }
 
@@ -7671,13 +7831,13 @@ struct btrfs_retry_complete {
        int uptodate;
 };
 
-static void btrfs_retry_endio_nocsum(struct bio *bio, int err)
+static void btrfs_retry_endio_nocsum(struct bio *bio)
 {
        struct btrfs_retry_complete *done = bio->bi_private;
        struct bio_vec *bvec;
        int i;
 
-       if (err)
+       if (bio->bi_error)
                goto end;
 
        done->uptodate = 1;
@@ -7726,7 +7886,7 @@ try_again:
        return 0;
 }
 
-static void btrfs_retry_endio(struct bio *bio, int err)
+static void btrfs_retry_endio(struct bio *bio)
 {
        struct btrfs_retry_complete *done = bio->bi_private;
        struct btrfs_io_bio *io_bio = btrfs_io_bio(bio);
@@ -7735,7 +7895,7 @@ static void btrfs_retry_endio(struct bio *bio, int err)
        int ret;
        int i;
 
-       if (err)
+       if (bio->bi_error)
                goto end;
 
        uptodate = 1;
@@ -7818,12 +7978,13 @@ static int btrfs_subio_endio_read(struct inode *inode,
        }
 }
 
-static void btrfs_endio_direct_read(struct bio *bio, int err)
+static void btrfs_endio_direct_read(struct bio *bio)
 {
        struct btrfs_dio_private *dip = bio->bi_private;
        struct inode *inode = dip->inode;
        struct bio *dio_bio;
        struct btrfs_io_bio *io_bio = btrfs_io_bio(bio);
+       int err = bio->bi_error;
 
        if (dip->flags & BTRFS_DIO_ORIG_BIO_SUBMITTED)
                err = btrfs_subio_endio_read(inode, io_bio, err);
@@ -7834,17 +7995,15 @@ static void btrfs_endio_direct_read(struct bio *bio, int err)
 
        kfree(dip);
 
-       /* If we had a csum failure make sure to clear the uptodate flag */
-       if (err)
-               clear_bit(BIO_UPTODATE, &dio_bio->bi_flags);
-       dio_end_io(dio_bio, err);
+       dio_bio->bi_error = bio->bi_error;
+       dio_end_io(dio_bio, bio->bi_error);
 
        if (io_bio->end_io)
                io_bio->end_io(io_bio, err);
        bio_put(bio);
 }
 
-static void btrfs_endio_direct_write(struct bio *bio, int err)
+static void btrfs_endio_direct_write(struct bio *bio)
 {
        struct btrfs_dio_private *dip = bio->bi_private;
        struct inode *inode = dip->inode;
@@ -7855,12 +8014,11 @@ static void btrfs_endio_direct_write(struct bio *bio, int err)
        struct bio *dio_bio;
        int ret;
 
-       if (err)
-               goto out_done;
 again:
        ret = btrfs_dec_test_first_ordered_pending(inode, &ordered,
                                                   &ordered_offset,
-                                                  ordered_bytes, !err);
+                                                  ordered_bytes,
+                                                  !bio->bi_error);
        if (!ret)
                goto out_test;
 
@@ -7879,15 +8037,12 @@ out_test:
                ordered = NULL;
                goto again;
        }
-out_done:
        dio_bio = dip->dio_bio;
 
        kfree(dip);
 
-       /* If we had an error make sure to clear the uptodate flag */
-       if (err)
-               clear_bit(BIO_UPTODATE, &dio_bio->bi_flags);
-       dio_end_io(dio_bio, err);
+       dio_bio->bi_error = bio->bi_error;
+       dio_end_io(dio_bio, bio->bi_error);
        bio_put(bio);
 }
 
@@ -7902,9 +8057,10 @@ static int __btrfs_submit_bio_start_direct_io(struct inode *inode, int rw,
        return 0;
 }
 
-static void btrfs_end_dio_bio(struct bio *bio, int err)
+static void btrfs_end_dio_bio(struct bio *bio)
 {
        struct btrfs_dio_private *dip = bio->bi_private;
+       int err = bio->bi_error;
 
        if (err)
                btrfs_warn(BTRFS_I(dip->inode)->root->fs_info,
@@ -7933,8 +8089,8 @@ static void btrfs_end_dio_bio(struct bio *bio, int err)
        if (dip->errors) {
                bio_io_error(dip->orig_bio);
        } else {
-               set_bit(BIO_UPTODATE, &dip->dio_bio->bi_flags);
-               bio_endio(dip->orig_bio, 0);
+               dip->dio_bio->bi_error = 0;
+               bio_endio(dip->orig_bio);
        }
 out:
        bio_put(bio);
@@ -7943,8 +8099,11 @@ out:
 static struct bio *btrfs_dio_bio_alloc(struct block_device *bdev,
                                       u64 first_sector, gfp_t gfp_flags)
 {
-       int nr_vecs = bio_get_nr_vecs(bdev);
-       return btrfs_bio_alloc(bdev, first_sector, nr_vecs, gfp_flags);
+       struct bio *bio;
+       bio = btrfs_bio_alloc(bdev, first_sector, BIO_MAX_PAGES, gfp_flags);
+       if (bio)
+               bio_associate_current(bio);
+       return bio;
 }
 
 static inline int btrfs_lookup_and_bind_dio_csum(struct btrfs_root *root,
@@ -8147,9 +8306,8 @@ out_err:
 static void btrfs_submit_direct(int rw, struct bio *dio_bio,
                                struct inode *inode, loff_t file_offset)
 {
-       struct btrfs_root *root = BTRFS_I(inode)->root;
-       struct btrfs_dio_private *dip;
-       struct bio *io_bio;
+       struct btrfs_dio_private *dip = NULL;
+       struct bio *io_bio = NULL;
        struct btrfs_io_bio *btrfs_bio;
        int skip_sum;
        int write = rw & REQ_WRITE;
@@ -8166,7 +8324,7 @@ static void btrfs_submit_direct(int rw, struct bio *dio_bio,
        dip = kzalloc(sizeof(*dip), GFP_NOFS);
        if (!dip) {
                ret = -ENOMEM;
-               goto free_io_bio;
+               goto free_ordered;
        }
 
        dip->private = dio_bio->bi_private;
@@ -8194,25 +8352,56 @@ static void btrfs_submit_direct(int rw, struct bio *dio_bio,
 
        if (btrfs_bio->end_io)
                btrfs_bio->end_io(btrfs_bio, ret);
-free_io_bio:
-       bio_put(io_bio);
 
 free_ordered:
        /*
-        * If this is a write, we need to clean up the reserved space and kill
-        * the ordered extent.
+        * If we arrived here it means either we failed to submit the dip
+        * or we either failed to clone the dio_bio or failed to allocate the
+        * dip. If we cloned the dio_bio and allocated the dip, we can just
+        * call bio_endio against our io_bio so that we get proper resource
+        * cleanup if we fail to submit the dip, otherwise, we must do the
+        * same as btrfs_endio_direct_[write|read] because we can't call these
+        * callbacks - they require an allocated dip and a clone of dio_bio.
         */
-       if (write) {
-               struct btrfs_ordered_extent *ordered;
-               ordered = btrfs_lookup_ordered_extent(inode, file_offset);
-               if (!test_bit(BTRFS_ORDERED_PREALLOC, &ordered->flags) &&
-                   !test_bit(BTRFS_ORDERED_NOCOW, &ordered->flags))
-                       btrfs_free_reserved_extent(root, ordered->start,
-                                                  ordered->disk_len, 1);
-               btrfs_put_ordered_extent(ordered);
-               btrfs_put_ordered_extent(ordered);
+       if (io_bio && dip) {
+               io_bio->bi_error = -EIO;
+               bio_endio(io_bio);
+               /*
+                * The end io callbacks free our dip, do the final put on io_bio
+                * and all the cleanup and final put for dio_bio (through
+                * dio_end_io()).
+                */
+               dip = NULL;
+               io_bio = NULL;
+       } else {
+               if (write) {
+                       struct btrfs_ordered_extent *ordered;
+
+                       ordered = btrfs_lookup_ordered_extent(inode,
+                                                             file_offset);
+                       set_bit(BTRFS_ORDERED_IOERR, &ordered->flags);
+                       /*
+                        * Decrements our ref on the ordered extent and removes
+                        * the ordered extent from the inode's ordered tree,
+                        * doing all the proper resource cleanup such as for the
+                        * reserved space and waking up any waiters for this
+                        * ordered extent (through btrfs_remove_ordered_extent).
+                        */
+                       btrfs_finish_ordered_io(ordered);
+               } else {
+                       unlock_extent(&BTRFS_I(inode)->io_tree, file_offset,
+                             file_offset + dio_bio->bi_iter.bi_size - 1);
+               }
+               dio_bio->bi_error = -EIO;
+               /*
+                * Releases and cleans up our dio_bio, no need to bio_put()
+                * nor bio_endio()/bio_io_error() against dio_bio.
+                */
+               dio_end_io(dio_bio, ret);
        }
-       bio_endio(dio_bio, ret);
+       if (io_bio)
+               bio_put(io_bio);
+       kfree(dip);
 }
 
 static ssize_t check_direct_IO(struct btrfs_root *root, struct kiocb *iocb,
@@ -8253,7 +8442,8 @@ static ssize_t btrfs_direct_IO(struct kiocb *iocb, struct iov_iter *iter,
 {
        struct file *file = iocb->ki_filp;
        struct inode *inode = file->f_mapping->host;
-       u64 outstanding_extents = 0;
+       struct btrfs_root *root = BTRFS_I(inode)->root;
+       struct btrfs_dio_data dio_data = { 0 };
        size_t count = 0;
        int flags = 0;
        bool wakeup = true;
@@ -8288,10 +8478,10 @@ static ssize_t btrfs_direct_IO(struct kiocb *iocb, struct iov_iter *iter,
                        mutex_unlock(&inode->i_mutex);
                        relock = true;
                }
-               ret = btrfs_delalloc_reserve_space(inode, count);
+               ret = btrfs_delalloc_reserve_space(inode, offset, count);
                if (ret)
                        goto out;
-               outstanding_extents = div64_u64(count +
+               dio_data.outstanding_extents = div64_u64(count +
                                                BTRFS_MAX_EXTENT_SIZE - 1,
                                                BTRFS_MAX_EXTENT_SIZE);
 
@@ -8300,7 +8490,8 @@ static ssize_t btrfs_direct_IO(struct kiocb *iocb, struct iov_iter *iter,
                 * do the accounting properly if we go over the number we
                 * originally calculated.  Abuse current->journal_info for this.
                 */
-               current->journal_info = &outstanding_extents;
+               dio_data.reserve = round_up(count, root->sectorsize);
+               current->journal_info = &dio_data;
        } else if (test_bit(BTRFS_INODE_READDIO_NEED_LOCK,
                                     &BTRFS_I(inode)->runtime_flags)) {
                inode_dio_end(inode);
@@ -8314,10 +8505,12 @@ static ssize_t btrfs_direct_IO(struct kiocb *iocb, struct iov_iter *iter,
                                   btrfs_submit_direct, flags);
        if (iov_iter_rw(iter) == WRITE) {
                current->journal_info = NULL;
-               if (ret < 0 && ret != -EIOCBQUEUED)
-                       btrfs_delalloc_release_space(inode, count);
-               else if (ret >= 0 && (size_t)ret < count)
-                       btrfs_delalloc_release_space(inode,
+               if (ret < 0 && ret != -EIOCBQUEUED) {
+                       if (dio_data.reserve)
+                               btrfs_delalloc_release_space(inode, offset,
+                                                            dio_data.reserve);
+               } else if (ret >= 0 && (size_t)ret < count)
+                       btrfs_delalloc_release_space(inode, offset,
                                                     count - (size_t)ret);
        }
 out:
@@ -8353,15 +8546,28 @@ int btrfs_readpage(struct file *file, struct page *page)
 static int btrfs_writepage(struct page *page, struct writeback_control *wbc)
 {
        struct extent_io_tree *tree;
-
+       struct inode *inode = page->mapping->host;
+       int ret;
 
        if (current->flags & PF_MEMALLOC) {
                redirty_page_for_writepage(wbc, page);
                unlock_page(page);
                return 0;
        }
+
+       /*
+        * If we are under memory pressure we will call this directly from the
+        * VM, we need to make sure we have the inode referenced for the ordered
+        * extent.  If not just return like we didn't do anything.
+        */
+       if (!igrab(inode)) {
+               redirty_page_for_writepage(wbc, page);
+               return AOP_WRITEPAGE_ACTIVATE;
+       }
        tree = &BTRFS_I(page->mapping->host)->io_tree;
-       return extent_write_full_page(tree, page, btrfs_get_extent, wbc);
+       ret = extent_write_full_page(tree, page, btrfs_get_extent, wbc);
+       btrfs_add_delayed_iput(inode);
+       return ret;
 }
 
 static int btrfs_writepages(struct address_space *mapping,
@@ -8476,6 +8682,18 @@ static void btrfs_invalidatepage(struct page *page, unsigned int offset,
                }
        }
 
+       /*
+        * Qgroup reserved space handler
+        * Page here will be either
+        * 1) Already written to disk
+        *    In this case, its reserved space is released from data rsv map
+        *    and will be freed by delayed_ref handler finally.
+        *    So even we call qgroup_free_data(), it won't decrease reserved
+        *    space.
+        * 2) Not written to disk
+        *    This means the reserved space should be freed here.
+        */
+       btrfs_qgroup_free_data(inode, page_start, PAGE_CACHE_SIZE);
        if (!inode_evicting) {
                clear_extent_bit(tree, page_start, page_end,
                                 EXTENT_LOCKED | EXTENT_DIRTY |
@@ -8526,7 +8744,11 @@ int btrfs_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
        u64 page_end;
 
        sb_start_pagefault(inode->i_sb);
-       ret  = btrfs_delalloc_reserve_space(inode, PAGE_CACHE_SIZE);
+       page_start = page_offset(page);
+       page_end = page_start + PAGE_CACHE_SIZE - 1;
+
+       ret = btrfs_delalloc_reserve_space(inode, page_start,
+                                          PAGE_CACHE_SIZE);
        if (!ret) {
                ret = file_update_time(vma->vm_file);
                reserved = 1;
@@ -8545,8 +8767,6 @@ int btrfs_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
 again:
        lock_page(page);
        size = i_size_read(inode);
-       page_start = page_offset(page);
-       page_end = page_start + PAGE_CACHE_SIZE - 1;
 
        if ((page->mapping != inode->i_mapping) ||
            (page_start >= size)) {
@@ -8623,7 +8843,7 @@ out_unlock:
        }
        unlock_page(page);
 out:
-       btrfs_delalloc_release_space(inode, PAGE_CACHE_SIZE);
+       btrfs_delalloc_release_space(inode, page_start, PAGE_CACHE_SIZE);
 out_noreserve:
        sb_end_pagefault(inode->i_sb);
        return ret;
@@ -8912,6 +9132,7 @@ void btrfs_destroy_inode(struct inode *inode)
                        btrfs_put_ordered_extent(ordered);
                }
        }
+       btrfs_qgroup_check_reserved_leak(inode);
        inode_tree_del(inode);
        btrfs_drop_extent_cache(inode, 0, (u64)-1, 0);
 free:
@@ -9440,9 +9661,11 @@ static int btrfs_symlink(struct inode *dir, struct dentry *dentry,
        /*
         * 2 items for inode item and ref
         * 2 items for dir items
+        * 1 item for updating parent inode item
+        * 1 item for the inline extent item
         * 1 item for xattr if selinux is on
         */
-       trans = btrfs_start_transaction(root, 5);
+       trans = btrfs_start_transaction(root, 7);
        if (IS_ERR(trans))
                return PTR_ERR(trans);
 
@@ -9548,6 +9771,7 @@ static int __btrfs_prealloc_file_range(struct inode *inode, int mode,
        u64 cur_offset = start;
        u64 i_size;
        u64 cur_bytes;
+       u64 last_alloc = (u64)-1;
        int ret = 0;
        bool own_trans = true;
 
@@ -9564,6 +9788,13 @@ static int __btrfs_prealloc_file_range(struct inode *inode, int mode,
 
                cur_bytes = min(num_bytes, 256ULL * 1024 * 1024);
                cur_bytes = max(cur_bytes, min_size);
+               /*
+                * If we are severely fragmented we could end up with really
+                * small allocations, so if the allocator is returning small
+                * chunks lets make its job easier by only searching for those
+                * sized chunks.
+                */
+               cur_bytes = min(cur_bytes, last_alloc);
                ret = btrfs_reserve_extent(root, cur_bytes, min_size, 0,
                                           *alloc_hint, &ins, 1, 0);
                if (ret) {
@@ -9572,6 +9803,7 @@ static int __btrfs_prealloc_file_range(struct inode *inode, int mode,
                        break;
                }
 
+               last_alloc = ins.offset;
                ret = insert_reserved_file_extent(trans, inode,
                                                  cur_offset, ins.objectid,
                                                  ins.offset, ins.offset,