These changes are the raw update to linux-4.4.6-rt14. Kernel sources

[kvmfornfv.git] / kernel / fs / btrfs / extent-tree.c
diff --git a/kernel/fs/btrfs/extent-tree.c b/kernel/fs/btrfs/extent-tree.c

index 0ec3acd..2368cac 100644 (file)
--- a/kernel/fs/btrfs/extent-tree.c
+++ b/kernel/fs/btrfs/extent-tree.c
@@ -79,11 +79,10 @@ static int update_block_group(struct btrfs_trans_handle *trans,
                               u64 num_bytes, int alloc);
  static int __btrfs_free_extent(struct btrfs_trans_handle *trans,
                                 struct btrfs_root *root,
-                               u64 bytenr, u64 num_bytes, u64 parent,
+                               struct btrfs_delayed_ref_node *node, u64 parent,
                                 u64 root_objectid, u64 owner_objectid,
                                 u64 owner_offset, int refs_to_drop,
-                               struct btrfs_delayed_extent_op *extra_op,
-                               int no_quota);
+                               struct btrfs_delayed_extent_op *extra_op);
  static void __run_delayed_extent_op(struct btrfs_delayed_extent_op *extent_op,
                                     struct extent_buffer *leaf,
                                     struct btrfs_extent_item *ei);
@@ -96,8 +95,7 @@ static int alloc_reserved_tree_block(struct btrfs_trans_handle *trans,
                                      struct btrfs_root *root,
                                      u64 parent, u64 root_objectid,
                                      u64 flags, struct btrfs_disk_key *key,
-                                    int level, struct btrfs_key *ins,
-                                    int no_quota);
+                                    int level, struct btrfs_key *ins);
  static int do_chunk_alloc(struct btrfs_trans_handle *trans,
                           struct btrfs_root *extent_root, u64 flags,
                           int force);
@@ -126,7 +124,7 @@ static int block_group_bits(struct btrfs_block_group_cache *cache, u64 bits)
         return (cache->flags & bits) == bits;
  }
  
-static void btrfs_get_block_group(struct btrfs_block_group_cache *cache)
+void btrfs_get_block_group(struct btrfs_block_group_cache *cache)
  {
         atomic_inc(&cache->count);
  }
@@ -333,6 +331,27 @@ static void put_caching_control(struct btrfs_caching_control *ctl)
                 kfree(ctl);
  }
  
+#ifdef CONFIG_BTRFS_DEBUG
+static void fragment_free_space(struct btrfs_root *root,
+                               struct btrfs_block_group_cache *block_group)
+{
+       u64 start = block_group->key.objectid;
+       u64 len = block_group->key.offset;
+       u64 chunk = block_group->flags & BTRFS_BLOCK_GROUP_METADATA ?
+               root->nodesize : root->sectorsize;
+       u64 step = chunk << 1;
+
+       while (len > chunk) {
+               btrfs_remove_free_space(block_group, start, chunk);
+               start += step;
+               if (len < step)
+                       len = 0;
+               else
+                       len -= step;
+       }
+}
+#endif
+
  /*
   * this is only called by cache_block_group, since we could have freed extents
   * we need to check the pinned_extents for any extents that can't be used yet
@@ -389,6 +408,7 @@ static noinline void caching_thread(struct btrfs_work *work)
         u64 last = 0;
         u32 nritems;
         int ret = -ENOMEM;
+       bool wakeup = true;
  
         caching_ctl = container_of(work, struct btrfs_caching_control, work);
         block_group = caching_ctl->block_group;
@@ -401,6 +421,15 @@ static noinline void caching_thread(struct btrfs_work *work)
  
         last = max_t(u64, block_group->key.objectid, BTRFS_SUPER_INFO_OFFSET);
  
+#ifdef CONFIG_BTRFS_DEBUG
+       /*
+        * If we're fragmenting we don't want to make anybody think we can
+        * allocate from this block group until we've had a chance to fragment
+        * the free space.
+        */
+       if (btrfs_should_fragment_free_space(extent_root, block_group))
+               wakeup = false;
+#endif
         /*
          * We don't want to deadlock with somebody trying to allocate a new
          * extent for the extent root while also trying to search the extent
@@ -442,7 +471,8 @@ next:
  
                         if (need_resched() ||
                             rwsem_is_contended(&fs_info->commit_root_sem)) {
-                               caching_ctl->progress = last;
+                               if (wakeup)
+                                       caching_ctl->progress = last;
                                 btrfs_release_path(path);
                                 up_read(&fs_info->commit_root_sem);
                                 mutex_unlock(&caching_ctl->mutex);
@@ -465,7 +495,8 @@ next:
                         key.offset = 0;
                         key.type = BTRFS_EXTENT_ITEM_KEY;
  
-                       caching_ctl->progress = last;
+                       if (wakeup)
+                               caching_ctl->progress = last;
                         btrfs_release_path(path);
                         goto next;
                 }
@@ -492,7 +523,8 @@ next:
  
                         if (total_found > (1024 * 1024 * 2)) {
                                 total_found = 0;
-                               wake_up(&caching_ctl->wait);
+                               if (wakeup)
+                                       wake_up(&caching_ctl->wait);
                         }
                 }
                 path->slots[0]++;
@@ -502,13 +534,27 @@ next:
         total_found += add_new_free_space(block_group, fs_info, last,
                                           block_group->key.objectid +
                                           block_group->key.offset);
-       caching_ctl->progress = (u64)-1;
-
         spin_lock(&block_group->lock);
         block_group->caching_ctl = NULL;
         block_group->cached = BTRFS_CACHE_FINISHED;
         spin_unlock(&block_group->lock);
  
+#ifdef CONFIG_BTRFS_DEBUG
+       if (btrfs_should_fragment_free_space(extent_root, block_group)) {
+               u64 bytes_used;
+
+               spin_lock(&block_group->space_info->lock);
+               spin_lock(&block_group->lock);
+               bytes_used = block_group->key.offset -
+                       btrfs_block_group_used(&block_group->item);
+               block_group->space_info->bytes_used += bytes_used >> 1;
+               spin_unlock(&block_group->lock);
+               spin_unlock(&block_group->space_info->lock);
+               fragment_free_space(extent_root, block_group);
+       }
+#endif
+
+       caching_ctl->progress = (u64)-1;
  err:
         btrfs_free_path(path);
         up_read(&fs_info->commit_root_sem);
@@ -608,6 +654,22 @@ static int cache_block_group(struct btrfs_block_group_cache *cache,
                         }
                 }
                 spin_unlock(&cache->lock);
+#ifdef CONFIG_BTRFS_DEBUG
+               if (ret == 1 &&
+                   btrfs_should_fragment_free_space(fs_info->extent_root,
+                                                    cache)) {
+                       u64 bytes_used;
+
+                       spin_lock(&cache->space_info->lock);
+                       spin_lock(&cache->lock);
+                       bytes_used = cache->key.offset -
+                               btrfs_block_group_used(&cache->item);
+                       cache->space_info->bytes_used += bytes_used >> 1;
+                       spin_unlock(&cache->lock);
+                       spin_unlock(&cache->space_info->lock);
+                       fragment_free_space(fs_info->extent_root, cache);
+               }
+#endif
                 mutex_unlock(&caching_ctl->mutex);
  
                 wake_up(&caching_ctl->wait);
@@ -1317,8 +1379,7 @@ static noinline int remove_extent_data_ref(struct btrfs_trans_handle *trans,
         return ret;
  }
  
-static noinline u32 extent_data_ref_count(struct btrfs_root *root,
-                                         struct btrfs_path *path,
+static noinline u32 extent_data_ref_count(struct btrfs_path *path,
                                           struct btrfs_extent_inline_ref *iref)
  {
         struct btrfs_key key;
@@ -1884,10 +1945,77 @@ static int remove_extent_backref(struct btrfs_trans_handle *trans,
         return ret;
  }
  
-static int btrfs_issue_discard(struct block_device *bdev,
-                               u64 start, u64 len)
+#define in_range(b, first, len)        ((b) >= (first) && (b) < (first) + (len))
+static int btrfs_issue_discard(struct block_device *bdev, u64 start, u64 len,
+                              u64 *discarded_bytes)
  {
-       return blkdev_issue_discard(bdev, start >> 9, len >> 9, GFP_NOFS, 0);
+       int j, ret = 0;
+       u64 bytes_left, end;
+       u64 aligned_start = ALIGN(start, 1 << 9);
+
+       if (WARN_ON(start != aligned_start)) {
+               len -= aligned_start - start;
+               len = round_down(len, 1 << 9);
+               start = aligned_start;
+       }
+
+       *discarded_bytes = 0;
+
+       if (!len)
+               return 0;
+
+       end = start + len;
+       bytes_left = len;
+
+       /* Skip any superblocks on this device. */
+       for (j = 0; j < BTRFS_SUPER_MIRROR_MAX; j++) {
+               u64 sb_start = btrfs_sb_offset(j);
+               u64 sb_end = sb_start + BTRFS_SUPER_INFO_SIZE;
+               u64 size = sb_start - start;
+
+               if (!in_range(sb_start, start, bytes_left) &&
+                   !in_range(sb_end, start, bytes_left) &&
+                   !in_range(start, sb_start, BTRFS_SUPER_INFO_SIZE))
+                       continue;
+
+               /*
+                * Superblock spans beginning of range.  Adjust start and
+                * try again.
+                */
+               if (sb_start <= start) {
+                       start += sb_end - start;
+                       if (start > end) {
+                               bytes_left = 0;
+                               break;
+                       }
+                       bytes_left = end - start;
+                       continue;
+               }
+
+               if (size) {
+                       ret = blkdev_issue_discard(bdev, start >> 9, size >> 9,
+                                                  GFP_NOFS, 0);
+                       if (!ret)
+                               *discarded_bytes += size;
+                       else if (ret != -EOPNOTSUPP)
+                               return ret;
+               }
+
+               start = sb_end;
+               if (start > end) {
+                       bytes_left = 0;
+                       break;
+               }
+               bytes_left = end - start;
+       }
+
+       if (bytes_left) {
+               ret = blkdev_issue_discard(bdev, start >> 9, bytes_left >> 9,
+                                          GFP_NOFS, 0);
+               if (!ret)
+                       *discarded_bytes += bytes_left;
+       }
+       return ret;
  }
  
  int btrfs_discard_extent(struct btrfs_root *root, u64 bytenr,
@@ -1908,14 +2036,16 @@ int btrfs_discard_extent(struct btrfs_root *root, u64 bytenr,
  
  
                 for (i = 0; i < bbio->num_stripes; i++, stripe++) {
+                       u64 bytes;
                         if (!stripe->dev->can_discard)
                                 continue;
  
                         ret = btrfs_issue_discard(stripe->dev->bdev,
                                                   stripe->physical,
-                                                 stripe->length);
+                                                 stripe->length,
+                                                 &bytes);
                         if (!ret)
-                               discarded_bytes += stripe->length;
+                               discarded_bytes += bytes;
                         else if (ret != -EOPNOTSUPP)
                                 break; /* Logic errors or -ENOMEM, or -EIO but I don't know how that could happen JDM */
  
@@ -1942,8 +2072,7 @@ int btrfs_discard_extent(struct btrfs_root *root, u64 bytenr,
  int btrfs_inc_extent_ref(struct btrfs_trans_handle *trans,
                          struct btrfs_root *root,
                          u64 bytenr, u64 num_bytes, u64 parent,
-                        u64 root_objectid, u64 owner, u64 offset,
-                        int no_quota)
+                        u64 root_objectid, u64 owner, u64 offset)
  {
         int ret;
         struct btrfs_fs_info *fs_info = root->fs_info;
@@ -1955,22 +2084,21 @@ int btrfs_inc_extent_ref(struct btrfs_trans_handle *trans,
                 ret = btrfs_add_delayed_tree_ref(fs_info, trans, bytenr,
                                         num_bytes,
                                         parent, root_objectid, (int)owner,
-                                       BTRFS_ADD_DELAYED_REF, NULL, no_quota);
+                                       BTRFS_ADD_DELAYED_REF, NULL);
         } else {
                 ret = btrfs_add_delayed_data_ref(fs_info, trans, bytenr,
-                                       num_bytes,
-                                       parent, root_objectid, owner, offset,
-                                       BTRFS_ADD_DELAYED_REF, NULL, no_quota);
+                                       num_bytes, parent, root_objectid,
+                                       owner, offset, 0,
+                                       BTRFS_ADD_DELAYED_REF, NULL);
         }
         return ret;
  }
  
  static int __btrfs_inc_extent_ref(struct btrfs_trans_handle *trans,
                                   struct btrfs_root *root,
-                                 u64 bytenr, u64 num_bytes,
+                                 struct btrfs_delayed_ref_node *node,
                                   u64 parent, u64 root_objectid,
                                   u64 owner, u64 offset, int refs_to_add,
-                                 int no_quota,
                                   struct btrfs_delayed_extent_op *extent_op)
  {
         struct btrfs_fs_info *fs_info = root->fs_info;
@@ -1978,17 +2106,15 @@ static int __btrfs_inc_extent_ref(struct btrfs_trans_handle *trans,
         struct extent_buffer *leaf;
         struct btrfs_extent_item *item;
         struct btrfs_key key;
+       u64 bytenr = node->bytenr;
+       u64 num_bytes = node->num_bytes;
         u64 refs;
         int ret;
-       enum btrfs_qgroup_operation_type type = BTRFS_QGROUP_OPER_ADD_EXCL;
  
         path = btrfs_alloc_path();
         if (!path)
                 return -ENOMEM;
  
-       if (!is_fstree(root_objectid) || !root->fs_info->quota_enabled)
-               no_quota = 1;
-
         path->reada = 1;
         path->leave_spinning = 1;
         /* this will setup the path even if it fails to insert the back ref */
@@ -1996,26 +2122,8 @@ static int __btrfs_inc_extent_ref(struct btrfs_trans_handle *trans,
                                            bytenr, num_bytes, parent,
                                            root_objectid, owner, offset,
                                            refs_to_add, extent_op);
-       if ((ret < 0 && ret != -EAGAIN) || (!ret && no_quota))
+       if ((ret < 0 && ret != -EAGAIN) || !ret)
                 goto out;
-       /*
-        * Ok we were able to insert an inline extent and it appears to be a new
-        * reference, deal with the qgroup accounting.
-        */
-       if (!ret && !no_quota) {
-               ASSERT(root->fs_info->quota_enabled);
-               leaf = path->nodes[0];
-               btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
-               item = btrfs_item_ptr(leaf, path->slots[0],
-                                     struct btrfs_extent_item);
-               if (btrfs_extent_refs(leaf, item) > (u64)refs_to_add)
-                       type = BTRFS_QGROUP_OPER_ADD_SHARED;
-               btrfs_release_path(path);
-
-               ret = btrfs_qgroup_record_ref(trans, fs_info, root_objectid,
-                                             bytenr, num_bytes, type, 0);
-               goto out;
-       }
  
         /*
          * Ok we had -EAGAIN which means we didn't have space to insert and
@@ -2026,8 +2134,6 @@ static int __btrfs_inc_extent_ref(struct btrfs_trans_handle *trans,
         btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
         item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item);
         refs = btrfs_extent_refs(leaf, item);
-       if (refs)
-               type = BTRFS_QGROUP_OPER_ADD_SHARED;
         btrfs_set_extent_refs(leaf, item, refs + refs_to_add);
         if (extent_op)
                 __run_delayed_extent_op(extent_op, leaf, item);
@@ -2035,13 +2141,6 @@ static int __btrfs_inc_extent_ref(struct btrfs_trans_handle *trans,
         btrfs_mark_buffer_dirty(leaf);
         btrfs_release_path(path);
  
-       if (!no_quota) {
-               ret = btrfs_qgroup_record_ref(trans, fs_info, root_objectid,
-                                             bytenr, num_bytes, type, 0);
-               if (ret)
-                       goto out;
-       }
-
         path->reada = 1;
         path->leave_spinning = 1;
         /* now insert the actual backref */
@@ -2087,17 +2186,15 @@ static int run_delayed_data_ref(struct btrfs_trans_handle *trans,
                                                  ref->objectid, ref->offset,
                                                  &ins, node->ref_mod);
         } else if (node->action == BTRFS_ADD_DELAYED_REF) {
-               ret = __btrfs_inc_extent_ref(trans, root, node->bytenr,
-                                            node->num_bytes, parent,
+               ret = __btrfs_inc_extent_ref(trans, root, node, parent,
                                              ref_root, ref->objectid,
                                              ref->offset, node->ref_mod,
-                                            node->no_quota, extent_op);
+                                            extent_op);
         } else if (node->action == BTRFS_DROP_DELAYED_REF) {
-               ret = __btrfs_free_extent(trans, root, node->bytenr,
-                                         node->num_bytes, parent,
+               ret = __btrfs_free_extent(trans, root, node, parent,
                                           ref_root, ref->objectid,
                                           ref->offset, node->ref_mod,
-                                         extent_op, node->no_quota);
+                                         extent_op);
         } else {
                 BUG();
         }
@@ -2252,18 +2349,16 @@ static int run_delayed_tree_ref(struct btrfs_trans_handle *trans,
                                                 parent, ref_root,
                                                 extent_op->flags_to_set,
                                                 &extent_op->key,
-                                               ref->level, &ins,
-                                               node->no_quota);
+                                               ref->level, &ins);
         } else if (node->action == BTRFS_ADD_DELAYED_REF) {
-               ret = __btrfs_inc_extent_ref(trans, root, node->bytenr,
-                                            node->num_bytes, parent, ref_root,
-                                            ref->level, 0, 1, node->no_quota,
+               ret = __btrfs_inc_extent_ref(trans, root, node,
+                                            parent, ref_root,
+                                            ref->level, 0, 1,
                                              extent_op);
         } else if (node->action == BTRFS_DROP_DELAYED_REF) {
-               ret = __btrfs_free_extent(trans, root, node->bytenr,
-                                         node->num_bytes, parent, ref_root,
-                                         ref->level, 0, 1, extent_op,
-                                         node->no_quota);
+               ret = __btrfs_free_extent(trans, root, node,
+                                         parent, ref_root,
+                                         ref->level, 0, 1, extent_op);
         } else {
                 BUG();
         }
@@ -2307,6 +2402,11 @@ static int run_one_delayed_ref(struct btrfs_trans_handle *trans,
                                                       node->num_bytes);
                         }
                 }
+
+               /* Also free its reserved qgroup space */
+               btrfs_qgroup_free_delayed_ref(root->fs_info,
+                                             head->qgroup_ref_root,
+                                             head->qgroup_reserved);
                 return ret;
         }
  
@@ -2323,28 +2423,27 @@ static int run_one_delayed_ref(struct btrfs_trans_handle *trans,
         return ret;
  }
  
-static noinline struct btrfs_delayed_ref_node *
+static inline struct btrfs_delayed_ref_node *
  select_delayed_ref(struct btrfs_delayed_ref_head *head)
  {
-       struct rb_node *node;
-       struct btrfs_delayed_ref_node *ref, *last = NULL;;
+       struct btrfs_delayed_ref_node *ref;
+
+       if (list_empty(&head->ref_list))
+               return NULL;
  
         /*
-        * select delayed ref of type BTRFS_ADD_DELAYED_REF first.
-        * this prevents ref count from going down to zero when
-        * there still are pending delayed ref.
+        * Select a delayed ref of type BTRFS_ADD_DELAYED_REF first.
+        * This is to prevent a ref count from going down to zero, which deletes
+        * the extent item from the extent tree, when there still are references
+        * to add, which would fail because they would not find the extent item.
          */
-       node = rb_first(&head->ref_root);
-       while (node) {
-               ref = rb_entry(node, struct btrfs_delayed_ref_node,
-                               rb_node);
+       list_for_each_entry(ref, &head->ref_list, list) {
                 if (ref->action == BTRFS_ADD_DELAYED_REF)
                         return ref;
-               else if (last == NULL)
-                       last = ref;
-               node = rb_next(node);
         }
-       return last;
+
+       return list_entry(head->ref_list.next, struct btrfs_delayed_ref_node,
+                         list);
  }
  
  /*
@@ -2402,6 +2501,11 @@ static noinline int __btrfs_run_delayed_refs(struct btrfs_trans_handle *trans,
                  * and then it being added back again before the drop can
                  * finish.  If we merged anything we need to re-loop so we can
                  * get a good ref.
+                * Or we can get node references of the same type that weren't
+                * merged when created due to bumps in the tree mod seq, and
+                * we need to merge them to prevent adding an inline extent
+                * backref before dropping it (triggering a BUG_ON at
+                * insert_inline_extent_backref()).
                  */
                 spin_lock(&locked_ref->lock);
                 btrfs_merge_delayed_refs(trans, fs_info, delayed_refs,
@@ -2482,7 +2586,7 @@ static noinline int __btrfs_run_delayed_refs(struct btrfs_trans_handle *trans,
                         spin_unlock(&locked_ref->lock);
                         spin_lock(&delayed_refs->lock);
                         spin_lock(&locked_ref->lock);
-                       if (rb_first(&locked_ref->ref_root) ||
+                       if (!list_empty(&locked_ref->ref_list) ||
                             locked_ref->extent_op) {
                                 spin_unlock(&locked_ref->lock);
                                 spin_unlock(&delayed_refs->lock);
@@ -2496,7 +2600,7 @@ static noinline int __btrfs_run_delayed_refs(struct btrfs_trans_handle *trans,
                 } else {
                         actual_count++;
                         ref->in_tree = 0;
-                       rb_erase(&ref->rb_node, &locked_ref->ref_root);
+                       list_del(&ref->list);
                 }
                 atomic_dec(&delayed_refs->num_entries);
  
@@ -2800,6 +2904,7 @@ int btrfs_run_delayed_refs(struct btrfs_trans_handle *trans,
         struct btrfs_delayed_ref_head *head;
         int ret;
         int run_all = count == (unsigned long)-1;
+       bool can_flush_pending_bgs = trans->can_flush_pending_bgs;
  
         /* We'll clean this up in btrfs_cleanup_transaction */
         if (trans->aborted)
@@ -2816,6 +2921,7 @@ again:
  #ifdef SCRAMBLE_DELAYED_REFS
         delayed_refs->run_delayed_start = find_middle(&delayed_refs->root);
  #endif
+       trans->can_flush_pending_bgs = false;
         ret = __btrfs_run_delayed_refs(trans, root, count);
         if (ret < 0) {
                 btrfs_abort_transaction(trans, root, ret);
@@ -2864,10 +2970,8 @@ again:
                 goto again;
         }
  out:
-       ret = btrfs_delayed_qgroup_accounting(trans, root->fs_info);
-       if (ret)
-               return ret;
         assert_qgroups_uptodate(trans);
+       trans->can_flush_pending_bgs = can_flush_pending_bgs;
         return 0;
  }
  
@@ -2905,7 +3009,6 @@ static noinline int check_delayed_ref(struct btrfs_trans_handle *trans,
         struct btrfs_delayed_ref_node *ref;
         struct btrfs_delayed_data_ref *data_ref;
         struct btrfs_delayed_ref_root *delayed_refs;
-       struct rb_node *node;
         int ret = 0;
  
         delayed_refs = &trans->transaction->delayed_refs;
@@ -2934,11 +3037,7 @@ static noinline int check_delayed_ref(struct btrfs_trans_handle *trans,
         spin_unlock(&delayed_refs->lock);
  
         spin_lock(&head->lock);
-       node = rb_first(&head->ref_root);
-       while (node) {
-               ref = rb_entry(node, struct btrfs_delayed_ref_node, rb_node);
-               node = rb_next(node);
-
+       list_for_each_entry(ref, &head->ref_list, list) {
                 /* If it's a shared ref we know a cross reference exists */
                 if (ref->type != BTRFS_EXTENT_DATA_REF_KEY) {
                         ret = 1;
@@ -3086,7 +3185,7 @@ static int __btrfs_mod_ref(struct btrfs_trans_handle *trans,
         int level;
         int ret = 0;
         int (*process_func)(struct btrfs_trans_handle *, struct btrfs_root *,
-                           u64, u64, u64, u64, u64, u64, int);
+                           u64, u64, u64, u64, u64, u64);
  
  
         if (btrfs_test_is_dummy_root(root))
@@ -3127,15 +3226,14 @@ static int __btrfs_mod_ref(struct btrfs_trans_handle *trans,
                         key.offset -= btrfs_file_extent_offset(buf, fi);
                         ret = process_func(trans, root, bytenr, num_bytes,
                                            parent, ref_root, key.objectid,
-                                          key.offset, 1);
+                                          key.offset);
                         if (ret)
                                 goto fail;
                 } else {
                         bytenr = btrfs_node_blockptr(buf, i);
                         num_bytes = root->nodesize;
                         ret = process_func(trans, root, bytenr, num_bytes,
-                                          parent, ref_root, level - 1, 0,
-                                          1);
+                                          parent, ref_root, level - 1, 0);
                         if (ret)
                                 goto fail;
                 }
@@ -3315,6 +3413,15 @@ again:
         }
         spin_unlock(&block_group->lock);
  
+       /*
+        * We hit an ENOSPC when setting up the cache in this transaction, just
+        * skip doing the setup, we've already cleared the cache so we're safe.
+        */
+       if (test_bit(BTRFS_TRANS_CACHE_ENOSPC, &trans->transaction->flags)) {
+               ret = -ENOSPC;
+               goto out_put;
+       }
+
         /*
          * Try to preallocate enough space based on how big the block group is.
          * Keep in mind this has to include any pinned space which could end up
@@ -3328,16 +3435,26 @@ again:
         num_pages *= 16;
         num_pages *= PAGE_CACHE_SIZE;
  
-       ret = btrfs_check_data_free_space(inode, num_pages, num_pages);
+       ret = btrfs_check_data_free_space(inode, 0, num_pages);
         if (ret)
                 goto out_put;
  
         ret = btrfs_prealloc_file_range_trans(inode, trans, 0, 0, num_pages,
                                               num_pages, num_pages,
                                               &alloc_hint);
+       /*
+        * Our cache requires contiguous chunks so that we don't modify a bunch
+        * of metadata or split extents when writing the cache out, which means
+        * we can enospc if we are heavily fragmented in addition to just normal
+        * out of space conditions.  So if we hit this just skip setting up any
+        * other block groups for this transaction, maybe we'll unpin enough
+        * space the next time around.
+        */
         if (!ret)
                 dcs = BTRFS_DC_SETUP;
-       btrfs_free_reserved_data_space(inode, num_pages);
+       else if (ret == -ENOSPC)
+               set_bit(BTRFS_TRANS_CACHE_ENOSPC, &trans->transaction->flags);
+       btrfs_free_reserved_data_space(inode, 0, num_pages);
  
  out_put:
         iput(inode);
@@ -3693,7 +3810,8 @@ static int update_space_info(struct btrfs_fs_info *info, u64 flags,
                 found->disk_total += total_bytes * factor;
                 found->bytes_used += bytes_used;
                 found->disk_used += bytes_used * factor;
-               found->full = 0;
+               if (total_bytes > 0)
+                       found->full = 0;
                 spin_unlock(&found->lock);
                 *space_info = found;
                 return 0;
@@ -3722,6 +3840,7 @@ static int update_space_info(struct btrfs_fs_info *info, u64 flags,
         found->bytes_readonly = 0;
         found->bytes_may_use = 0;
         found->full = 0;
+       found->max_extent_size = 0;
         found->force_alloc = CHUNK_ALLOC_NO_FORCE;
         found->chunk_alloc = 0;
         found->flush = 0;
@@ -3798,7 +3917,8 @@ static u64 btrfs_reduce_alloc_profile(struct btrfs_root *root, u64 flags)
  {
         u64 num_devices = root->fs_info->fs_devices->rw_devices;
         u64 target;
-       u64 tmp;
+       u64 raid_type;
+       u64 allowed = 0;
  
         /*
          * see if restripe for this chunk_type is in progress, if so
@@ -3816,31 +3936,26 @@ static u64 btrfs_reduce_alloc_profile(struct btrfs_root *root, u64 flags)
         spin_unlock(&root->fs_info->balance_lock);
  
         /* First, mask out the RAID levels which aren't possible */
-       if (num_devices == 1)
-               flags &= ~(BTRFS_BLOCK_GROUP_RAID1 | BTRFS_BLOCK_GROUP_RAID0 |
-                          BTRFS_BLOCK_GROUP_RAID5);
-       if (num_devices < 3)
-               flags &= ~BTRFS_BLOCK_GROUP_RAID6;
-       if (num_devices < 4)
-               flags &= ~BTRFS_BLOCK_GROUP_RAID10;
-
-       tmp = flags & (BTRFS_BLOCK_GROUP_DUP | BTRFS_BLOCK_GROUP_RAID0 |
-                      BTRFS_BLOCK_GROUP_RAID1 | BTRFS_BLOCK_GROUP_RAID5 |
-                      BTRFS_BLOCK_GROUP_RAID6 | BTRFS_BLOCK_GROUP_RAID10);
-       flags &= ~tmp;
-
-       if (tmp & BTRFS_BLOCK_GROUP_RAID6)
-               tmp = BTRFS_BLOCK_GROUP_RAID6;
-       else if (tmp & BTRFS_BLOCK_GROUP_RAID5)
-               tmp = BTRFS_BLOCK_GROUP_RAID5;
-       else if (tmp & BTRFS_BLOCK_GROUP_RAID10)
-               tmp = BTRFS_BLOCK_GROUP_RAID10;
-       else if (tmp & BTRFS_BLOCK_GROUP_RAID1)
-               tmp = BTRFS_BLOCK_GROUP_RAID1;
-       else if (tmp & BTRFS_BLOCK_GROUP_RAID0)
-               tmp = BTRFS_BLOCK_GROUP_RAID0;
-
-       return extended_to_chunk(flags | tmp);
+       for (raid_type = 0; raid_type < BTRFS_NR_RAID_TYPES; raid_type++) {
+               if (num_devices >= btrfs_raid_array[raid_type].devs_min)
+                       allowed |= btrfs_raid_group[raid_type];
+       }
+       allowed &= flags;
+
+       if (allowed & BTRFS_BLOCK_GROUP_RAID6)
+               allowed = BTRFS_BLOCK_GROUP_RAID6;
+       else if (allowed & BTRFS_BLOCK_GROUP_RAID5)
+               allowed = BTRFS_BLOCK_GROUP_RAID5;
+       else if (allowed & BTRFS_BLOCK_GROUP_RAID10)
+               allowed = BTRFS_BLOCK_GROUP_RAID10;
+       else if (allowed & BTRFS_BLOCK_GROUP_RAID1)
+               allowed = BTRFS_BLOCK_GROUP_RAID1;
+       else if (allowed & BTRFS_BLOCK_GROUP_RAID0)
+               allowed = BTRFS_BLOCK_GROUP_RAID0;
+
+       flags &= ~BTRFS_BLOCK_GROUP_PROFILE_MASK;
+
+       return extended_to_chunk(flags | allowed);
  }
  
  static u64 get_alloc_profile(struct btrfs_root *root, u64 orig_flags)
@@ -3879,11 +3994,7 @@ u64 btrfs_get_alloc_profile(struct btrfs_root *root, int data)
         return ret;
  }
  
-/*
- * This will check the space that the inode allocates from to make sure we have
- * enough space for bytes.
- */
-int btrfs_check_data_free_space(struct inode *inode, u64 bytes, u64 write_bytes)
+int btrfs_alloc_data_chunk_ondemand(struct inode *inode, u64 bytes)
  {
         struct btrfs_space_info *data_sinfo;
         struct btrfs_root *root = BTRFS_I(inode)->root;
@@ -3975,21 +4086,28 @@ commit_trans:
                     !atomic_read(&root->fs_info->open_ioctl_trans)) {
                         need_commit--;
  
+                       if (need_commit > 0) {
+                               btrfs_start_delalloc_roots(fs_info, 0, -1);
+                               btrfs_wait_ordered_roots(fs_info, -1);
+                       }
+
                         trans = btrfs_join_transaction(root);
                         if (IS_ERR(trans))
                                 return PTR_ERR(trans);
                         if (have_pinned_space >= 0 ||
-                           trans->transaction->have_free_bgs ||
+                           test_bit(BTRFS_TRANS_HAVE_FREE_BGS,
+                                    &trans->transaction->flags) ||
                             need_commit > 0) {
                                 ret = btrfs_commit_transaction(trans, root);
                                 if (ret)
                                         return ret;
                                 /*
-                                * make sure that all running delayed iput are
-                                * done
+                                * The cleaner kthread might still be doing iput
+                                * operations. Wait for it to finish so that
+                                * more space is released.
                                  */
-                               down_write(&root->fs_info->delayed_iput_sem);
-                               up_write(&root->fs_info->delayed_iput_sem);
+                               mutex_lock(&root->fs_info->cleaner_delayed_iput_mutex);
+                               mutex_unlock(&root->fs_info->cleaner_delayed_iput_mutex);
                                 goto again;
                         } else {
                                 btrfs_end_transaction(trans, root);
@@ -4001,38 +4119,86 @@ commit_trans:
                                               data_sinfo->flags, bytes, 1);
                 return -ENOSPC;
         }
-       ret = btrfs_qgroup_reserve(root, write_bytes);
-       if (ret)
-               goto out;
         data_sinfo->bytes_may_use += bytes;
         trace_btrfs_space_reservation(root->fs_info, "space_info",
                                       data_sinfo->flags, bytes, 1);
-out:
         spin_unlock(&data_sinfo->lock);
  
         return ret;
  }
  
  /*
- * Called if we need to clear a data reservation for this inode.
+ * New check_data_free_space() with ability for precious data reservation
+ * Will replace old btrfs_check_data_free_space(), but for patch split,
+ * add a new function first and then replace it.
   */
-void btrfs_free_reserved_data_space(struct inode *inode, u64 bytes)
+int btrfs_check_data_free_space(struct inode *inode, u64 start, u64 len)
+{
+       struct btrfs_root *root = BTRFS_I(inode)->root;
+       int ret;
+
+       /* align the range */
+       len = round_up(start + len, root->sectorsize) -
+             round_down(start, root->sectorsize);
+       start = round_down(start, root->sectorsize);
+
+       ret = btrfs_alloc_data_chunk_ondemand(inode, len);
+       if (ret < 0)
+               return ret;
+
+       /*
+        * Use new btrfs_qgroup_reserve_data to reserve precious data space
+        *
+        * TODO: Find a good method to avoid reserve data space for NOCOW
+        * range, but don't impact performance on quota disable case.
+        */
+       ret = btrfs_qgroup_reserve_data(inode, start, len);
+       return ret;
+}
+
+/*
+ * Called if we need to clear a data reservation for this inode
+ * Normally in a error case.
+ *
+ * This one will *NOT* use accurate qgroup reserved space API, just for case
+ * which we can't sleep and is sure it won't affect qgroup reserved space.
+ * Like clear_bit_hook().
+ */
+void btrfs_free_reserved_data_space_noquota(struct inode *inode, u64 start,
+                                           u64 len)
  {
         struct btrfs_root *root = BTRFS_I(inode)->root;
         struct btrfs_space_info *data_sinfo;
  
-       /* make sure bytes are sectorsize aligned */
-       bytes = ALIGN(bytes, root->sectorsize);
+       /* Make sure the range is aligned to sectorsize */
+       len = round_up(start + len, root->sectorsize) -
+             round_down(start, root->sectorsize);
+       start = round_down(start, root->sectorsize);
  
         data_sinfo = root->fs_info->data_sinfo;
         spin_lock(&data_sinfo->lock);
-       WARN_ON(data_sinfo->bytes_may_use < bytes);
-       data_sinfo->bytes_may_use -= bytes;
+       if (WARN_ON(data_sinfo->bytes_may_use < len))
+               data_sinfo->bytes_may_use = 0;
+       else
+               data_sinfo->bytes_may_use -= len;
         trace_btrfs_space_reservation(root->fs_info, "space_info",
-                                     data_sinfo->flags, bytes, 0);
+                                     data_sinfo->flags, len, 0);
         spin_unlock(&data_sinfo->lock);
  }
  
+/*
+ * Called if we need to clear a data reservation for this inode
+ * Normally in a error case.
+ *
+ * This one will handle the per-indoe data rsv map for accurate reserved
+ * space framework.
+ */
+void btrfs_free_reserved_data_space(struct inode *inode, u64 start, u64 len)
+{
+       btrfs_free_reserved_data_space_noquota(inode, start, len);
+       btrfs_qgroup_free_data(inode, start, len);
+}
+
  static void force_metadata_allocation(struct btrfs_fs_info *info)
  {
         struct list_head *head = &info->space_info;
@@ -4088,7 +4254,7 @@ static int should_alloc_chunk(struct btrfs_root *root,
         return 1;
  }
  
-static u64 get_system_chunk_thresh(struct btrfs_root *root, u64 type)
+static u64 get_profile_num_devs(struct btrfs_root *root, u64 type)
  {
         u64 num_dev;
  
@@ -4102,24 +4268,43 @@ static u64 get_system_chunk_thresh(struct btrfs_root *root, u64 type)
         else
                 num_dev = 1;    /* DUP or single */
  
-       /* metadata for updaing devices and chunk tree */
-       return btrfs_calc_trans_metadata_size(root, num_dev + 1);
+       return num_dev;
  }
  
-static void check_system_chunk(struct btrfs_trans_handle *trans,
-                              struct btrfs_root *root, u64 type)
+/*
+ * If @is_allocation is true, reserve space in the system space info necessary
+ * for allocating a chunk, otherwise if it's false, reserve space necessary for
+ * removing a chunk.
+ */
+void check_system_chunk(struct btrfs_trans_handle *trans,
+                       struct btrfs_root *root,
+                       u64 type)
  {
         struct btrfs_space_info *info;
         u64 left;
         u64 thresh;
+       int ret = 0;
+       u64 num_devs;
+
+       /*
+        * Needed because we can end up allocating a system chunk and for an
+        * atomic and race free space reservation in the chunk block reserve.
+        */
+       ASSERT(mutex_is_locked(&root->fs_info->chunk_mutex));
  
         info = __find_space_info(root->fs_info, BTRFS_BLOCK_GROUP_SYSTEM);
         spin_lock(&info->lock);
         left = info->total_bytes - info->bytes_used - info->bytes_pinned -
-               info->bytes_reserved - info->bytes_readonly;
+               info->bytes_reserved - info->bytes_readonly -
+               info->bytes_may_use;
         spin_unlock(&info->lock);
  
-       thresh = get_system_chunk_thresh(root, type);
+       num_devs = get_profile_num_devs(root, type);
+
+       /* num_devs device items to update and 1 chunk item to add or remove */
+       thresh = btrfs_calc_trunc_metadata_size(root, num_devs) +
+               btrfs_calc_trans_metadata_size(root, 1);
+
         if (left < thresh && btrfs_test_opt(root, ENOSPC_DEBUG)) {
                 btrfs_info(root->fs_info, "left=%llu, need=%llu, flags=%llu",
                         left, thresh, type);
@@ -4130,7 +4315,21 @@ static void check_system_chunk(struct btrfs_trans_handle *trans,
                 u64 flags;
  
                 flags = btrfs_get_alloc_profile(root->fs_info->chunk_root, 0);
-               btrfs_alloc_chunk(trans, root, flags);
+               /*
+                * Ignore failure to create system chunk. We might end up not
+                * needing it, as we might not need to COW all nodes/leafs from
+                * the paths we visit in the chunk tree (they were already COWed
+                * or created in the current transaction for example).
+                */
+               ret = btrfs_alloc_chunk(trans, root, flags);
+       }
+
+       if (!ret) {
+               ret = btrfs_block_rsv_add(root->fs_info->chunk_root,
+                                         &root->fs_info->chunk_block_rsv,
+                                         thresh, BTRFS_RESERVE_NO_FLUSH);
+               if (!ret)
+                       trans->chunk_bytes_reserved += thresh;
         }
  }
  
@@ -4235,6 +4434,25 @@ out:
         space_info->chunk_alloc = 0;
         spin_unlock(&space_info->lock);
         mutex_unlock(&fs_info->chunk_mutex);
+       /*
+        * When we allocate a new chunk we reserve space in the chunk block
+        * reserve to make sure we can COW nodes/leafs in the chunk tree or
+        * add new nodes/leafs to it if we end up needing to do it when
+        * inserting the chunk item and updating device items as part of the
+        * second phase of chunk allocation, performed by
+        * btrfs_finish_chunk_alloc(). So make sure we don't accumulate a
+        * large number of new block groups to create in our transaction
+        * handle's new_bgs list to avoid exhausting the chunk block reserve
+        * in extreme cases - like having a single transaction create many new
+        * block groups when starting to write out the free space caches of all
+        * the block groups that were made dirty during the lifetime of the
+        * transaction.
+        */
+       if (trans->can_flush_pending_bgs &&
+           trans->chunk_bytes_reserved >= (2 * 1024 * 1024ull)) {
+               btrfs_create_pending_block_groups(trans, trans->root);
+               btrfs_trans_release_chunk_metadata(trans);
+       }
         return ret;
  }
  
@@ -4812,13 +5030,9 @@ static struct btrfs_block_rsv *get_block_rsv(
  {
         struct btrfs_block_rsv *block_rsv = NULL;
  
-       if (test_bit(BTRFS_ROOT_REF_COWS, &root->state))
-               block_rsv = trans->block_rsv;
-
-       if (root == root->fs_info->csum_root && trans->adding_csums)
-               block_rsv = trans->block_rsv;
-
-       if (root == root->fs_info->uuid_root)
+       if (test_bit(BTRFS_ROOT_REF_COWS, &root->state) ||
+           (root == root->fs_info->csum_root && trans->adding_csums) ||
+            (root == root->fs_info->uuid_root))
                 block_rsv = trans->block_rsv;
  
         if (!block_rsv)
@@ -5188,6 +5402,24 @@ void btrfs_trans_release_metadata(struct btrfs_trans_handle *trans,
         trans->bytes_reserved = 0;
  }
  
+/*
+ * To be called after all the new block groups attached to the transaction
+ * handle have been created (btrfs_create_pending_block_groups()).
+ */
+void btrfs_trans_release_chunk_metadata(struct btrfs_trans_handle *trans)
+{
+       struct btrfs_fs_info *fs_info = trans->root->fs_info;
+
+       if (!trans->chunk_bytes_reserved)
+               return;
+
+       WARN_ON_ONCE(!list_empty(&trans->new_bgs));
+
+       block_rsv_release_bytes(fs_info, &fs_info->chunk_block_rsv, NULL,
+                               trans->chunk_bytes_reserved);
+       trans->chunk_bytes_reserved = 0;
+}
+
  /* Can only return 0 or -ENOSPC */
  int btrfs_orphan_reserve_metadata(struct btrfs_trans_handle *trans,
                                   struct inode *inode)
@@ -5243,7 +5475,7 @@ int btrfs_subvolume_reserve_metadata(struct btrfs_root *root,
         if (root->fs_info->quota_enabled) {
                 /* One for parent inode, two for dir entries */
                 num_bytes = 3 * root->nodesize;
-               ret = btrfs_qgroup_reserve(root, num_bytes);
+               ret = btrfs_qgroup_reserve_meta(root, num_bytes);
                 if (ret)
                         return ret;
         } else {
@@ -5261,10 +5493,8 @@ int btrfs_subvolume_reserve_metadata(struct btrfs_root *root,
         if (ret == -ENOSPC && use_global_rsv)
                 ret = btrfs_block_rsv_migrate(global_rsv, rsv, num_bytes);
  
-       if (ret) {
-               if (*qgroup_reserved)
-                       btrfs_qgroup_free(root, *qgroup_reserved);
-       }
+       if (ret && *qgroup_reserved)
+               btrfs_qgroup_free_meta(root, *qgroup_reserved);
  
         return ret;
  }
@@ -5425,15 +5655,15 @@ int btrfs_delalloc_reserve_metadata(struct inode *inode, u64 num_bytes)
         spin_unlock(&BTRFS_I(inode)->lock);
  
         if (root->fs_info->quota_enabled) {
-               ret = btrfs_qgroup_reserve(root, nr_extents * root->nodesize);
+               ret = btrfs_qgroup_reserve_meta(root,
+                               nr_extents * root->nodesize);
                 if (ret)
                         goto out_fail;
         }
  
         ret = reserve_metadata_bytes(root, block_rsv, to_reserve, flush);
         if (unlikely(ret)) {
-               if (root->fs_info->quota_enabled)
-                       btrfs_qgroup_free(root, nr_extents * root->nodesize);
+               btrfs_qgroup_free_meta(root, nr_extents * root->nodesize);
                 goto out_fail;
         }
  
@@ -5556,41 +5786,48 @@ void btrfs_delalloc_release_metadata(struct inode *inode, u64 num_bytes)
  }
  
  /**
- * btrfs_delalloc_reserve_space - reserve data and metadata space for delalloc
+ * btrfs_delalloc_reserve_space - reserve data and metadata space for
+ * delalloc
   * @inode: inode we're writing to
- * @num_bytes: the number of bytes we want to allocate
+ * @start: start range we are writing to
+ * @len: how long the range we are writing to
+ *
+ * TODO: This function will finally replace old btrfs_delalloc_reserve_space()
   *
   * This will do the following things
   *
- * o reserve space in the data space info for num_bytes
- * o reserve space in the metadata space info based on number of outstanding
+ * o reserve space in data space info for num bytes
+ *   and reserve precious corresponding qgroup space
+ *   (Done in check_data_free_space)
+ *
+ * o reserve space for metadata space, based on the number of outstanding
   *   extents and how much csums will be needed
- * o add to the inodes ->delalloc_bytes
+ *   also reserve metadata space in a per root over-reserve method.
+ * o add to the inodes->delalloc_bytes
   * o add it to the fs_info's delalloc inodes list.
+ *   (Above 3 all done in delalloc_reserve_metadata)
   *
- * This will return 0 for success and -ENOSPC if there is no space left.
+ * Return 0 for success
+ * Return <0 for error(-ENOSPC or -EQUOT)
   */
-int btrfs_delalloc_reserve_space(struct inode *inode, u64 num_bytes)
+int btrfs_delalloc_reserve_space(struct inode *inode, u64 start, u64 len)
  {
         int ret;
  
-       ret = btrfs_check_data_free_space(inode, num_bytes, num_bytes);
-       if (ret)
-               return ret;
-
-       ret = btrfs_delalloc_reserve_metadata(inode, num_bytes);
-       if (ret) {
-               btrfs_free_reserved_data_space(inode, num_bytes);
+       ret = btrfs_check_data_free_space(inode, start, len);
+       if (ret < 0)
                 return ret;
-       }
-
-       return 0;
+       ret = btrfs_delalloc_reserve_metadata(inode, len);
+       if (ret < 0)
+               btrfs_free_reserved_data_space(inode, start, len);
+       return ret;
  }
  
  /**
   * btrfs_delalloc_release_space - release data and metadata space for delalloc
   * @inode: inode we're releasing space for
- * @num_bytes: the number of bytes we want to free up
+ * @start: start position of the space already reserved
+ * @len: the len of the space already reserved
   *
   * This must be matched with a call to btrfs_delalloc_reserve_space.  This is
   * called in the case that we don't need the metadata AND data reservations
@@ -5599,11 +5836,12 @@ int btrfs_delalloc_reserve_space(struct inode *inode, u64 num_bytes)
   * This function will release the metadata space that was not used and will
   * decrement ->delalloc_bytes and remove it from the fs_info delalloc_inodes
   * list if there are no delalloc bytes left.
+ * Also it will handle the qgroup reserved space.
   */
-void btrfs_delalloc_release_space(struct inode *inode, u64 num_bytes)
+void btrfs_delalloc_release_space(struct inode *inode, u64 start, u64 len)
  {
-       btrfs_delalloc_release_metadata(inode, num_bytes);
-       btrfs_free_reserved_data_space(inode, num_bytes);
+       btrfs_delalloc_release_metadata(inode, len);
+       btrfs_free_reserved_data_space(inode, start, len);
  }
  
  static int update_block_group(struct btrfs_trans_handle *trans,
@@ -5680,19 +5918,6 @@ static int update_block_group(struct btrfs_trans_handle *trans,
                         set_extent_dirty(info->pinned_extents,
                                          bytenr, bytenr + num_bytes - 1,
                                          GFP_NOFS | __GFP_NOFAIL);
-                       /*
-                        * No longer have used bytes in this block group, queue
-                        * it for deletion.
-                        */
-                       if (old_val == 0) {
-                               spin_lock(&info->unused_bgs_lock);
-                               if (list_empty(&cache->bg_list)) {
-                                       btrfs_get_block_group(cache);
-                                       list_add_tail(&cache->bg_list,
-                                                     &info->unused_bgs);
-                               }
-                               spin_unlock(&info->unused_bgs_lock);
-                       }
                 }
  
                 spin_lock(&trans->transaction->dirty_bgs_lock);
@@ -5704,6 +5929,22 @@ static int update_block_group(struct btrfs_trans_handle *trans,
                 }
                 spin_unlock(&trans->transaction->dirty_bgs_lock);
  
+               /*
+                * No longer have used bytes in this block group, queue it for
+                * deletion. We do this after adding the block group to the
+                * dirty list to avoid races between cleaner kthread and space
+                * cache writeout.
+                */
+               if (!alloc && old_val == 0) {
+                       spin_lock(&info->unused_bgs_lock);
+                       if (list_empty(&cache->bg_list)) {
+                               btrfs_get_block_group(cache);
+                               list_add_tail(&cache->bg_list,
+                                             &info->unused_bgs);
+                       }
+                       spin_unlock(&info->unused_bgs_lock);
+               }
+
                 btrfs_put_block_group(cache);
                 total -= num_bytes;
                 bytenr += num_bytes;
@@ -5968,6 +6209,34 @@ void btrfs_prepare_extent_commit(struct btrfs_trans_handle *trans,
         update_global_block_rsv(fs_info);
  }
  
+/*
+ * Returns the free cluster for the given space info and sets empty_cluster to
+ * what it should be based on the mount options.
+ */
+static struct btrfs_free_cluster *
+fetch_cluster_info(struct btrfs_root *root, struct btrfs_space_info *space_info,
+                  u64 *empty_cluster)
+{
+       struct btrfs_free_cluster *ret = NULL;
+       bool ssd = btrfs_test_opt(root, SSD);
+
+       *empty_cluster = 0;
+       if (btrfs_mixed_space_info(space_info))
+               return ret;
+
+       if (ssd)
+               *empty_cluster = 2 * 1024 * 1024;
+       if (space_info->flags & BTRFS_BLOCK_GROUP_METADATA) {
+               ret = &root->fs_info->meta_alloc_cluster;
+               if (!ssd)
+                       *empty_cluster = 64 * 1024;
+       } else if ((space_info->flags & BTRFS_BLOCK_GROUP_DATA) && ssd) {
+               ret = &root->fs_info->data_alloc_cluster;
+       }
+
+       return ret;
+}
+
  static int unpin_extent_range(struct btrfs_root *root, u64 start, u64 end,
                               const bool return_free_space)
  {
@@ -5975,7 +6244,10 @@ static int unpin_extent_range(struct btrfs_root *root, u64 start, u64 end,
         struct btrfs_block_group_cache *cache = NULL;
         struct btrfs_space_info *space_info;
         struct btrfs_block_rsv *global_rsv = &fs_info->global_block_rsv;
+       struct btrfs_free_cluster *cluster = NULL;
         u64 len;
+       u64 total_unpinned = 0;
+       u64 empty_cluster = 0;
         bool readonly;
  
         while (start <= end) {
@@ -5984,8 +6256,14 @@ static int unpin_extent_range(struct btrfs_root *root, u64 start, u64 end,
                     start >= cache->key.objectid + cache->key.offset) {
                         if (cache)
                                 btrfs_put_block_group(cache);
+                       total_unpinned = 0;
                         cache = btrfs_lookup_block_group(fs_info, start);
                         BUG_ON(!cache); /* Logic error */
+
+                       cluster = fetch_cluster_info(root,
+                                                    cache->space_info,
+                                                    &empty_cluster);
+                       empty_cluster <<= 1;
                 }
  
                 len = cache->key.objectid + cache->key.offset - start;
@@ -5998,12 +6276,27 @@ static int unpin_extent_range(struct btrfs_root *root, u64 start, u64 end,
                 }
  
                 start += len;
+               total_unpinned += len;
                 space_info = cache->space_info;
  
+               /*
+                * If this space cluster has been marked as fragmented and we've
+                * unpinned enough in this block group to potentially allow a
+                * cluster to be created inside of it go ahead and clear the
+                * fragmented check.
+                */
+               if (cluster && cluster->fragmented &&
+                   total_unpinned > empty_cluster) {
+                       spin_lock(&cluster->lock);
+                       cluster->fragmented = 0;
+                       spin_unlock(&cluster->lock);
+               }
+
                 spin_lock(&space_info->lock);
                 spin_lock(&cache->lock);
                 cache->pinned -= len;
                 space_info->bytes_pinned -= len;
+               space_info->max_extent_size = 0;
                 percpu_counter_add(&space_info->total_bytes_pinned, -len);
                 if (cache->ro) {
                         space_info->bytes_readonly += len;
@@ -6034,20 +6327,19 @@ int btrfs_finish_extent_commit(struct btrfs_trans_handle *trans,
                                struct btrfs_root *root)
  {
         struct btrfs_fs_info *fs_info = root->fs_info;
+       struct btrfs_block_group_cache *block_group, *tmp;
+       struct list_head *deleted_bgs;
         struct extent_io_tree *unpin;
         u64 start;
         u64 end;
         int ret;
  
-       if (trans->aborted)
-               return 0;
-
         if (fs_info->pinned_extents == &fs_info->freed_extents[0])
                 unpin = &fs_info->freed_extents[1];
         else
                 unpin = &fs_info->freed_extents[0];
  
-       while (1) {
+       while (!trans->aborted) {
                 mutex_lock(&fs_info->unused_bg_unpin_mutex);
                 ret = find_first_extent_bit(unpin, 0, &start, &end,
                                             EXTENT_DIRTY, NULL);
@@ -6066,6 +6358,34 @@ int btrfs_finish_extent_commit(struct btrfs_trans_handle *trans,
                 cond_resched();
         }
  
+       /*
+        * Transaction is finished.  We don't need the lock anymore.  We
+        * do need to clean up the block groups in case of a transaction
+        * abort.
+        */
+       deleted_bgs = &trans->transaction->deleted_bgs;
+       list_for_each_entry_safe(block_group, tmp, deleted_bgs, bg_list) {
+               u64 trimmed = 0;
+
+               ret = -EROFS;
+               if (!trans->aborted)
+                       ret = btrfs_discard_extent(root,
+                                                  block_group->key.objectid,
+                                                  block_group->key.offset,
+                                                  &trimmed);
+
+               list_del_init(&block_group->bg_list);
+               btrfs_put_block_group_trimming(block_group);
+               btrfs_put_block_group(block_group);
+
+               if (ret) {
+                       const char *errstr = btrfs_decode_error(ret);
+                       btrfs_warn(fs_info,
+                                  "Discard failed while removing blockgroup: errno=%d %s\n",
+                                  ret, errstr);
+               }
+       }
+
         return 0;
  }
  
@@ -6092,11 +6412,10 @@ static void add_pinned_bytes(struct btrfs_fs_info *fs_info, u64 num_bytes,
  
  static int __btrfs_free_extent(struct btrfs_trans_handle *trans,
                                 struct btrfs_root *root,
-                               u64 bytenr, u64 num_bytes, u64 parent,
+                               struct btrfs_delayed_ref_node *node, u64 parent,
                                 u64 root_objectid, u64 owner_objectid,
                                 u64 owner_offset, int refs_to_drop,
-                               struct btrfs_delayed_extent_op *extent_op,
-                               int no_quota)
+                               struct btrfs_delayed_extent_op *extent_op)
  {
         struct btrfs_key key;
         struct btrfs_path *path;
@@ -6112,14 +6431,12 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans,
         int num_to_del = 1;
         u32 item_size;
         u64 refs;
+       u64 bytenr = node->bytenr;
+       u64 num_bytes = node->num_bytes;
         int last_ref = 0;
-       enum btrfs_qgroup_operation_type type = BTRFS_QGROUP_OPER_SUB_EXCL;
         bool skinny_metadata = btrfs_fs_incompat(root->fs_info,
                                                  SKINNY_METADATA);
  
-       if (!info->quota_enabled || !is_fstree(root_objectid))
-               no_quota = 1;
-
         path = btrfs_alloc_path();
         if (!path)
                 return -ENOMEM;
@@ -6294,7 +6611,6 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans,
         refs -= refs_to_drop;
  
         if (refs > 0) {
-               type = BTRFS_QGROUP_OPER_SUB_SHARED;
                 if (extent_op)
                         __run_delayed_extent_op(extent_op, leaf, ei);
                 /*
@@ -6321,7 +6637,7 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans,
         } else {
                 if (found_extent) {
                         BUG_ON(is_data && refs_to_drop !=
-                              extent_data_ref_count(root, path, iref));
+                              extent_data_ref_count(path, iref));
                         if (iref) {
                                 BUG_ON(path->slots[0] != extent_slot);
                         } else {
@@ -6356,18 +6672,6 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans,
         }
         btrfs_release_path(path);
  
-       /* Deal with the quota accounting */
-       if (!ret && last_ref && !no_quota) {
-               int mod_seq = 0;
-
-               if (owner_objectid >= BTRFS_FIRST_FREE_OBJECTID &&
-                   type == BTRFS_QGROUP_OPER_SUB_SHARED)
-                       mod_seq = 1;
-
-               ret = btrfs_qgroup_record_ref(trans, info, root_objectid,
-                                             bytenr, num_bytes, type,
-                                             mod_seq);
-       }
  out:
         btrfs_free_path(path);
         return ret;
@@ -6393,7 +6697,7 @@ static noinline int check_ref_cleanup(struct btrfs_trans_handle *trans,
                 goto out_delayed_unlock;
  
         spin_lock(&head->lock);
-       if (rb_first(&head->ref_root))
+       if (!list_empty(&head->ref_list))
                 goto out;
  
         if (head->extent_op) {
@@ -6458,7 +6762,7 @@ void btrfs_free_tree_block(struct btrfs_trans_handle *trans,
                                         buf->start, buf->len,
                                         parent, root->root_key.objectid,
                                         btrfs_header_level(buf),
-                                       BTRFS_DROP_DELAYED_REF, NULL, 0);
+                                       BTRFS_DROP_DELAYED_REF, NULL);
                 BUG_ON(ret); /* -ENOMEM */
         }
  
@@ -6506,7 +6810,7 @@ out:
  /* Can return -ENOMEM */
  int btrfs_free_extent(struct btrfs_trans_handle *trans, struct btrfs_root *root,
                       u64 bytenr, u64 num_bytes, u64 parent, u64 root_objectid,
-                     u64 owner, u64 offset, int no_quota)
+                     u64 owner, u64 offset)
  {
         int ret;
         struct btrfs_fs_info *fs_info = root->fs_info;
@@ -6529,13 +6833,13 @@ int btrfs_free_extent(struct btrfs_trans_handle *trans, struct btrfs_root *root,
                 ret = btrfs_add_delayed_tree_ref(fs_info, trans, bytenr,
                                         num_bytes,
                                         parent, root_objectid, (int)owner,
-                                       BTRFS_DROP_DELAYED_REF, NULL, no_quota);
+                                       BTRFS_DROP_DELAYED_REF, NULL);
         } else {
                 ret = btrfs_add_delayed_data_ref(fs_info, trans, bytenr,
                                                 num_bytes,
                                                 parent, root_objectid, owner,
-                                               offset, BTRFS_DROP_DELAYED_REF,
-                                               NULL, no_quota);
+                                               offset, 0,
+                                               BTRFS_DROP_DELAYED_REF, NULL);
         }
         return ret;
  }
@@ -6721,7 +7025,7 @@ static noinline int find_free_extent(struct btrfs_root *orig_root,
         struct btrfs_block_group_cache *block_group = NULL;
         u64 search_start = 0;
         u64 max_extent_size = 0;
-       int empty_cluster = 2 * 1024 * 1024;
+       u64 empty_cluster = 0;
         struct btrfs_space_info *space_info;
         int loop = 0;
         int index = __get_raid_index(flags);
@@ -6731,6 +7035,8 @@ static noinline int find_free_extent(struct btrfs_root *orig_root,
         bool failed_alloc = false;
         bool use_cluster = true;
         bool have_caching_bg = false;
+       bool orig_have_caching_bg = false;
+       bool full_search = false;
  
         WARN_ON(num_bytes < root->sectorsize);
         ins->type = BTRFS_EXTENT_ITEM_KEY;
@@ -6746,36 +7052,47 @@ static noinline int find_free_extent(struct btrfs_root *orig_root,
         }
  
         /*
-        * If the space info is for both data and metadata it means we have a
-        * small filesystem and we can't use the clustering stuff.
+        * If our free space is heavily fragmented we may not be able to make
+        * big contiguous allocations, so instead of doing the expensive search
+        * for free space, simply return ENOSPC with our max_extent_size so we
+        * can go ahead and search for a more manageable chunk.
+        *
+        * If our max_extent_size is large enough for our allocation simply
+        * disable clustering since we will likely not be able to find enough
+        * space to create a cluster and induce latency trying.
          */
-       if (btrfs_mixed_space_info(space_info))
-               use_cluster = false;
-
-       if (flags & BTRFS_BLOCK_GROUP_METADATA && use_cluster) {
-               last_ptr = &root->fs_info->meta_alloc_cluster;
-               if (!btrfs_test_opt(root, SSD))
-                       empty_cluster = 64 * 1024;
-       }
-
-       if ((flags & BTRFS_BLOCK_GROUP_DATA) && use_cluster &&
-           btrfs_test_opt(root, SSD)) {
-               last_ptr = &root->fs_info->data_alloc_cluster;
+       if (unlikely(space_info->max_extent_size)) {
+               spin_lock(&space_info->lock);
+               if (space_info->max_extent_size &&
+                   num_bytes > space_info->max_extent_size) {
+                       ins->offset = space_info->max_extent_size;
+                       spin_unlock(&space_info->lock);
+                       return -ENOSPC;
+               } else if (space_info->max_extent_size) {
+                       use_cluster = false;
+               }
+               spin_unlock(&space_info->lock);
         }
  
+       last_ptr = fetch_cluster_info(orig_root, space_info, &empty_cluster);
         if (last_ptr) {
                 spin_lock(&last_ptr->lock);
                 if (last_ptr->block_group)
                         hint_byte = last_ptr->window_start;
+               if (last_ptr->fragmented) {
+                       /*
+                        * We still set window_start so we can keep track of the
+                        * last place we found an allocation to try and save
+                        * some time.
+                        */
+                       hint_byte = last_ptr->window_start;
+                       use_cluster = false;
+               }
                 spin_unlock(&last_ptr->lock);
         }
  
         search_start = max(search_start, first_logical_byte(root, 0));
         search_start = max(search_start, hint_byte);
-
-       if (!last_ptr)
-               empty_cluster = 0;
-
         if (search_start == hint_byte) {
                 block_group = btrfs_lookup_block_group(root->fs_info,
                                                        search_start);
@@ -6810,6 +7127,8 @@ static noinline int find_free_extent(struct btrfs_root *orig_root,
         }
  search:
         have_caching_bg = false;
+       if (index == 0 || index == __get_raid_index(flags))
+               full_search = true;
         down_read(&space_info->groups_sem);
         list_for_each_entry(block_group, &space_info->block_groups[index],
                             list) {
@@ -6843,6 +7162,7 @@ search:
  have_block_group:
                 cached = block_group_cache_done(block_group);
                 if (unlikely(!cached)) {
+                       have_caching_bg = true;
                         ret = cache_block_group(block_group, 0);
                         BUG_ON(ret < 0);
                         ret = 0;
@@ -6857,7 +7177,7 @@ have_block_group:
                  * Ok we want to try and use the cluster allocator, so
                  * lets look there
                  */
-               if (last_ptr) {
+               if (last_ptr && use_cluster) {
                         struct btrfs_block_group_cache *used_block_group;
                         unsigned long aligned_cluster;
                         /*
@@ -6983,6 +7303,16 @@ refill_cluster:
                 }
  
  unclustered_alloc:
+               /*
+                * We are doing an unclustered alloc, set the fragmented flag so
+                * we don't bother trying to setup a cluster again until we get
+                * more space.
+                */
+               if (unlikely(last_ptr)) {
+                       spin_lock(&last_ptr->lock);
+                       last_ptr->fragmented = 1;
+                       spin_unlock(&last_ptr->lock);
+               }
                 spin_lock(&block_group->free_space_ctl->tree_lock);
                 if (cached &&
                     block_group->free_space_ctl->free_space <
@@ -7015,8 +7345,6 @@ unclustered_alloc:
                         failed_alloc = true;
                         goto have_block_group;
                 } else if (!offset) {
-                       if (!cached)
-                               have_caching_bg = true;
                         goto loop;
                 }
  checks:
@@ -7057,6 +7385,10 @@ loop:
         }
         up_read(&space_info->groups_sem);
  
+       if ((loop == LOOP_CACHING_NOWAIT) && have_caching_bg
+               && !orig_have_caching_bg)
+               orig_have_caching_bg = true;
+
         if (!ins->objectid && loop >= LOOP_CACHING_WAIT && have_caching_bg)
                 goto search;
  
@@ -7073,7 +7405,20 @@ loop:
          */
         if (!ins->objectid && loop < LOOP_NO_EMPTY_SIZE) {
                 index = 0;
-               loop++;
+               if (loop == LOOP_CACHING_NOWAIT) {
+                       /*
+                        * We want to skip the LOOP_CACHING_WAIT step if we
+                        * don't have any unached bgs and we've alrelady done a
+                        * full search through.
+                        */
+                       if (orig_have_caching_bg || !full_search)
+                               loop = LOOP_CACHING_WAIT;
+                       else
+                               loop = LOOP_ALLOC_CHUNK;
+               } else {
+                       loop++;
+               }
+
                 if (loop == LOOP_ALLOC_CHUNK) {
                         struct btrfs_trans_handle *trans;
                         int exist = 0;
@@ -7091,6 +7436,15 @@ loop:
  
                         ret = do_chunk_alloc(trans, root, flags,
                                              CHUNK_ALLOC_FORCE);
+
+                       /*
+                        * If we can't allocate a new chunk we've already looped
+                        * through at least once, move on to the NO_EMPTY_SIZE
+                        * case.
+                        */
+                       if (ret == -ENOSPC)
+                               loop = LOOP_NO_EMPTY_SIZE;
+
                         /*
                          * Do not bail out on ENOSPC since we
                          * can do more things.
@@ -7107,6 +7461,15 @@ loop:
                 }
  
                 if (loop == LOOP_NO_EMPTY_SIZE) {
+                       /*
+                        * Don't loop again if we already have no empty_size and
+                        * no empty_cluster.
+                        */
+                       if (empty_size == 0 &&
+                           empty_cluster == 0) {
+                               ret = -ENOSPC;
+                               goto out;
+                       }
                         empty_size = 0;
                         empty_cluster = 0;
                 }
@@ -7115,11 +7478,20 @@ loop:
         } else if (!ins->objectid) {
                 ret = -ENOSPC;
         } else if (ins->objectid) {
+               if (!use_cluster && last_ptr) {
+                       spin_lock(&last_ptr->lock);
+                       last_ptr->window_start = ins->objectid;
+                       spin_unlock(&last_ptr->lock);
+               }
                 ret = 0;
         }
  out:
-       if (ret == -ENOSPC)
+       if (ret == -ENOSPC) {
+               spin_lock(&space_info->lock);
+               space_info->max_extent_size = max_extent_size;
+               spin_unlock(&space_info->lock);
                 ins->offset = max_extent_size;
+       }
         return ret;
  }
  
@@ -7168,7 +7540,7 @@ int btrfs_reserve_extent(struct btrfs_root *root,
                          u64 empty_size, u64 hint_byte,
                          struct btrfs_key *ins, int is_data, int delalloc)
  {
-       bool final_tried = false;
+       bool final_tried = num_bytes == min_alloc_size;
         u64 flags;
         int ret;
  
@@ -7303,13 +7675,6 @@ static int alloc_reserved_file_extent(struct btrfs_trans_handle *trans,
         btrfs_mark_buffer_dirty(path->nodes[0]);
         btrfs_free_path(path);
  
-       /* Always set parent to 0 here since its exclusive anyway. */
-       ret = btrfs_qgroup_record_ref(trans, fs_info, root_objectid,
-                                     ins->objectid, ins->offset,
-                                     BTRFS_QGROUP_OPER_ADD_EXCL, 0);
-       if (ret)
-               return ret;
-
         ret = update_block_group(trans, root, ins->objectid, ins->offset, 1);
         if (ret) { /* -ENOENT, logic error */
                 btrfs_err(fs_info, "update block group failed for %llu %llu",
@@ -7324,8 +7689,7 @@ static int alloc_reserved_tree_block(struct btrfs_trans_handle *trans,
                                      struct btrfs_root *root,
                                      u64 parent, u64 root_objectid,
                                      u64 flags, struct btrfs_disk_key *key,
-                                    int level, struct btrfs_key *ins,
-                                    int no_quota)
+                                    int level, struct btrfs_key *ins)
  {
         int ret;
         struct btrfs_fs_info *fs_info = root->fs_info;
@@ -7391,14 +7755,6 @@ static int alloc_reserved_tree_block(struct btrfs_trans_handle *trans,
         btrfs_mark_buffer_dirty(leaf);
         btrfs_free_path(path);
  
-       if (!no_quota) {
-               ret = btrfs_qgroup_record_ref(trans, fs_info, root_objectid,
-                                             ins->objectid, num_bytes,
-                                             BTRFS_QGROUP_OPER_ADD_EXCL, 0);
-               if (ret)
-                       return ret;
-       }
-
         ret = update_block_group(trans, root, ins->objectid, root->nodesize,
                                  1);
         if (ret) { /* -ENOENT, logic error */
@@ -7414,7 +7770,8 @@ static int alloc_reserved_tree_block(struct btrfs_trans_handle *trans,
  int btrfs_alloc_reserved_file_extent(struct btrfs_trans_handle *trans,
                                      struct btrfs_root *root,
                                      u64 root_objectid, u64 owner,
-                                    u64 offset, struct btrfs_key *ins)
+                                    u64 offset, u64 ram_bytes,
+                                    struct btrfs_key *ins)
  {
         int ret;
  
@@ -7423,7 +7780,8 @@ int btrfs_alloc_reserved_file_extent(struct btrfs_trans_handle *trans,
         ret = btrfs_add_delayed_data_ref(root->fs_info, trans, ins->objectid,
                                          ins->offset, 0,
                                          root_objectid, owner, offset,
-                                        BTRFS_ADD_DELAYED_EXTENT, NULL, 0);
+                                        ram_bytes, BTRFS_ADD_DELAYED_EXTENT,
+                                        NULL);
         return ret;
  }
  
@@ -7566,9 +7924,6 @@ static void unuse_block_rsv(struct btrfs_fs_info *fs_info,
  
  /*
   * finds a free extent and does all the dirty work required for allocation
- * returns the key for the extent through ins, and a tree buffer for
- * the first block of the extent through buf.
- *
   * returns the tree buffer or an ERR_PTR on error.
   */
  struct extent_buffer *btrfs_alloc_tree_block(struct btrfs_trans_handle *trans,
@@ -7640,7 +7995,7 @@ struct extent_buffer *btrfs_alloc_tree_block(struct btrfs_trans_handle *trans,
                                                  ins.objectid, ins.offset,
                                                  parent, root_objectid, level,
                                                  BTRFS_ADD_DELAYED_EXTENT,
-                                                extent_op, 0);
+                                                extent_op);
                 if (ret)
                         goto out_free_delayed;
         }
@@ -7755,6 +8110,34 @@ reada:
         wc->reada_slot = slot;
  }
  
+/*
+ * These may not be seen by the usual inc/dec ref code so we have to
+ * add them here.
+ */
+static int record_one_subtree_extent(struct btrfs_trans_handle *trans,
+                                    struct btrfs_root *root, u64 bytenr,
+                                    u64 num_bytes)
+{
+       struct btrfs_qgroup_extent_record *qrecord;
+       struct btrfs_delayed_ref_root *delayed_refs;
+
+       qrecord = kmalloc(sizeof(*qrecord), GFP_NOFS);
+       if (!qrecord)
+               return -ENOMEM;
+
+       qrecord->bytenr = bytenr;
+       qrecord->num_bytes = num_bytes;
+       qrecord->old_roots = NULL;
+
+       delayed_refs = &trans->transaction->delayed_refs;
+       spin_lock(&delayed_refs->lock);
+       if (btrfs_qgroup_insert_dirty_extent(delayed_refs, qrecord))
+               kfree(qrecord);
+       spin_unlock(&delayed_refs->lock);
+
+       return 0;
+}
+
  static int account_leaf_items(struct btrfs_trans_handle *trans,
                               struct btrfs_root *root,
                               struct extent_buffer *eb)
@@ -7765,6 +8148,10 @@ static int account_leaf_items(struct btrfs_trans_handle *trans,
         struct btrfs_file_extent_item *fi;
         u64 bytenr, num_bytes;
  
+       /* We can be called directly from walk_up_proc() */
+       if (!root->fs_info->quota_enabled)
+               return 0;
+
         for (i = 0; i < nr; i++) {
                 btrfs_item_key_to_cpu(eb, &key, i);
  
@@ -7784,10 +8171,7 @@ static int account_leaf_items(struct btrfs_trans_handle *trans,
  
                 num_bytes = btrfs_file_extent_disk_num_bytes(eb, fi);
  
-               ret = btrfs_qgroup_record_ref(trans, root->fs_info,
-                                             root->objectid,
-                                             bytenr, num_bytes,
-                                             BTRFS_QGROUP_OPER_SUB_SUBTREE, 0);
+               ret = record_one_subtree_extent(trans, root, bytenr, num_bytes);
                 if (ret)
                         return ret;
         }
@@ -7920,7 +8304,11 @@ walk_down:
                         child_gen = btrfs_node_ptr_generation(eb, parent_slot);
  
                         eb = read_tree_block(root, child_bytenr, child_gen);
-                       if (!eb || !extent_buffer_uptodate(eb)) {
+                       if (IS_ERR(eb)) {
+                               ret = PTR_ERR(eb);
+                               goto out;
+                       } else if (!extent_buffer_uptodate(eb)) {
+                               free_extent_buffer(eb);
                                 ret = -EIO;
                                 goto out;
                         }
@@ -7932,15 +8320,10 @@ walk_down:
                         btrfs_set_lock_blocking_rw(eb, BTRFS_READ_LOCK);
                         path->locks[level] = BTRFS_READ_LOCK_BLOCKING;
  
-                       ret = btrfs_qgroup_record_ref(trans, root->fs_info,
-                                               root->objectid,
-                                               child_bytenr,
-                                               root->nodesize,
-                                               BTRFS_QGROUP_OPER_SUB_SUBTREE,
-                                               0);
+                       ret = record_one_subtree_extent(trans, root, child_bytenr,
+                                                       root->nodesize);
                         if (ret)
                                 goto out;
-
                 }
  
                 if (level == 0) {
@@ -8151,7 +8534,9 @@ static noinline int do_walk_down(struct btrfs_trans_handle *trans,
                 if (reada && level == 1)
                         reada_walk_down(trans, root, wc, path);
                 next = read_tree_block(root, bytenr, generation);
-               if (!next || !extent_buffer_uptodate(next)) {
+               if (IS_ERR(next)) {
+                       return PTR_ERR(next);
+               } else if (!extent_buffer_uptodate(next)) {
                         free_extent_buffer(next);
                         return -EIO;
                 }
@@ -8184,14 +8569,15 @@ skip:
                         ret = account_shared_subtree(trans, root, next,
                                                      generation, level - 1);
                         if (ret) {
-                               printk_ratelimited(KERN_ERR "BTRFS: %s Error "
+                               btrfs_err_rl(root->fs_info,
+                                       "Error "
                                         "%d accounting shared subtree. Quota "
-                                       "is out of sync, rescan required.\n",
-                                       root->fs_info->sb->s_id, ret);
+                                       "is out of sync, rescan required.",
+                                       ret);
                         }
                 }
                 ret = btrfs_free_extent(trans, root, bytenr, blocksize, parent,
-                               root->root_key.objectid, level - 1, 0, 0);
+                               root->root_key.objectid, level - 1, 0);
                 BUG_ON(ret); /* -ENOMEM */
         }
         btrfs_tree_unlock(next);
@@ -8276,10 +8662,11 @@ static noinline int walk_up_proc(struct btrfs_trans_handle *trans,
                         BUG_ON(ret); /* -ENOMEM */
                         ret = account_leaf_items(trans, root, eb);
                         if (ret) {
-                               printk_ratelimited(KERN_ERR "BTRFS: %s Error "
+                               btrfs_err_rl(root->fs_info,
+                                       "error "
                                         "%d accounting leaf items. Quota "
-                                       "is out of sync, rescan required.\n",
-                                       root->fs_info->sb->s_id, ret);
+                                       "is out of sync, rescan required.",
+                                       ret);
                         }
                 }
                 /* make block locked assertion in clean_tree_block happy */
@@ -8533,24 +8920,6 @@ int btrfs_drop_snapshot(struct btrfs_root *root,
                                 goto out_end_trans;
                         }
  
-                       /*
-                        * Qgroup update accounting is run from
-                        * delayed ref handling. This usually works
-                        * out because delayed refs are normally the
-                        * only way qgroup updates are added. However,
-                        * we may have added updates during our tree
-                        * walk so run qgroups here to make sure we
-                        * don't lose any updates.
-                        */
-                       ret = btrfs_delayed_qgroup_accounting(trans,
-                                                             root->fs_info);
-                       if (ret)
-                               printk_ratelimited(KERN_ERR "BTRFS: Failure %d "
-                                                  "running qgroup updates "
-                                                  "during snapshot delete. "
-                                                  "Quota is out of sync, "
-                                                  "rescan required.\n", ret);
-
                         btrfs_end_transaction_throttle(trans, tree_root);
                         if (!for_reloc && btrfs_need_cleaner_sleep(root)) {
                                 pr_debug("BTRFS: drop snapshot early exit\n");
@@ -8596,7 +8965,7 @@ int btrfs_drop_snapshot(struct btrfs_root *root,
         }
  
         if (test_bit(BTRFS_ROOT_IN_RADIX, &root->state)) {
-               btrfs_drop_and_free_fs_root(tree_root->fs_info, root);
+               btrfs_add_dropped_root(trans, root);
         } else {
                 free_extent_buffer(root->node);
                 free_extent_buffer(root->commit_root);
@@ -8604,14 +8973,6 @@ int btrfs_drop_snapshot(struct btrfs_root *root,
         }
         root_dropped = true;
  out_end_trans:
-       ret = btrfs_delayed_qgroup_accounting(trans, tree_root->fs_info);
-       if (ret)
-               printk_ratelimited(KERN_ERR "BTRFS: Failure %d "
-                                  "running qgroup updates "
-                                  "during snapshot delete. "
-                                  "Quota is out of sync, "
-                                  "rescan required.\n", ret);
-
         btrfs_end_transaction_throttle(trans, tree_root);
  out_free:
         kfree(wc);
@@ -8627,7 +8988,7 @@ out:
         if (!for_reloc && root_dropped == false)
                 btrfs_add_dead_root(root);
         if (err && err != -EAGAIN)
-               btrfs_std_error(root->fs_info, err);
+               btrfs_std_error(root->fs_info, err, NULL);
         return err;
  }
  
@@ -8751,14 +9112,13 @@ static u64 update_block_group_flags(struct btrfs_root *root, u64 flags)
         return flags;
  }
  
-static int set_block_group_ro(struct btrfs_block_group_cache *cache, int force)
+static int inc_block_group_ro(struct btrfs_block_group_cache *cache, int force)
  {
         struct btrfs_space_info *sinfo = cache->space_info;
         u64 num_bytes;
         u64 min_allocable_bytes;
         int ret = -ENOSPC;
  
-
         /*
          * We need some metadata space and system metadata space for
          * allocating chunks in some corner cases until we force to set
@@ -8775,6 +9135,7 @@ static int set_block_group_ro(struct btrfs_block_group_cache *cache, int force)
         spin_lock(&cache->lock);
  
         if (cache->ro) {
+               cache->ro++;
                 ret = 0;
                 goto out;
         }
@@ -8786,7 +9147,7 @@ static int set_block_group_ro(struct btrfs_block_group_cache *cache, int force)
             sinfo->bytes_may_use + sinfo->bytes_readonly + num_bytes +
             min_allocable_bytes <= sinfo->total_bytes) {
                 sinfo->bytes_readonly += num_bytes;
-               cache->ro = 1;
+               cache->ro++;
                 list_add_tail(&cache->ro_list, &sinfo->ro_bgs);
                 ret = 0;
         }
@@ -8796,7 +9157,7 @@ out:
         return ret;
  }
  
-int btrfs_set_block_group_ro(struct btrfs_root *root,
+int btrfs_inc_block_group_ro(struct btrfs_root *root,
                              struct btrfs_block_group_cache *cache)
  
  {
@@ -8804,8 +9165,6 @@ int btrfs_set_block_group_ro(struct btrfs_root *root,
         u64 alloc_flags;
         int ret;
  
-       BUG_ON(cache->ro);
-
  again:
         trans = btrfs_join_transaction(root);
         if (IS_ERR(trans))
@@ -8817,7 +9176,7 @@ again:
          * back off and let this transaction commit
          */
         mutex_lock(&root->fs_info->ro_block_group_mutex);
-       if (trans->transaction->dirty_bg_run) {
+       if (test_bit(BTRFS_TRANS_DIRTY_BG_RUN, &trans->transaction->flags)) {
                 u64 transid = trans->transid;
  
                 mutex_unlock(&root->fs_info->ro_block_group_mutex);
@@ -8848,7 +9207,7 @@ again:
                         goto out;
         }
  
-       ret = set_block_group_ro(cache, 0);
+       ret = inc_block_group_ro(cache, 0);
         if (!ret)
                 goto out;
         alloc_flags = get_alloc_profile(root, cache->space_info->flags);
@@ -8856,7 +9215,7 @@ again:
                              CHUNK_ALLOC_FORCE);
         if (ret < 0)
                 goto out;
-       ret = set_block_group_ro(cache, 0);
+       ret = inc_block_group_ro(cache, 0);
  out:
         if (cache->flags & BTRFS_BLOCK_GROUP_SYSTEM) {
                 alloc_flags = update_block_group_flags(root, cache->flags);
@@ -8919,7 +9278,7 @@ u64 btrfs_account_ro_block_groups_free_space(struct btrfs_space_info *sinfo)
         return free_bytes;
  }
  
-void btrfs_set_block_group_rw(struct btrfs_root *root,
+void btrfs_dec_block_group_ro(struct btrfs_root *root,
                               struct btrfs_block_group_cache *cache)
  {
         struct btrfs_space_info *sinfo = cache->space_info;
@@ -8929,11 +9288,13 @@ void btrfs_set_block_group_rw(struct btrfs_root *root,
  
         spin_lock(&sinfo->lock);
         spin_lock(&cache->lock);
-       num_bytes = cache->key.offset - cache->reserved - cache->pinned -
-                   cache->bytes_super - btrfs_block_group_used(&cache->item);
-       sinfo->bytes_readonly -= num_bytes;
-       cache->ro = 0;
-       list_del_init(&cache->ro_list);
+       if (!--cache->ro) {
+               num_bytes = cache->key.offset - cache->reserved -
+                           cache->pinned - cache->bytes_super -
+                           btrfs_block_group_used(&cache->item);
+               sinfo->bytes_readonly -= num_bytes;
+               list_del_init(&cache->ro_list);
+       }
         spin_unlock(&cache->lock);
         spin_unlock(&sinfo->lock);
  }
@@ -9449,7 +9810,7 @@ int btrfs_read_block_groups(struct btrfs_root *root)
  
                 set_avail_alloc_bits(root->fs_info, cache->flags);
                 if (btrfs_chunk_readonly(root, cache->key.objectid)) {
-                       set_block_group_ro(cache, 1);
+                       inc_block_group_ro(cache, 1);
                 } else if (btrfs_block_group_used(&cache->item) == 0) {
                         spin_lock(&info->unused_bgs_lock);
                         /* Should always be true but just in case. */
@@ -9477,11 +9838,11 @@ int btrfs_read_block_groups(struct btrfs_root *root)
                 list_for_each_entry(cache,
                                 &space_info->block_groups[BTRFS_RAID_RAID0],
                                 list)
-                       set_block_group_ro(cache, 1);
+                       inc_block_group_ro(cache, 1);
                 list_for_each_entry(cache,
                                 &space_info->block_groups[BTRFS_RAID_SINGLE],
                                 list)
-                       set_block_group_ro(cache, 1);
+                       inc_block_group_ro(cache, 1);
         }
  
         init_global_block_rsv(info);
@@ -9499,7 +9860,9 @@ void btrfs_create_pending_block_groups(struct btrfs_trans_handle *trans,
         struct btrfs_block_group_item item;
         struct btrfs_key key;
         int ret = 0;
+       bool can_flush_pending_bgs = trans->can_flush_pending_bgs;
  
+       trans->can_flush_pending_bgs = false;
         list_for_each_entry_safe(block_group, tmp, &trans->new_bgs, bg_list) {
                 if (ret)
                         goto next;
@@ -9520,6 +9883,7 @@ void btrfs_create_pending_block_groups(struct btrfs_trans_handle *trans,
  next:
                 list_del_init(&block_group->bg_list);
         }
+       trans->can_flush_pending_bgs = can_flush_pending_bgs;
  }
  
  int btrfs_make_block_group(struct btrfs_trans_handle *trans,
@@ -9562,6 +9926,27 @@ int btrfs_make_block_group(struct btrfs_trans_handle *trans,
  
         free_excluded_extents(root, cache);
  
+#ifdef CONFIG_BTRFS_DEBUG
+       if (btrfs_should_fragment_free_space(root, cache)) {
+               u64 new_bytes_used = size - bytes_used;
+
+               bytes_used += new_bytes_used >> 1;
+               fragment_free_space(root, cache);
+       }
+#endif
+       /*
+        * Call to ensure the corresponding space_info object is created and
+        * assigned to our block group, but don't update its counters just yet.
+        * We want our bg to be added to the rbtree with its ->space_info set.
+        */
+       ret = update_space_info(root->fs_info, cache->flags, 0, 0,
+                               &cache->space_info);
+       if (ret) {
+               btrfs_remove_free_space_cache(cache);
+               btrfs_put_block_group(cache);
+               return ret;
+       }
+
         ret = btrfs_add_block_group_cache(root->fs_info, cache);
         if (ret) {
                 btrfs_remove_free_space_cache(cache);
@@ -9569,6 +9954,10 @@ int btrfs_make_block_group(struct btrfs_trans_handle *trans,
                 return ret;
         }
  
+       /*
+        * Now that our block group has its ->space_info set and is inserted in
+        * the rbtree, update the space info's counters.
+        */
         ret = update_space_info(root->fs_info, cache->flags, size, bytes_used,
                                 &cache->space_info);
         if (ret) {
@@ -9845,6 +10234,11 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans,
          * currently running transaction might finish and a new one start,
          * allowing for new block groups to be created that can reuse the same
          * physical device locations unless we take this special care.
+        *
+        * There may also be an implicit trim operation if the file system
+        * is mounted with -odiscard. The same protections must remain
+        * in place until the extents have been discarded completely when
+        * the transaction commit has completed.
          */
         remove_em = (atomic_read(&block_group->trimming) == 0);
         /*
@@ -9901,6 +10295,47 @@ out:
         return ret;
  }
  
+struct btrfs_trans_handle *
+btrfs_start_trans_remove_block_group(struct btrfs_fs_info *fs_info,
+                                    const u64 chunk_offset)
+{
+       struct extent_map_tree *em_tree = &fs_info->mapping_tree.map_tree;
+       struct extent_map *em;
+       struct map_lookup *map;
+       unsigned int num_items;
+
+       read_lock(&em_tree->lock);
+       em = lookup_extent_mapping(em_tree, chunk_offset, 1);
+       read_unlock(&em_tree->lock);
+       ASSERT(em && em->start == chunk_offset);
+
+       /*
+        * We need to reserve 3 + N units from the metadata space info in order
+        * to remove a block group (done at btrfs_remove_chunk() and at
+        * btrfs_remove_block_group()), which are used for:
+        *
+        * 1 unit for adding the free space inode's orphan (located in the tree
+        * of tree roots).
+        * 1 unit for deleting the block group item (located in the extent
+        * tree).
+        * 1 unit for deleting the free space item (located in tree of tree
+        * roots).
+        * N units for deleting N device extent items corresponding to each
+        * stripe (located in the device tree).
+        *
+        * In order to remove a block group we also need to reserve units in the
+        * system space info in order to update the chunk tree (update one or
+        * more device items and remove one chunk item), but this is done at
+        * btrfs_remove_chunk() through a call to check_system_chunk().
+        */
+       map = (struct map_lookup *)em->bdev;
+       num_items = 3 + map->num_stripes;
+       free_extent_map(em);
+
+       return btrfs_start_transaction_fallback_global_rsv(fs_info->extent_root,
+                                                          num_items, 1);
+}
+
  /*
   * Process the unused_bgs list and remove any that don't have any allocated
   * space inside of them.
@@ -9919,24 +10354,30 @@ void btrfs_delete_unused_bgs(struct btrfs_fs_info *fs_info)
         spin_lock(&fs_info->unused_bgs_lock);
         while (!list_empty(&fs_info->unused_bgs)) {
                 u64 start, end;
+               int trimming;
  
                 block_group = list_first_entry(&fs_info->unused_bgs,
                                                struct btrfs_block_group_cache,
                                                bg_list);
-               space_info = block_group->space_info;
                 list_del_init(&block_group->bg_list);
+
+               space_info = block_group->space_info;
+
                 if (ret || btrfs_mixed_space_info(space_info)) {
                         btrfs_put_block_group(block_group);
                         continue;
                 }
                 spin_unlock(&fs_info->unused_bgs_lock);
  
+               mutex_lock(&fs_info->delete_unused_bgs_mutex);
+
                 /* Don't want to race with allocators so take the groups_sem */
                 down_write(&space_info->groups_sem);
                 spin_lock(&block_group->lock);
                 if (block_group->reserved ||
                     btrfs_block_group_used(&block_group->item) ||
-                   block_group->ro) {
+                   block_group->ro ||
+                   list_is_singular(&block_group->list)) {
                         /*
                          * We want to bail if we made new allocations or have
                          * outstanding allocations in this block group.  We do
@@ -9950,7 +10391,7 @@ void btrfs_delete_unused_bgs(struct btrfs_fs_info *fs_info)
                 spin_unlock(&block_group->lock);
  
                 /* We don't want to force the issue, only flip if it's ok. */
-               ret = set_block_group_ro(block_group, 0);
+               ret = inc_block_group_ro(block_group, 0);
                 up_write(&space_info->groups_sem);
                 if (ret < 0) {
                         ret = 0;
@@ -9961,10 +10402,10 @@ void btrfs_delete_unused_bgs(struct btrfs_fs_info *fs_info)
                  * Want to do this before we do anything else so we can recover
                  * properly if we fail to join the transaction.
                  */
-               /* 1 for btrfs_orphan_reserve_metadata() */
-               trans = btrfs_start_transaction(root, 1);
+               trans = btrfs_start_trans_remove_block_group(fs_info,
+                                                    block_group->key.objectid);
                 if (IS_ERR(trans)) {
-                       btrfs_set_block_group_rw(root, block_group);
+                       btrfs_dec_block_group_ro(root, block_group);
                         ret = PTR_ERR(trans);
                         goto next;
                 }
@@ -9991,14 +10432,14 @@ void btrfs_delete_unused_bgs(struct btrfs_fs_info *fs_info)
                                   EXTENT_DIRTY, GFP_NOFS);
                 if (ret) {
                         mutex_unlock(&fs_info->unused_bg_unpin_mutex);
-                       btrfs_set_block_group_rw(root, block_group);
+                       btrfs_dec_block_group_ro(root, block_group);
                         goto end_trans;
                 }
                 ret = clear_extent_bits(&fs_info->freed_extents[1], start, end,
                                   EXTENT_DIRTY, GFP_NOFS);
                 if (ret) {
                         mutex_unlock(&fs_info->unused_bg_unpin_mutex);
-                       btrfs_set_block_group_rw(root, block_group);
+                       btrfs_dec_block_group_ro(root, block_group);
                         goto end_trans;
                 }
                 mutex_unlock(&fs_info->unused_bg_unpin_mutex);
@@ -10016,15 +10457,47 @@ void btrfs_delete_unused_bgs(struct btrfs_fs_info *fs_info)
                 spin_unlock(&block_group->lock);
                 spin_unlock(&space_info->lock);
  
+               /* DISCARD can flip during remount */
+               trimming = btrfs_test_opt(root, DISCARD);
+
+               /* Implicit trim during transaction commit. */
+               if (trimming)
+                       btrfs_get_block_group_trimming(block_group);
+
                 /*
                  * Btrfs_remove_chunk will abort the transaction if things go
                  * horribly wrong.
                  */
                 ret = btrfs_remove_chunk(trans, root,
                                          block_group->key.objectid);
+
+               if (ret) {
+                       if (trimming)
+                               btrfs_put_block_group_trimming(block_group);
+                       goto end_trans;
+               }
+
+               /*
+                * If we're not mounted with -odiscard, we can just forget
+                * about this block group. Otherwise we'll need to wait
+                * until transaction commit to do the actual discard.
+                */
+               if (trimming) {
+                       spin_lock(&fs_info->unused_bgs_lock);
+                       /*
+                        * A concurrent scrub might have added us to the list
+                        * fs_info->unused_bgs, so use a list_move operation
+                        * to add the block group to the deleted_bgs list.
+                        */
+                       list_move(&block_group->bg_list,
+                                 &trans->transaction->deleted_bgs);
+                       spin_unlock(&fs_info->unused_bgs_lock);
+                       btrfs_get_block_group(block_group);
+               }
  end_trans:
                 btrfs_end_transaction(trans, root);
  next:
+               mutex_unlock(&fs_info->delete_unused_bgs_mutex);
                 btrfs_put_block_group(block_group);
                 spin_lock(&fs_info->unused_bgs_lock);
         }
@@ -10074,10 +10547,99 @@ int btrfs_error_unpin_extent_range(struct btrfs_root *root, u64 start, u64 end)
         return unpin_extent_range(root, start, end, false);
  }
  
+/*
+ * It used to be that old block groups would be left around forever.
+ * Iterating over them would be enough to trim unused space.  Since we
+ * now automatically remove them, we also need to iterate over unallocated
+ * space.
+ *
+ * We don't want a transaction for this since the discard may take a
+ * substantial amount of time.  We don't require that a transaction be
+ * running, but we do need to take a running transaction into account
+ * to ensure that we're not discarding chunks that were released in
+ * the current transaction.
+ *
+ * Holding the chunks lock will prevent other threads from allocating
+ * or releasing chunks, but it won't prevent a running transaction
+ * from committing and releasing the memory that the pending chunks
+ * list head uses.  For that, we need to take a reference to the
+ * transaction.
+ */
+static int btrfs_trim_free_extents(struct btrfs_device *device,
+                                  u64 minlen, u64 *trimmed)
+{
+       u64 start = 0, len = 0;
+       int ret;
+
+       *trimmed = 0;
+
+       /* Not writeable = nothing to do. */
+       if (!device->writeable)
+               return 0;
+
+       /* No free space = nothing to do. */
+       if (device->total_bytes <= device->bytes_used)
+               return 0;
+
+       ret = 0;
+
+       while (1) {
+               struct btrfs_fs_info *fs_info = device->dev_root->fs_info;
+               struct btrfs_transaction *trans;
+               u64 bytes;
+
+               ret = mutex_lock_interruptible(&fs_info->chunk_mutex);
+               if (ret)
+                       return ret;
+
+               down_read(&fs_info->commit_root_sem);
+
+               spin_lock(&fs_info->trans_lock);
+               trans = fs_info->running_transaction;
+               if (trans)
+                       atomic_inc(&trans->use_count);
+               spin_unlock(&fs_info->trans_lock);
+
+               ret = find_free_dev_extent_start(trans, device, minlen, start,
+                                                &start, &len);
+               if (trans)
+                       btrfs_put_transaction(trans);
+
+               if (ret) {
+                       up_read(&fs_info->commit_root_sem);
+                       mutex_unlock(&fs_info->chunk_mutex);
+                       if (ret == -ENOSPC)
+                               ret = 0;
+                       break;
+               }
+
+               ret = btrfs_issue_discard(device->bdev, start, len, &bytes);
+               up_read(&fs_info->commit_root_sem);
+               mutex_unlock(&fs_info->chunk_mutex);
+
+               if (ret)
+                       break;
+
+               start += len;
+               *trimmed += bytes;
+
+               if (fatal_signal_pending(current)) {
+                       ret = -ERESTARTSYS;
+                       break;
+               }
+
+               cond_resched();
+       }
+
+       return ret;
+}
+
  int btrfs_trim_fs(struct btrfs_root *root, struct fstrim_range *range)
  {
         struct btrfs_fs_info *fs_info = root->fs_info;
         struct btrfs_block_group_cache *cache = NULL;
+       struct btrfs_device *device;
+       struct list_head *devices;
         u64 group_trimmed;
         u64 start;
         u64 end;
@@ -10132,6 +10694,18 @@ int btrfs_trim_fs(struct btrfs_root *root, struct fstrim_range *range)
                 cache = next_block_group(fs_info->tree_root, cache);
         }
  
+       mutex_lock(&root->fs_info->fs_devices->device_list_mutex);
+       devices = &root->fs_info->fs_devices->alloc_list;
+       list_for_each_entry(device, devices, dev_alloc_list) {
+               ret = btrfs_trim_free_extents(device, range->minlen,
+                                             &group_trimmed);
+               if (ret)
+                       break;
+
+               trimmed += group_trimmed;
+       }
+       mutex_unlock(&root->fs_info->fs_devices->device_list_mutex);
+
         range->len = trimmed;
         return ret;
  }
@@ -10148,8 +10722,7 @@ void btrfs_end_write_no_snapshoting(struct btrfs_root *root)
  {
         percpu_counter_dec(&root->subv_writers->counter);
         /*
-        * Make sure counter is updated before we wake up
-        * waiters.
+        * Make sure counter is updated before we wake up waiters.
          */
         smp_mb();
         if (waitqueue_active(&root->subv_writers->wait))