These changes are the raw update to linux-4.4.6-rt14. Kernel sources

[kvmfornfv.git] / kernel / fs / btrfs / qgroup.c
diff --git a/kernel/fs/btrfs/qgroup.c b/kernel/fs/btrfs/qgroup.c

index 3d65465..5279fda 100644 (file)
--- a/kernel/fs/btrfs/qgroup.c
+++ b/kernel/fs/btrfs/qgroup.c
@@ -34,6 +34,7 @@
  #include "extent_io.h"
  #include "qgroup.h"
  
+
  /* TODO XXX FIXME
   *  - subvol delete -> delete when ref goes to 0? delete limits also?
   *  - reorganize keys
@@ -84,11 +85,42 @@ struct btrfs_qgroup {
  
         /*
          * temp variables for accounting operations
+        * Refer to qgroup_shared_accouting() for details.
          */
         u64 old_refcnt;
         u64 new_refcnt;
  };
  
+static void btrfs_qgroup_update_old_refcnt(struct btrfs_qgroup *qg, u64 seq,
+                                          int mod)
+{
+       if (qg->old_refcnt < seq)
+               qg->old_refcnt = seq;
+       qg->old_refcnt += mod;
+}
+
+static void btrfs_qgroup_update_new_refcnt(struct btrfs_qgroup *qg, u64 seq,
+                                          int mod)
+{
+       if (qg->new_refcnt < seq)
+               qg->new_refcnt = seq;
+       qg->new_refcnt += mod;
+}
+
+static inline u64 btrfs_qgroup_get_old_refcnt(struct btrfs_qgroup *qg, u64 seq)
+{
+       if (qg->old_refcnt < seq)
+               return 0;
+       return qg->old_refcnt - seq;
+}
+
+static inline u64 btrfs_qgroup_get_new_refcnt(struct btrfs_qgroup *qg, u64 seq)
+{
+       if (qg->new_refcnt < seq)
+               return 0;
+       return qg->new_refcnt - seq;
+}
+
  /*
   * glue structure to represent the relations between qgroups.
   */
@@ -344,7 +376,7 @@ int btrfs_read_qgroup_config(struct btrfs_fs_info *fs_info)
                 qgroup = find_qgroup_rb(fs_info, found_key.offset);
                 if ((qgroup && found_key.type == BTRFS_QGROUP_INFO_KEY) ||
                     (!qgroup && found_key.type == BTRFS_QGROUP_LIMIT_KEY)) {
-                       btrfs_err(fs_info, "inconsitent qgroup config");
+                       btrfs_err(fs_info, "inconsistent qgroup config");
                         flags |= BTRFS_QGROUP_STATUS_FLAG_INCONSISTENT;
                 }
                 if (!qgroup) {
@@ -961,9 +993,10 @@ int btrfs_quota_disable(struct btrfs_trans_handle *trans,
         mutex_lock(&fs_info->qgroup_ioctl_lock);
         if (!fs_info->quota_root)
                 goto out;
-       spin_lock(&fs_info->qgroup_lock);
         fs_info->quota_enabled = 0;
         fs_info->pending_quota_state = 0;
+       btrfs_qgroup_wait_for_completion(fs_info);
+       spin_lock(&fs_info->qgroup_lock);
         quota_root = fs_info->quota_root;
         fs_info->quota_root = NULL;
         fs_info->qgroup_flags &= ~BTRFS_QGROUP_STATUS_FLAG_ON;
@@ -1115,14 +1148,14 @@ int btrfs_add_qgroup_relation(struct btrfs_trans_handle *trans,
         struct ulist *tmp;
         int ret = 0;
  
-       tmp = ulist_alloc(GFP_NOFS);
-       if (!tmp)
-               return -ENOMEM;
-
         /* Check the level of src and dst first */
         if (btrfs_qgroup_level(src) >= btrfs_qgroup_level(dst))
                 return -EINVAL;
  
+       tmp = ulist_alloc(GFP_NOFS);
+       if (!tmp)
+               return -ENOMEM;
+
         mutex_lock(&fs_info->qgroup_ioctl_lock);
         quota_root = fs_info->quota_root;
         if (!quota_root) {
@@ -1317,6 +1350,11 @@ int btrfs_limit_qgroup(struct btrfs_trans_handle *trans,
         struct btrfs_root *quota_root;
         struct btrfs_qgroup *qgroup;
         int ret = 0;
+       /* Sometimes we would want to clear the limit on this qgroup.
+        * To meet this requirement, we treat the -1 as a special value
+        * which tell kernel to clear the limit on this qgroup.
+        */
+       const u64 CLEAR_VALUE = -1;
  
         mutex_lock(&fs_info->qgroup_ioctl_lock);
         quota_root = fs_info->quota_root;
@@ -1332,14 +1370,42 @@ int btrfs_limit_qgroup(struct btrfs_trans_handle *trans,
         }
  
         spin_lock(&fs_info->qgroup_lock);
-       if (limit->flags & BTRFS_QGROUP_LIMIT_MAX_RFER)
-               qgroup->max_rfer = limit->max_rfer;
-       if (limit->flags & BTRFS_QGROUP_LIMIT_MAX_EXCL)
-               qgroup->max_excl = limit->max_excl;
-       if (limit->flags & BTRFS_QGROUP_LIMIT_RSV_RFER)
-               qgroup->rsv_rfer = limit->rsv_rfer;
-       if (limit->flags & BTRFS_QGROUP_LIMIT_RSV_EXCL)
-               qgroup->rsv_excl = limit->rsv_excl;
+       if (limit->flags & BTRFS_QGROUP_LIMIT_MAX_RFER) {
+               if (limit->max_rfer == CLEAR_VALUE) {
+                       qgroup->lim_flags &= ~BTRFS_QGROUP_LIMIT_MAX_RFER;
+                       limit->flags &= ~BTRFS_QGROUP_LIMIT_MAX_RFER;
+                       qgroup->max_rfer = 0;
+               } else {
+                       qgroup->max_rfer = limit->max_rfer;
+               }
+       }
+       if (limit->flags & BTRFS_QGROUP_LIMIT_MAX_EXCL) {
+               if (limit->max_excl == CLEAR_VALUE) {
+                       qgroup->lim_flags &= ~BTRFS_QGROUP_LIMIT_MAX_EXCL;
+                       limit->flags &= ~BTRFS_QGROUP_LIMIT_MAX_EXCL;
+                       qgroup->max_excl = 0;
+               } else {
+                       qgroup->max_excl = limit->max_excl;
+               }
+       }
+       if (limit->flags & BTRFS_QGROUP_LIMIT_RSV_RFER) {
+               if (limit->rsv_rfer == CLEAR_VALUE) {
+                       qgroup->lim_flags &= ~BTRFS_QGROUP_LIMIT_RSV_RFER;
+                       limit->flags &= ~BTRFS_QGROUP_LIMIT_RSV_RFER;
+                       qgroup->rsv_rfer = 0;
+               } else {
+                       qgroup->rsv_rfer = limit->rsv_rfer;
+               }
+       }
+       if (limit->flags & BTRFS_QGROUP_LIMIT_RSV_EXCL) {
+               if (limit->rsv_excl == CLEAR_VALUE) {
+                       qgroup->lim_flags &= ~BTRFS_QGROUP_LIMIT_RSV_EXCL;
+                       limit->flags &= ~BTRFS_QGROUP_LIMIT_RSV_EXCL;
+                       qgroup->rsv_excl = 0;
+               } else {
+                       qgroup->rsv_excl = limit->rsv_excl;
+               }
+       }
         qgroup->lim_flags |= limit->flags;
  
         spin_unlock(&fs_info->qgroup_lock);
@@ -1356,239 +1422,88 @@ out:
         return ret;
  }
  
-static int comp_oper_exist(struct btrfs_qgroup_operation *oper1,
-                          struct btrfs_qgroup_operation *oper2)
+int btrfs_qgroup_prepare_account_extents(struct btrfs_trans_handle *trans,
+                                        struct btrfs_fs_info *fs_info)
  {
-       /*
-        * Ignore seq and type here, we're looking for any operation
-        * at all related to this extent on that root.
-        */
-       if (oper1->bytenr < oper2->bytenr)
-               return -1;
-       if (oper1->bytenr > oper2->bytenr)
-               return 1;
-       if (oper1->ref_root < oper2->ref_root)
-               return -1;
-       if (oper1->ref_root > oper2->ref_root)
-               return 1;
-       return 0;
-}
+       struct btrfs_qgroup_extent_record *record;
+       struct btrfs_delayed_ref_root *delayed_refs;
+       struct rb_node *node;
+       u64 qgroup_to_skip;
+       int ret = 0;
  
-static int qgroup_oper_exists(struct btrfs_fs_info *fs_info,
-                             struct btrfs_qgroup_operation *oper)
-{
-       struct rb_node *n;
-       struct btrfs_qgroup_operation *cur;
-       int cmp;
+       delayed_refs = &trans->transaction->delayed_refs;
+       qgroup_to_skip = delayed_refs->qgroup_to_skip;
  
-       spin_lock(&fs_info->qgroup_op_lock);
-       n = fs_info->qgroup_op_tree.rb_node;
-       while (n) {
-               cur = rb_entry(n, struct btrfs_qgroup_operation, n);
-               cmp = comp_oper_exist(cur, oper);
-               if (cmp < 0) {
-                       n = n->rb_right;
-               } else if (cmp) {
-                       n = n->rb_left;
-               } else {
-                       spin_unlock(&fs_info->qgroup_op_lock);
-                       return -EEXIST;
-               }
+       /*
+        * No need to do lock, since this function will only be called in
+        * btrfs_commmit_transaction().
+        */
+       node = rb_first(&delayed_refs->dirty_extent_root);
+       while (node) {
+               record = rb_entry(node, struct btrfs_qgroup_extent_record,
+                                 node);
+               ret = btrfs_find_all_roots(NULL, fs_info, record->bytenr, 0,
+                                          &record->old_roots);
+               if (ret < 0)
+                       break;
+               if (qgroup_to_skip)
+                       ulist_del(record->old_roots, qgroup_to_skip, 0);
+               node = rb_next(node);
         }
-       spin_unlock(&fs_info->qgroup_op_lock);
-       return 0;
+       return ret;
  }
  
-static int comp_oper(struct btrfs_qgroup_operation *oper1,
-                    struct btrfs_qgroup_operation *oper2)
+struct btrfs_qgroup_extent_record
+*btrfs_qgroup_insert_dirty_extent(struct btrfs_delayed_ref_root *delayed_refs,
+                                 struct btrfs_qgroup_extent_record *record)
  {
-       if (oper1->bytenr < oper2->bytenr)
-               return -1;
-       if (oper1->bytenr > oper2->bytenr)
-               return 1;
-       if (oper1->ref_root < oper2->ref_root)
-               return -1;
-       if (oper1->ref_root > oper2->ref_root)
-               return 1;
-       if (oper1->seq < oper2->seq)
-               return -1;
-       if (oper1->seq > oper2->seq)
-               return 1;
-       if (oper1->type < oper2->type)
-               return -1;
-       if (oper1->type > oper2->type)
-               return 1;
-       return 0;
-}
+       struct rb_node **p = &delayed_refs->dirty_extent_root.rb_node;
+       struct rb_node *parent_node = NULL;
+       struct btrfs_qgroup_extent_record *entry;
+       u64 bytenr = record->bytenr;
  
-static int insert_qgroup_oper(struct btrfs_fs_info *fs_info,
-                             struct btrfs_qgroup_operation *oper)
-{
-       struct rb_node **p;
-       struct rb_node *parent = NULL;
-       struct btrfs_qgroup_operation *cur;
-       int cmp;
+       assert_spin_locked(&delayed_refs->lock);
  
-       spin_lock(&fs_info->qgroup_op_lock);
-       p = &fs_info->qgroup_op_tree.rb_node;
         while (*p) {
-               parent = *p;
-               cur = rb_entry(parent, struct btrfs_qgroup_operation, n);
-               cmp = comp_oper(cur, oper);
-               if (cmp < 0) {
-                       p = &(*p)->rb_right;
-               } else if (cmp) {
+               parent_node = *p;
+               entry = rb_entry(parent_node, struct btrfs_qgroup_extent_record,
+                                node);
+               if (bytenr < entry->bytenr)
                         p = &(*p)->rb_left;
-               } else {
-                       spin_unlock(&fs_info->qgroup_op_lock);
-                       return -EEXIST;
-               }
-       }
-       rb_link_node(&oper->n, parent, p);
-       rb_insert_color(&oper->n, &fs_info->qgroup_op_tree);
-       spin_unlock(&fs_info->qgroup_op_lock);
-       return 0;
-}
-
-/*
- * Record a quota operation for processing later on.
- * @trans: the transaction we are adding the delayed op to.
- * @fs_info: the fs_info for this fs.
- * @ref_root: the root of the reference we are acting on,
- * @bytenr: the bytenr we are acting on.
- * @num_bytes: the number of bytes in the reference.
- * @type: the type of operation this is.
- * @mod_seq: do we need to get a sequence number for looking up roots.
- *
- * We just add it to our trans qgroup_ref_list and carry on and process these
- * operations in order at some later point.  If the reference root isn't a fs
- * root then we don't bother with doing anything.
- *
- * MUST BE HOLDING THE REF LOCK.
- */
-int btrfs_qgroup_record_ref(struct btrfs_trans_handle *trans,
-                           struct btrfs_fs_info *fs_info, u64 ref_root,
-                           u64 bytenr, u64 num_bytes,
-                           enum btrfs_qgroup_operation_type type, int mod_seq)
-{
-       struct btrfs_qgroup_operation *oper;
-       int ret;
-
-       if (!is_fstree(ref_root) || !fs_info->quota_enabled)
-               return 0;
-
-       oper = kmalloc(sizeof(*oper), GFP_NOFS);
-       if (!oper)
-               return -ENOMEM;
-
-       oper->ref_root = ref_root;
-       oper->bytenr = bytenr;
-       oper->num_bytes = num_bytes;
-       oper->type = type;
-       oper->seq = atomic_inc_return(&fs_info->qgroup_op_seq);
-       INIT_LIST_HEAD(&oper->elem.list);
-       oper->elem.seq = 0;
-
-       trace_btrfs_qgroup_record_ref(oper);
-
-       if (type == BTRFS_QGROUP_OPER_SUB_SUBTREE) {
-               /*
-                * If any operation for this bytenr/ref_root combo
-                * exists, then we know it's not exclusively owned and
-                * shouldn't be queued up.
-                *
-                * This also catches the case where we have a cloned
-                * extent that gets queued up multiple times during
-                * drop snapshot.
-                */
-               if (qgroup_oper_exists(fs_info, oper)) {
-                       kfree(oper);
-                       return 0;
-               }
-       }
-
-       ret = insert_qgroup_oper(fs_info, oper);
-       if (ret) {
-               /* Shouldn't happen so have an assert for developers */
-               ASSERT(0);
-               kfree(oper);
-               return ret;
+               else if (bytenr > entry->bytenr)
+                       p = &(*p)->rb_right;
+               else
+                       return entry;
         }
-       list_add_tail(&oper->list, &trans->qgroup_ref_list);
-
-       if (mod_seq)
-               btrfs_get_tree_mod_seq(fs_info, &oper->elem);
  
-       return 0;
-}
-
-static int qgroup_excl_accounting(struct btrfs_fs_info *fs_info,
-                                 struct btrfs_qgroup_operation *oper)
-{
-       struct ulist *tmp;
-       int sign = 0;
-       int ret = 0;
-
-       tmp = ulist_alloc(GFP_NOFS);
-       if (!tmp)
-               return -ENOMEM;
-
-       spin_lock(&fs_info->qgroup_lock);
-       if (!fs_info->quota_root)
-               goto out;
-
-       switch (oper->type) {
-       case BTRFS_QGROUP_OPER_ADD_EXCL:
-               sign = 1;
-               break;
-       case BTRFS_QGROUP_OPER_SUB_EXCL:
-               sign = -1;
-               break;
-       default:
-               ASSERT(0);
-       }
-       ret = __qgroup_excl_accounting(fs_info, tmp, oper->ref_root,
-                                      oper->num_bytes, sign);
-out:
-       spin_unlock(&fs_info->qgroup_lock);
-       ulist_free(tmp);
-       return ret;
+       rb_link_node(&record->node, parent_node, p);
+       rb_insert_color(&record->node, &delayed_refs->dirty_extent_root);
+       return NULL;
  }
  
+#define UPDATE_NEW     0
+#define UPDATE_OLD     1
  /*
- * Walk all of the roots that pointed to our bytenr and adjust their refcnts as
- * properly.
+ * Walk all of the roots that points to the bytenr and adjust their refcnts.
   */
-static int qgroup_calc_old_refcnt(struct btrfs_fs_info *fs_info,
-                                 u64 root_to_skip, struct ulist *tmp,
-                                 struct ulist *roots, struct ulist *qgroups,
-                                 u64 seq, int *old_roots, int rescan)
+static int qgroup_update_refcnt(struct btrfs_fs_info *fs_info,
+                               struct ulist *roots, struct ulist *tmp,
+                               struct ulist *qgroups, u64 seq, int update_old)
  {
         struct ulist_node *unode;
         struct ulist_iterator uiter;
         struct ulist_node *tmp_unode;
         struct ulist_iterator tmp_uiter;
         struct btrfs_qgroup *qg;
-       int ret;
+       int ret = 0;
  
+       if (!roots)
+               return 0;
         ULIST_ITER_INIT(&uiter);
         while ((unode = ulist_next(roots, &uiter))) {
-               /* We don't count our current root here */
-               if (unode->val == root_to_skip)
-                       continue;
                 qg = find_qgroup_rb(fs_info, unode->val);
                 if (!qg)
                         continue;
-               /*
-                * We could have a pending removal of this same ref so we may
-                * not have actually found our ref root when doing
-                * btrfs_find_all_roots, so we need to keep track of how many
-                * old roots we find in case we removed ours and added a
-                * different one at the same time.  I don't think this could
-                * happen in practice but that sort of thinking leads to pain
-                * and suffering and to the dark side.
-                */
-               (*old_roots)++;
  
                 ulist_reinit(tmp);
                 ret = ulist_add(qgroups, qg->qgroupid, ptr_to_u64(qg),
@@ -1603,29 +1518,10 @@ static int qgroup_calc_old_refcnt(struct btrfs_fs_info *fs_info,
                         struct btrfs_qgroup_list *glist;
  
                         qg = u64_to_ptr(tmp_unode->aux);
-                       /*
-                        * We use this sequence number to keep from having to
-                        * run the whole list and 0 out the refcnt every time.
-                        * We basically use sequnce as the known 0 count and
-                        * then add 1 everytime we see a qgroup.  This is how we
-                        * get how many of the roots actually point up to the
-                        * upper level qgroups in order to determine exclusive
-                        * counts.
-                        *
-                        * For rescan we want to set old_refcnt to seq so our
-                        * exclusive calculations end up correct.
-                        */
-                       if (rescan)
-                               qg->old_refcnt = seq;
-                       else if (qg->old_refcnt < seq)
-                               qg->old_refcnt = seq + 1;
+                       if (update_old)
+                               btrfs_qgroup_update_old_refcnt(qg, seq, 1);
                         else
-                               qg->old_refcnt++;
-
-                       if (qg->new_refcnt < seq)
-                               qg->new_refcnt = seq + 1;
-                       else
-                               qg->new_refcnt++;
+                               btrfs_qgroup_update_new_refcnt(qg, seq, 1);
                         list_for_each_entry(glist, &qg->groups, next_group) {
                                 ret = ulist_add(qgroups, glist->group->qgroupid,
                                                 ptr_to_u64(glist->group),
@@ -1644,161 +1540,46 @@ static int qgroup_calc_old_refcnt(struct btrfs_fs_info *fs_info,
  }
  
  /*
- * We need to walk forward in our operation tree and account for any roots that
- * were deleted after we made this operation.
- */
-static int qgroup_account_deleted_refs(struct btrfs_fs_info *fs_info,
-                                      struct btrfs_qgroup_operation *oper,
-                                      struct ulist *tmp,
-                                      struct ulist *qgroups, u64 seq,
-                                      int *old_roots)
-{
-       struct ulist_node *unode;
-       struct ulist_iterator uiter;
-       struct btrfs_qgroup *qg;
-       struct btrfs_qgroup_operation *tmp_oper;
-       struct rb_node *n;
-       int ret;
-
-       ulist_reinit(tmp);
-
-       /*
-        * We only walk forward in the tree since we're only interested in
-        * removals that happened _after_  our operation.
-        */
-       spin_lock(&fs_info->qgroup_op_lock);
-       n = rb_next(&oper->n);
-       spin_unlock(&fs_info->qgroup_op_lock);
-       if (!n)
-               return 0;
-       tmp_oper = rb_entry(n, struct btrfs_qgroup_operation, n);
-       while (tmp_oper->bytenr == oper->bytenr) {
-               /*
-                * If it's not a removal we don't care, additions work out
-                * properly with our refcnt tracking.
-                */
-               if (tmp_oper->type != BTRFS_QGROUP_OPER_SUB_SHARED &&
-                   tmp_oper->type != BTRFS_QGROUP_OPER_SUB_EXCL)
-                       goto next;
-               qg = find_qgroup_rb(fs_info, tmp_oper->ref_root);
-               if (!qg)
-                       goto next;
-               ret = ulist_add(qgroups, qg->qgroupid, ptr_to_u64(qg),
-                               GFP_ATOMIC);
-               if (ret) {
-                       if (ret < 0)
-                               return ret;
-                       /*
-                        * We only want to increase old_roots if this qgroup is
-                        * not already in the list of qgroups.  If it is already
-                        * there then that means it must have been re-added or
-                        * the delete will be discarded because we had an
-                        * existing ref that we haven't looked up yet.  In this
-                        * case we don't want to increase old_roots.  So if ret
-                        * == 1 then we know that this is the first time we've
-                        * seen this qgroup and we can bump the old_roots.
-                        */
-                       (*old_roots)++;
-                       ret = ulist_add(tmp, qg->qgroupid, ptr_to_u64(qg),
-                                       GFP_ATOMIC);
-                       if (ret < 0)
-                               return ret;
-               }
-next:
-               spin_lock(&fs_info->qgroup_op_lock);
-               n = rb_next(&tmp_oper->n);
-               spin_unlock(&fs_info->qgroup_op_lock);
-               if (!n)
-                       break;
-               tmp_oper = rb_entry(n, struct btrfs_qgroup_operation, n);
-       }
-
-       /* Ok now process the qgroups we found */
-       ULIST_ITER_INIT(&uiter);
-       while ((unode = ulist_next(tmp, &uiter))) {
-               struct btrfs_qgroup_list *glist;
-
-               qg = u64_to_ptr(unode->aux);
-               if (qg->old_refcnt < seq)
-                       qg->old_refcnt = seq + 1;
-               else
-                       qg->old_refcnt++;
-               if (qg->new_refcnt < seq)
-                       qg->new_refcnt = seq + 1;
-               else
-                       qg->new_refcnt++;
-               list_for_each_entry(glist, &qg->groups, next_group) {
-                       ret = ulist_add(qgroups, glist->group->qgroupid,
-                                       ptr_to_u64(glist->group), GFP_ATOMIC);
-                       if (ret < 0)
-                               return ret;
-                       ret = ulist_add(tmp, glist->group->qgroupid,
-                                       ptr_to_u64(glist->group), GFP_ATOMIC);
-                       if (ret < 0)
-                               return ret;
-               }
-       }
-       return 0;
-}
-
-/* Add refcnt for the newly added reference. */
-static int qgroup_calc_new_refcnt(struct btrfs_fs_info *fs_info,
-                                 struct btrfs_qgroup_operation *oper,
-                                 struct btrfs_qgroup *qgroup,
-                                 struct ulist *tmp, struct ulist *qgroups,
-                                 u64 seq)
-{
-       struct ulist_node *unode;
-       struct ulist_iterator uiter;
-       struct btrfs_qgroup *qg;
-       int ret;
-
-       ulist_reinit(tmp);
-       ret = ulist_add(qgroups, qgroup->qgroupid, ptr_to_u64(qgroup),
-                       GFP_ATOMIC);
-       if (ret < 0)
-               return ret;
-       ret = ulist_add(tmp, qgroup->qgroupid, ptr_to_u64(qgroup),
-                       GFP_ATOMIC);
-       if (ret < 0)
-               return ret;
-       ULIST_ITER_INIT(&uiter);
-       while ((unode = ulist_next(tmp, &uiter))) {
-               struct btrfs_qgroup_list *glist;
-
-               qg = u64_to_ptr(unode->aux);
-               if (oper->type == BTRFS_QGROUP_OPER_ADD_SHARED) {
-                       if (qg->new_refcnt < seq)
-                               qg->new_refcnt = seq + 1;
-                       else
-                               qg->new_refcnt++;
-               } else {
-                       if (qg->old_refcnt < seq)
-                               qg->old_refcnt = seq + 1;
-                       else
-                               qg->old_refcnt++;
-               }
-               list_for_each_entry(glist, &qg->groups, next_group) {
-                       ret = ulist_add(tmp, glist->group->qgroupid,
-                                       ptr_to_u64(glist->group), GFP_ATOMIC);
-                       if (ret < 0)
-                               return ret;
-                       ret = ulist_add(qgroups, glist->group->qgroupid,
-                                       ptr_to_u64(glist->group), GFP_ATOMIC);
-                       if (ret < 0)
-                               return ret;
-               }
-       }
-       return 0;
-}
-
-/*
- * This adjusts the counters for all referenced qgroups if need be.
+ * Update qgroup rfer/excl counters.
+ * Rfer update is easy, codes can explain themselves.
+ *
+ * Excl update is tricky, the update is split into 2 part.
+ * Part 1: Possible exclusive <-> sharing detect:
+ *     |       A       |       !A      |
+ *  -------------------------------------
+ *  B  |       *       |       -       |
+ *  -------------------------------------
+ *  !B |       +       |       **      |
+ *  -------------------------------------
+ *
+ * Conditions:
+ * A:  cur_old_roots < nr_old_roots    (not exclusive before)
+ * !A: cur_old_roots == nr_old_roots   (possible exclusive before)
+ * B:  cur_new_roots < nr_new_roots    (not exclusive now)
+ * !B: cur_new_roots == nr_new_roots   (possible exclsuive now)
+ *
+ * Results:
+ * +: Possible sharing -> exclusive    -: Possible exclusive -> sharing
+ * *: Definitely not changed.          **: Possible unchanged.
+ *
+ * For !A and !B condition, the exception is cur_old/new_roots == 0 case.
+ *
+ * To make the logic clear, we first use condition A and B to split
+ * combination into 4 results.
+ *
+ * Then, for result "+" and "-", check old/new_roots == 0 case, as in them
+ * only on variant maybe 0.
+ *
+ * Lastly, check result **, since there are 2 variants maybe 0, split them
+ * again(2x2).
+ * But this time we don't need to consider other things, the codes and logic
+ * is easy to understand now.
   */
-static int qgroup_adjust_counters(struct btrfs_fs_info *fs_info,
-                                 u64 root_to_skip, u64 num_bytes,
-                                 struct ulist *qgroups, u64 seq,
-                                 int old_roots, int new_roots, int rescan)
+static int qgroup_update_counters(struct btrfs_fs_info *fs_info,
+                                 struct ulist *qgroups,
+                                 u64 nr_old_roots,
+                                 u64 nr_new_roots,
+                                 u64 num_bytes, u64 seq)
  {
         struct ulist_node *unode;
         struct ulist_iterator uiter;
@@ -1810,57 +1591,68 @@ static int qgroup_adjust_counters(struct btrfs_fs_info *fs_info,
                 bool dirty = false;
  
                 qg = u64_to_ptr(unode->aux);
-               /*
-                * Wasn't referenced before but is now, add to the reference
-                * counters.
-                */
-               if (qg->old_refcnt <= seq && qg->new_refcnt > seq) {
+               cur_old_count = btrfs_qgroup_get_old_refcnt(qg, seq);
+               cur_new_count = btrfs_qgroup_get_new_refcnt(qg, seq);
+
+               /* Rfer update part */
+               if (cur_old_count == 0 && cur_new_count > 0) {
                         qg->rfer += num_bytes;
                         qg->rfer_cmpr += num_bytes;
                         dirty = true;
                 }
-
-               /*
-                * Was referenced before but isn't now, subtract from the
-                * reference counters.
-                */
-               if (qg->old_refcnt > seq && qg->new_refcnt <= seq) {
+               if (cur_old_count > 0 && cur_new_count == 0) {
                         qg->rfer -= num_bytes;
                         qg->rfer_cmpr -= num_bytes;
                         dirty = true;
                 }
  
-               if (qg->old_refcnt < seq)
-                       cur_old_count = 0;
-               else
-                       cur_old_count = qg->old_refcnt - seq;
-               if (qg->new_refcnt < seq)
-                       cur_new_count = 0;
-               else
-                       cur_new_count = qg->new_refcnt - seq;
+               /* Excl update part */
+               /* Exclusive/none -> shared case */
+               if (cur_old_count == nr_old_roots &&
+                   cur_new_count < nr_new_roots) {
+                       /* Exclusive -> shared */
+                       if (cur_old_count != 0) {
+                               qg->excl -= num_bytes;
+                               qg->excl_cmpr -= num_bytes;
+                               dirty = true;
+                       }
+               }
  
-               /*
-                * If our refcount was the same as the roots previously but our
-                * new count isn't the same as the number of roots now then we
-                * went from having a exclusive reference on this range to not.
-                */
-               if (old_roots && cur_old_count == old_roots &&
-                   (cur_new_count != new_roots || new_roots == 0)) {
-                       WARN_ON(cur_new_count != new_roots && new_roots == 0);
-                       qg->excl -= num_bytes;
-                       qg->excl_cmpr -= num_bytes;
-                       dirty = true;
+               /* Shared -> exclusive/none case */
+               if (cur_old_count < nr_old_roots &&
+                   cur_new_count == nr_new_roots) {
+                       /* Shared->exclusive */
+                       if (cur_new_count != 0) {
+                               qg->excl += num_bytes;
+                               qg->excl_cmpr += num_bytes;
+                               dirty = true;
+                       }
                 }
  
-               /*
-                * If we didn't reference all the roots before but now we do we
-                * have an exclusive reference to this range.
-                */
-               if ((!old_roots || (old_roots && cur_old_count != old_roots))
-                   && cur_new_count == new_roots) {
-                       qg->excl += num_bytes;
-                       qg->excl_cmpr += num_bytes;
-                       dirty = true;
+               /* Exclusive/none -> exclusive/none case */
+               if (cur_old_count == nr_old_roots &&
+                   cur_new_count == nr_new_roots) {
+                       if (cur_old_count == 0) {
+                               /* None -> exclusive/none */
+
+                               if (cur_new_count != 0) {
+                                       /* None -> exclusive */
+                                       qg->excl += num_bytes;
+                                       qg->excl_cmpr += num_bytes;
+                                       dirty = true;
+                               }
+                               /* None -> none, nothing changed */
+                       } else {
+                               /* Exclusive -> exclusive/none */
+
+                               if (cur_new_count == 0) {
+                                       /* Exclusive -> none */
+                                       qg->excl -= num_bytes;
+                                       qg->excl_cmpr -= num_bytes;
+                                       dirty = true;
+                               }
+                               /* Exclusive -> exclusive, nothing changed */
+                       }
                 }
  
                 if (dirty)
@@ -1869,364 +1661,122 @@ static int qgroup_adjust_counters(struct btrfs_fs_info *fs_info,
         return 0;
  }
  
-/*
- * If we removed a data extent and there were other references for that bytenr
- * then we need to lookup all referenced roots to make sure we still don't
- * reference this bytenr.  If we do then we can just discard this operation.
- */
-static int check_existing_refs(struct btrfs_trans_handle *trans,
-                              struct btrfs_fs_info *fs_info,
-                              struct btrfs_qgroup_operation *oper)
-{
-       struct ulist *roots = NULL;
-       struct ulist_node *unode;
-       struct ulist_iterator uiter;
-       int ret = 0;
-
-       ret = btrfs_find_all_roots(trans, fs_info, oper->bytenr,
-                                  oper->elem.seq, &roots);
-       if (ret < 0)
-               return ret;
-       ret = 0;
-
-       ULIST_ITER_INIT(&uiter);
-       while ((unode = ulist_next(roots, &uiter))) {
-               if (unode->val == oper->ref_root) {
-                       ret = 1;
-                       break;
-               }
-       }
-       ulist_free(roots);
-       btrfs_put_tree_mod_seq(fs_info, &oper->elem);
-
-       return ret;
-}
-
-/*
- * If we share a reference across multiple roots then we may need to adjust
- * various qgroups referenced and exclusive counters.  The basic premise is this
- *
- * 1) We have seq to represent a 0 count.  Instead of looping through all of the
- * qgroups and resetting their refcount to 0 we just constantly bump this
- * sequence number to act as the base reference count.  This means that if
- * anybody is equal to or below this sequence they were never referenced.  We
- * jack this sequence up by the number of roots we found each time in order to
- * make sure we don't have any overlap.
- *
- * 2) We first search all the roots that reference the area _except_ the root
- * we're acting on currently.  This makes up the old_refcnt of all the qgroups
- * before.
- *
- * 3) We walk all of the qgroups referenced by the root we are currently acting
- * on, and will either adjust old_refcnt in the case of a removal or the
- * new_refcnt in the case of an addition.
- *
- * 4) Finally we walk all the qgroups that are referenced by this range
- * including the root we are acting on currently.  We will adjust the counters
- * based on the number of roots we had and will have after this operation.
- *
- * Take this example as an illustration
- *
- *                     [qgroup 1/0]
- *                  /         |          \
- *             [qg 0/0]   [qg 0/1]     [qg 0/2]
- *                \          |            /
- *               [        extent           ]
- *
- * Say we are adding a reference that is covered by qg 0/0.  The first step
- * would give a refcnt of 1 to qg 0/1 and 0/2 and a refcnt of 2 to qg 1/0 with
- * old_roots being 2.  Because it is adding new_roots will be 1.  We then go
- * through qg 0/0 which will get the new_refcnt set to 1 and add 1 to qg 1/0's
- * new_refcnt, bringing it to 3.  We then walk through all of the qgroups, we
- * notice that the old refcnt for qg 0/0 < the new refcnt, so we added a
- * reference and thus must add the size to the referenced bytes.  Everything
- * else is the same so nothing else changes.
- */
-static int qgroup_shared_accounting(struct btrfs_trans_handle *trans,
-                                   struct btrfs_fs_info *fs_info,
-                                   struct btrfs_qgroup_operation *oper)
+int
+btrfs_qgroup_account_extent(struct btrfs_trans_handle *trans,
+                           struct btrfs_fs_info *fs_info,
+                           u64 bytenr, u64 num_bytes,
+                           struct ulist *old_roots, struct ulist *new_roots)
  {
-       struct ulist *roots = NULL;
-       struct ulist *qgroups, *tmp;
-       struct btrfs_qgroup *qgroup;
-       struct seq_list elem = SEQ_LIST_INIT(elem);
+       struct ulist *qgroups = NULL;
+       struct ulist *tmp = NULL;
         u64 seq;
-       int old_roots = 0;
-       int new_roots = 0;
+       u64 nr_new_roots = 0;
+       u64 nr_old_roots = 0;
         int ret = 0;
  
-       if (oper->elem.seq) {
-               ret = check_existing_refs(trans, fs_info, oper);
-               if (ret < 0)
-                       return ret;
-               if (ret)
-                       return 0;
-       }
+       if (new_roots)
+               nr_new_roots = new_roots->nnodes;
+       if (old_roots)
+               nr_old_roots = old_roots->nnodes;
  
-       qgroups = ulist_alloc(GFP_NOFS);
-       if (!qgroups)
-               return -ENOMEM;
+       if (!fs_info->quota_enabled)
+               goto out_free;
+       BUG_ON(!fs_info->quota_root);
  
+       qgroups = ulist_alloc(GFP_NOFS);
+       if (!qgroups) {
+               ret = -ENOMEM;
+               goto out_free;
+       }
         tmp = ulist_alloc(GFP_NOFS);
         if (!tmp) {
-               ulist_free(qgroups);
-               return -ENOMEM;
+               ret = -ENOMEM;
+               goto out_free;
         }
  
-       btrfs_get_tree_mod_seq(fs_info, &elem);
-       ret = btrfs_find_all_roots(trans, fs_info, oper->bytenr, elem.seq,
-                                  &roots);
-       btrfs_put_tree_mod_seq(fs_info, &elem);
-       if (ret < 0) {
-               ulist_free(qgroups);
-               ulist_free(tmp);
-               return ret;
+       mutex_lock(&fs_info->qgroup_rescan_lock);
+       if (fs_info->qgroup_flags & BTRFS_QGROUP_STATUS_FLAG_RESCAN) {
+               if (fs_info->qgroup_rescan_progress.objectid <= bytenr) {
+                       mutex_unlock(&fs_info->qgroup_rescan_lock);
+                       ret = 0;
+                       goto out_free;
+               }
         }
+       mutex_unlock(&fs_info->qgroup_rescan_lock);
+
         spin_lock(&fs_info->qgroup_lock);
-       qgroup = find_qgroup_rb(fs_info, oper->ref_root);
-       if (!qgroup)
-               goto out;
         seq = fs_info->qgroup_seq;
  
-       /*
-        * So roots is the list of all the roots currently pointing at the
-        * bytenr, including the ref we are adding if we are adding, or not if
-        * we are removing a ref.  So we pass in the ref_root to skip that root
-        * in our calculations.  We set old_refnct and new_refcnt cause who the
-        * hell knows what everything looked like before, and it doesn't matter
-        * except...
-        */
-       ret = qgroup_calc_old_refcnt(fs_info, oper->ref_root, tmp, roots, qgroups,
-                                    seq, &old_roots, 0);
-       if (ret < 0)
-               goto out;
-
-       /*
-        * Now adjust the refcounts of the qgroups that care about this
-        * reference, either the old_count in the case of removal or new_count
-        * in the case of an addition.
-        */
-       ret = qgroup_calc_new_refcnt(fs_info, oper, qgroup, tmp, qgroups,
-                                    seq);
+       /* Update old refcnts using old_roots */
+       ret = qgroup_update_refcnt(fs_info, old_roots, tmp, qgroups, seq,
+                                  UPDATE_OLD);
         if (ret < 0)
                 goto out;
  
-       /*
-        * ...in the case of removals.  If we had a removal before we got around
-        * to processing this operation then we need to find that guy and count
-        * his references as if they really existed so we don't end up screwing
-        * up the exclusive counts.  Then whenever we go to process the delete
-        * everything will be grand and we can account for whatever exclusive
-        * changes need to be made there.  We also have to pass in old_roots so
-        * we have an accurate count of the roots as it pertains to this
-        * operations view of the world.
-        */
-       ret = qgroup_account_deleted_refs(fs_info, oper, tmp, qgroups, seq,
-                                         &old_roots);
+       /* Update new refcnts using new_roots */
+       ret = qgroup_update_refcnt(fs_info, new_roots, tmp, qgroups, seq,
+                                  UPDATE_NEW);
         if (ret < 0)
                 goto out;
  
-       /*
-        * We are adding our root, need to adjust up the number of roots,
-        * otherwise old_roots is the number of roots we want.
-        */
-       if (oper->type == BTRFS_QGROUP_OPER_ADD_SHARED) {
-               new_roots = old_roots + 1;
-       } else {
-               new_roots = old_roots;
-               old_roots++;
-       }
-       fs_info->qgroup_seq += old_roots + 1;
-
+       qgroup_update_counters(fs_info, qgroups, nr_old_roots, nr_new_roots,
+                              num_bytes, seq);
  
         /*
-        * And now the magic happens, bless Arne for having a pretty elegant
-        * solution for this.
+        * Bump qgroup_seq to avoid seq overlap
          */
-       qgroup_adjust_counters(fs_info, oper->ref_root, oper->num_bytes,
-                              qgroups, seq, old_roots, new_roots, 0);
+       fs_info->qgroup_seq += max(nr_old_roots, nr_new_roots) + 1;
  out:
         spin_unlock(&fs_info->qgroup_lock);
-       ulist_free(qgroups);
-       ulist_free(roots);
+out_free:
         ulist_free(tmp);
+       ulist_free(qgroups);
+       ulist_free(old_roots);
+       ulist_free(new_roots);
         return ret;
  }
  
-/*
- * Process a reference to a shared subtree. This type of operation is
- * queued during snapshot removal when we encounter extents which are
- * shared between more than one root.
- */
-static int qgroup_subtree_accounting(struct btrfs_trans_handle *trans,
-                                    struct btrfs_fs_info *fs_info,
-                                    struct btrfs_qgroup_operation *oper)
-{
-       struct ulist *roots = NULL;
-       struct ulist_node *unode;
-       struct ulist_iterator uiter;
-       struct btrfs_qgroup_list *glist;
-       struct ulist *parents;
-       int ret = 0;
-       int err;
-       struct btrfs_qgroup *qg;
-       u64 root_obj = 0;
-       struct seq_list elem = SEQ_LIST_INIT(elem);
-
-       parents = ulist_alloc(GFP_NOFS);
-       if (!parents)
-               return -ENOMEM;
-
-       btrfs_get_tree_mod_seq(fs_info, &elem);
-       ret = btrfs_find_all_roots(trans, fs_info, oper->bytenr,
-                                  elem.seq, &roots);
-       btrfs_put_tree_mod_seq(fs_info, &elem);
-       if (ret < 0)
-               goto out;
-
-       if (roots->nnodes != 1)
-               goto out;
-
-       ULIST_ITER_INIT(&uiter);
-       unode = ulist_next(roots, &uiter); /* Only want 1 so no need to loop */
-       /*
-        * If we find our ref root then that means all refs
-        * this extent has to the root have not yet been
-        * deleted. In that case, we do nothing and let the
-        * last ref for this bytenr drive our update.
-        *
-        * This can happen for example if an extent is
-        * referenced multiple times in a snapshot (clone,
-        * etc). If we are in the middle of snapshot removal,
-        * queued updates for such an extent will find the
-        * root if we have not yet finished removing the
-        * snapshot.
-        */
-       if (unode->val == oper->ref_root)
-               goto out;
-
-       root_obj = unode->val;
-       BUG_ON(!root_obj);
-
-       spin_lock(&fs_info->qgroup_lock);
-       qg = find_qgroup_rb(fs_info, root_obj);
-       if (!qg)
-               goto out_unlock;
-
-       qg->excl += oper->num_bytes;
-       qg->excl_cmpr += oper->num_bytes;
-       qgroup_dirty(fs_info, qg);
-
-       /*
-        * Adjust counts for parent groups. First we find all
-        * parents, then in the 2nd loop we do the adjustment
-        * while adding parents of the parents to our ulist.
-        */
-       list_for_each_entry(glist, &qg->groups, next_group) {
-               err = ulist_add(parents, glist->group->qgroupid,
-                               ptr_to_u64(glist->group), GFP_ATOMIC);
-               if (err < 0) {
-                       ret = err;
-                       goto out_unlock;
-               }
-       }
-
-       ULIST_ITER_INIT(&uiter);
-       while ((unode = ulist_next(parents, &uiter))) {
-               qg = u64_to_ptr(unode->aux);
-               qg->excl += oper->num_bytes;
-               qg->excl_cmpr += oper->num_bytes;
-               qgroup_dirty(fs_info, qg);
-
-               /* Add any parents of the parents */
-               list_for_each_entry(glist, &qg->groups, next_group) {
-                       err = ulist_add(parents, glist->group->qgroupid,
-                                       ptr_to_u64(glist->group), GFP_ATOMIC);
-                       if (err < 0) {
-                               ret = err;
-                               goto out_unlock;
-                       }
-               }
-       }
-
-out_unlock:
-       spin_unlock(&fs_info->qgroup_lock);
-
-out:
-       ulist_free(roots);
-       ulist_free(parents);
-       return ret;
-}
-
-/*
- * btrfs_qgroup_account_ref is called for every ref that is added to or deleted
- * from the fs. First, all roots referencing the extent are searched, and
- * then the space is accounted accordingly to the different roots. The
- * accounting algorithm works in 3 steps documented inline.
- */
-static int btrfs_qgroup_account(struct btrfs_trans_handle *trans,
-                               struct btrfs_fs_info *fs_info,
-                               struct btrfs_qgroup_operation *oper)
+int btrfs_qgroup_account_extents(struct btrfs_trans_handle *trans,
+                                struct btrfs_fs_info *fs_info)
  {
+       struct btrfs_qgroup_extent_record *record;
+       struct btrfs_delayed_ref_root *delayed_refs;
+       struct ulist *new_roots = NULL;
+       struct rb_node *node;
+       u64 qgroup_to_skip;
         int ret = 0;
  
-       if (!fs_info->quota_enabled)
-               return 0;
-
-       BUG_ON(!fs_info->quota_root);
+       delayed_refs = &trans->transaction->delayed_refs;
+       qgroup_to_skip = delayed_refs->qgroup_to_skip;
+       while ((node = rb_first(&delayed_refs->dirty_extent_root))) {
+               record = rb_entry(node, struct btrfs_qgroup_extent_record,
+                                 node);
  
-       mutex_lock(&fs_info->qgroup_rescan_lock);
-       if (fs_info->qgroup_flags & BTRFS_QGROUP_STATUS_FLAG_RESCAN) {
-               if (fs_info->qgroup_rescan_progress.objectid <= oper->bytenr) {
-                       mutex_unlock(&fs_info->qgroup_rescan_lock);
-                       return 0;
+               if (!ret) {
+                       /*
+                        * Use (u64)-1 as time_seq to do special search, which
+                        * doesn't lock tree or delayed_refs and search current
+                        * root. It's safe inside commit_transaction().
+                        */
+                       ret = btrfs_find_all_roots(trans, fs_info,
+                                       record->bytenr, (u64)-1, &new_roots);
+                       if (ret < 0)
+                               goto cleanup;
+                       if (qgroup_to_skip)
+                               ulist_del(new_roots, qgroup_to_skip, 0);
+                       ret = btrfs_qgroup_account_extent(trans, fs_info,
+                                       record->bytenr, record->num_bytes,
+                                       record->old_roots, new_roots);
+                       record->old_roots = NULL;
+                       new_roots = NULL;
                 }
-       }
-       mutex_unlock(&fs_info->qgroup_rescan_lock);
+cleanup:
+               ulist_free(record->old_roots);
+               ulist_free(new_roots);
+               new_roots = NULL;
+               rb_erase(node, &delayed_refs->dirty_extent_root);
+               kfree(record);
  
-       ASSERT(is_fstree(oper->ref_root));
-
-       trace_btrfs_qgroup_account(oper);
-
-       switch (oper->type) {
-       case BTRFS_QGROUP_OPER_ADD_EXCL:
-       case BTRFS_QGROUP_OPER_SUB_EXCL:
-               ret = qgroup_excl_accounting(fs_info, oper);
-               break;
-       case BTRFS_QGROUP_OPER_ADD_SHARED:
-       case BTRFS_QGROUP_OPER_SUB_SHARED:
-               ret = qgroup_shared_accounting(trans, fs_info, oper);
-               break;
-       case BTRFS_QGROUP_OPER_SUB_SUBTREE:
-               ret = qgroup_subtree_accounting(trans, fs_info, oper);
-               break;
-       default:
-               ASSERT(0);
-       }
-       return ret;
-}
-
-/*
- * Needs to be called everytime we run delayed refs, even if there is an error
- * in order to cleanup outstanding operations.
- */
-int btrfs_delayed_qgroup_accounting(struct btrfs_trans_handle *trans,
-                                   struct btrfs_fs_info *fs_info)
-{
-       struct btrfs_qgroup_operation *oper;
-       int ret = 0;
-
-       while (!list_empty(&trans->qgroup_ref_list)) {
-               oper = list_first_entry(&trans->qgroup_ref_list,
-                                       struct btrfs_qgroup_operation, list);
-               list_del_init(&oper->list);
-               if (!ret || !trans->aborted)
-                       ret = btrfs_qgroup_account(trans, fs_info, oper);
-               spin_lock(&fs_info->qgroup_op_lock);
-               rb_erase(&oper->n, &fs_info->qgroup_op_tree);
-               spin_unlock(&fs_info->qgroup_op_lock);
-               btrfs_put_tree_mod_seq(fs_info, &oper->elem);
-               kfree(oper);
         }
         return ret;
  }
@@ -2484,7 +2034,7 @@ out:
         return ret;
  }
  
-int btrfs_qgroup_reserve(struct btrfs_root *root, u64 num_bytes)
+static int qgroup_reserve(struct btrfs_root *root, u64 num_bytes)
  {
         struct btrfs_root *quota_root;
         struct btrfs_qgroup *qgroup;
@@ -2565,14 +2115,13 @@ out:
         return ret;
  }
  
-void btrfs_qgroup_free(struct btrfs_root *root, u64 num_bytes)
+void btrfs_qgroup_free_refroot(struct btrfs_fs_info *fs_info,
+                              u64 ref_root, u64 num_bytes)
  {
         struct btrfs_root *quota_root;
         struct btrfs_qgroup *qgroup;
-       struct btrfs_fs_info *fs_info = root->fs_info;
         struct ulist_node *unode;
         struct ulist_iterator uiter;
-       u64 ref_root = root->root_key.objectid;
         int ret = 0;
  
         if (!is_fstree(ref_root))
@@ -2618,6 +2167,11 @@ out:
         spin_unlock(&fs_info->qgroup_lock);
  }
  
+static inline void qgroup_free(struct btrfs_root *root, u64 num_bytes)
+{
+       return btrfs_qgroup_free_refroot(root->fs_info, root->objectid,
+                                        num_bytes);
+}
  void assert_qgroups_uptodate(struct btrfs_trans_handle *trans)
  {
         if (list_empty(&trans->qgroup_ref_list) && !trans->delayed_ref_elem.seq)
@@ -2637,19 +2191,16 @@ void assert_qgroups_uptodate(struct btrfs_trans_handle *trans)
   */
  static int
  qgroup_rescan_leaf(struct btrfs_fs_info *fs_info, struct btrfs_path *path,
-                  struct btrfs_trans_handle *trans, struct ulist *qgroups,
-                  struct ulist *tmp, struct extent_buffer *scratch_leaf)
+                  struct btrfs_trans_handle *trans)
  {
         struct btrfs_key found;
+       struct extent_buffer *scratch_leaf = NULL;
         struct ulist *roots = NULL;
         struct seq_list tree_mod_seq_elem = SEQ_LIST_INIT(tree_mod_seq_elem);
         u64 num_bytes;
-       u64 seq;
-       int new_roots;
         int slot;
         int ret;
  
-       path->leave_spinning = 1;
         mutex_lock(&fs_info->qgroup_rescan_lock);
         ret = btrfs_search_slot_for_read(fs_info->extent_root,
                                          &fs_info->qgroup_rescan_progress,
@@ -2680,7 +2231,15 @@ qgroup_rescan_leaf(struct btrfs_fs_info *fs_info, struct btrfs_path *path,
         fs_info->qgroup_rescan_progress.objectid = found.objectid + 1;
  
         btrfs_get_tree_mod_seq(fs_info, &tree_mod_seq_elem);
-       memcpy(scratch_leaf, path->nodes[0], sizeof(*scratch_leaf));
+       scratch_leaf = btrfs_clone_extent_buffer(path->nodes[0]);
+       if (!scratch_leaf) {
+               ret = -ENOMEM;
+               mutex_unlock(&fs_info->qgroup_rescan_lock);
+               goto out;
+       }
+       extent_buffer_get(scratch_leaf);
+       btrfs_tree_read_lock(scratch_leaf);
+       btrfs_set_lock_blocking_rw(scratch_leaf, BTRFS_READ_LOCK);
         slot = path->slots[0];
         btrfs_release_path(path);
         mutex_unlock(&fs_info->qgroup_rescan_lock);
@@ -2695,35 +2254,21 @@ qgroup_rescan_leaf(struct btrfs_fs_info *fs_info, struct btrfs_path *path,
                 else
                         num_bytes = found.offset;
  
-               ulist_reinit(qgroups);
                 ret = btrfs_find_all_roots(NULL, fs_info, found.objectid, 0,
                                            &roots);
                 if (ret < 0)
                         goto out;
-               spin_lock(&fs_info->qgroup_lock);
-               seq = fs_info->qgroup_seq;
-               fs_info->qgroup_seq += roots->nnodes + 1; /* max refcnt */
-
-               new_roots = 0;
-               ret = qgroup_calc_old_refcnt(fs_info, 0, tmp, roots, qgroups,
-                                            seq, &new_roots, 1);
-               if (ret < 0) {
-                       spin_unlock(&fs_info->qgroup_lock);
-                       ulist_free(roots);
-                       goto out;
-               }
-
-               ret = qgroup_adjust_counters(fs_info, 0, num_bytes, qgroups,
-                                            seq, 0, new_roots, 1);
-               if (ret < 0) {
-                       spin_unlock(&fs_info->qgroup_lock);
-                       ulist_free(roots);
+               /* For rescan, just pass old_roots as NULL */
+               ret = btrfs_qgroup_account_extent(trans, fs_info,
+                               found.objectid, num_bytes, NULL, roots);
+               if (ret < 0)
                         goto out;
-               }
-               spin_unlock(&fs_info->qgroup_lock);
-               ulist_free(roots);
         }
  out:
+       if (scratch_leaf) {
+               btrfs_tree_read_unlock_blocking(scratch_leaf);
+               free_extent_buffer(scratch_leaf);
+       }
         btrfs_put_tree_mod_seq(fs_info, &tree_mod_seq_elem);
  
         return ret;
@@ -2735,26 +2280,15 @@ static void btrfs_qgroup_rescan_worker(struct btrfs_work *work)
                                                      qgroup_rescan_work);
         struct btrfs_path *path;
         struct btrfs_trans_handle *trans = NULL;
-       struct ulist *tmp = NULL, *qgroups = NULL;
-       struct extent_buffer *scratch_leaf = NULL;
         int err = -ENOMEM;
         int ret = 0;
  
         path = btrfs_alloc_path();
         if (!path)
                 goto out;
-       qgroups = ulist_alloc(GFP_NOFS);
-       if (!qgroups)
-               goto out;
-       tmp = ulist_alloc(GFP_NOFS);
-       if (!tmp)
-               goto out;
-       scratch_leaf = kmalloc(sizeof(*scratch_leaf), GFP_NOFS);
-       if (!scratch_leaf)
-               goto out;
  
         err = 0;
-       while (!err) {
+       while (!err && !btrfs_fs_closing(fs_info)) {
                 trans = btrfs_start_transaction(fs_info->fs_root, 0);
                 if (IS_ERR(trans)) {
                         err = PTR_ERR(trans);
@@ -2763,8 +2297,7 @@ static void btrfs_qgroup_rescan_worker(struct btrfs_work *work)
                 if (!fs_info->quota_enabled) {
                         err = -EINTR;
                 } else {
-                       err = qgroup_rescan_leaf(fs_info, path, trans,
-                                                qgroups, tmp, scratch_leaf);
+                       err = qgroup_rescan_leaf(fs_info, path, trans);
                 }
                 if (err > 0)
                         btrfs_commit_transaction(trans, fs_info->fs_root);
@@ -2773,13 +2306,11 @@ static void btrfs_qgroup_rescan_worker(struct btrfs_work *work)
         }
  
  out:
-       kfree(scratch_leaf);
-       ulist_free(qgroups);
-       ulist_free(tmp);
         btrfs_free_path(path);
  
         mutex_lock(&fs_info->qgroup_rescan_lock);
-       fs_info->qgroup_flags &= ~BTRFS_QGROUP_STATUS_FLAG_RESCAN;
+       if (!btrfs_fs_closing(fs_info))
+               fs_info->qgroup_flags &= ~BTRFS_QGROUP_STATUS_FLAG_RESCAN;
  
         if (err > 0 &&
             fs_info->qgroup_flags & BTRFS_QGROUP_STATUS_FLAG_INCONSISTENT) {
@@ -2808,7 +2339,9 @@ out:
         }
         btrfs_end_transaction(trans, fs_info->quota_root);
  
-       if (err >= 0) {
+       if (btrfs_fs_closing(fs_info)) {
+               btrfs_info(fs_info, "qgroup scan paused");
+       } else if (err >= 0) {
                 btrfs_info(fs_info, "qgroup scan completed%s",
                         err > 0 ? " (inconsistency flag cleared)" : "");
         } else {
@@ -2856,12 +2389,11 @@ qgroup_rescan_init(struct btrfs_fs_info *fs_info, u64 progress_objectid,
         memset(&fs_info->qgroup_rescan_progress, 0,
                 sizeof(fs_info->qgroup_rescan_progress));
         fs_info->qgroup_rescan_progress.objectid = progress_objectid;
+       init_completion(&fs_info->qgroup_rescan_completion);
  
         spin_unlock(&fs_info->qgroup_lock);
         mutex_unlock(&fs_info->qgroup_rescan_lock);
  
-       init_completion(&fs_info->qgroup_rescan_completion);
-
         memset(&fs_info->qgroup_rescan_work, 0,
                sizeof(fs_info->qgroup_rescan_work));
         btrfs_init_work(&fs_info->qgroup_rescan_work,
@@ -2964,3 +2496,190 @@ btrfs_qgroup_rescan_resume(struct btrfs_fs_info *fs_info)
                 btrfs_queue_work(fs_info->qgroup_rescan_workers,
                                  &fs_info->qgroup_rescan_work);
  }
+
+/*
+ * Reserve qgroup space for range [start, start + len).
+ *
+ * This function will either reserve space from related qgroups or doing
+ * nothing if the range is already reserved.
+ *
+ * Return 0 for successful reserve
+ * Return <0 for error (including -EQUOT)
+ *
+ * NOTE: this function may sleep for memory allocation.
+ */
+int btrfs_qgroup_reserve_data(struct inode *inode, u64 start, u64 len)
+{
+       struct btrfs_root *root = BTRFS_I(inode)->root;
+       struct extent_changeset changeset;
+       struct ulist_node *unode;
+       struct ulist_iterator uiter;
+       int ret;
+
+       if (!root->fs_info->quota_enabled || !is_fstree(root->objectid) ||
+           len == 0)
+               return 0;
+
+       changeset.bytes_changed = 0;
+       changeset.range_changed = ulist_alloc(GFP_NOFS);
+       ret = set_record_extent_bits(&BTRFS_I(inode)->io_tree, start,
+                       start + len -1, EXTENT_QGROUP_RESERVED, GFP_NOFS,
+                       &changeset);
+       trace_btrfs_qgroup_reserve_data(inode, start, len,
+                                       changeset.bytes_changed,
+                                       QGROUP_RESERVE);
+       if (ret < 0)
+               goto cleanup;
+       ret = qgroup_reserve(root, changeset.bytes_changed);
+       if (ret < 0)
+               goto cleanup;
+
+       ulist_free(changeset.range_changed);
+       return ret;
+
+cleanup:
+       /* cleanup already reserved ranges */
+       ULIST_ITER_INIT(&uiter);
+       while ((unode = ulist_next(changeset.range_changed, &uiter)))
+               clear_extent_bit(&BTRFS_I(inode)->io_tree, unode->val,
+                                unode->aux, EXTENT_QGROUP_RESERVED, 0, 0, NULL,
+                                GFP_NOFS);
+       ulist_free(changeset.range_changed);
+       return ret;
+}
+
+static int __btrfs_qgroup_release_data(struct inode *inode, u64 start, u64 len,
+                                      int free)
+{
+       struct extent_changeset changeset;
+       int trace_op = QGROUP_RELEASE;
+       int ret;
+
+       changeset.bytes_changed = 0;
+       changeset.range_changed = ulist_alloc(GFP_NOFS);
+       if (!changeset.range_changed)
+               return -ENOMEM;
+
+       ret = clear_record_extent_bits(&BTRFS_I(inode)->io_tree, start, 
+                       start + len -1, EXTENT_QGROUP_RESERVED, GFP_NOFS,
+                       &changeset);
+       if (ret < 0)
+               goto out;
+
+       if (free) {
+               qgroup_free(BTRFS_I(inode)->root, changeset.bytes_changed);
+               trace_op = QGROUP_FREE;
+       }
+       trace_btrfs_qgroup_release_data(inode, start, len,
+                                       changeset.bytes_changed, trace_op);
+out:
+       ulist_free(changeset.range_changed);
+       return ret;
+}
+
+/*
+ * Free a reserved space range from io_tree and related qgroups
+ *
+ * Should be called when a range of pages get invalidated before reaching disk.
+ * Or for error cleanup case.
+ *
+ * For data written to disk, use btrfs_qgroup_release_data().
+ *
+ * NOTE: This function may sleep for memory allocation.
+ */
+int btrfs_qgroup_free_data(struct inode *inode, u64 start, u64 len)
+{
+       return __btrfs_qgroup_release_data(inode, start, len, 1);
+}
+
+/*
+ * Release a reserved space range from io_tree only.
+ *
+ * Should be called when a range of pages get written to disk and corresponding
+ * FILE_EXTENT is inserted into corresponding root.
+ *
+ * Since new qgroup accounting framework will only update qgroup numbers at
+ * commit_transaction() time, its reserved space shouldn't be freed from
+ * related qgroups.
+ *
+ * But we should release the range from io_tree, to allow further write to be
+ * COWed.
+ *
+ * NOTE: This function may sleep for memory allocation.
+ */
+int btrfs_qgroup_release_data(struct inode *inode, u64 start, u64 len)
+{
+       return __btrfs_qgroup_release_data(inode, start, len, 0);
+}
+
+int btrfs_qgroup_reserve_meta(struct btrfs_root *root, int num_bytes)
+{
+       int ret;
+
+       if (!root->fs_info->quota_enabled || !is_fstree(root->objectid) ||
+           num_bytes == 0)
+               return 0;
+
+       BUG_ON(num_bytes != round_down(num_bytes, root->nodesize));
+       ret = qgroup_reserve(root, num_bytes);
+       if (ret < 0)
+               return ret;
+       atomic_add(num_bytes, &root->qgroup_meta_rsv);
+       return ret;
+}
+
+void btrfs_qgroup_free_meta_all(struct btrfs_root *root)
+{
+       int reserved;
+
+       if (!root->fs_info->quota_enabled || !is_fstree(root->objectid))
+               return;
+
+       reserved = atomic_xchg(&root->qgroup_meta_rsv, 0);
+       if (reserved == 0)
+               return;
+       qgroup_free(root, reserved);
+}
+
+void btrfs_qgroup_free_meta(struct btrfs_root *root, int num_bytes)
+{
+       if (!root->fs_info->quota_enabled || !is_fstree(root->objectid))
+               return;
+
+       BUG_ON(num_bytes != round_down(num_bytes, root->nodesize));
+       WARN_ON(atomic_read(&root->qgroup_meta_rsv) < num_bytes);
+       atomic_sub(num_bytes, &root->qgroup_meta_rsv);
+       qgroup_free(root, num_bytes);
+}
+
+/*
+ * Check qgroup reserved space leaking, normally at destory inode
+ * time
+ */
+void btrfs_qgroup_check_reserved_leak(struct inode *inode)
+{
+       struct extent_changeset changeset;
+       struct ulist_node *unode;
+       struct ulist_iterator iter;
+       int ret;
+
+       changeset.bytes_changed = 0;
+       changeset.range_changed = ulist_alloc(GFP_NOFS);
+       if (WARN_ON(!changeset.range_changed))
+               return;
+
+       ret = clear_record_extent_bits(&BTRFS_I(inode)->io_tree, 0, (u64)-1,
+                       EXTENT_QGROUP_RESERVED, GFP_NOFS, &changeset);
+
+       WARN_ON(ret < 0);
+       if (WARN_ON(changeset.bytes_changed)) {
+               ULIST_ITER_INIT(&iter);
+               while ((unode = ulist_next(changeset.range_changed, &iter))) {
+                       btrfs_warn(BTRFS_I(inode)->root->fs_info,
+                               "leaking qgroup reserved space, ino: %lu, start: %llu, end: %llu",
+                               inode->i_ino, unode->val, unode->aux);
+               }
+               qgroup_free(BTRFS_I(inode)->root, changeset.bytes_changed);
+       }
+       ulist_free(changeset.range_changed);
+}