These changes are the raw update to linux-4.4.6-rt14. Kernel sources

[kvmfornfv.git] / kernel / fs / btrfs / transaction.c
diff --git a/kernel/fs/btrfs/transaction.c b/kernel/fs/btrfs/transaction.c

index 00d18c2..be8eae8 100644 (file)
--- a/kernel/fs/btrfs/transaction.c
+++ b/kernel/fs/btrfs/transaction.c
@@ -82,6 +82,12 @@ void btrfs_put_transaction(struct btrfs_transaction *transaction)
  static void clear_btree_io_tree(struct extent_io_tree *tree)
  {
         spin_lock(&tree->lock);
+       /*
+        * Do a single barrier for the waitqueue_active check here, the state
+        * of the waitqueue should not change once clear_btree_io_tree is
+        * called.
+        */
+       smp_mb();
         while (!RB_EMPTY_ROOT(&tree->state)) {
                 struct rb_node *node;
                 struct extent_state *state;
@@ -117,6 +123,18 @@ static noinline void switch_commit_roots(struct btrfs_transaction *trans,
                         btrfs_unpin_free_ino(root);
                 clear_btree_io_tree(&root->dirty_log_pages);
         }
+
+       /* We can free old roots now. */
+       spin_lock(&trans->dropped_roots_lock);
+       while (!list_empty(&trans->dropped_roots)) {
+               root = list_first_entry(&trans->dropped_roots,
+                                       struct btrfs_root, root_list);
+               list_del_init(&root->root_list);
+               spin_unlock(&trans->dropped_roots_lock);
+               btrfs_drop_and_free_fs_root(fs_info, root);
+               spin_lock(&trans->dropped_roots_lock);
+       }
+       spin_unlock(&trans->dropped_roots_lock);
         up_write(&fs_info->commit_root_sem);
  }
  
@@ -214,23 +232,22 @@ loop:
         extwriter_counter_init(cur_trans, type);
         init_waitqueue_head(&cur_trans->writer_wait);
         init_waitqueue_head(&cur_trans->commit_wait);
+       init_waitqueue_head(&cur_trans->pending_wait);
         cur_trans->state = TRANS_STATE_RUNNING;
         /*
          * One for this trans handle, one so it will live on until we
          * commit the transaction.
          */
         atomic_set(&cur_trans->use_count, 2);
-       cur_trans->have_free_bgs = 0;
+       atomic_set(&cur_trans->pending_ordered, 0);
+       cur_trans->flags = 0;
         cur_trans->start_time = get_seconds();
-       cur_trans->dirty_bg_run = 0;
+
+       memset(&cur_trans->delayed_refs, 0, sizeof(cur_trans->delayed_refs));
  
         cur_trans->delayed_refs.href_root = RB_ROOT;
+       cur_trans->delayed_refs.dirty_extent_root = RB_ROOT;
         atomic_set(&cur_trans->delayed_refs.num_entries, 0);
-       cur_trans->delayed_refs.num_heads_ready = 0;
-       cur_trans->delayed_refs.pending_csums = 0;
-       cur_trans->delayed_refs.num_heads = 0;
-       cur_trans->delayed_refs.flushing = 0;
-       cur_trans->delayed_refs.run_delayed_start = 0;
  
         /*
          * although the tree mod log is per file system and not per transaction,
@@ -250,12 +267,14 @@ loop:
         INIT_LIST_HEAD(&cur_trans->pending_snapshots);
         INIT_LIST_HEAD(&cur_trans->pending_chunks);
         INIT_LIST_HEAD(&cur_trans->switch_commits);
-       INIT_LIST_HEAD(&cur_trans->pending_ordered);
         INIT_LIST_HEAD(&cur_trans->dirty_bgs);
         INIT_LIST_HEAD(&cur_trans->io_bgs);
+       INIT_LIST_HEAD(&cur_trans->dropped_roots);
         mutex_init(&cur_trans->cache_write_mutex);
         cur_trans->num_dirty_bgs = 0;
         spin_lock_init(&cur_trans->dirty_bgs_lock);
+       INIT_LIST_HEAD(&cur_trans->deleted_bgs);
+       spin_lock_init(&cur_trans->dropped_roots_lock);
         list_add_tail(&cur_trans->list, &fs_info->trans_list);
         extent_io_tree_init(&cur_trans->dirty_pages,
                              fs_info->btree_inode->i_mapping);
@@ -332,6 +351,24 @@ static int record_root_in_trans(struct btrfs_trans_handle *trans,
  }
  
  
+void btrfs_add_dropped_root(struct btrfs_trans_handle *trans,
+                           struct btrfs_root *root)
+{
+       struct btrfs_transaction *cur_trans = trans->transaction;
+
+       /* Add ourselves to the transaction dropped list */
+       spin_lock(&cur_trans->dropped_roots_lock);
+       list_add_tail(&root->root_list, &cur_trans->dropped_roots);
+       spin_unlock(&cur_trans->dropped_roots_lock);
+
+       /* Make sure we don't try to update the root at commit time */
+       spin_lock(&root->fs_info->fs_roots_radix_lock);
+       radix_tree_tag_clear(&root->fs_info->fs_roots_radix,
+                            (unsigned long)root->root_key.objectid,
+                            BTRFS_ROOT_TRANS_TAG);
+       spin_unlock(&root->fs_info->fs_roots_radix_lock);
+}
+
  int btrfs_record_root_in_trans(struct btrfs_trans_handle *trans,
                                struct btrfs_root *root)
  {
@@ -411,8 +448,8 @@ static inline bool need_reserve_reloc_root(struct btrfs_root *root)
  }
  
  static struct btrfs_trans_handle *
-start_transaction(struct btrfs_root *root, u64 num_items, unsigned int type,
-                 enum btrfs_reserve_flush_enum flush)
+start_transaction(struct btrfs_root *root, unsigned int num_items,
+                 unsigned int type, enum btrfs_reserve_flush_enum flush)
  {
         struct btrfs_trans_handle *h;
         struct btrfs_transaction *cur_trans;
@@ -442,13 +479,10 @@ start_transaction(struct btrfs_root *root, u64 num_items, unsigned int type,
          * the appropriate flushing if need be.
          */
         if (num_items > 0 && root != root->fs_info->chunk_root) {
-               if (root->fs_info->quota_enabled &&
-                   is_fstree(root->root_key.objectid)) {
-                       qgroup_reserved = num_items * root->nodesize;
-                       ret = btrfs_qgroup_reserve(root, qgroup_reserved);
-                       if (ret)
-                               return ERR_PTR(ret);
-               }
+               qgroup_reserved = num_items * root->nodesize;
+               ret = btrfs_qgroup_reserve_meta(root, qgroup_reserved);
+               if (ret)
+                       return ERR_PTR(ret);
  
                 num_bytes = btrfs_calc_trans_metadata_size(root, num_items);
                 /*
@@ -466,7 +500,7 @@ start_transaction(struct btrfs_root *root, u64 num_items, unsigned int type,
                         goto reserve_fail;
         }
  again:
-       h = kmem_cache_alloc(btrfs_trans_handle_cachep, GFP_NOFS);
+       h = kmem_cache_zalloc(btrfs_trans_handle_cachep, GFP_NOFS);
         if (!h) {
                 ret = -ENOMEM;
                 goto alloc_fail;
@@ -507,24 +541,13 @@ again:
  
         h->transid = cur_trans->transid;
         h->transaction = cur_trans;
-       h->blocks_used = 0;
-       h->bytes_reserved = 0;
         h->root = root;
-       h->delayed_ref_updates = 0;
         h->use_count = 1;
-       h->adding_csums = 0;
-       h->block_rsv = NULL;
-       h->orig_rsv = NULL;
-       h->aborted = 0;
-       h->qgroup_reserved = 0;
-       h->delayed_ref_elem.seq = 0;
+
         h->type = type;
-       h->allocating_chunk = false;
-       h->reloc_reserved = false;
-       h->sync = false;
+       h->can_flush_pending_bgs = true;
         INIT_LIST_HEAD(&h->qgroup_ref_list);
         INIT_LIST_HEAD(&h->new_bgs);
-       INIT_LIST_HEAD(&h->ordered);
  
         smp_mb();
         if (cur_trans->state >= TRANS_STATE_BLOCKED &&
@@ -541,7 +564,6 @@ again:
                 h->bytes_reserved = num_bytes;
                 h->reloc_reserved = reloc_reserved;
         }
-       h->qgroup_reserved = qgroup_reserved;
  
  got_it:
         btrfs_record_root_in_trans(h, root);
@@ -559,20 +581,52 @@ alloc_fail:
                 btrfs_block_rsv_release(root, &root->fs_info->trans_block_rsv,
                                         num_bytes);
  reserve_fail:
-       if (qgroup_reserved)
-               btrfs_qgroup_free(root, qgroup_reserved);
+       btrfs_qgroup_free_meta(root, qgroup_reserved);
         return ERR_PTR(ret);
  }
  
  struct btrfs_trans_handle *btrfs_start_transaction(struct btrfs_root *root,
-                                                  int num_items)
+                                                  unsigned int num_items)
  {
         return start_transaction(root, num_items, TRANS_START,
                                  BTRFS_RESERVE_FLUSH_ALL);
  }
+struct btrfs_trans_handle *btrfs_start_transaction_fallback_global_rsv(
+                                       struct btrfs_root *root,
+                                       unsigned int num_items,
+                                       int min_factor)
+{
+       struct btrfs_trans_handle *trans;
+       u64 num_bytes;
+       int ret;
+
+       trans = btrfs_start_transaction(root, num_items);
+       if (!IS_ERR(trans) || PTR_ERR(trans) != -ENOSPC)
+               return trans;
+
+       trans = btrfs_start_transaction(root, 0);
+       if (IS_ERR(trans))
+               return trans;
+
+       num_bytes = btrfs_calc_trans_metadata_size(root, num_items);
+       ret = btrfs_cond_migrate_bytes(root->fs_info,
+                                      &root->fs_info->trans_block_rsv,
+                                      num_bytes,
+                                      min_factor);
+       if (ret) {
+               btrfs_end_transaction(trans, root);
+               return ERR_PTR(ret);
+       }
+
+       trans->block_rsv = &root->fs_info->trans_block_rsv;
+       trans->bytes_reserved = num_bytes;
+
+       return trans;
+}
  
  struct btrfs_trans_handle *btrfs_start_transaction_lflush(
-                                       struct btrfs_root *root, int num_items)
+                                       struct btrfs_root *root,
+                                       unsigned int num_items)
  {
         return start_transaction(root, num_items, TRANS_START,
                                  BTRFS_RESERVE_FLUSH_LIMIT);
@@ -756,12 +810,6 @@ static int __btrfs_end_transaction(struct btrfs_trans_handle *trans,
         if (!list_empty(&trans->new_bgs))
                 btrfs_create_pending_block_groups(trans, root);
  
-       if (!list_empty(&trans->ordered)) {
-               spin_lock(&info->trans_lock);
-               list_splice_init(&trans->ordered, &cur_trans->pending_ordered);
-               spin_unlock(&info->trans_lock);
-       }
-
         trans->delayed_ref_updates = 0;
         if (!trans->sync) {
                 must_run_delayed_refs =
@@ -777,21 +825,14 @@ static int __btrfs_end_transaction(struct btrfs_trans_handle *trans,
                         must_run_delayed_refs = 2;
         }
  
-       if (trans->qgroup_reserved) {
-               /*
-                * the same root has to be passed here between start_transaction
-                * and end_transaction. Subvolume quota depends on this.
-                */
-               btrfs_qgroup_free(trans->root, trans->qgroup_reserved);
-               trans->qgroup_reserved = 0;
-       }
-
         btrfs_trans_release_metadata(trans, root);
         trans->block_rsv = NULL;
  
         if (!list_empty(&trans->new_bgs))
                 btrfs_create_pending_block_groups(trans, root);
  
+       btrfs_trans_release_chunk_metadata(trans);
+
         if (lock && !atomic_read(&root->fs_info->open_ioctl_trans) &&
             should_end_transaction(trans, root) &&
             ACCESS_ONCE(cur_trans->state) == TRANS_STATE_RUNNING) {
@@ -816,6 +857,9 @@ static int __btrfs_end_transaction(struct btrfs_trans_handle *trans,
         atomic_dec(&cur_trans->num_writers);
         extwriter_counter_dec(cur_trans, trans->type);
  
+       /*
+        * Make sure counter is updated before we wake up waiters.
+        */
         smp_mb();
         if (waitqueue_active(&cur_trans->writer_wait))
                 wake_up(&cur_trans->writer_wait);
@@ -1198,6 +1242,7 @@ static noinline int commit_fs_roots(struct btrfs_trans_handle *trans,
                         spin_lock(&fs_info->fs_roots_radix_lock);
                         if (err)
                                 break;
+                       btrfs_qgroup_free_meta_all(root);
                 }
         }
         spin_unlock(&fs_info->fs_roots_radix_lock);
@@ -1290,7 +1335,13 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans,
         if (pending->error)
                 goto no_free_objectid;
  
-       btrfs_reloc_pre_snapshot(trans, pending, &to_reserve);
+       /*
+        * Make qgroup to skip current new snapshot's qgroupid, as it is
+        * accounted by later btrfs_qgroup_inherit().
+        */
+       btrfs_set_skip_qgroup(trans, objectid);
+
+       btrfs_reloc_pre_snapshot(pending, &to_reserve);
  
         if (to_reserve > 0) {
                 pending->error = btrfs_block_rsv_add(root,
@@ -1298,7 +1349,7 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans,
                                                      to_reserve,
                                                      BTRFS_RESERVE_NO_FLUSH);
                 if (pending->error)
-                       goto no_free_objectid;
+                       goto clear_skip_qgroup;
         }
  
         key.objectid = objectid;
@@ -1396,25 +1447,6 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans,
                 btrfs_abort_transaction(trans, root, ret);
                 goto fail;
         }
-
-       /*
-        * We need to flush delayed refs in order to make sure all of our quota
-        * operations have been done before we call btrfs_qgroup_inherit.
-        */
-       ret = btrfs_run_delayed_refs(trans, root, (unsigned long)-1);
-       if (ret) {
-               btrfs_abort_transaction(trans, root, ret);
-               goto fail;
-       }
-
-       ret = btrfs_qgroup_inherit(trans, fs_info,
-                                  root->root_key.objectid,
-                                  objectid, pending->inherit);
-       if (ret) {
-               btrfs_abort_transaction(trans, root, ret);
-               goto fail;
-       }
-
         /* see comments in should_cow_block() */
         set_bit(BTRFS_ROOT_FORCE_COW, &root->state);
         smp_wmb();
@@ -1497,11 +1529,37 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans,
                         goto fail;
                 }
         }
+
+       ret = btrfs_run_delayed_refs(trans, root, (unsigned long)-1);
+       if (ret) {
+               btrfs_abort_transaction(trans, root, ret);
+               goto fail;
+       }
+
+       /*
+        * account qgroup counters before qgroup_inherit()
+        */
+       ret = btrfs_qgroup_prepare_account_extents(trans, fs_info);
+       if (ret)
+               goto fail;
+       ret = btrfs_qgroup_account_extents(trans, fs_info);
+       if (ret)
+               goto fail;
+       ret = btrfs_qgroup_inherit(trans, fs_info,
+                                  root->root_key.objectid,
+                                  objectid, pending->inherit);
+       if (ret) {
+               btrfs_abort_transaction(trans, root, ret);
+               goto fail;
+       }
+
  fail:
         pending->error = ret;
  dir_item_existed:
         trans->block_rsv = rsv;
         trans->bytes_reserved = 0;
+clear_skip_qgroup:
+       btrfs_clear_skip_qgroup(trans);
  no_free_objectid:
         kfree(new_root_item);
  root_item_alloc_fail:
@@ -1620,9 +1678,7 @@ static void do_async_commit(struct work_struct *work)
          * Tell lockdep about it.
          */
         if (ac->newtrans->type & __TRANS_FREEZABLE)
-               rwsem_acquire_read(
-                    &ac->root->fs_info->sb->s_writers.lock_map[SB_FREEZE_FS-1],
-                    0, 1, _THIS_IP_);
+               __sb_writers_acquired(ac->root->fs_info->sb, SB_FREEZE_FS);
  
         current->journal_info = ac->newtrans;
  
@@ -1661,9 +1717,7 @@ int btrfs_commit_transaction_async(struct btrfs_trans_handle *trans,
          * async commit thread will be the one to unlock it.
          */
         if (ac->newtrans->type & __TRANS_FREEZABLE)
-               rwsem_release(
-                       &root->fs_info->sb->s_writers.lock_map[SB_FREEZE_FS-1],
-                       1, _THIS_IP_);
+               __sb_writers_release(root->fs_info->sb, SB_FREEZE_FS);
  
         schedule_work(&ac->work);
  
@@ -1746,25 +1800,10 @@ static inline void btrfs_wait_delalloc_flush(struct btrfs_fs_info *fs_info)
  }
  
  static inline void
-btrfs_wait_pending_ordered(struct btrfs_transaction *cur_trans,
-                          struct btrfs_fs_info *fs_info)
+btrfs_wait_pending_ordered(struct btrfs_transaction *cur_trans)
  {
-       struct btrfs_ordered_extent *ordered;
-
-       spin_lock(&fs_info->trans_lock);
-       while (!list_empty(&cur_trans->pending_ordered)) {
-               ordered = list_first_entry(&cur_trans->pending_ordered,
-                                          struct btrfs_ordered_extent,
-                                          trans_list);
-               list_del_init(&ordered->trans_list);
-               spin_unlock(&fs_info->trans_lock);
-
-               wait_event(ordered->wait, test_bit(BTRFS_ORDERED_COMPLETE,
-                                                  &ordered->flags));
-               btrfs_put_ordered_extent(ordered);
-               spin_lock(&fs_info->trans_lock);
-       }
-       spin_unlock(&fs_info->trans_lock);
+       wait_event(cur_trans->pending_wait,
+                  atomic_read(&cur_trans->pending_ordered) == 0);
  }
  
  int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
@@ -1793,10 +1832,6 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
  
         btrfs_trans_release_metadata(trans, root);
         trans->block_rsv = NULL;
-       if (trans->qgroup_reserved) {
-               btrfs_qgroup_free(root, trans->qgroup_reserved);
-               trans->qgroup_reserved = 0;
-       }
  
         cur_trans = trans->transaction;
  
@@ -1816,7 +1851,7 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
                 return ret;
         }
  
-       if (!cur_trans->dirty_bg_run) {
+       if (!test_bit(BTRFS_TRANS_DIRTY_BG_RUN, &cur_trans->flags)) {
                 int run_it = 0;
  
                 /* this mutex is also taken before trying to set
@@ -1825,18 +1860,17 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
                  * after a extents from that block group have been
                  * allocated for cache files.  btrfs_set_block_group_ro
                  * will wait for the transaction to commit if it
-                * finds dirty_bg_run = 1
+                * finds BTRFS_TRANS_DIRTY_BG_RUN set.
                  *
-                * The dirty_bg_run flag is also used to make sure only
-                * one process starts all the block group IO.  It wouldn't
+                * The BTRFS_TRANS_DIRTY_BG_RUN flag is also used to make sure
+                * only one process starts all the block group IO.  It wouldn't
                  * hurt to have more than one go through, but there's no
                  * real advantage to it either.
                  */
                 mutex_lock(&root->fs_info->ro_block_group_mutex);
-               if (!cur_trans->dirty_bg_run) {
+               if (!test_and_set_bit(BTRFS_TRANS_DIRTY_BG_RUN,
+                                     &cur_trans->flags))
                         run_it = 1;
-                       cur_trans->dirty_bg_run = 1;
-               }
                 mutex_unlock(&root->fs_info->ro_block_group_mutex);
  
                 if (run_it)
@@ -1848,7 +1882,6 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
         }
  
         spin_lock(&root->fs_info->trans_lock);
-       list_splice_init(&trans->ordered, &cur_trans->pending_ordered);
         if (cur_trans->state >= TRANS_STATE_COMMIT_START) {
                 spin_unlock(&root->fs_info->trans_lock);
                 atomic_inc(&cur_trans->use_count);
@@ -1907,7 +1940,7 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
  
         btrfs_wait_delalloc_flush(root->fs_info);
  
-       btrfs_wait_pending_ordered(cur_trans, root->fs_info);
+       btrfs_wait_pending_ordered(cur_trans);
  
         btrfs_scrub_pause(root);
         /*
@@ -1966,6 +1999,13 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
                 goto scrub_continue;
         }
  
+       /* Reocrd old roots for later qgroup accounting */
+       ret = btrfs_qgroup_prepare_account_extents(trans, root->fs_info);
+       if (ret) {
+               mutex_unlock(&root->fs_info->reloc_mutex);
+               goto scrub_continue;
+       }
+
         /*
          * make sure none of the code above managed to slip in a
          * delayed item
@@ -2007,6 +2047,17 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
          */
         btrfs_free_log_root_tree(trans, root->fs_info);
  
+       /*
+        * Since fs roots are all committed, we can get a quite accurate
+        * new_roots. So let's do quota accounting.
+        */
+       ret = btrfs_qgroup_account_extents(trans, root->fs_info);
+       if (ret < 0) {
+               mutex_unlock(&root->fs_info->tree_log_mutex);
+               mutex_unlock(&root->fs_info->reloc_mutex);
+               goto scrub_continue;
+       }
+
         ret = commit_cowonly_roots(trans, root);
         if (ret) {
                 mutex_unlock(&root->fs_info->tree_log_mutex);
@@ -2057,6 +2108,8 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
         clear_bit(BTRFS_INODE_BTREE_LOG1_ERR, &btree_ino->runtime_flags);
         clear_bit(BTRFS_INODE_BTREE_LOG2_ERR, &btree_ino->runtime_flags);
  
+       btrfs_trans_release_chunk_metadata(trans);
+
         spin_lock(&root->fs_info->trans_lock);
         cur_trans->state = TRANS_STATE_UNBLOCKED;
         root->fs_info->running_transaction = NULL;
@@ -2067,7 +2120,7 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
  
         ret = btrfs_write_and_wait_transaction(trans, root);
         if (ret) {
-               btrfs_error(root->fs_info, ret,
+               btrfs_std_error(root->fs_info, ret,
                             "Error while writing out transaction");
                 mutex_unlock(&root->fs_info->tree_log_mutex);
                 goto scrub_continue;
@@ -2087,7 +2140,7 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
  
         btrfs_finish_extent_commit(trans, root);
  
-       if (cur_trans->have_free_bgs)
+       if (test_bit(BTRFS_TRANS_HAVE_FREE_BGS, &cur_trans->flags))
                 btrfs_clear_space_info_full(root->fs_info);
  
         root->fs_info->last_trans_committed = cur_trans->transid;
@@ -2117,7 +2170,8 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
  
         kmem_cache_free(btrfs_trans_handle_cachep, trans);
  
-       if (current != root->fs_info->transaction_kthread)
+       if (current != root->fs_info->transaction_kthread &&
+           current != root->fs_info->cleaner_kthread)
                 btrfs_run_delayed_iputs(root);
  
         return ret;
@@ -2126,11 +2180,8 @@ scrub_continue:
         btrfs_scrub_continue(root);
  cleanup_transaction:
         btrfs_trans_release_metadata(trans, root);
+       btrfs_trans_release_chunk_metadata(trans);
         trans->block_rsv = NULL;
-       if (trans->qgroup_reserved) {
-               btrfs_qgroup_free(root, trans->qgroup_reserved);
-               trans->qgroup_reserved = 0;
-       }
         btrfs_warn(root->fs_info, "Skipping commit of aborted transaction.");
         if (current->journal_info == trans)
                 current->journal_info = NULL;