These changes are the raw update to linux-4.4.6-rt14. Kernel sources
[kvmfornfv.git] / kernel / fs / btrfs / transaction.c
index 00d18c2..be8eae8 100644 (file)
@@ -82,6 +82,12 @@ void btrfs_put_transaction(struct btrfs_transaction *transaction)
 static void clear_btree_io_tree(struct extent_io_tree *tree)
 {
        spin_lock(&tree->lock);
+       /*
+        * Do a single barrier for the waitqueue_active check here, the state
+        * of the waitqueue should not change once clear_btree_io_tree is
+        * called.
+        */
+       smp_mb();
        while (!RB_EMPTY_ROOT(&tree->state)) {
                struct rb_node *node;
                struct extent_state *state;
@@ -117,6 +123,18 @@ static noinline void switch_commit_roots(struct btrfs_transaction *trans,
                        btrfs_unpin_free_ino(root);
                clear_btree_io_tree(&root->dirty_log_pages);
        }
+
+       /* We can free old roots now. */
+       spin_lock(&trans->dropped_roots_lock);
+       while (!list_empty(&trans->dropped_roots)) {
+               root = list_first_entry(&trans->dropped_roots,
+                                       struct btrfs_root, root_list);
+               list_del_init(&root->root_list);
+               spin_unlock(&trans->dropped_roots_lock);
+               btrfs_drop_and_free_fs_root(fs_info, root);
+               spin_lock(&trans->dropped_roots_lock);
+       }
+       spin_unlock(&trans->dropped_roots_lock);
        up_write(&fs_info->commit_root_sem);
 }
 
@@ -214,23 +232,22 @@ loop:
        extwriter_counter_init(cur_trans, type);
        init_waitqueue_head(&cur_trans->writer_wait);
        init_waitqueue_head(&cur_trans->commit_wait);
+       init_waitqueue_head(&cur_trans->pending_wait);
        cur_trans->state = TRANS_STATE_RUNNING;
        /*
         * One for this trans handle, one so it will live on until we
         * commit the transaction.
         */
        atomic_set(&cur_trans->use_count, 2);
-       cur_trans->have_free_bgs = 0;
+       atomic_set(&cur_trans->pending_ordered, 0);
+       cur_trans->flags = 0;
        cur_trans->start_time = get_seconds();
-       cur_trans->dirty_bg_run = 0;
+
+       memset(&cur_trans->delayed_refs, 0, sizeof(cur_trans->delayed_refs));
 
        cur_trans->delayed_refs.href_root = RB_ROOT;
+       cur_trans->delayed_refs.dirty_extent_root = RB_ROOT;
        atomic_set(&cur_trans->delayed_refs.num_entries, 0);
-       cur_trans->delayed_refs.num_heads_ready = 0;
-       cur_trans->delayed_refs.pending_csums = 0;
-       cur_trans->delayed_refs.num_heads = 0;
-       cur_trans->delayed_refs.flushing = 0;
-       cur_trans->delayed_refs.run_delayed_start = 0;
 
        /*
         * although the tree mod log is per file system and not per transaction,
@@ -250,12 +267,14 @@ loop:
        INIT_LIST_HEAD(&cur_trans->pending_snapshots);
        INIT_LIST_HEAD(&cur_trans->pending_chunks);
        INIT_LIST_HEAD(&cur_trans->switch_commits);
-       INIT_LIST_HEAD(&cur_trans->pending_ordered);
        INIT_LIST_HEAD(&cur_trans->dirty_bgs);
        INIT_LIST_HEAD(&cur_trans->io_bgs);
+       INIT_LIST_HEAD(&cur_trans->dropped_roots);
        mutex_init(&cur_trans->cache_write_mutex);
        cur_trans->num_dirty_bgs = 0;
        spin_lock_init(&cur_trans->dirty_bgs_lock);
+       INIT_LIST_HEAD(&cur_trans->deleted_bgs);
+       spin_lock_init(&cur_trans->dropped_roots_lock);
        list_add_tail(&cur_trans->list, &fs_info->trans_list);
        extent_io_tree_init(&cur_trans->dirty_pages,
                             fs_info->btree_inode->i_mapping);
@@ -332,6 +351,24 @@ static int record_root_in_trans(struct btrfs_trans_handle *trans,
 }
 
 
+void btrfs_add_dropped_root(struct btrfs_trans_handle *trans,
+                           struct btrfs_root *root)
+{
+       struct btrfs_transaction *cur_trans = trans->transaction;
+
+       /* Add ourselves to the transaction dropped list */
+       spin_lock(&cur_trans->dropped_roots_lock);
+       list_add_tail(&root->root_list, &cur_trans->dropped_roots);
+       spin_unlock(&cur_trans->dropped_roots_lock);
+
+       /* Make sure we don't try to update the root at commit time */
+       spin_lock(&root->fs_info->fs_roots_radix_lock);
+       radix_tree_tag_clear(&root->fs_info->fs_roots_radix,
+                            (unsigned long)root->root_key.objectid,
+                            BTRFS_ROOT_TRANS_TAG);
+       spin_unlock(&root->fs_info->fs_roots_radix_lock);
+}
+
 int btrfs_record_root_in_trans(struct btrfs_trans_handle *trans,
                               struct btrfs_root *root)
 {
@@ -411,8 +448,8 @@ static inline bool need_reserve_reloc_root(struct btrfs_root *root)
 }
 
 static struct btrfs_trans_handle *
-start_transaction(struct btrfs_root *root, u64 num_items, unsigned int type,
-                 enum btrfs_reserve_flush_enum flush)
+start_transaction(struct btrfs_root *root, unsigned int num_items,
+                 unsigned int type, enum btrfs_reserve_flush_enum flush)
 {
        struct btrfs_trans_handle *h;
        struct btrfs_transaction *cur_trans;
@@ -442,13 +479,10 @@ start_transaction(struct btrfs_root *root, u64 num_items, unsigned int type,
         * the appropriate flushing if need be.
         */
        if (num_items > 0 && root != root->fs_info->chunk_root) {
-               if (root->fs_info->quota_enabled &&
-                   is_fstree(root->root_key.objectid)) {
-                       qgroup_reserved = num_items * root->nodesize;
-                       ret = btrfs_qgroup_reserve(root, qgroup_reserved);
-                       if (ret)
-                               return ERR_PTR(ret);
-               }
+               qgroup_reserved = num_items * root->nodesize;
+               ret = btrfs_qgroup_reserve_meta(root, qgroup_reserved);
+               if (ret)
+                       return ERR_PTR(ret);
 
                num_bytes = btrfs_calc_trans_metadata_size(root, num_items);
                /*
@@ -466,7 +500,7 @@ start_transaction(struct btrfs_root *root, u64 num_items, unsigned int type,
                        goto reserve_fail;
        }
 again:
-       h = kmem_cache_alloc(btrfs_trans_handle_cachep, GFP_NOFS);
+       h = kmem_cache_zalloc(btrfs_trans_handle_cachep, GFP_NOFS);
        if (!h) {
                ret = -ENOMEM;
                goto alloc_fail;
@@ -507,24 +541,13 @@ again:
 
        h->transid = cur_trans->transid;
        h->transaction = cur_trans;
-       h->blocks_used = 0;
-       h->bytes_reserved = 0;
        h->root = root;
-       h->delayed_ref_updates = 0;
        h->use_count = 1;
-       h->adding_csums = 0;
-       h->block_rsv = NULL;
-       h->orig_rsv = NULL;
-       h->aborted = 0;
-       h->qgroup_reserved = 0;
-       h->delayed_ref_elem.seq = 0;
+
        h->type = type;
-       h->allocating_chunk = false;
-       h->reloc_reserved = false;
-       h->sync = false;
+       h->can_flush_pending_bgs = true;
        INIT_LIST_HEAD(&h->qgroup_ref_list);
        INIT_LIST_HEAD(&h->new_bgs);
-       INIT_LIST_HEAD(&h->ordered);
 
        smp_mb();
        if (cur_trans->state >= TRANS_STATE_BLOCKED &&
@@ -541,7 +564,6 @@ again:
                h->bytes_reserved = num_bytes;
                h->reloc_reserved = reloc_reserved;
        }
-       h->qgroup_reserved = qgroup_reserved;
 
 got_it:
        btrfs_record_root_in_trans(h, root);
@@ -559,20 +581,52 @@ alloc_fail:
                btrfs_block_rsv_release(root, &root->fs_info->trans_block_rsv,
                                        num_bytes);
 reserve_fail:
-       if (qgroup_reserved)
-               btrfs_qgroup_free(root, qgroup_reserved);
+       btrfs_qgroup_free_meta(root, qgroup_reserved);
        return ERR_PTR(ret);
 }
 
 struct btrfs_trans_handle *btrfs_start_transaction(struct btrfs_root *root,
-                                                  int num_items)
+                                                  unsigned int num_items)
 {
        return start_transaction(root, num_items, TRANS_START,
                                 BTRFS_RESERVE_FLUSH_ALL);
 }
+struct btrfs_trans_handle *btrfs_start_transaction_fallback_global_rsv(
+                                       struct btrfs_root *root,
+                                       unsigned int num_items,
+                                       int min_factor)
+{
+       struct btrfs_trans_handle *trans;
+       u64 num_bytes;
+       int ret;
+
+       trans = btrfs_start_transaction(root, num_items);
+       if (!IS_ERR(trans) || PTR_ERR(trans) != -ENOSPC)
+               return trans;
+
+       trans = btrfs_start_transaction(root, 0);
+       if (IS_ERR(trans))
+               return trans;
+
+       num_bytes = btrfs_calc_trans_metadata_size(root, num_items);
+       ret = btrfs_cond_migrate_bytes(root->fs_info,
+                                      &root->fs_info->trans_block_rsv,
+                                      num_bytes,
+                                      min_factor);
+       if (ret) {
+               btrfs_end_transaction(trans, root);
+               return ERR_PTR(ret);
+       }
+
+       trans->block_rsv = &root->fs_info->trans_block_rsv;
+       trans->bytes_reserved = num_bytes;
+
+       return trans;
+}
 
 struct btrfs_trans_handle *btrfs_start_transaction_lflush(
-                                       struct btrfs_root *root, int num_items)
+                                       struct btrfs_root *root,
+                                       unsigned int num_items)
 {
        return start_transaction(root, num_items, TRANS_START,
                                 BTRFS_RESERVE_FLUSH_LIMIT);
@@ -756,12 +810,6 @@ static int __btrfs_end_transaction(struct btrfs_trans_handle *trans,
        if (!list_empty(&trans->new_bgs))
                btrfs_create_pending_block_groups(trans, root);
 
-       if (!list_empty(&trans->ordered)) {
-               spin_lock(&info->trans_lock);
-               list_splice_init(&trans->ordered, &cur_trans->pending_ordered);
-               spin_unlock(&info->trans_lock);
-       }
-
        trans->delayed_ref_updates = 0;
        if (!trans->sync) {
                must_run_delayed_refs =
@@ -777,21 +825,14 @@ static int __btrfs_end_transaction(struct btrfs_trans_handle *trans,
                        must_run_delayed_refs = 2;
        }
 
-       if (trans->qgroup_reserved) {
-               /*
-                * the same root has to be passed here between start_transaction
-                * and end_transaction. Subvolume quota depends on this.
-                */
-               btrfs_qgroup_free(trans->root, trans->qgroup_reserved);
-               trans->qgroup_reserved = 0;
-       }
-
        btrfs_trans_release_metadata(trans, root);
        trans->block_rsv = NULL;
 
        if (!list_empty(&trans->new_bgs))
                btrfs_create_pending_block_groups(trans, root);
 
+       btrfs_trans_release_chunk_metadata(trans);
+
        if (lock && !atomic_read(&root->fs_info->open_ioctl_trans) &&
            should_end_transaction(trans, root) &&
            ACCESS_ONCE(cur_trans->state) == TRANS_STATE_RUNNING) {
@@ -816,6 +857,9 @@ static int __btrfs_end_transaction(struct btrfs_trans_handle *trans,
        atomic_dec(&cur_trans->num_writers);
        extwriter_counter_dec(cur_trans, trans->type);
 
+       /*
+        * Make sure counter is updated before we wake up waiters.
+        */
        smp_mb();
        if (waitqueue_active(&cur_trans->writer_wait))
                wake_up(&cur_trans->writer_wait);
@@ -1198,6 +1242,7 @@ static noinline int commit_fs_roots(struct btrfs_trans_handle *trans,
                        spin_lock(&fs_info->fs_roots_radix_lock);
                        if (err)
                                break;
+                       btrfs_qgroup_free_meta_all(root);
                }
        }
        spin_unlock(&fs_info->fs_roots_radix_lock);
@@ -1290,7 +1335,13 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans,
        if (pending->error)
                goto no_free_objectid;
 
-       btrfs_reloc_pre_snapshot(trans, pending, &to_reserve);
+       /*
+        * Make qgroup to skip current new snapshot's qgroupid, as it is
+        * accounted by later btrfs_qgroup_inherit().
+        */
+       btrfs_set_skip_qgroup(trans, objectid);
+
+       btrfs_reloc_pre_snapshot(pending, &to_reserve);
 
        if (to_reserve > 0) {
                pending->error = btrfs_block_rsv_add(root,
@@ -1298,7 +1349,7 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans,
                                                     to_reserve,
                                                     BTRFS_RESERVE_NO_FLUSH);
                if (pending->error)
-                       goto no_free_objectid;
+                       goto clear_skip_qgroup;
        }
 
        key.objectid = objectid;
@@ -1396,25 +1447,6 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans,
                btrfs_abort_transaction(trans, root, ret);
                goto fail;
        }
-
-       /*
-        * We need to flush delayed refs in order to make sure all of our quota
-        * operations have been done before we call btrfs_qgroup_inherit.
-        */
-       ret = btrfs_run_delayed_refs(trans, root, (unsigned long)-1);
-       if (ret) {
-               btrfs_abort_transaction(trans, root, ret);
-               goto fail;
-       }
-
-       ret = btrfs_qgroup_inherit(trans, fs_info,
-                                  root->root_key.objectid,
-                                  objectid, pending->inherit);
-       if (ret) {
-               btrfs_abort_transaction(trans, root, ret);
-               goto fail;
-       }
-
        /* see comments in should_cow_block() */
        set_bit(BTRFS_ROOT_FORCE_COW, &root->state);
        smp_wmb();
@@ -1497,11 +1529,37 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans,
                        goto fail;
                }
        }
+
+       ret = btrfs_run_delayed_refs(trans, root, (unsigned long)-1);
+       if (ret) {
+               btrfs_abort_transaction(trans, root, ret);
+               goto fail;
+       }
+
+       /*
+        * account qgroup counters before qgroup_inherit()
+        */
+       ret = btrfs_qgroup_prepare_account_extents(trans, fs_info);
+       if (ret)
+               goto fail;
+       ret = btrfs_qgroup_account_extents(trans, fs_info);
+       if (ret)
+               goto fail;
+       ret = btrfs_qgroup_inherit(trans, fs_info,
+                                  root->root_key.objectid,
+                                  objectid, pending->inherit);
+       if (ret) {
+               btrfs_abort_transaction(trans, root, ret);
+               goto fail;
+       }
+
 fail:
        pending->error = ret;
 dir_item_existed:
        trans->block_rsv = rsv;
        trans->bytes_reserved = 0;
+clear_skip_qgroup:
+       btrfs_clear_skip_qgroup(trans);
 no_free_objectid:
        kfree(new_root_item);
 root_item_alloc_fail:
@@ -1620,9 +1678,7 @@ static void do_async_commit(struct work_struct *work)
         * Tell lockdep about it.
         */
        if (ac->newtrans->type & __TRANS_FREEZABLE)
-               rwsem_acquire_read(
-                    &ac->root->fs_info->sb->s_writers.lock_map[SB_FREEZE_FS-1],
-                    0, 1, _THIS_IP_);
+               __sb_writers_acquired(ac->root->fs_info->sb, SB_FREEZE_FS);
 
        current->journal_info = ac->newtrans;
 
@@ -1661,9 +1717,7 @@ int btrfs_commit_transaction_async(struct btrfs_trans_handle *trans,
         * async commit thread will be the one to unlock it.
         */
        if (ac->newtrans->type & __TRANS_FREEZABLE)
-               rwsem_release(
-                       &root->fs_info->sb->s_writers.lock_map[SB_FREEZE_FS-1],
-                       1, _THIS_IP_);
+               __sb_writers_release(root->fs_info->sb, SB_FREEZE_FS);
 
        schedule_work(&ac->work);
 
@@ -1746,25 +1800,10 @@ static inline void btrfs_wait_delalloc_flush(struct btrfs_fs_info *fs_info)
 }
 
 static inline void
-btrfs_wait_pending_ordered(struct btrfs_transaction *cur_trans,
-                          struct btrfs_fs_info *fs_info)
+btrfs_wait_pending_ordered(struct btrfs_transaction *cur_trans)
 {
-       struct btrfs_ordered_extent *ordered;
-
-       spin_lock(&fs_info->trans_lock);
-       while (!list_empty(&cur_trans->pending_ordered)) {
-               ordered = list_first_entry(&cur_trans->pending_ordered,
-                                          struct btrfs_ordered_extent,
-                                          trans_list);
-               list_del_init(&ordered->trans_list);
-               spin_unlock(&fs_info->trans_lock);
-
-               wait_event(ordered->wait, test_bit(BTRFS_ORDERED_COMPLETE,
-                                                  &ordered->flags));
-               btrfs_put_ordered_extent(ordered);
-               spin_lock(&fs_info->trans_lock);
-       }
-       spin_unlock(&fs_info->trans_lock);
+       wait_event(cur_trans->pending_wait,
+                  atomic_read(&cur_trans->pending_ordered) == 0);
 }
 
 int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
@@ -1793,10 +1832,6 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
 
        btrfs_trans_release_metadata(trans, root);
        trans->block_rsv = NULL;
-       if (trans->qgroup_reserved) {
-               btrfs_qgroup_free(root, trans->qgroup_reserved);
-               trans->qgroup_reserved = 0;
-       }
 
        cur_trans = trans->transaction;
 
@@ -1816,7 +1851,7 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
                return ret;
        }
 
-       if (!cur_trans->dirty_bg_run) {
+       if (!test_bit(BTRFS_TRANS_DIRTY_BG_RUN, &cur_trans->flags)) {
                int run_it = 0;
 
                /* this mutex is also taken before trying to set
@@ -1825,18 +1860,17 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
                 * after a extents from that block group have been
                 * allocated for cache files.  btrfs_set_block_group_ro
                 * will wait for the transaction to commit if it
-                * finds dirty_bg_run = 1
+                * finds BTRFS_TRANS_DIRTY_BG_RUN set.
                 *
-                * The dirty_bg_run flag is also used to make sure only
-                * one process starts all the block group IO.  It wouldn't
+                * The BTRFS_TRANS_DIRTY_BG_RUN flag is also used to make sure
+                * only one process starts all the block group IO.  It wouldn't
                 * hurt to have more than one go through, but there's no
                 * real advantage to it either.
                 */
                mutex_lock(&root->fs_info->ro_block_group_mutex);
-               if (!cur_trans->dirty_bg_run) {
+               if (!test_and_set_bit(BTRFS_TRANS_DIRTY_BG_RUN,
+                                     &cur_trans->flags))
                        run_it = 1;
-                       cur_trans->dirty_bg_run = 1;
-               }
                mutex_unlock(&root->fs_info->ro_block_group_mutex);
 
                if (run_it)
@@ -1848,7 +1882,6 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
        }
 
        spin_lock(&root->fs_info->trans_lock);
-       list_splice_init(&trans->ordered, &cur_trans->pending_ordered);
        if (cur_trans->state >= TRANS_STATE_COMMIT_START) {
                spin_unlock(&root->fs_info->trans_lock);
                atomic_inc(&cur_trans->use_count);
@@ -1907,7 +1940,7 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
 
        btrfs_wait_delalloc_flush(root->fs_info);
 
-       btrfs_wait_pending_ordered(cur_trans, root->fs_info);
+       btrfs_wait_pending_ordered(cur_trans);
 
        btrfs_scrub_pause(root);
        /*
@@ -1966,6 +1999,13 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
                goto scrub_continue;
        }
 
+       /* Reocrd old roots for later qgroup accounting */
+       ret = btrfs_qgroup_prepare_account_extents(trans, root->fs_info);
+       if (ret) {
+               mutex_unlock(&root->fs_info->reloc_mutex);
+               goto scrub_continue;
+       }
+
        /*
         * make sure none of the code above managed to slip in a
         * delayed item
@@ -2007,6 +2047,17 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
         */
        btrfs_free_log_root_tree(trans, root->fs_info);
 
+       /*
+        * Since fs roots are all committed, we can get a quite accurate
+        * new_roots. So let's do quota accounting.
+        */
+       ret = btrfs_qgroup_account_extents(trans, root->fs_info);
+       if (ret < 0) {
+               mutex_unlock(&root->fs_info->tree_log_mutex);
+               mutex_unlock(&root->fs_info->reloc_mutex);
+               goto scrub_continue;
+       }
+
        ret = commit_cowonly_roots(trans, root);
        if (ret) {
                mutex_unlock(&root->fs_info->tree_log_mutex);
@@ -2057,6 +2108,8 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
        clear_bit(BTRFS_INODE_BTREE_LOG1_ERR, &btree_ino->runtime_flags);
        clear_bit(BTRFS_INODE_BTREE_LOG2_ERR, &btree_ino->runtime_flags);
 
+       btrfs_trans_release_chunk_metadata(trans);
+
        spin_lock(&root->fs_info->trans_lock);
        cur_trans->state = TRANS_STATE_UNBLOCKED;
        root->fs_info->running_transaction = NULL;
@@ -2067,7 +2120,7 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
 
        ret = btrfs_write_and_wait_transaction(trans, root);
        if (ret) {
-               btrfs_error(root->fs_info, ret,
+               btrfs_std_error(root->fs_info, ret,
                            "Error while writing out transaction");
                mutex_unlock(&root->fs_info->tree_log_mutex);
                goto scrub_continue;
@@ -2087,7 +2140,7 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
 
        btrfs_finish_extent_commit(trans, root);
 
-       if (cur_trans->have_free_bgs)
+       if (test_bit(BTRFS_TRANS_HAVE_FREE_BGS, &cur_trans->flags))
                btrfs_clear_space_info_full(root->fs_info);
 
        root->fs_info->last_trans_committed = cur_trans->transid;
@@ -2117,7 +2170,8 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
 
        kmem_cache_free(btrfs_trans_handle_cachep, trans);
 
-       if (current != root->fs_info->transaction_kthread)
+       if (current != root->fs_info->transaction_kthread &&
+           current != root->fs_info->cleaner_kthread)
                btrfs_run_delayed_iputs(root);
 
        return ret;
@@ -2126,11 +2180,8 @@ scrub_continue:
        btrfs_scrub_continue(root);
 cleanup_transaction:
        btrfs_trans_release_metadata(trans, root);
+       btrfs_trans_release_chunk_metadata(trans);
        trans->block_rsv = NULL;
-       if (trans->qgroup_reserved) {
-               btrfs_qgroup_free(root, trans->qgroup_reserved);
-               trans->qgroup_reserved = 0;
-       }
        btrfs_warn(root->fs_info, "Skipping commit of aborted transaction.");
        if (current->journal_info == trans)
                current->journal_info = NULL;