These changes are the raw update to linux-4.4.6-rt14. Kernel sources
[kvmfornfv.git] / kernel / block / blk-cgroup.c
index 6817e28..5a37188 100644 (file)
@@ -9,29 +9,46 @@
  *
  * Copyright (C) 2009 Vivek Goyal <vgoyal@redhat.com>
  *                   Nauman Rafique <nauman@google.com>
+ *
+ * For policy-specific per-blkcg data:
+ * Copyright (C) 2015 Paolo Valente <paolo.valente@unimore.it>
+ *                    Arianna Avanzini <avanzini.arianna@gmail.com>
  */
 #include <linux/ioprio.h>
 #include <linux/kdev_t.h>
 #include <linux/module.h>
 #include <linux/err.h>
 #include <linux/blkdev.h>
+#include <linux/backing-dev.h>
 #include <linux/slab.h>
 #include <linux/genhd.h>
 #include <linux/delay.h>
 #include <linux/atomic.h>
-#include "blk-cgroup.h"
+#include <linux/ctype.h>
+#include <linux/blk-cgroup.h>
 #include "blk.h"
 
 #define MAX_KEY_LEN 100
 
+/*
+ * blkcg_pol_mutex protects blkcg_policy[] and policy [de]activation.
+ * blkcg_pol_register_mutex nests outside of it and synchronizes entire
+ * policy [un]register operations including cgroup file additions /
+ * removals.  Putting cgroup file registration outside blkcg_pol_mutex
+ * allows grabbing it from cgroup callbacks.
+ */
+static DEFINE_MUTEX(blkcg_pol_register_mutex);
 static DEFINE_MUTEX(blkcg_pol_mutex);
 
-struct blkcg blkcg_root = { .cfq_weight = 2 * CFQ_WEIGHT_DEFAULT,
-                           .cfq_leaf_weight = 2 * CFQ_WEIGHT_DEFAULT, };
+struct blkcg blkcg_root;
 EXPORT_SYMBOL_GPL(blkcg_root);
 
+struct cgroup_subsys_state * const blkcg_root_css = &blkcg_root.css;
+
 static struct blkcg_policy *blkcg_policy[BLKCG_MAX_POLS];
 
+static LIST_HEAD(all_blkcgs);          /* protected by blkcg_pol_mutex */
+
 static bool blkcg_policy_enabled(struct request_queue *q,
                                 const struct blkcg_policy *pol)
 {
@@ -52,9 +69,14 @@ static void blkg_free(struct blkcg_gq *blkg)
                return;
 
        for (i = 0; i < BLKCG_MAX_POLS; i++)
-               kfree(blkg->pd[i]);
+               if (blkg->pd[i])
+                       blkcg_policy[i]->pd_free_fn(blkg->pd[i]);
 
-       blk_exit_rl(&blkg->rl);
+       if (blkg->blkcg != &blkcg_root)
+               blk_exit_rl(&blkg->rl);
+
+       blkg_rwstat_exit(&blkg->stat_ios);
+       blkg_rwstat_exit(&blkg->stat_bytes);
        kfree(blkg);
 }
 
@@ -77,6 +99,10 @@ static struct blkcg_gq *blkg_alloc(struct blkcg *blkcg, struct request_queue *q,
        if (!blkg)
                return NULL;
 
+       if (blkg_rwstat_init(&blkg->stat_bytes, gfp_mask) ||
+           blkg_rwstat_init(&blkg->stat_ios, gfp_mask))
+               goto err_free;
+
        blkg->q = q;
        INIT_LIST_HEAD(&blkg->q_node);
        blkg->blkcg = blkcg;
@@ -97,7 +123,7 @@ static struct blkcg_gq *blkg_alloc(struct blkcg *blkcg, struct request_queue *q,
                        continue;
 
                /* alloc per-policy data and attach it to blkg */
-               pd = kzalloc_node(pol->pd_size, gfp_mask, q->node);
+               pd = pol->pd_alloc_fn(gfp_mask, q->node);
                if (!pd)
                        goto err_free;
 
@@ -113,26 +139,11 @@ err_free:
        return NULL;
 }
 
-/**
- * __blkg_lookup - internal version of blkg_lookup()
- * @blkcg: blkcg of interest
- * @q: request_queue of interest
- * @update_hint: whether to update lookup hint with the result or not
- *
- * This is internal version and shouldn't be used by policy
- * implementations.  Looks up blkgs for the @blkcg - @q pair regardless of
- * @q's bypass state.  If @update_hint is %true, the caller should be
- * holding @q->queue_lock and lookup hint is updated on success.
- */
-struct blkcg_gq *__blkg_lookup(struct blkcg *blkcg, struct request_queue *q,
-                              bool update_hint)
+struct blkcg_gq *blkg_lookup_slowpath(struct blkcg *blkcg,
+                                     struct request_queue *q, bool update_hint)
 {
        struct blkcg_gq *blkg;
 
-       blkg = rcu_dereference(blkcg->blkg_hint);
-       if (blkg && blkg->q == q)
-               return blkg;
-
        /*
         * Hint didn't match.  Look up from the radix tree.  Note that the
         * hint can only be updated under queue_lock as otherwise @blkg
@@ -150,35 +161,18 @@ struct blkcg_gq *__blkg_lookup(struct blkcg *blkcg, struct request_queue *q,
 
        return NULL;
 }
-
-/**
- * blkg_lookup - lookup blkg for the specified blkcg - q pair
- * @blkcg: blkcg of interest
- * @q: request_queue of interest
- *
- * Lookup blkg for the @blkcg - @q pair.  This function should be called
- * under RCU read lock and is guaranteed to return %NULL if @q is bypassing
- * - see blk_queue_bypass_start() for details.
- */
-struct blkcg_gq *blkg_lookup(struct blkcg *blkcg, struct request_queue *q)
-{
-       WARN_ON_ONCE(!rcu_read_lock_held());
-
-       if (unlikely(blk_queue_bypass(q)))
-               return NULL;
-       return __blkg_lookup(blkcg, q, false);
-}
-EXPORT_SYMBOL_GPL(blkg_lookup);
+EXPORT_SYMBOL_GPL(blkg_lookup_slowpath);
 
 /*
  * If @new_blkg is %NULL, this function tries to allocate a new one as
- * necessary using %GFP_ATOMIC.  @new_blkg is always consumed on return.
+ * necessary using %GFP_NOWAIT.  @new_blkg is always consumed on return.
  */
 static struct blkcg_gq *blkg_create(struct blkcg *blkcg,
                                    struct request_queue *q,
                                    struct blkcg_gq *new_blkg)
 {
        struct blkcg_gq *blkg;
+       struct bdi_writeback_congested *wb_congested;
        int i, ret;
 
        WARN_ON_ONCE(!rcu_read_lock_held());
@@ -186,26 +180,34 @@ static struct blkcg_gq *blkg_create(struct blkcg *blkcg,
 
        /* blkg holds a reference to blkcg */
        if (!css_tryget_online(&blkcg->css)) {
-               ret = -EINVAL;
+               ret = -ENODEV;
                goto err_free_blkg;
        }
 
+       wb_congested = wb_congested_get_create(&q->backing_dev_info,
+                                              blkcg->css.id, GFP_NOWAIT);
+       if (!wb_congested) {
+               ret = -ENOMEM;
+               goto err_put_css;
+       }
+
        /* allocate */
        if (!new_blkg) {
-               new_blkg = blkg_alloc(blkcg, q, GFP_ATOMIC);
+               new_blkg = blkg_alloc(blkcg, q, GFP_NOWAIT);
                if (unlikely(!new_blkg)) {
                        ret = -ENOMEM;
-                       goto err_put_css;
+                       goto err_put_congested;
                }
        }
        blkg = new_blkg;
+       blkg->wb_congested = wb_congested;
 
        /* link parent */
        if (blkcg_parent(blkcg)) {
                blkg->parent = __blkg_lookup(blkcg_parent(blkcg), q, false);
                if (WARN_ON_ONCE(!blkg->parent)) {
-                       ret = -EINVAL;
-                       goto err_put_css;
+                       ret = -ENODEV;
+                       goto err_put_congested;
                }
                blkg_get(blkg->parent);
        }
@@ -215,7 +217,7 @@ static struct blkcg_gq *blkg_create(struct blkcg *blkcg,
                struct blkcg_policy *pol = blkcg_policy[i];
 
                if (blkg->pd[i] && pol->pd_init_fn)
-                       pol->pd_init_fn(blkg);
+                       pol->pd_init_fn(blkg->pd[i]);
        }
 
        /* insert */
@@ -229,24 +231,21 @@ static struct blkcg_gq *blkg_create(struct blkcg *blkcg,
                        struct blkcg_policy *pol = blkcg_policy[i];
 
                        if (blkg->pd[i] && pol->pd_online_fn)
-                               pol->pd_online_fn(blkg);
+                               pol->pd_online_fn(blkg->pd[i]);
                }
        }
        blkg->online = true;
        spin_unlock(&blkcg->lock);
 
-       if (!ret) {
-               if (blkcg == &blkcg_root) {
-                       q->root_blkg = blkg;
-                       q->root_rl.blkg = blkg;
-               }
+       if (!ret)
                return blkg;
-       }
 
        /* @blkg failed fully initialized, use the usual release path */
        blkg_put(blkg);
        return ERR_PTR(ret);
 
+err_put_congested:
+       wb_congested_put(wb_congested);
 err_put_css:
        css_put(&blkcg->css);
 err_free_blkg:
@@ -281,7 +280,7 @@ struct blkcg_gq *blkg_lookup_create(struct blkcg *blkcg,
         * we shouldn't allow anything to go through for a bypassing queue.
         */
        if (unlikely(blk_queue_bypass(q)))
-               return ERR_PTR(blk_queue_dying(q) ? -EINVAL : -EBUSY);
+               return ERR_PTR(blk_queue_dying(q) ? -ENODEV : -EBUSY);
 
        blkg = __blkg_lookup(blkcg, q, true);
        if (blkg)
@@ -305,11 +304,11 @@ struct blkcg_gq *blkg_lookup_create(struct blkcg *blkcg,
                        return blkg;
        }
 }
-EXPORT_SYMBOL_GPL(blkg_lookup_create);
 
 static void blkg_destroy(struct blkcg_gq *blkg)
 {
        struct blkcg *blkcg = blkg->blkcg;
+       struct blkcg_gq *parent = blkg->parent;
        int i;
 
        lockdep_assert_held(blkg->q->queue_lock);
@@ -323,8 +322,14 @@ static void blkg_destroy(struct blkcg_gq *blkg)
                struct blkcg_policy *pol = blkcg_policy[i];
 
                if (blkg->pd[i] && pol->pd_offline_fn)
-                       pol->pd_offline_fn(blkg);
+                       pol->pd_offline_fn(blkg->pd[i]);
        }
+
+       if (parent) {
+               blkg_rwstat_add_aux(&parent->stat_bytes, &blkg->stat_bytes);
+               blkg_rwstat_add_aux(&parent->stat_ios, &blkg->stat_ios);
+       }
+
        blkg->online = false;
 
        radix_tree_delete(&blkcg->blkg_tree, blkg->q->id);
@@ -339,15 +344,6 @@ static void blkg_destroy(struct blkcg_gq *blkg)
        if (rcu_access_pointer(blkcg->blkg_hint) == blkg)
                rcu_assign_pointer(blkcg->blkg_hint, NULL);
 
-       /*
-        * If root blkg is destroyed.  Just clear the pointer since root_rl
-        * does not take reference on root blkg.
-        */
-       if (blkcg == &blkcg_root) {
-               blkg->q->root_blkg = NULL;
-               blkg->q->root_rl.blkg = NULL;
-       }
-
        /*
         * Put the reference taken at the time of creation so that when all
         * queues are gone, group can be destroyed.
@@ -374,6 +370,9 @@ static void blkg_destroy_all(struct request_queue *q)
                blkg_destroy(blkg);
                spin_unlock(&blkcg->lock);
        }
+
+       q->root_blkg = NULL;
+       q->root_rl.blkg = NULL;
 }
 
 /*
@@ -387,21 +386,14 @@ static void blkg_destroy_all(struct request_queue *q)
 void __blkg_release_rcu(struct rcu_head *rcu_head)
 {
        struct blkcg_gq *blkg = container_of(rcu_head, struct blkcg_gq, rcu_head);
-       int i;
-
-       /* tell policies that this one is being freed */
-       for (i = 0; i < BLKCG_MAX_POLS; i++) {
-               struct blkcg_policy *pol = blkcg_policy[i];
-
-               if (blkg->pd[i] && pol->pd_exit_fn)
-                       pol->pd_exit_fn(blkg);
-       }
 
        /* release the blkcg and parent blkg refs this blkg has been holding */
        css_put(&blkg->blkcg->css);
        if (blkg->parent)
                blkg_put(blkg->parent);
 
+       wb_congested_put(blkg->wb_congested);
+
        blkg_free(blkg);
 }
 EXPORT_SYMBOL_GPL(__blkg_release_rcu);
@@ -448,20 +440,7 @@ static int blkcg_reset_stats(struct cgroup_subsys_state *css,
        struct blkcg_gq *blkg;
        int i;
 
-       /*
-        * XXX: We invoke cgroup_add/rm_cftypes() under blkcg_pol_mutex
-        * which ends up putting cgroup's internal cgroup_tree_mutex under
-        * it; however, cgroup_tree_mutex is nested above cgroup file
-        * active protection and grabbing blkcg_pol_mutex from a cgroup
-        * file operation creates a possible circular dependency.  cgroup
-        * internal locking is planned to go through further simplification
-        * and this issue should go away soon.  For now, let's trylock
-        * blkcg_pol_mutex and restart the write on failure.
-        *
-        * http://lkml.kernel.org/g/5363C04B.4010400@oracle.com
-        */
-       if (!mutex_trylock(&blkcg_pol_mutex))
-               return restart_syscall();
+       mutex_lock(&blkcg_pol_mutex);
        spin_lock_irq(&blkcg->lock);
 
        /*
@@ -470,12 +449,14 @@ static int blkcg_reset_stats(struct cgroup_subsys_state *css,
         * anyway.  If you get hit by a race, retry.
         */
        hlist_for_each_entry(blkg, &blkcg->blkg_list, blkcg_node) {
+               blkg_rwstat_reset(&blkg->stat_bytes);
+               blkg_rwstat_reset(&blkg->stat_ios);
+
                for (i = 0; i < BLKCG_MAX_POLS; i++) {
                        struct blkcg_policy *pol = blkcg_policy[i];
 
-                       if (blkcg_policy_enabled(blkg->q, pol) &&
-                           pol->pd_reset_stats_fn)
-                               pol->pd_reset_stats_fn(blkg);
+                       if (blkg->pd[i] && pol->pd_reset_stats_fn)
+                               pol->pd_reset_stats_fn(blkg->pd[i]);
                }
        }
 
@@ -484,13 +465,14 @@ static int blkcg_reset_stats(struct cgroup_subsys_state *css,
        return 0;
 }
 
-static const char *blkg_dev_name(struct blkcg_gq *blkg)
+const char *blkg_dev_name(struct blkcg_gq *blkg)
 {
        /* some drivers (floppy) instantiate a queue w/o disk registered */
        if (blkg->q->backing_dev_info.dev)
                return dev_name(blkg->q->backing_dev_info.dev);
        return NULL;
 }
+EXPORT_SYMBOL_GPL(blkg_dev_name);
 
 /**
  * blkcg_print_blkgs - helper for printing per-blkg data
@@ -579,9 +561,10 @@ u64 __blkg_prfill_rwstat(struct seq_file *sf, struct blkg_policy_data *pd,
 
        for (i = 0; i < BLKG_RWSTAT_NR; i++)
                seq_printf(sf, "%s %s %llu\n", dname, rwstr[i],
-                          (unsigned long long)rwstat->cnt[i]);
+                          (unsigned long long)atomic64_read(&rwstat->aux_cnt[i]));
 
-       v = rwstat->cnt[BLKG_RWSTAT_READ] + rwstat->cnt[BLKG_RWSTAT_WRITE];
+       v = atomic64_read(&rwstat->aux_cnt[BLKG_RWSTAT_READ]) +
+               atomic64_read(&rwstat->aux_cnt[BLKG_RWSTAT_WRITE]);
        seq_printf(sf, "%s Total %llu\n", dname, (unsigned long long)v);
        return v;
 }
@@ -618,31 +601,122 @@ u64 blkg_prfill_rwstat(struct seq_file *sf, struct blkg_policy_data *pd,
 }
 EXPORT_SYMBOL_GPL(blkg_prfill_rwstat);
 
+static u64 blkg_prfill_rwstat_field(struct seq_file *sf,
+                                   struct blkg_policy_data *pd, int off)
+{
+       struct blkg_rwstat rwstat = blkg_rwstat_read((void *)pd->blkg + off);
+
+       return __blkg_prfill_rwstat(sf, pd, &rwstat);
+}
+
+/**
+ * blkg_print_stat_bytes - seq_show callback for blkg->stat_bytes
+ * @sf: seq_file to print to
+ * @v: unused
+ *
+ * To be used as cftype->seq_show to print blkg->stat_bytes.
+ * cftype->private must be set to the blkcg_policy.
+ */
+int blkg_print_stat_bytes(struct seq_file *sf, void *v)
+{
+       blkcg_print_blkgs(sf, css_to_blkcg(seq_css(sf)),
+                         blkg_prfill_rwstat_field, (void *)seq_cft(sf)->private,
+                         offsetof(struct blkcg_gq, stat_bytes), true);
+       return 0;
+}
+EXPORT_SYMBOL_GPL(blkg_print_stat_bytes);
+
+/**
+ * blkg_print_stat_bytes - seq_show callback for blkg->stat_ios
+ * @sf: seq_file to print to
+ * @v: unused
+ *
+ * To be used as cftype->seq_show to print blkg->stat_ios.  cftype->private
+ * must be set to the blkcg_policy.
+ */
+int blkg_print_stat_ios(struct seq_file *sf, void *v)
+{
+       blkcg_print_blkgs(sf, css_to_blkcg(seq_css(sf)),
+                         blkg_prfill_rwstat_field, (void *)seq_cft(sf)->private,
+                         offsetof(struct blkcg_gq, stat_ios), true);
+       return 0;
+}
+EXPORT_SYMBOL_GPL(blkg_print_stat_ios);
+
+static u64 blkg_prfill_rwstat_field_recursive(struct seq_file *sf,
+                                             struct blkg_policy_data *pd,
+                                             int off)
+{
+       struct blkg_rwstat rwstat = blkg_rwstat_recursive_sum(pd->blkg,
+                                                             NULL, off);
+       return __blkg_prfill_rwstat(sf, pd, &rwstat);
+}
+
+/**
+ * blkg_print_stat_bytes_recursive - recursive version of blkg_print_stat_bytes
+ * @sf: seq_file to print to
+ * @v: unused
+ */
+int blkg_print_stat_bytes_recursive(struct seq_file *sf, void *v)
+{
+       blkcg_print_blkgs(sf, css_to_blkcg(seq_css(sf)),
+                         blkg_prfill_rwstat_field_recursive,
+                         (void *)seq_cft(sf)->private,
+                         offsetof(struct blkcg_gq, stat_bytes), true);
+       return 0;
+}
+EXPORT_SYMBOL_GPL(blkg_print_stat_bytes_recursive);
+
+/**
+ * blkg_print_stat_ios_recursive - recursive version of blkg_print_stat_ios
+ * @sf: seq_file to print to
+ * @v: unused
+ */
+int blkg_print_stat_ios_recursive(struct seq_file *sf, void *v)
+{
+       blkcg_print_blkgs(sf, css_to_blkcg(seq_css(sf)),
+                         blkg_prfill_rwstat_field_recursive,
+                         (void *)seq_cft(sf)->private,
+                         offsetof(struct blkcg_gq, stat_ios), true);
+       return 0;
+}
+EXPORT_SYMBOL_GPL(blkg_print_stat_ios_recursive);
+
 /**
  * blkg_stat_recursive_sum - collect hierarchical blkg_stat
- * @pd: policy private data of interest
- * @off: offset to the blkg_stat in @pd
+ * @blkg: blkg of interest
+ * @pol: blkcg_policy which contains the blkg_stat
+ * @off: offset to the blkg_stat in blkg_policy_data or @blkg
+ *
+ * Collect the blkg_stat specified by @blkg, @pol and @off and all its
+ * online descendants and their aux counts.  The caller must be holding the
+ * queue lock for online tests.
  *
- * Collect the blkg_stat specified by @off from @pd and all its online
- * descendants and return the sum.  The caller must be holding the queue
- * lock for online tests.
+ * If @pol is NULL, blkg_stat is at @off bytes into @blkg; otherwise, it is
+ * at @off bytes into @blkg's blkg_policy_data of the policy.
  */
-u64 blkg_stat_recursive_sum(struct blkg_policy_data *pd, int off)
+u64 blkg_stat_recursive_sum(struct blkcg_gq *blkg,
+                           struct blkcg_policy *pol, int off)
 {
-       struct blkcg_policy *pol = blkcg_policy[pd->plid];
        struct blkcg_gq *pos_blkg;
        struct cgroup_subsys_state *pos_css;
        u64 sum = 0;
 
-       lockdep_assert_held(pd->blkg->q->queue_lock);
+       lockdep_assert_held(blkg->q->queue_lock);
 
        rcu_read_lock();
-       blkg_for_each_descendant_pre(pos_blkg, pos_css, pd_to_blkg(pd)) {
-               struct blkg_policy_data *pos_pd = blkg_to_pd(pos_blkg, pol);
-               struct blkg_stat *stat = (void *)pos_pd + off;
+       blkg_for_each_descendant_pre(pos_blkg, pos_css, blkg) {
+               struct blkg_stat *stat;
+
+               if (!pos_blkg->online)
+                       continue;
 
-               if (pos_blkg->online)
-                       sum += blkg_stat_read(stat);
+               if (pol)
+                       stat = (void *)blkg_to_pd(pos_blkg, pol) + off;
+               else
+                       stat = (void *)blkg + off;
+
+               sum += blkg_stat_read(stat) + atomic64_read(&stat->aux_cnt);
        }
        rcu_read_unlock();
 
@@ -652,37 +726,43 @@ EXPORT_SYMBOL_GPL(blkg_stat_recursive_sum);
 
 /**
  * blkg_rwstat_recursive_sum - collect hierarchical blkg_rwstat
- * @pd: policy private data of interest
- * @off: offset to the blkg_stat in @pd
+ * @blkg: blkg of interest
+ * @pol: blkcg_policy which contains the blkg_rwstat
+ * @off: offset to the blkg_rwstat in blkg_policy_data or @blkg
+ *
+ * Collect the blkg_rwstat specified by @blkg, @pol and @off and all its
+ * online descendants and their aux counts.  The caller must be holding the
+ * queue lock for online tests.
  *
- * Collect the blkg_rwstat specified by @off from @pd and all its online
- * descendants and return the sum.  The caller must be holding the queue
- * lock for online tests.
+ * If @pol is NULL, blkg_rwstat is at @off bytes into @blkg; otherwise, it
+ * is at @off bytes into @blkg's blkg_policy_data of the policy.
  */
-struct blkg_rwstat blkg_rwstat_recursive_sum(struct blkg_policy_data *pd,
-                                            int off)
+struct blkg_rwstat blkg_rwstat_recursive_sum(struct blkcg_gq *blkg,
+                                            struct blkcg_policy *pol, int off)
 {
-       struct blkcg_policy *pol = blkcg_policy[pd->plid];
        struct blkcg_gq *pos_blkg;
        struct cgroup_subsys_state *pos_css;
        struct blkg_rwstat sum = { };
        int i;
 
-       lockdep_assert_held(pd->blkg->q->queue_lock);
+       lockdep_assert_held(blkg->q->queue_lock);
 
        rcu_read_lock();
-       blkg_for_each_descendant_pre(pos_blkg, pos_css, pd_to_blkg(pd)) {
-               struct blkg_policy_data *pos_pd = blkg_to_pd(pos_blkg, pol);
-               struct blkg_rwstat *rwstat = (void *)pos_pd + off;
-               struct blkg_rwstat tmp;
+       blkg_for_each_descendant_pre(pos_blkg, pos_css, blkg) {
+               struct blkg_rwstat *rwstat;
 
                if (!pos_blkg->online)
                        continue;
 
-               tmp = blkg_rwstat_read(rwstat);
+               if (pol)
+                       rwstat = (void *)blkg_to_pd(pos_blkg, pol) + off;
+               else
+                       rwstat = (void *)pos_blkg + off;
 
                for (i = 0; i < BLKG_RWSTAT_NR; i++)
-                       sum.cnt[i] += tmp.cnt[i];
+                       atomic64_add(atomic64_read(&rwstat->aux_cnt[i]) +
+                               percpu_counter_sum_positive(&rwstat->cpu_cnt[i]),
+                               &sum.aux_cnt[i]);
        }
        rcu_read_unlock();
 
@@ -698,29 +778,34 @@ EXPORT_SYMBOL_GPL(blkg_rwstat_recursive_sum);
  * @ctx: blkg_conf_ctx to be filled
  *
  * Parse per-blkg config update from @input and initialize @ctx with the
- * result.  @ctx->blkg points to the blkg to be updated and @ctx->v the new
- * value.  This function returns with RCU read lock and queue lock held and
- * must be paired with blkg_conf_finish().
+ * result.  @ctx->blkg points to the blkg to be updated and @ctx->body the
+ * part of @input following MAJ:MIN.  This function returns with RCU read
+ * lock and queue lock held and must be paired with blkg_conf_finish().
  */
 int blkg_conf_prep(struct blkcg *blkcg, const struct blkcg_policy *pol,
-                  const char *input, struct blkg_conf_ctx *ctx)
+                  char *input, struct blkg_conf_ctx *ctx)
        __acquires(rcu) __acquires(disk->queue->queue_lock)
 {
        struct gendisk *disk;
        struct blkcg_gq *blkg;
        unsigned int major, minor;
-       unsigned long long v;
-       int part, ret;
+       int key_len, part, ret;
+       char *body;
+
+       if (sscanf(input, "%u:%u%n", &major, &minor, &key_len) != 2)
+               return -EINVAL;
 
-       if (sscanf(input, "%u:%u %llu", &major, &minor, &v) != 3)
+       body = input + key_len;
+       if (!isspace(*body))
                return -EINVAL;
+       body = skip_spaces(body);
 
        disk = get_gendisk(MKDEV(major, minor), &part);
        if (!disk)
-               return -EINVAL;
+               return -ENODEV;
        if (part) {
                put_disk(disk);
-               return -EINVAL;
+               return -ENODEV;
        }
 
        rcu_read_lock();
@@ -729,7 +814,7 @@ int blkg_conf_prep(struct blkcg *blkcg, const struct blkcg_policy *pol,
        if (blkcg_policy_enabled(disk->queue, pol))
                blkg = blkg_lookup_create(blkcg, disk->queue);
        else
-               blkg = ERR_PTR(-EINVAL);
+               blkg = ERR_PTR(-EOPNOTSUPP);
 
        if (IS_ERR(blkg)) {
                ret = PTR_ERR(blkg);
@@ -751,7 +836,7 @@ int blkg_conf_prep(struct blkcg *blkcg, const struct blkcg_policy *pol,
 
        ctx->disk = disk;
        ctx->blkg = blkg;
-       ctx->v = v;
+       ctx->body = body;
        return 0;
 }
 EXPORT_SYMBOL_GPL(blkg_conf_prep);
@@ -772,7 +857,55 @@ void blkg_conf_finish(struct blkg_conf_ctx *ctx)
 }
 EXPORT_SYMBOL_GPL(blkg_conf_finish);
 
+static int blkcg_print_stat(struct seq_file *sf, void *v)
+{
+       struct blkcg *blkcg = css_to_blkcg(seq_css(sf));
+       struct blkcg_gq *blkg;
+
+       rcu_read_lock();
+
+       hlist_for_each_entry_rcu(blkg, &blkcg->blkg_list, blkcg_node) {
+               const char *dname;
+               struct blkg_rwstat rwstat;
+               u64 rbytes, wbytes, rios, wios;
+
+               dname = blkg_dev_name(blkg);
+               if (!dname)
+                       continue;
+
+               spin_lock_irq(blkg->q->queue_lock);
+
+               rwstat = blkg_rwstat_recursive_sum(blkg, NULL,
+                                       offsetof(struct blkcg_gq, stat_bytes));
+               rbytes = atomic64_read(&rwstat.aux_cnt[BLKG_RWSTAT_READ]);
+               wbytes = atomic64_read(&rwstat.aux_cnt[BLKG_RWSTAT_WRITE]);
+
+               rwstat = blkg_rwstat_recursive_sum(blkg, NULL,
+                                       offsetof(struct blkcg_gq, stat_ios));
+               rios = atomic64_read(&rwstat.aux_cnt[BLKG_RWSTAT_READ]);
+               wios = atomic64_read(&rwstat.aux_cnt[BLKG_RWSTAT_WRITE]);
+
+               spin_unlock_irq(blkg->q->queue_lock);
+
+               if (rbytes || wbytes || rios || wios)
+                       seq_printf(sf, "%s rbytes=%llu wbytes=%llu rios=%llu wios=%llu\n",
+                                  dname, rbytes, wbytes, rios, wios);
+       }
+
+       rcu_read_unlock();
+       return 0;
+}
+
 struct cftype blkcg_files[] = {
+       {
+               .name = "stat",
+               .flags = CFTYPE_NOT_ON_ROOT,
+               .seq_show = blkcg_print_stat,
+       },
+       { }     /* terminate */
+};
+
+struct cftype blkcg_legacy_files[] = {
        {
                .name = "reset_stats",
                .write_u64 = blkcg_reset_stats,
@@ -813,38 +946,91 @@ static void blkcg_css_offline(struct cgroup_subsys_state *css)
        }
 
        spin_unlock_irq(&blkcg->lock);
+
+       wb_blkcg_offline(blkcg);
 }
 
 static void blkcg_css_free(struct cgroup_subsys_state *css)
 {
        struct blkcg *blkcg = css_to_blkcg(css);
+       int i;
+
+       mutex_lock(&blkcg_pol_mutex);
+
+       list_del(&blkcg->all_blkcgs_node);
+
+       for (i = 0; i < BLKCG_MAX_POLS; i++)
+               if (blkcg->cpd[i])
+                       blkcg_policy[i]->cpd_free_fn(blkcg->cpd[i]);
 
-       if (blkcg != &blkcg_root)
-               kfree(blkcg);
+       mutex_unlock(&blkcg_pol_mutex);
+
+       kfree(blkcg);
 }
 
 static struct cgroup_subsys_state *
 blkcg_css_alloc(struct cgroup_subsys_state *parent_css)
 {
        struct blkcg *blkcg;
+       struct cgroup_subsys_state *ret;
+       int i;
+
+       mutex_lock(&blkcg_pol_mutex);
 
        if (!parent_css) {
                blkcg = &blkcg_root;
-               goto done;
+       } else {
+               blkcg = kzalloc(sizeof(*blkcg), GFP_KERNEL);
+               if (!blkcg) {
+                       ret = ERR_PTR(-ENOMEM);
+                       goto free_blkcg;
+               }
        }
 
-       blkcg = kzalloc(sizeof(*blkcg), GFP_KERNEL);
-       if (!blkcg)
-               return ERR_PTR(-ENOMEM);
+       for (i = 0; i < BLKCG_MAX_POLS ; i++) {
+               struct blkcg_policy *pol = blkcg_policy[i];
+               struct blkcg_policy_data *cpd;
+
+               /*
+                * If the policy hasn't been attached yet, wait for it
+                * to be attached before doing anything else. Otherwise,
+                * check if the policy requires any specific per-cgroup
+                * data: if it does, allocate and initialize it.
+                */
+               if (!pol || !pol->cpd_alloc_fn)
+                       continue;
+
+               cpd = pol->cpd_alloc_fn(GFP_KERNEL);
+               if (!cpd) {
+                       ret = ERR_PTR(-ENOMEM);
+                       goto free_pd_blkcg;
+               }
+               blkcg->cpd[i] = cpd;
+               cpd->blkcg = blkcg;
+               cpd->plid = i;
+               if (pol->cpd_init_fn)
+                       pol->cpd_init_fn(cpd);
+       }
 
-       blkcg->cfq_weight = CFQ_WEIGHT_DEFAULT;
-       blkcg->cfq_leaf_weight = CFQ_WEIGHT_DEFAULT;
-done:
        spin_lock_init(&blkcg->lock);
-       INIT_RADIX_TREE(&blkcg->blkg_tree, GFP_ATOMIC);
+       INIT_RADIX_TREE(&blkcg->blkg_tree, GFP_NOWAIT);
        INIT_HLIST_HEAD(&blkcg->blkg_list);
+#ifdef CONFIG_CGROUP_WRITEBACK
+       INIT_LIST_HEAD(&blkcg->cgwb_list);
+#endif
+       list_add_tail(&blkcg->all_blkcgs_node, &all_blkcgs);
 
+       mutex_unlock(&blkcg_pol_mutex);
        return &blkcg->css;
+
+free_pd_blkcg:
+       for (i--; i >= 0; i--)
+               if (blkcg->cpd[i])
+                       blkcg_policy[i]->cpd_free_fn(blkcg->cpd[i]);
+free_blkcg:
+       kfree(blkcg);
+       mutex_unlock(&blkcg_pol_mutex);
+       return ret;
 }
 
 /**
@@ -859,9 +1045,45 @@ done:
  */
 int blkcg_init_queue(struct request_queue *q)
 {
-       might_sleep();
+       struct blkcg_gq *new_blkg, *blkg;
+       bool preloaded;
+       int ret;
+
+       new_blkg = blkg_alloc(&blkcg_root, q, GFP_KERNEL);
+       if (!new_blkg)
+               return -ENOMEM;
+
+       preloaded = !radix_tree_preload(GFP_KERNEL);
 
-       return blk_throtl_init(q);
+       /*
+        * Make sure the root blkg exists and count the existing blkgs.  As
+        * @q is bypassing at this point, blkg_lookup_create() can't be
+        * used.  Open code insertion.
+        */
+       rcu_read_lock();
+       spin_lock_irq(q->queue_lock);
+       blkg = blkg_create(&blkcg_root, q, new_blkg);
+       spin_unlock_irq(q->queue_lock);
+       rcu_read_unlock();
+
+       if (preloaded)
+               radix_tree_preload_end();
+
+       if (IS_ERR(blkg)) {
+               blkg_free(new_blkg);
+               return PTR_ERR(blkg);
+       }
+
+       q->root_blkg = blkg;
+       q->root_rl.blkg = blkg;
+
+       ret = blk_throtl_init(q);
+       if (ret) {
+               spin_lock_irq(q->queue_lock);
+               blkg_destroy_all(q);
+               spin_unlock_irq(q->queue_lock);
+       }
+       return ret;
 }
 
 /**
@@ -905,15 +1127,15 @@ void blkcg_exit_queue(struct request_queue *q)
  * of the main cic data structures.  For now we allow a task to change
  * its cgroup only if it's the only owner of its ioc.
  */
-static int blkcg_can_attach(struct cgroup_subsys_state *css,
-                           struct cgroup_taskset *tset)
+static int blkcg_can_attach(struct cgroup_taskset *tset)
 {
        struct task_struct *task;
+       struct cgroup_subsys_state *dst_css;
        struct io_context *ioc;
        int ret = 0;
 
        /* task_lock() is needed to avoid races with exit_io_context() */
-       cgroup_taskset_for_each(task, tset) {
+       cgroup_taskset_for_each(task, dst_css, tset) {
                task_lock(task);
                ioc = task->io_context;
                if (ioc && atomic_read(&ioc->nr_tasks) > 1)
@@ -925,12 +1147,35 @@ static int blkcg_can_attach(struct cgroup_subsys_state *css,
        return ret;
 }
 
-struct cgroup_subsys blkio_cgrp_subsys = {
+static void blkcg_bind(struct cgroup_subsys_state *root_css)
+{
+       int i;
+
+       mutex_lock(&blkcg_pol_mutex);
+
+       for (i = 0; i < BLKCG_MAX_POLS; i++) {
+               struct blkcg_policy *pol = blkcg_policy[i];
+               struct blkcg *blkcg;
+
+               if (!pol || !pol->cpd_bind_fn)
+                       continue;
+
+               list_for_each_entry(blkcg, &all_blkcgs, all_blkcgs_node)
+                       if (blkcg->cpd[pol->plid])
+                               pol->cpd_bind_fn(blkcg->cpd[pol->plid]);
+       }
+       mutex_unlock(&blkcg_pol_mutex);
+}
+
+struct cgroup_subsys io_cgrp_subsys = {
        .css_alloc = blkcg_css_alloc,
        .css_offline = blkcg_css_offline,
        .css_free = blkcg_css_free,
        .can_attach = blkcg_can_attach,
-       .legacy_cftypes = blkcg_files,
+       .bind = blkcg_bind,
+       .dfl_cftypes = blkcg_files,
+       .legacy_cftypes = blkcg_legacy_files,
+       .legacy_name = "blkio",
 #ifdef CONFIG_MEMCG
        /*
         * This ensures that, if available, memcg is automatically enabled
@@ -940,7 +1185,7 @@ struct cgroup_subsys blkio_cgrp_subsys = {
        .depends_on = 1 << memory_cgrp_id,
 #endif
 };
-EXPORT_SYMBOL_GPL(blkio_cgrp_subsys);
+EXPORT_SYMBOL_GPL(io_cgrp_subsys);
 
 /**
  * blkcg_activate_policy - activate a blkcg policy on a request_queue
@@ -961,96 +1206,54 @@ EXPORT_SYMBOL_GPL(blkio_cgrp_subsys);
 int blkcg_activate_policy(struct request_queue *q,
                          const struct blkcg_policy *pol)
 {
-       LIST_HEAD(pds);
-       struct blkcg_gq *blkg, *new_blkg;
-       struct blkg_policy_data *pd, *n;
-       int cnt = 0, ret;
-       bool preloaded;
+       struct blkg_policy_data *pd_prealloc = NULL;
+       struct blkcg_gq *blkg;
+       int ret;
 
        if (blkcg_policy_enabled(q, pol))
                return 0;
 
-       /* preallocations for root blkg */
-       new_blkg = blkg_alloc(&blkcg_root, q, GFP_KERNEL);
-       if (!new_blkg)
-               return -ENOMEM;
-
        blk_queue_bypass_start(q);
-
-       preloaded = !radix_tree_preload(GFP_KERNEL);
-
-       /*
-        * Make sure the root blkg exists and count the existing blkgs.  As
-        * @q is bypassing at this point, blkg_lookup_create() can't be
-        * used.  Open code it.
-        */
-       spin_lock_irq(q->queue_lock);
-
-       rcu_read_lock();
-       blkg = __blkg_lookup(&blkcg_root, q, false);
-       if (blkg)
-               blkg_free(new_blkg);
-       else
-               blkg = blkg_create(&blkcg_root, q, new_blkg);
-       rcu_read_unlock();
-
-       if (preloaded)
-               radix_tree_preload_end();
-
-       if (IS_ERR(blkg)) {
-               ret = PTR_ERR(blkg);
-               goto out_unlock;
-       }
-
-       list_for_each_entry(blkg, &q->blkg_list, q_node)
-               cnt++;
-
-       spin_unlock_irq(q->queue_lock);
-
-       /* allocate policy_data for all existing blkgs */
-       while (cnt--) {
-               pd = kzalloc_node(pol->pd_size, GFP_KERNEL, q->node);
-               if (!pd) {
+pd_prealloc:
+       if (!pd_prealloc) {
+               pd_prealloc = pol->pd_alloc_fn(GFP_KERNEL, q->node);
+               if (!pd_prealloc) {
                        ret = -ENOMEM;
-                       goto out_free;
+                       goto out_bypass_end;
                }
-               list_add_tail(&pd->alloc_node, &pds);
        }
 
-       /*
-        * Install the allocated pds.  With @q bypassing, no new blkg
-        * should have been created while the queue lock was dropped.
-        */
        spin_lock_irq(q->queue_lock);
 
        list_for_each_entry(blkg, &q->blkg_list, q_node) {
-               if (WARN_ON(list_empty(&pds))) {
-                       /* umm... this shouldn't happen, just abort */
-                       ret = -ENOMEM;
-                       goto out_unlock;
-               }
-               pd = list_first_entry(&pds, struct blkg_policy_data, alloc_node);
-               list_del_init(&pd->alloc_node);
+               struct blkg_policy_data *pd;
 
-               /* grab blkcg lock too while installing @pd on @blkg */
-               spin_lock(&blkg->blkcg->lock);
+               if (blkg->pd[pol->plid])
+                       continue;
+
+               pd = pol->pd_alloc_fn(GFP_NOWAIT, q->node);
+               if (!pd)
+                       swap(pd, pd_prealloc);
+               if (!pd) {
+                       spin_unlock_irq(q->queue_lock);
+                       goto pd_prealloc;
+               }
 
                blkg->pd[pol->plid] = pd;
                pd->blkg = blkg;
                pd->plid = pol->plid;
-               pol->pd_init_fn(blkg);
-
-               spin_unlock(&blkg->blkcg->lock);
+               if (pol->pd_init_fn)
+                       pol->pd_init_fn(pd);
        }
 
        __set_bit(pol->plid, q->blkcg_pols);
        ret = 0;
-out_unlock:
+
        spin_unlock_irq(q->queue_lock);
-out_free:
+out_bypass_end:
        blk_queue_bypass_end(q);
-       list_for_each_entry_safe(pd, n, &pds, alloc_node)
-               kfree(pd);
+       if (pd_prealloc)
+               pol->pd_free_fn(pd_prealloc);
        return ret;
 }
 EXPORT_SYMBOL_GPL(blkcg_activate_policy);
@@ -1076,21 +1279,16 @@ void blkcg_deactivate_policy(struct request_queue *q,
 
        __clear_bit(pol->plid, q->blkcg_pols);
 
-       /* if no policy is left, no need for blkgs - shoot them down */
-       if (bitmap_empty(q->blkcg_pols, BLKCG_MAX_POLS))
-               blkg_destroy_all(q);
-
        list_for_each_entry(blkg, &q->blkg_list, q_node) {
                /* grab blkcg lock too while removing @pd from @blkg */
                spin_lock(&blkg->blkcg->lock);
 
-               if (pol->pd_offline_fn)
-                       pol->pd_offline_fn(blkg);
-               if (pol->pd_exit_fn)
-                       pol->pd_exit_fn(blkg);
-
-               kfree(blkg->pd[pol->plid]);
-               blkg->pd[pol->plid] = NULL;
+               if (blkg->pd[pol->plid]) {
+                       if (pol->pd_offline_fn)
+                               pol->pd_offline_fn(blkg->pd[pol->plid]);
+                       pol->pd_free_fn(blkg->pd[pol->plid]);
+                       blkg->pd[pol->plid] = NULL;
+               }
 
                spin_unlock(&blkg->blkcg->lock);
        }
@@ -1109,11 +1307,10 @@ EXPORT_SYMBOL_GPL(blkcg_deactivate_policy);
  */
 int blkcg_policy_register(struct blkcg_policy *pol)
 {
+       struct blkcg *blkcg;
        int i, ret;
 
-       if (WARN_ON(pol->pd_size < sizeof(struct blkg_policy_data)))
-               return -EINVAL;
-
+       mutex_lock(&blkcg_pol_register_mutex);
        mutex_lock(&blkcg_pol_mutex);
 
        /* find an empty slot */
@@ -1122,19 +1319,55 @@ int blkcg_policy_register(struct blkcg_policy *pol)
                if (!blkcg_policy[i])
                        break;
        if (i >= BLKCG_MAX_POLS)
-               goto out_unlock;
+               goto err_unlock;
 
-       /* register and update blkgs */
+       /* register @pol */
        pol->plid = i;
-       blkcg_policy[i] = pol;
+       blkcg_policy[pol->plid] = pol;
+
+       /* allocate and install cpd's */
+       if (pol->cpd_alloc_fn) {
+               list_for_each_entry(blkcg, &all_blkcgs, all_blkcgs_node) {
+                       struct blkcg_policy_data *cpd;
+
+                       cpd = pol->cpd_alloc_fn(GFP_KERNEL);
+                       if (!cpd) {
+                               mutex_unlock(&blkcg_pol_mutex);
+                               goto err_free_cpds;
+                       }
+
+                       blkcg->cpd[pol->plid] = cpd;
+                       cpd->blkcg = blkcg;
+                       cpd->plid = pol->plid;
+                       pol->cpd_init_fn(cpd);
+               }
+       }
+
+       mutex_unlock(&blkcg_pol_mutex);
 
        /* everything is in place, add intf files for the new policy */
-       if (pol->cftypes)
-               WARN_ON(cgroup_add_legacy_cftypes(&blkio_cgrp_subsys,
-                                                 pol->cftypes));
-       ret = 0;
-out_unlock:
+       if (pol->dfl_cftypes)
+               WARN_ON(cgroup_add_dfl_cftypes(&io_cgrp_subsys,
+                                              pol->dfl_cftypes));
+       if (pol->legacy_cftypes)
+               WARN_ON(cgroup_add_legacy_cftypes(&io_cgrp_subsys,
+                                                 pol->legacy_cftypes));
+       mutex_unlock(&blkcg_pol_register_mutex);
+       return 0;
+
+err_free_cpds:
+       if (pol->cpd_alloc_fn) {
+               list_for_each_entry(blkcg, &all_blkcgs, all_blkcgs_node) {
+                       if (blkcg->cpd[pol->plid]) {
+                               pol->cpd_free_fn(blkcg->cpd[pol->plid]);
+                               blkcg->cpd[pol->plid] = NULL;
+                       }
+               }
+       }
+       blkcg_policy[pol->plid] = NULL;
+err_unlock:
        mutex_unlock(&blkcg_pol_mutex);
+       mutex_unlock(&blkcg_pol_register_mutex);
        return ret;
 }
 EXPORT_SYMBOL_GPL(blkcg_policy_register);
@@ -1147,18 +1380,34 @@ EXPORT_SYMBOL_GPL(blkcg_policy_register);
  */
 void blkcg_policy_unregister(struct blkcg_policy *pol)
 {
-       mutex_lock(&blkcg_pol_mutex);
+       struct blkcg *blkcg;
+
+       mutex_lock(&blkcg_pol_register_mutex);
 
        if (WARN_ON(blkcg_policy[pol->plid] != pol))
                goto out_unlock;
 
        /* kill the intf files first */
-       if (pol->cftypes)
-               cgroup_rm_cftypes(pol->cftypes);
+       if (pol->dfl_cftypes)
+               cgroup_rm_cftypes(pol->dfl_cftypes);
+       if (pol->legacy_cftypes)
+               cgroup_rm_cftypes(pol->legacy_cftypes);
+
+       /* remove cpds and unregister */
+       mutex_lock(&blkcg_pol_mutex);
 
-       /* unregister and update blkgs */
+       if (pol->cpd_alloc_fn) {
+               list_for_each_entry(blkcg, &all_blkcgs, all_blkcgs_node) {
+                       if (blkcg->cpd[pol->plid]) {
+                               pol->cpd_free_fn(blkcg->cpd[pol->plid]);
+                               blkcg->cpd[pol->plid] = NULL;
+                       }
+               }
+       }
        blkcg_policy[pol->plid] = NULL;
-out_unlock:
+
        mutex_unlock(&blkcg_pol_mutex);
+out_unlock:
+       mutex_unlock(&blkcg_pol_register_mutex);
 }
 EXPORT_SYMBOL_GPL(blkcg_policy_unregister);