These changes are the raw update to linux-4.4.6-rt14. Kernel sources

[kvmfornfv.git] / kernel / kernel / cpuset.c
diff --git a/kernel/kernel/cpuset.c b/kernel/kernel/cpuset.c

index f0acff0..2ade632 100644 (file)
--- a/kernel/kernel/cpuset.c
+++ b/kernel/kernel/cpuset.c
@@ -286,6 +286,8 @@ static struct cpuset top_cpuset = {
  static DEFINE_MUTEX(cpuset_mutex);
  static DEFINE_SPINLOCK(callback_lock);
  
+static struct workqueue_struct *cpuset_migrate_mm_wq;
+
  /*
   * CPU / memory hotplug is handled asynchronously.
   */
@@ -473,7 +475,8 @@ static int validate_change(struct cpuset *cur, struct cpuset *trial)
  
         /* On legacy hiearchy, we must be a subset of our parent cpuset. */
         ret = -EACCES;
-       if (!cgroup_on_dfl(cur->css.cgroup) && !is_cpuset_subset(trial, par))
+       if (!cgroup_subsys_on_dfl(cpuset_cgrp_subsys) &&
+           !is_cpuset_subset(trial, par))
                 goto out;
  
         /*
@@ -497,7 +500,7 @@ static int validate_change(struct cpuset *cur, struct cpuset *trial)
          * be changed to have empty cpus_allowed or mems_allowed.
          */
         ret = -ENOSPC;
-       if ((cgroup_has_tasks(cur->css.cgroup) || cur->attach_in_progress)) {
+       if ((cgroup_is_populated(cur->css.cgroup) || cur->attach_in_progress)) {
                 if (!cpumask_empty(cur->cpus_allowed) &&
                     cpumask_empty(trial->cpus_allowed))
                         goto out;
@@ -879,7 +882,8 @@ static void update_cpumasks_hier(struct cpuset *cs, struct cpumask *new_cpus)
                  * If it becomes empty, inherit the effective mask of the
                  * parent, which is guaranteed to have some CPUs.
                  */
-               if (cgroup_on_dfl(cp->css.cgroup) && cpumask_empty(new_cpus))
+               if (cgroup_subsys_on_dfl(cpuset_cgrp_subsys) &&
+                   cpumask_empty(new_cpus))
                         cpumask_copy(new_cpus, parent->effective_cpus);
  
                 /* Skip the whole subtree if the cpumask remains the same. */
@@ -896,7 +900,7 @@ static void update_cpumasks_hier(struct cpuset *cs, struct cpumask *new_cpus)
                 cpumask_copy(cp->effective_cpus, new_cpus);
                 spin_unlock_irq(&callback_lock);
  
-               WARN_ON(!cgroup_on_dfl(cp->css.cgroup) &&
+               WARN_ON(!cgroup_subsys_on_dfl(cpuset_cgrp_subsys) &&
                         !cpumask_equal(cp->cpus_allowed, cp->effective_cpus));
  
                 update_tasks_cpumask(cp);
@@ -969,31 +973,51 @@ static int update_cpumask(struct cpuset *cs, struct cpuset *trialcs,
  }
  
  /*
- * cpuset_migrate_mm
- *
- *    Migrate memory region from one set of nodes to another.
- *
- *    Temporarilly set tasks mems_allowed to target nodes of migration,
- *    so that the migration code can allocate pages on these nodes.
- *
- *    While the mm_struct we are migrating is typically from some
- *    other task, the task_struct mems_allowed that we are hacking
- *    is for our current task, which must allocate new pages for that
- *    migrating memory region.
+ * Migrate memory region from one set of nodes to another.  This is
+ * performed asynchronously as it can be called from process migration path
+ * holding locks involved in process management.  All mm migrations are
+ * performed in the queued order and can be waited for by flushing
+ * cpuset_migrate_mm_wq.
   */
  
+struct cpuset_migrate_mm_work {
+       struct work_struct      work;
+       struct mm_struct        *mm;
+       nodemask_t              from;
+       nodemask_t              to;
+};
+
+static void cpuset_migrate_mm_workfn(struct work_struct *work)
+{
+       struct cpuset_migrate_mm_work *mwork =
+               container_of(work, struct cpuset_migrate_mm_work, work);
+
+       /* on a wq worker, no need to worry about %current's mems_allowed */
+       do_migrate_pages(mwork->mm, &mwork->from, &mwork->to, MPOL_MF_MOVE_ALL);
+       mmput(mwork->mm);
+       kfree(mwork);
+}
+
  static void cpuset_migrate_mm(struct mm_struct *mm, const nodemask_t *from,
                                                         const nodemask_t *to)
  {
-       struct task_struct *tsk = current;
+       struct cpuset_migrate_mm_work *mwork;
  
-       tsk->mems_allowed = *to;
-
-       do_migrate_pages(mm, from, to, MPOL_MF_MOVE_ALL);
+       mwork = kzalloc(sizeof(*mwork), GFP_KERNEL);
+       if (mwork) {
+               mwork->mm = mm;
+               mwork->from = *from;
+               mwork->to = *to;
+               INIT_WORK(&mwork->work, cpuset_migrate_mm_workfn);
+               queue_work(cpuset_migrate_mm_wq, &mwork->work);
+       } else {
+               mmput(mm);
+       }
+}
  
-       rcu_read_lock();
-       guarantee_online_mems(task_cs(tsk), &tsk->mems_allowed);
-       rcu_read_unlock();
+void cpuset_post_attach_flush(void)
+{
+       flush_workqueue(cpuset_migrate_mm_wq);
  }
  
  /*
@@ -1094,7 +1118,8 @@ static void update_tasks_nodemask(struct cpuset *cs)
                 mpol_rebind_mm(mm, &cs->mems_allowed);
                 if (migrate)
                         cpuset_migrate_mm(mm, &cs->old_mems_allowed, &newmems);
-               mmput(mm);
+               else
+                       mmput(mm);
         }
         css_task_iter_end(&it);
  
@@ -1135,7 +1160,8 @@ static void update_nodemasks_hier(struct cpuset *cs, nodemask_t *new_mems)
                  * If it becomes empty, inherit the effective mask of the
                  * parent, which is guaranteed to have some MEMs.
                  */
-               if (cgroup_on_dfl(cp->css.cgroup) && nodes_empty(*new_mems))
+               if (cgroup_subsys_on_dfl(cpuset_cgrp_subsys) &&
+                   nodes_empty(*new_mems))
                         *new_mems = parent->effective_mems;
  
                 /* Skip the whole subtree if the nodemask remains the same. */
@@ -1152,7 +1178,7 @@ static void update_nodemasks_hier(struct cpuset *cs, nodemask_t *new_mems)
                 cp->effective_mems = *new_mems;
                 spin_unlock_irq(&callback_lock);
  
-               WARN_ON(!cgroup_on_dfl(cp->css.cgroup) &&
+               WARN_ON(!cgroup_subsys_on_dfl(cpuset_cgrp_subsys) &&
                         !nodes_equal(cp->mems_allowed, cp->effective_mems));
  
                 update_tasks_nodemask(cp);
@@ -1426,25 +1452,26 @@ static int fmeter_getrate(struct fmeter *fmp)
  static struct cpuset *cpuset_attach_old_cs;
  
  /* Called by cgroups to determine if a cpuset is usable; cpuset_mutex held */
-static int cpuset_can_attach(struct cgroup_subsys_state *css,
-                            struct cgroup_taskset *tset)
+static int cpuset_can_attach(struct cgroup_taskset *tset)
  {
-       struct cpuset *cs = css_cs(css);
+       struct cgroup_subsys_state *css;
+       struct cpuset *cs;
         struct task_struct *task;
         int ret;
  
         /* used later by cpuset_attach() */
-       cpuset_attach_old_cs = task_cs(cgroup_taskset_first(tset));
+       cpuset_attach_old_cs = task_cs(cgroup_taskset_first(tset, &css));
+       cs = css_cs(css);
  
         mutex_lock(&cpuset_mutex);
  
         /* allow moving tasks into an empty cpuset if on default hierarchy */
         ret = -ENOSPC;
-       if (!cgroup_on_dfl(css->cgroup) &&
+       if (!cgroup_subsys_on_dfl(cpuset_cgrp_subsys) &&
             (cpumask_empty(cs->cpus_allowed) || nodes_empty(cs->mems_allowed)))
                 goto out_unlock;
  
-       cgroup_taskset_for_each(task, tset) {
+       cgroup_taskset_for_each(task, css, tset) {
                 ret = task_can_attach(task, cs->cpus_allowed);
                 if (ret)
                         goto out_unlock;
@@ -1464,9 +1491,14 @@ out_unlock:
         return ret;
  }
  
-static void cpuset_cancel_attach(struct cgroup_subsys_state *css,
-                                struct cgroup_taskset *tset)
+static void cpuset_cancel_attach(struct cgroup_taskset *tset)
  {
+       struct cgroup_subsys_state *css;
+       struct cpuset *cs;
+
+       cgroup_taskset_first(tset, &css);
+       cs = css_cs(css);
+
         mutex_lock(&cpuset_mutex);
         css_cs(css)->attach_in_progress--;
         mutex_unlock(&cpuset_mutex);
@@ -1479,17 +1511,19 @@ static void cpuset_cancel_attach(struct cgroup_subsys_state *css,
   */
  static cpumask_var_t cpus_attach;
  
-static void cpuset_attach(struct cgroup_subsys_state *css,
-                         struct cgroup_taskset *tset)
+static void cpuset_attach(struct cgroup_taskset *tset)
  {
         /* static buf protected by cpuset_mutex */
         static nodemask_t cpuset_attach_nodemask_to;
-       struct mm_struct *mm;
         struct task_struct *task;
-       struct task_struct *leader = cgroup_taskset_first(tset);
-       struct cpuset *cs = css_cs(css);
+       struct task_struct *leader;
+       struct cgroup_subsys_state *css;
+       struct cpuset *cs;
         struct cpuset *oldcs = cpuset_attach_old_cs;
  
+       cgroup_taskset_first(tset, &css);
+       cs = css_cs(css);
+
         mutex_lock(&cpuset_mutex);
  
         /* prepare for attach */
@@ -1500,7 +1534,7 @@ static void cpuset_attach(struct cgroup_subsys_state *css,
  
         guarantee_online_mems(cs, &cpuset_attach_nodemask_to);
  
-       cgroup_taskset_for_each(task, tset) {
+       cgroup_taskset_for_each(task, css, tset) {
                 /*
                  * can_attach beforehand should guarantee that this doesn't
                  * fail.  TODO: have a better way to handle failure here
@@ -1512,26 +1546,30 @@ static void cpuset_attach(struct cgroup_subsys_state *css,
         }
  
         /*
-        * Change mm, possibly for multiple threads in a threadgroup. This is
-        * expensive and may sleep.
+        * Change mm for all threadgroup leaders. This is expensive and may
+        * sleep and should be moved outside migration path proper.
          */
         cpuset_attach_nodemask_to = cs->effective_mems;
-       mm = get_task_mm(leader);
-       if (mm) {
-               mpol_rebind_mm(mm, &cpuset_attach_nodemask_to);
-
-               /*
-                * old_mems_allowed is the same with mems_allowed here, except
-                * if this task is being moved automatically due to hotplug.
-                * In that case @mems_allowed has been updated and is empty,
-                * so @old_mems_allowed is the right nodesets that we migrate
-                * mm from.
-                */
-               if (is_memory_migrate(cs)) {
-                       cpuset_migrate_mm(mm, &oldcs->old_mems_allowed,
-                                         &cpuset_attach_nodemask_to);
+       cgroup_taskset_for_each_leader(leader, css, tset) {
+               struct mm_struct *mm = get_task_mm(leader);
+
+               if (mm) {
+                       mpol_rebind_mm(mm, &cpuset_attach_nodemask_to);
+
+                       /*
+                        * old_mems_allowed is the same with mems_allowed
+                        * here, except if this task is being moved
+                        * automatically due to hotplug.  In that case
+                        * @mems_allowed has been updated and is empty, so
+                        * @old_mems_allowed is the right nodesets that we
+                        * migrate mm from.
+                        */
+                       if (is_memory_migrate(cs))
+                               cpuset_migrate_mm(mm, &oldcs->old_mems_allowed,
+                                                 &cpuset_attach_nodemask_to);
+                       else
+                               mmput(mm);
                 }
-               mmput(mm);
         }
  
         cs->old_mems_allowed = cpuset_attach_nodemask_to;
@@ -1594,9 +1632,6 @@ static int cpuset_write_u64(struct cgroup_subsys_state *css, struct cftype *cft,
         case FILE_MEMORY_PRESSURE_ENABLED:
                 cpuset_memory_pressure_enabled = !!val;
                 break;
-       case FILE_MEMORY_PRESSURE:
-               retval = -EACCES;
-               break;
         case FILE_SPREAD_PAGE:
                 retval = update_flag(CS_SPREAD_PAGE, cs, val);
                 break;
@@ -1698,6 +1733,7 @@ out_unlock:
         mutex_unlock(&cpuset_mutex);
         kernfs_unbreak_active_protection(of->kn);
         css_put(&cs->css);
+       flush_workqueue(cpuset_migrate_mm_wq);
         return retval ?: nbytes;
  }
  
@@ -1863,9 +1899,6 @@ static struct cftype files[] = {
         {
                 .name = "memory_pressure",
                 .read_u64 = cpuset_read_u64,
-               .write_u64 = cpuset_write_u64,
-               .private = FILE_MEMORY_PRESSURE,
-               .mode = S_IRUGO,
         },
  
         {
@@ -1952,7 +1985,7 @@ static int cpuset_css_online(struct cgroup_subsys_state *css)
         cpuset_inc();
  
         spin_lock_irq(&callback_lock);
-       if (cgroup_on_dfl(cs->css.cgroup)) {
+       if (cgroup_subsys_on_dfl(cpuset_cgrp_subsys)) {
                 cpumask_copy(cs->effective_cpus, parent->effective_cpus);
                 cs->effective_mems = parent->effective_mems;
         }
@@ -2029,7 +2062,7 @@ static void cpuset_bind(struct cgroup_subsys_state *root_css)
         mutex_lock(&cpuset_mutex);
         spin_lock_irq(&callback_lock);
  
-       if (cgroup_on_dfl(root_css->cgroup)) {
+       if (cgroup_subsys_on_dfl(cpuset_cgrp_subsys)) {
                 cpumask_copy(top_cpuset.cpus_allowed, cpu_possible_mask);
                 top_cpuset.mems_allowed = node_possible_map;
         } else {
@@ -2210,7 +2243,7 @@ retry:
         cpus_updated = !cpumask_equal(&new_cpus, cs->effective_cpus);
         mems_updated = !nodes_equal(new_mems, cs->effective_mems);
  
-       if (cgroup_on_dfl(cs->css.cgroup))
+       if (cgroup_subsys_on_dfl(cpuset_cgrp_subsys))
                 hotplug_update_tasks(cs, &new_cpus, &new_mems,
                                      cpus_updated, mems_updated);
         else
@@ -2241,7 +2274,7 @@ static void cpuset_hotplug_workfn(struct work_struct *work)
         static cpumask_t new_cpus;
         static nodemask_t new_mems;
         bool cpus_updated, mems_updated;
-       bool on_dfl = cgroup_on_dfl(top_cpuset.css.cgroup);
+       bool on_dfl = cgroup_subsys_on_dfl(cpuset_cgrp_subsys);
  
         mutex_lock(&cpuset_mutex);
  
@@ -2346,6 +2379,9 @@ void __init cpuset_init_smp(void)
         top_cpuset.effective_mems = node_states[N_MEMORY];
  
         register_hotmemory_notifier(&cpuset_track_online_nodes_nb);
+
+       cpuset_migrate_mm_wq = alloc_ordered_workqueue("cpuset_migrate_mm", 0);
+       BUG_ON(!cpuset_migrate_mm_wq);
  }
  
  /**
@@ -2598,22 +2634,22 @@ int cpuset_mems_allowed_intersects(const struct task_struct *tsk1,
  }
  
  /**
- * cpuset_print_task_mems_allowed - prints task's cpuset and mems_allowed
- * @tsk: pointer to task_struct of some task.
+ * cpuset_print_current_mems_allowed - prints current's cpuset and mems_allowed
   *
- * Description: Prints @task's name, cpuset name, and cached copy of its
+ * Description: Prints current's name, cpuset name, and cached copy of its
   * mems_allowed to the kernel log.
   */
-void cpuset_print_task_mems_allowed(struct task_struct *tsk)
+void cpuset_print_current_mems_allowed(void)
  {
         struct cgroup *cgrp;
  
         rcu_read_lock();
  
-       cgrp = task_cs(tsk)->css.cgroup;
-       pr_info("%s cpuset=", tsk->comm);
+       cgrp = task_cs(current)->css.cgroup;
+       pr_info("%s cpuset=", current->comm);
         pr_cont_cgroup_name(cgrp);
-       pr_cont(" mems_allowed=%*pbl\n", nodemask_pr_args(&tsk->mems_allowed));
+       pr_cont(" mems_allowed=%*pbl\n",
+               nodemask_pr_args(&current->mems_allowed));
  
         rcu_read_unlock();
  }