Optimizing the code and uploading ftrace logs to artifacts

[kvmfornfv.git] / kernel / kernel / workqueue.c
diff --git a/kernel/kernel/workqueue.c b/kernel/kernel/workqueue.c

index 21daecd..d5b0f4f 100644 (file)
--- a/kernel/kernel/workqueue.c
+++ b/kernel/kernel/workqueue.c
@@ -134,6 +134,11 @@ enum {
   *
   * PR: wq_pool_mutex protected for writes.  RCU protected for reads.
   *
+ * PW: wq_pool_mutex and wq->mutex protected for writes.  Either for reads.
+ *
+ * PWR: wq_pool_mutex and wq->mutex protected for writes.  Either or
+ *      sched-RCU for reads.
+ *
   * WQ: wq->mutex protected.
   *
   * WR: wq->mutex protected for writes.  RCU protected for reads.
@@ -254,8 +259,8 @@ struct workqueue_struct {
         int                     nr_drainers;    /* WQ: drain in progress */
         int                     saved_max_active; /* WQ: saved pwq max_active */
  
-       struct workqueue_attrs  *unbound_attrs; /* WQ: only for unbound wqs */
-       struct pool_workqueue   *dfl_pwq;       /* WQ: only for unbound wqs */
+       struct workqueue_attrs  *unbound_attrs; /* PW: only for unbound wqs */
+       struct pool_workqueue   *dfl_pwq;       /* PW: only for unbound wqs */
  
  #ifdef CONFIG_SYSFS
         struct wq_device        *wq_dev;        /* I: for sysfs interface */
@@ -275,7 +280,7 @@ struct workqueue_struct {
         /* hot fields used during command issue, aligned to cacheline */
         unsigned int            flags ____cacheline_aligned; /* WQ: WQ_* flags */
         struct pool_workqueue __percpu *cpu_pwqs; /* I: per-cpu pwqs */
-       struct pool_workqueue __rcu *numa_pwq_tbl[]; /* FR: unbound pwqs indexed by node */
+       struct pool_workqueue __rcu *numa_pwq_tbl[]; /* PWR: unbound pwqs indexed by node */
  };
  
  static struct kmem_cache *pwq_cache;
@@ -287,12 +292,7 @@ static bool wq_disable_numa;
  module_param_named(disable_numa, wq_disable_numa, bool, 0444);
  
  /* see the comment above the definition of WQ_POWER_EFFICIENT */
-#ifdef CONFIG_WQ_POWER_EFFICIENT_DEFAULT
-static bool wq_power_efficient = true;
-#else
-static bool wq_power_efficient;
-#endif
-
+static bool wq_power_efficient = IS_ENABLED(CONFIG_WQ_POWER_EFFICIENT_DEFAULT);
  module_param_named(power_efficient, wq_power_efficient, bool, 0444);
  
  static bool wq_numa_enabled;           /* unbound NUMA affinity enabled */
@@ -306,6 +306,8 @@ static DEFINE_SPINLOCK(wq_mayday_lock);     /* protects wq->maydays list */
  static LIST_HEAD(workqueues);          /* PR: list of all workqueues */
  static bool workqueue_freezing;                /* PL: have wqs started freezing? */
  
+static cpumask_var_t wq_unbound_cpumask; /* PL: low level cpumask for all unbound wqs */
+
  /* the per-cpu worker pools */
  static DEFINE_PER_CPU_SHARED_ALIGNED(struct worker_pool [NR_STD_WORKER_POOLS],
                                      cpu_worker_pools);
@@ -339,22 +341,26 @@ EXPORT_SYMBOL_GPL(system_freezable_power_efficient_wq);
  static DEFINE_LOCAL_IRQ_LOCK(pendingb_lock);
  
  static int worker_thread(void *__worker);
-static void copy_workqueue_attrs(struct workqueue_attrs *to,
-                                const struct workqueue_attrs *from);
  static void workqueue_sysfs_unregister(struct workqueue_struct *wq);
  
  #define CREATE_TRACE_POINTS
  #include <trace/events/workqueue.h>
  
  #define assert_rcu_or_pool_mutex()                                     \
-       rcu_lockdep_assert(rcu_read_lock_held() ||                      \
-                          lockdep_is_held(&wq_pool_mutex),             \
-                          "RCU or wq_pool_mutex should be held")
+       RCU_LOCKDEP_WARN(!rcu_read_lock_held() &&                       \
+                        !lockdep_is_held(&wq_pool_mutex),              \
+                        "RCU or wq_pool_mutex should be held")
  
  #define assert_rcu_or_wq_mutex(wq)                                     \
-       rcu_lockdep_assert(rcu_read_lock_held() ||                      \
-                          lockdep_is_held(&wq->mutex),                 \
-                          "RCU or wq->mutex should be held")
+       RCU_LOCKDEP_WARN(!rcu_read_lock_held() &&                       \
+                        !lockdep_is_held(&wq->mutex),                  \
+                        "RCU or wq->mutex should be held")
+
+#define assert_rcu_or_wq_mutex_or_pool_mutex(wq)                       \
+       RCU_LOCKDEP_WARN(!rcu_read_lock_held() &&                       \
+                        !lockdep_is_held(&wq->mutex) &&                \
+                        !lockdep_is_held(&wq_pool_mutex),              \
+                        "RCU, wq->mutex or wq_pool_mutex should be held")
  
  #define for_each_cpu_worker_pool(pool, cpu)                            \
         for ((pool) = &per_cpu(cpu_worker_pools, cpu)[0];               \
@@ -585,7 +591,8 @@ static int worker_pool_assign_id(struct worker_pool *pool)
   * @wq: the target workqueue
   * @node: the node ID
   *
- * This must be called either with pwq_lock held or RCU read locked.
+ * This must be called with any of wq_pool_mutex, wq->mutex or RCU
+ * read locked.
   * If the pwq needs to be used beyond the locking in effect, the caller is
   * responsible for guaranteeing that the pwq stays online.
   *
@@ -594,7 +601,17 @@ static int worker_pool_assign_id(struct worker_pool *pool)
  static struct pool_workqueue *unbound_pwq_by_node(struct workqueue_struct *wq,
                                                   int node)
  {
-       assert_rcu_or_wq_mutex(wq);
+       assert_rcu_or_wq_mutex_or_pool_mutex(wq);
+
+       /*
+        * XXX: @node can be NUMA_NO_NODE if CPU goes offline while a
+        * delayed item is pending.  The plan is to keep CPU -> NODE
+        * mapping valid and stable across CPU on/offlines.  Once that
+        * happens, this workaround can be removed.
+        */
+       if (unlikely(node == NUMA_NO_NODE))
+               return wq->dfl_pwq;
+
         return rcu_dereference_raw(wq->numa_pwq_tbl[node]);
  }
  
@@ -666,6 +683,35 @@ static void set_work_pool_and_clear_pending(struct work_struct *work,
          */
         smp_wmb();
         set_work_data(work, (unsigned long)pool_id << WORK_OFFQ_POOL_SHIFT, 0);
+       /*
+        * The following mb guarantees that previous clear of a PENDING bit
+        * will not be reordered with any speculative LOADS or STORES from
+        * work->current_func, which is executed afterwards.  This possible
+        * reordering can lead to a missed execution on attempt to qeueue
+        * the same @work.  E.g. consider this case:
+        *
+        *   CPU#0                         CPU#1
+        *   ----------------------------  --------------------------------
+        *
+        * 1  STORE event_indicated
+        * 2  queue_work_on() {
+        * 3    test_and_set_bit(PENDING)
+        * 4 }                             set_..._and_clear_pending() {
+        * 5                                 set_work_data() # clear bit
+        * 6                                 smp_mb()
+        * 7                               work->current_func() {
+        * 8                                  LOAD event_indicated
+        *                                 }
+        *
+        * Without an explicit full barrier speculative LOAD on line 8 can
+        * be executed before CPU#0 does STORE on line 1.  If that happens,
+        * CPU#0 observes the PENDING bit is still set and new execution of
+        * a @work is not queued in a hope, that CPU#1 will eventually
+        * finish the queued @work.  Meanwhile CPU#1 does not see
+        * event_indicated is set, because speculative LOAD was executed
+        * before actual STORE.
+        */
+       smp_mb();
  }
  
  static void clear_work_data(struct work_struct *work)
@@ -1000,7 +1046,7 @@ static struct worker *find_worker_executing_work(struct worker_pool *pool,
   * move_linked_works - move linked works to a list
   * @work: start of series of works to be scheduled
   * @head: target list to append @work to
- * @nextp: out paramter for nested worklist walking
+ * @nextp: out parameter for nested worklist walking
   *
   * Schedule linked works starting from @work to @head.  Work series to
   * be scheduled starts at @work and includes any consecutive work with
@@ -1089,9 +1135,11 @@ static void put_pwq_unlocked(struct pool_workqueue *pwq)
                  * As both pwqs and pools are RCU protected, the
                  * following lock operations are safe.
                  */
+               rcu_read_lock();
                 local_spin_lock_irq(pendingb_lock, &pwq->pool->lock);
                 put_pwq(pwq);
                 local_spin_unlock_irq(pendingb_lock, &pwq->pool->lock);
+               rcu_read_unlock();
         }
  }
  
@@ -1740,9 +1788,7 @@ static struct worker *create_worker(struct worker_pool *pool)
                 goto fail;
  
         set_user_nice(worker->task, pool->attrs->nice);
-
-       /* prevent userland from meddling with cpumask of workqueue workers */
-       worker->task->flags |= PF_NO_SETAFFINITY;
+       kthread_bind_mask(worker->task, pool->attrs->cpumask);
  
         /* successful, attach the worker to the pool */
         worker_attach_to_pool(worker, pool);
@@ -2642,7 +2688,7 @@ void flush_workqueue(struct workqueue_struct *wq)
  out_unlock:
         mutex_unlock(&wq->mutex);
  }
-EXPORT_SYMBOL_GPL(flush_workqueue);
+EXPORT_SYMBOL(flush_workqueue);
  
  /**
   * drain_workqueue - drain a workqueue
@@ -2651,7 +2697,7 @@ EXPORT_SYMBOL_GPL(flush_workqueue);
   * Wait until the workqueue becomes empty.  While draining is in progress,
   * only chain queueing is allowed.  IOW, only currently pending or running
   * work items on @wq can queue further work items on it.  @wq is flushed
- * repeatedly until it becomes empty.  The number of flushing is detemined
+ * repeatedly until it becomes empty.  The number of flushing is determined
   * by the depth of chaining and should be relatively short.  Whine if it
   * takes too long.
   */
@@ -2982,36 +3028,6 @@ int schedule_on_each_cpu(work_func_t func)
         return 0;
  }
  
-/**
- * flush_scheduled_work - ensure that any scheduled work has run to completion.
- *
- * Forces execution of the kernel-global workqueue and blocks until its
- * completion.
- *
- * Think twice before calling this function!  It's very easy to get into
- * trouble if you don't take great care.  Either of the following situations
- * will lead to deadlock:
- *
- *     One of the work items currently on the workqueue needs to acquire
- *     a lock held by your code or its caller.
- *
- *     Your code is running in the context of a work routine.
- *
- * They will be detected by lockdep when they occur, but the first might not
- * occur very often.  It depends on what work items are on the workqueue and
- * what locks they need, which you have no control over.
- *
- * In most situations flushing the entire workqueue is overkill; you merely
- * need to know that a particular work item isn't queued and isn't running.
- * In such cases you should use cancel_delayed_work_sync() or
- * cancel_work_sync() instead.
- */
-void flush_scheduled_work(void)
-{
-       flush_workqueue(system_wq);
-}
-EXPORT_SYMBOL(flush_scheduled_work);
-
  /**
   * execute_in_process_context - reliably execute the routine with user context
   * @fn:                the function to execute
@@ -3117,7 +3133,7 @@ static bool wqattrs_equal(const struct workqueue_attrs *a,
   * init_worker_pool - initialize a newly zalloc'd worker_pool
   * @pool: worker_pool to initialize
   *
- * Initiailize a newly zalloc'd @pool.  It also allocates @pool->attrs.
+ * Initialize a newly zalloc'd @pool.  It also allocates @pool->attrs.
   *
   * Return: 0 on success, -errno on failure.  Even on failure, all fields
   * inside @pool proper are initialized and put_unbound_pool() can be called
@@ -3260,6 +3276,7 @@ static struct worker_pool *get_unbound_pool(const struct workqueue_attrs *attrs)
         u32 hash = wqattrs_hash(attrs);
         struct worker_pool *pool;
         int node;
+       int target_node = NUMA_NO_NODE;
  
         lockdep_assert_held(&wq_pool_mutex);
  
@@ -3271,13 +3288,25 @@ static struct worker_pool *get_unbound_pool(const struct workqueue_attrs *attrs)
                 }
         }
  
+       /* if cpumask is contained inside a NUMA node, we belong to that node */
+       if (wq_numa_enabled) {
+               for_each_node(node) {
+                       if (cpumask_subset(attrs->cpumask,
+                                          wq_numa_possible_cpumask[node])) {
+                               target_node = node;
+                               break;
+                       }
+               }
+       }
+
         /* nope, create a new one */
-       pool = kzalloc(sizeof(*pool), GFP_KERNEL);
+       pool = kzalloc_node(sizeof(*pool), GFP_KERNEL, target_node);
         if (!pool || init_worker_pool(pool) < 0)
                 goto fail;
  
         lockdep_set_subclass(&pool->lock, 1);   /* see put_pwq() */
         copy_workqueue_attrs(pool->attrs, attrs);
+       pool->node = target_node;
  
         /*
          * no_numa isn't a worker_pool attribute, always clear it.  See
@@ -3285,17 +3314,6 @@ static struct worker_pool *get_unbound_pool(const struct workqueue_attrs *attrs)
          */
         pool->attrs->no_numa = false;
  
-       /* if cpumask is contained inside a NUMA node, we belong to that node */
-       if (wq_numa_enabled) {
-               for_each_node(node) {
-                       if (cpumask_subset(pool->attrs->cpumask,
-                                          wq_numa_possible_cpumask[node])) {
-                               pool->node = node;
-                               break;
-                       }
-               }
-       }
-
         if (worker_pool_assign_id(pool) < 0)
                 goto fail;
  
@@ -3461,20 +3479,9 @@ static struct pool_workqueue *alloc_unbound_pwq(struct workqueue_struct *wq,
         return pwq;
  }
  
-/* undo alloc_unbound_pwq(), used only in the error path */
-static void free_unbound_pwq(struct pool_workqueue *pwq)
-{
-       lockdep_assert_held(&wq_pool_mutex);
-
-       if (pwq) {
-               put_unbound_pool(pwq->pool);
-               kmem_cache_free(pwq_cache, pwq);
-       }
-}
-
  /**
- * wq_calc_node_mask - calculate a wq_attrs' cpumask for the specified node
- * @attrs: the wq_attrs of interest
+ * wq_calc_node_cpumask - calculate a wq_attrs' cpumask for the specified node
+ * @attrs: the wq_attrs of the default pwq of the target workqueue
   * @node: the target NUMA node
   * @cpu_going_down: if >= 0, the CPU to consider as offline
   * @cpumask: outarg, the resulting cpumask
@@ -3524,6 +3531,7 @@ static struct pool_workqueue *numa_pwq_tbl_install(struct workqueue_struct *wq,
  {
         struct pool_workqueue *old_pwq;
  
+       lockdep_assert_held(&wq_pool_mutex);
         lockdep_assert_held(&wq->mutex);
  
         /* link_pwq() can handle duplicate calls */
@@ -3534,46 +3542,59 @@ static struct pool_workqueue *numa_pwq_tbl_install(struct workqueue_struct *wq,
         return old_pwq;
  }
  
-/**
- * apply_workqueue_attrs - apply new workqueue_attrs to an unbound workqueue
- * @wq: the target workqueue
- * @attrs: the workqueue_attrs to apply, allocated with alloc_workqueue_attrs()
- *
- * Apply @attrs to an unbound workqueue @wq.  Unless disabled, on NUMA
- * machines, this function maps a separate pwq to each NUMA node with
- * possibles CPUs in @attrs->cpumask so that work items are affine to the
- * NUMA node it was issued on.  Older pwqs are released as in-flight work
- * items finish.  Note that a work item which repeatedly requeues itself
- * back-to-back will stay on its current pwq.
- *
- * Performs GFP_KERNEL allocations.
- *
- * Return: 0 on success and -errno on failure.
- */
-int apply_workqueue_attrs(struct workqueue_struct *wq,
-                         const struct workqueue_attrs *attrs)
+/* context to store the prepared attrs & pwqs before applying */
+struct apply_wqattrs_ctx {
+       struct workqueue_struct *wq;            /* target workqueue */
+       struct workqueue_attrs  *attrs;         /* attrs to apply */
+       struct list_head        list;           /* queued for batching commit */
+       struct pool_workqueue   *dfl_pwq;
+       struct pool_workqueue   *pwq_tbl[];
+};
+
+/* free the resources after success or abort */
+static void apply_wqattrs_cleanup(struct apply_wqattrs_ctx *ctx)
+{
+       if (ctx) {
+               int node;
+
+               for_each_node(node)
+                       put_pwq_unlocked(ctx->pwq_tbl[node]);
+               put_pwq_unlocked(ctx->dfl_pwq);
+
+               free_workqueue_attrs(ctx->attrs);
+
+               kfree(ctx);
+       }
+}
+
+/* allocate the attrs and pwqs for later installation */
+static struct apply_wqattrs_ctx *
+apply_wqattrs_prepare(struct workqueue_struct *wq,
+                     const struct workqueue_attrs *attrs)
  {
+       struct apply_wqattrs_ctx *ctx;
         struct workqueue_attrs *new_attrs, *tmp_attrs;
-       struct pool_workqueue **pwq_tbl, *dfl_pwq;
-       int node, ret;
+       int node;
  
-       /* only unbound workqueues can change attributes */
-       if (WARN_ON(!(wq->flags & WQ_UNBOUND)))
-               return -EINVAL;
+       lockdep_assert_held(&wq_pool_mutex);
  
-       /* creating multiple pwqs breaks ordering guarantee */
-       if (WARN_ON((wq->flags & __WQ_ORDERED) && !list_empty(&wq->pwqs)))
-               return -EINVAL;
+       ctx = kzalloc(sizeof(*ctx) + nr_node_ids * sizeof(ctx->pwq_tbl[0]),
+                     GFP_KERNEL);
  
-       pwq_tbl = kzalloc(nr_node_ids * sizeof(pwq_tbl[0]), GFP_KERNEL);
         new_attrs = alloc_workqueue_attrs(GFP_KERNEL);
         tmp_attrs = alloc_workqueue_attrs(GFP_KERNEL);
-       if (!pwq_tbl || !new_attrs || !tmp_attrs)
-               goto enomem;
+       if (!ctx || !new_attrs || !tmp_attrs)
+               goto out_free;
  
-       /* make a copy of @attrs and sanitize it */
+       /*
+        * Calculate the attrs of the default pwq.
+        * If the user configured cpumask doesn't overlap with the
+        * wq_unbound_cpumask, we fallback to the wq_unbound_cpumask.
+        */
         copy_workqueue_attrs(new_attrs, attrs);
-       cpumask_and(new_attrs->cpumask, new_attrs->cpumask, cpu_possible_mask);
+       cpumask_and(new_attrs->cpumask, new_attrs->cpumask, wq_unbound_cpumask);
+       if (unlikely(cpumask_empty(new_attrs->cpumask)))
+               cpumask_copy(new_attrs->cpumask, wq_unbound_cpumask);
  
         /*
          * We may create multiple pwqs with differing cpumasks.  Make a
@@ -3582,76 +3603,130 @@ int apply_workqueue_attrs(struct workqueue_struct *wq,
          */
         copy_workqueue_attrs(tmp_attrs, new_attrs);
  
-       /*
-        * CPUs should stay stable across pwq creations and installations.
-        * Pin CPUs, determine the target cpumask for each node and create
-        * pwqs accordingly.
-        */
-       get_online_cpus();
-
-       mutex_lock(&wq_pool_mutex);
-
         /*
          * If something goes wrong during CPU up/down, we'll fall back to
          * the default pwq covering whole @attrs->cpumask.  Always create
          * it even if we don't use it immediately.
          */
-       dfl_pwq = alloc_unbound_pwq(wq, new_attrs);
-       if (!dfl_pwq)
-               goto enomem_pwq;
+       ctx->dfl_pwq = alloc_unbound_pwq(wq, new_attrs);
+       if (!ctx->dfl_pwq)
+               goto out_free;
  
         for_each_node(node) {
-               if (wq_calc_node_cpumask(attrs, node, -1, tmp_attrs->cpumask)) {
-                       pwq_tbl[node] = alloc_unbound_pwq(wq, tmp_attrs);
-                       if (!pwq_tbl[node])
-                               goto enomem_pwq;
+               if (wq_calc_node_cpumask(new_attrs, node, -1, tmp_attrs->cpumask)) {
+                       ctx->pwq_tbl[node] = alloc_unbound_pwq(wq, tmp_attrs);
+                       if (!ctx->pwq_tbl[node])
+                               goto out_free;
                 } else {
-                       dfl_pwq->refcnt++;
-                       pwq_tbl[node] = dfl_pwq;
+                       ctx->dfl_pwq->refcnt++;
+                       ctx->pwq_tbl[node] = ctx->dfl_pwq;
                 }
         }
  
-       mutex_unlock(&wq_pool_mutex);
+       /* save the user configured attrs and sanitize it. */
+       copy_workqueue_attrs(new_attrs, attrs);
+       cpumask_and(new_attrs->cpumask, new_attrs->cpumask, cpu_possible_mask);
+       ctx->attrs = new_attrs;
+
+       ctx->wq = wq;
+       free_workqueue_attrs(tmp_attrs);
+       return ctx;
+
+out_free:
+       free_workqueue_attrs(tmp_attrs);
+       free_workqueue_attrs(new_attrs);
+       apply_wqattrs_cleanup(ctx);
+       return NULL;
+}
+
+/* set attrs and install prepared pwqs, @ctx points to old pwqs on return */
+static void apply_wqattrs_commit(struct apply_wqattrs_ctx *ctx)
+{
+       int node;
  
         /* all pwqs have been created successfully, let's install'em */
-       mutex_lock(&wq->mutex);
+       mutex_lock(&ctx->wq->mutex);
  
-       copy_workqueue_attrs(wq->unbound_attrs, new_attrs);
+       copy_workqueue_attrs(ctx->wq->unbound_attrs, ctx->attrs);
  
         /* save the previous pwq and install the new one */
         for_each_node(node)
-               pwq_tbl[node] = numa_pwq_tbl_install(wq, node, pwq_tbl[node]);
+               ctx->pwq_tbl[node] = numa_pwq_tbl_install(ctx->wq, node,
+                                                         ctx->pwq_tbl[node]);
  
         /* @dfl_pwq might not have been used, ensure it's linked */
-       link_pwq(dfl_pwq);
-       swap(wq->dfl_pwq, dfl_pwq);
+       link_pwq(ctx->dfl_pwq);
+       swap(ctx->wq->dfl_pwq, ctx->dfl_pwq);
  
-       mutex_unlock(&wq->mutex);
+       mutex_unlock(&ctx->wq->mutex);
+}
  
-       /* put the old pwqs */
-       for_each_node(node)
-               put_pwq_unlocked(pwq_tbl[node]);
-       put_pwq_unlocked(dfl_pwq);
+static void apply_wqattrs_lock(void)
+{
+       /* CPUs should stay stable across pwq creations and installations */
+       get_online_cpus();
+       mutex_lock(&wq_pool_mutex);
+}
  
+static void apply_wqattrs_unlock(void)
+{
+       mutex_unlock(&wq_pool_mutex);
         put_online_cpus();
-       ret = 0;
-       /* fall through */
-out_free:
-       free_workqueue_attrs(tmp_attrs);
-       free_workqueue_attrs(new_attrs);
-       kfree(pwq_tbl);
+}
+
+static int apply_workqueue_attrs_locked(struct workqueue_struct *wq,
+                                       const struct workqueue_attrs *attrs)
+{
+       struct apply_wqattrs_ctx *ctx;
+       int ret = -ENOMEM;
+
+       /* only unbound workqueues can change attributes */
+       if (WARN_ON(!(wq->flags & WQ_UNBOUND)))
+               return -EINVAL;
+
+       /* creating multiple pwqs breaks ordering guarantee */
+       if (WARN_ON((wq->flags & __WQ_ORDERED) && !list_empty(&wq->pwqs)))
+               return -EINVAL;
+
+       ctx = apply_wqattrs_prepare(wq, attrs);
+
+       /* the ctx has been prepared successfully, let's commit it */
+       if (ctx) {
+               apply_wqattrs_commit(ctx);
+               ret = 0;
+       }
+
+       apply_wqattrs_cleanup(ctx);
+
         return ret;
+}
  
-enomem_pwq:
-       free_unbound_pwq(dfl_pwq);
-       for_each_node(node)
-               if (pwq_tbl && pwq_tbl[node] != dfl_pwq)
-                       free_unbound_pwq(pwq_tbl[node]);
-       mutex_unlock(&wq_pool_mutex);
-       put_online_cpus();
-enomem:
-       ret = -ENOMEM;
-       goto out_free;
+/**
+ * apply_workqueue_attrs - apply new workqueue_attrs to an unbound workqueue
+ * @wq: the target workqueue
+ * @attrs: the workqueue_attrs to apply, allocated with alloc_workqueue_attrs()
+ *
+ * Apply @attrs to an unbound workqueue @wq.  Unless disabled, on NUMA
+ * machines, this function maps a separate pwq to each NUMA node with
+ * possibles CPUs in @attrs->cpumask so that work items are affine to the
+ * NUMA node it was issued on.  Older pwqs are released as in-flight work
+ * items finish.  Note that a work item which repeatedly requeues itself
+ * back-to-back will stay on its current pwq.
+ *
+ * Performs GFP_KERNEL allocations.
+ *
+ * Return: 0 on success and -errno on failure.
+ */
+int apply_workqueue_attrs(struct workqueue_struct *wq,
+                         const struct workqueue_attrs *attrs)
+{
+       int ret;
+
+       apply_wqattrs_lock();
+       ret = apply_workqueue_attrs_locked(wq, attrs);
+       apply_wqattrs_unlock();
+
+       return ret;
  }
  
  /**
@@ -3687,7 +3762,8 @@ static void wq_update_unbound_numa(struct workqueue_struct *wq, int cpu,
  
         lockdep_assert_held(&wq_pool_mutex);
  
-       if (!wq_numa_enabled || !(wq->flags & WQ_UNBOUND))
+       if (!wq_numa_enabled || !(wq->flags & WQ_UNBOUND) ||
+           wq->unbound_attrs->no_numa)
                 return;
  
         /*
@@ -3698,48 +3774,37 @@ static void wq_update_unbound_numa(struct workqueue_struct *wq, int cpu,
         target_attrs = wq_update_unbound_numa_attrs_buf;
         cpumask = target_attrs->cpumask;
  
-       mutex_lock(&wq->mutex);
-       if (wq->unbound_attrs->no_numa)
-               goto out_unlock;
-
         copy_workqueue_attrs(target_attrs, wq->unbound_attrs);
         pwq = unbound_pwq_by_node(wq, node);
  
         /*
          * Let's determine what needs to be done.  If the target cpumask is
-        * different from wq's, we need to compare it to @pwq's and create
-        * a new one if they don't match.  If the target cpumask equals
-        * wq's, the default pwq should be used.
+        * different from the default pwq's, we need to compare it to @pwq's
+        * and create a new one if they don't match.  If the target cpumask
+        * equals the default pwq's, the default pwq should be used.
          */
-       if (wq_calc_node_cpumask(wq->unbound_attrs, node, cpu_off, cpumask)) {
+       if (wq_calc_node_cpumask(wq->dfl_pwq->pool->attrs, node, cpu_off, cpumask)) {
                 if (cpumask_equal(cpumask, pwq->pool->attrs->cpumask))
-                       goto out_unlock;
+                       return;
         } else {
                 goto use_dfl_pwq;
         }
  
-       mutex_unlock(&wq->mutex);
-
         /* create a new pwq */
         pwq = alloc_unbound_pwq(wq, target_attrs);
         if (!pwq) {
                 pr_warn("workqueue: allocation failed while updating NUMA affinity of \"%s\"\n",
                         wq->name);
-               mutex_lock(&wq->mutex);
                 goto use_dfl_pwq;
         }
  
-       /*
-        * Install the new pwq.  As this function is called only from CPU
-        * hotplug callbacks and applying a new attrs is wrapped with
-        * get/put_online_cpus(), @wq->unbound_attrs couldn't have changed
-        * inbetween.
-        */
+       /* Install the new pwq. */
         mutex_lock(&wq->mutex);
         old_pwq = numa_pwq_tbl_install(wq, node, pwq);
         goto out_unlock;
  
  use_dfl_pwq:
+       mutex_lock(&wq->mutex);
         spin_lock_irq(&wq->dfl_pwq->pool->lock);
         get_pwq(wq->dfl_pwq);
         spin_unlock_irq(&wq->dfl_pwq->pool->lock);
@@ -3868,7 +3933,7 @@ struct workqueue_struct *__alloc_workqueue_key(const char *fmt,
                 }
  
                 wq->rescuer = rescuer;
-               rescuer->task->flags |= PF_NO_SETAFFINITY;
+               kthread_bind_mask(rescuer->task, cpu_possible_mask);
                 wake_up_process(rescuer->task);
         }
  
@@ -4423,7 +4488,7 @@ static void rebind_workers(struct worker_pool *pool)
         /*
          * Restore CPU affinity of all workers.  As all idle workers should
          * be on the run-queue of the associated CPU before any local
-        * wake-ups for concurrency management happen, restore CPU affinty
+        * wake-ups for concurrency management happen, restore CPU affinity
          * of all workers first and then clear UNBOUND.  As we're called
          * from CPU_ONLINE, the following shouldn't fail.
          */
@@ -4432,6 +4497,17 @@ static void rebind_workers(struct worker_pool *pool)
                                                   pool->attrs->cpumask) < 0);
  
         spin_lock_irq(&pool->lock);
+
+       /*
+        * XXX: CPU hotplug notifiers are weird and can call DOWN_FAILED
+        * w/o preceding DOWN_PREPARE.  Work around it.  CPU hotplug is
+        * being reworked and this can go away in time.
+        */
+       if (!(pool->flags & POOL_DISASSOCIATED)) {
+               spin_unlock_irq(&pool->lock);
+               return;
+       }
+
         pool->flags &= ~POOL_DISASSOCIATED;
  
         for_each_pool_worker(worker, pool) {
@@ -4736,6 +4812,82 @@ out_unlock:
  }
  #endif /* CONFIG_FREEZER */
  
+static int workqueue_apply_unbound_cpumask(void)
+{
+       LIST_HEAD(ctxs);
+       int ret = 0;
+       struct workqueue_struct *wq;
+       struct apply_wqattrs_ctx *ctx, *n;
+
+       lockdep_assert_held(&wq_pool_mutex);
+
+       list_for_each_entry(wq, &workqueues, list) {
+               if (!(wq->flags & WQ_UNBOUND))
+                       continue;
+               /* creating multiple pwqs breaks ordering guarantee */
+               if (wq->flags & __WQ_ORDERED)
+                       continue;
+
+               ctx = apply_wqattrs_prepare(wq, wq->unbound_attrs);
+               if (!ctx) {
+                       ret = -ENOMEM;
+                       break;
+               }
+
+               list_add_tail(&ctx->list, &ctxs);
+       }
+
+       list_for_each_entry_safe(ctx, n, &ctxs, list) {
+               if (!ret)
+                       apply_wqattrs_commit(ctx);
+               apply_wqattrs_cleanup(ctx);
+       }
+
+       return ret;
+}
+
+/**
+ *  workqueue_set_unbound_cpumask - Set the low-level unbound cpumask
+ *  @cpumask: the cpumask to set
+ *
+ *  The low-level workqueues cpumask is a global cpumask that limits
+ *  the affinity of all unbound workqueues.  This function check the @cpumask
+ *  and apply it to all unbound workqueues and updates all pwqs of them.
+ *
+ *  Retun:     0       - Success
+ *             -EINVAL - Invalid @cpumask
+ *             -ENOMEM - Failed to allocate memory for attrs or pwqs.
+ */
+int workqueue_set_unbound_cpumask(cpumask_var_t cpumask)
+{
+       int ret = -EINVAL;
+       cpumask_var_t saved_cpumask;
+
+       if (!zalloc_cpumask_var(&saved_cpumask, GFP_KERNEL))
+               return -ENOMEM;
+
+       cpumask_and(cpumask, cpumask, cpu_possible_mask);
+       if (!cpumask_empty(cpumask)) {
+               apply_wqattrs_lock();
+
+               /* save the old wq_unbound_cpumask. */
+               cpumask_copy(saved_cpumask, wq_unbound_cpumask);
+
+               /* update wq_unbound_cpumask at first and apply it to wqs. */
+               cpumask_copy(wq_unbound_cpumask, cpumask);
+               ret = workqueue_apply_unbound_cpumask();
+
+               /* restore the wq_unbound_cpumask when failed. */
+               if (ret < 0)
+                       cpumask_copy(wq_unbound_cpumask, saved_cpumask);
+
+               apply_wqattrs_unlock();
+       }
+
+       free_cpumask_var(saved_cpumask);
+       return ret;
+}
+
  #ifdef CONFIG_SYSFS
  /*
   * Workqueues with WQ_SYSFS flag set is visible to userland via
@@ -4842,13 +4994,13 @@ static struct workqueue_attrs *wq_sysfs_prep_attrs(struct workqueue_struct *wq)
  {
         struct workqueue_attrs *attrs;
  
+       lockdep_assert_held(&wq_pool_mutex);
+
         attrs = alloc_workqueue_attrs(GFP_KERNEL);
         if (!attrs)
                 return NULL;
  
-       mutex_lock(&wq->mutex);
         copy_workqueue_attrs(attrs, wq->unbound_attrs);
-       mutex_unlock(&wq->mutex);
         return attrs;
  }
  
@@ -4857,18 +5009,22 @@ static ssize_t wq_nice_store(struct device *dev, struct device_attribute *attr,
  {
         struct workqueue_struct *wq = dev_to_wq(dev);
         struct workqueue_attrs *attrs;
-       int ret;
+       int ret = -ENOMEM;
+
+       apply_wqattrs_lock();
  
         attrs = wq_sysfs_prep_attrs(wq);
         if (!attrs)
-               return -ENOMEM;
+               goto out_unlock;
  
         if (sscanf(buf, "%d", &attrs->nice) == 1 &&
             attrs->nice >= MIN_NICE && attrs->nice <= MAX_NICE)
-               ret = apply_workqueue_attrs(wq, attrs);
+               ret = apply_workqueue_attrs_locked(wq, attrs);
         else
                 ret = -EINVAL;
  
+out_unlock:
+       apply_wqattrs_unlock();
         free_workqueue_attrs(attrs);
         return ret ?: count;
  }
@@ -4892,16 +5048,20 @@ static ssize_t wq_cpumask_store(struct device *dev,
  {
         struct workqueue_struct *wq = dev_to_wq(dev);
         struct workqueue_attrs *attrs;
-       int ret;
+       int ret = -ENOMEM;
+
+       apply_wqattrs_lock();
  
         attrs = wq_sysfs_prep_attrs(wq);
         if (!attrs)
-               return -ENOMEM;
+               goto out_unlock;
  
         ret = cpumask_parse(buf, attrs->cpumask);
         if (!ret)
-               ret = apply_workqueue_attrs(wq, attrs);
+               ret = apply_workqueue_attrs_locked(wq, attrs);
  
+out_unlock:
+       apply_wqattrs_unlock();
         free_workqueue_attrs(attrs);
         return ret ?: count;
  }
@@ -4925,18 +5085,22 @@ static ssize_t wq_numa_store(struct device *dev, struct device_attribute *attr,
  {
         struct workqueue_struct *wq = dev_to_wq(dev);
         struct workqueue_attrs *attrs;
-       int v, ret;
+       int v, ret = -ENOMEM;
+
+       apply_wqattrs_lock();
  
         attrs = wq_sysfs_prep_attrs(wq);
         if (!attrs)
-               return -ENOMEM;
+               goto out_unlock;
  
         ret = -EINVAL;
         if (sscanf(buf, "%d", &v) == 1) {
                 attrs->no_numa = !v;
-               ret = apply_workqueue_attrs(wq, attrs);
+               ret = apply_workqueue_attrs_locked(wq, attrs);
         }
  
+out_unlock:
+       apply_wqattrs_unlock();
         free_workqueue_attrs(attrs);
         return ret ?: count;
  }
@@ -4954,9 +5118,49 @@ static struct bus_type wq_subsys = {
         .dev_groups                     = wq_sysfs_groups,
  };
  
+static ssize_t wq_unbound_cpumask_show(struct device *dev,
+               struct device_attribute *attr, char *buf)
+{
+       int written;
+
+       mutex_lock(&wq_pool_mutex);
+       written = scnprintf(buf, PAGE_SIZE, "%*pb\n",
+                           cpumask_pr_args(wq_unbound_cpumask));
+       mutex_unlock(&wq_pool_mutex);
+
+       return written;
+}
+
+static ssize_t wq_unbound_cpumask_store(struct device *dev,
+               struct device_attribute *attr, const char *buf, size_t count)
+{
+       cpumask_var_t cpumask;
+       int ret;
+
+       if (!zalloc_cpumask_var(&cpumask, GFP_KERNEL))
+               return -ENOMEM;
+
+       ret = cpumask_parse(buf, cpumask);
+       if (!ret)
+               ret = workqueue_set_unbound_cpumask(cpumask);
+
+       free_cpumask_var(cpumask);
+       return ret ? ret : count;
+}
+
+static struct device_attribute wq_sysfs_cpumask_attr =
+       __ATTR(cpumask, 0644, wq_unbound_cpumask_show,
+              wq_unbound_cpumask_store);
+
  static int __init wq_sysfs_init(void)
  {
-       return subsys_virtual_register(&wq_subsys, NULL);
+       int err;
+
+       err = subsys_virtual_register(&wq_subsys, NULL);
+       if (err)
+               return err;
+
+       return device_create_file(wq_subsys.dev_root, &wq_sysfs_cpumask_attr);
  }
  core_initcall(wq_sysfs_init);
  
@@ -4988,7 +5192,7 @@ int workqueue_sysfs_register(struct workqueue_struct *wq)
         int ret;
  
         /*
-        * Adjusting max_active or creating new pwqs by applyting
+        * Adjusting max_active or creating new pwqs by applying
          * attributes breaks ordering guarantee.  Disallow exposing ordered
          * workqueues.
          */
@@ -5104,6 +5308,9 @@ static int __init init_workqueues(void)
  
         WARN_ON(__alignof__(struct pool_workqueue) < __alignof__(long long));
  
+       BUG_ON(!alloc_cpumask_var(&wq_unbound_cpumask, GFP_KERNEL));
+       cpumask_copy(wq_unbound_cpumask, cpu_possible_mask);
+
         pwq_cache = KMEM_CACHE(pool_workqueue, SLAB_PANIC);
  
         cpu_notifier(workqueue_cpu_up_callback, CPU_PRI_WORKQUEUE_UP);