Upgrade to 4.4.50-rt62

[kvmfornfv.git] / kernel / kernel / sched / fair.c
diff --git a/kernel/kernel/sched/fair.c b/kernel/kernel/sched/fair.c

index 8c71e62..cf0a1ad 100644 (file)
--- a/kernel/kernel/sched/fair.c
+++ b/kernel/kernel/sched/fair.c
@@ -687,8 +687,6 @@ void init_entity_runnable_average(struct sched_entity *se)
         /* when this task enqueue'ed, it will contribute to its cfs_rq's load_avg */
  }
  
-static inline unsigned long cfs_rq_runnable_load_avg(struct cfs_rq *cfs_rq);
-static inline unsigned long cfs_rq_load_avg(struct cfs_rq *cfs_rq);
  #else
  void init_entity_runnable_average(struct sched_entity *se)
  {
@@ -1193,8 +1191,6 @@ static void task_numa_assign(struct task_numa_env *env,
  {
         if (env->best_task)
                 put_task_struct(env->best_task);
-       if (p)
-               get_task_struct(p);
  
         env->best_task = p;
         env->best_imp = imp;
@@ -1262,20 +1258,30 @@ static void task_numa_compare(struct task_numa_env *env,
         long imp = env->p->numa_group ? groupimp : taskimp;
         long moveimp = imp;
         int dist = env->dist;
+       bool assigned = false;
  
         rcu_read_lock();
  
         raw_spin_lock_irq(&dst_rq->lock);
         cur = dst_rq->curr;
         /*
-        * No need to move the exiting task, and this ensures that ->curr
-        * wasn't reaped and thus get_task_struct() in task_numa_assign()
-        * is safe under RCU read lock.
-        * Note that rcu_read_lock() itself can't protect from the final
-        * put_task_struct() after the last schedule().
+        * No need to move the exiting task or idle task.
          */
         if ((cur->flags & PF_EXITING) || is_idle_task(cur))
                 cur = NULL;
+       else {
+               /*
+                * The task_struct must be protected here to protect the
+                * p->numa_faults access in the task_weight since the
+                * numa_faults could already be freed in the following path:
+                * finish_task_switch()
+                *     --> put_task_struct()
+                *         --> __put_task_struct()
+                *             --> task_numa_free()
+                */
+               get_task_struct(cur);
+       }
+
         raw_spin_unlock_irq(&dst_rq->lock);
  
         /*
@@ -1359,6 +1365,7 @@ balance:
                  */
                 if (!load_too_imbalanced(src_load, dst_load, env)) {
                         imp = moveimp - 1;
+                       put_task_struct(cur);
                         cur = NULL;
                         goto assign;
                 }
@@ -1384,9 +1391,16 @@ balance:
                 env->dst_cpu = select_idle_sibling(env->p, env->dst_cpu);
  
  assign:
+       assigned = true;
         task_numa_assign(env, cur, imp);
  unlock:
         rcu_read_unlock();
+       /*
+        * The dst_rq->curr isn't assigned. The protection for task_struct is
+        * finished.
+        */
+       if (cur && !assigned)
+               put_task_struct(cur);
  }
  
  static void task_numa_find_cpu(struct task_numa_env *env,
@@ -2682,6 +2696,23 @@ static inline void update_tg_load_avg(struct cfs_rq *cfs_rq, int force) {}
  
  static inline u64 cfs_rq_clock_task(struct cfs_rq *cfs_rq);
  
+/*
+ * Unsigned subtract and clamp on underflow.
+ *
+ * Explicitly do a load-store to ensure the intermediate value never hits
+ * memory. This allows lockless observations without ever seeing the negative
+ * values.
+ */
+#define sub_positive(_ptr, _val) do {                          \
+       typeof(_ptr) ptr = (_ptr);                              \
+       typeof(*ptr) val = (_val);                              \
+       typeof(*ptr) res, var = READ_ONCE(*ptr);                \
+       res = var - val;                                        \
+       if (res > var)                                          \
+               res = 0;                                        \
+       WRITE_ONCE(*ptr, res);                                  \
+} while (0)
+
  /* Group cfs_rq's load_avg is used for task_h_load and update_cfs_share */
  static inline int update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq)
  {
@@ -2690,15 +2721,15 @@ static inline int update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq)
  
         if (atomic_long_read(&cfs_rq->removed_load_avg)) {
                 s64 r = atomic_long_xchg(&cfs_rq->removed_load_avg, 0);
-               sa->load_avg = max_t(long, sa->load_avg - r, 0);
-               sa->load_sum = max_t(s64, sa->load_sum - r * LOAD_AVG_MAX, 0);
+               sub_positive(&sa->load_avg, r);
+               sub_positive(&sa->load_sum, r * LOAD_AVG_MAX);
                 removed = 1;
         }
  
         if (atomic_long_read(&cfs_rq->removed_util_avg)) {
                 long r = atomic_long_xchg(&cfs_rq->removed_util_avg, 0);
-               sa->util_avg = max_t(long, sa->util_avg - r, 0);
-               sa->util_sum = max_t(s32, sa->util_sum - r * LOAD_AVG_MAX, 0);
+               sub_positive(&sa->util_avg, r);
+               sub_positive(&sa->util_sum, r * LOAD_AVG_MAX);
         }
  
         decayed = __update_load_avg(now, cpu_of(rq_of(cfs_rq)), sa,
@@ -2764,10 +2795,10 @@ static void detach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *s
                           &se->avg, se->on_rq * scale_load_down(se->load.weight),
                           cfs_rq->curr == se, NULL);
  
-       cfs_rq->avg.load_avg = max_t(long, cfs_rq->avg.load_avg - se->avg.load_avg, 0);
-       cfs_rq->avg.load_sum = max_t(s64,  cfs_rq->avg.load_sum - se->avg.load_sum, 0);
-       cfs_rq->avg.util_avg = max_t(long, cfs_rq->avg.util_avg - se->avg.util_avg, 0);
-       cfs_rq->avg.util_sum = max_t(s32,  cfs_rq->avg.util_sum - se->avg.util_sum, 0);
+       sub_positive(&cfs_rq->avg.load_avg, se->avg.load_avg);
+       sub_positive(&cfs_rq->avg.load_sum, se->avg.load_sum);
+       sub_positive(&cfs_rq->avg.util_avg, se->avg.util_avg);
+       sub_positive(&cfs_rq->avg.util_sum, se->avg.util_sum);
  }
  
  /* Add the load generated by se into cfs_rq's load average */
@@ -4577,19 +4608,24 @@ static long effective_load(struct task_group *tg, int cpu, long wl, long wg)
                 return wl;
  
         for_each_sched_entity(se) {
-               long w, W;
+               struct cfs_rq *cfs_rq = se->my_q;
+               long W, w = cfs_rq_load_avg(cfs_rq);
  
-               tg = se->my_q->tg;
+               tg = cfs_rq->tg;
  
                 /*
                  * W = @wg + \Sum rw_j
                  */
-               W = wg + calc_tg_weight(tg, se->my_q);
+               W = wg + atomic_long_read(&tg->load_avg);
+
+               /* Ensure \Sum rw_j >= rw_i */
+               W -= cfs_rq->tg_load_avg_contrib;
+               W += w;
  
                 /*
                  * w = rw_i + @wl
                  */
-               w = cfs_rq_load_avg(se->my_q) + wl;
+               w += wl;
  
                 /*
                  * wl = S * s'_i; see (2)