Upgrade to 4.4.50-rt62

[kvmfornfv.git] / kernel / kernel / sched / cputime.c
diff --git a/kernel/kernel/sched/cputime.c b/kernel/kernel/sched/cputime.c

index 2da134c..558b98a 100644 (file)
--- a/kernel/kernel/sched/cputime.c
+++ b/kernel/kernel/sched/cputime.c
@@ -259,21 +259,21 @@ static __always_inline bool steal_account_process_tick(void)
  #ifdef CONFIG_PARAVIRT
         if (static_key_false(&paravirt_steal_enabled)) {
                 u64 steal;
-               cputime_t steal_ct;
+               unsigned long steal_jiffies;
  
                 steal = paravirt_steal_clock(smp_processor_id());
                 steal -= this_rq()->prev_steal_time;
  
                 /*
-                * cputime_t may be less precise than nsecs (eg: if it's
-                * based on jiffies). Lets cast the result to cputime
+                * steal is in nsecs but our caller is expecting steal
+                * time in jiffies. Lets cast the result to jiffies
                  * granularity and account the rest on the next rounds.
                  */
-               steal_ct = nsecs_to_cputime(steal);
-               this_rq()->prev_steal_time += cputime_to_nsecs(steal_ct);
+               steal_jiffies = nsecs_to_jiffies(steal);
+               this_rq()->prev_steal_time += jiffies_to_nsecs(steal_jiffies);
  
-               account_steal_time(steal_ct);
-               return steal_ct;
+               account_steal_time(jiffies_to_cputime(steal_jiffies));
+               return steal_jiffies;
         }
  #endif
         return false;
@@ -444,6 +444,7 @@ void task_cputime_adjusted(struct task_struct *p, cputime_t *ut, cputime_t *st)
         *ut = p->utime;
         *st = p->stime;
  }
+EXPORT_SYMBOL_GPL(task_cputime_adjusted);
  
  void thread_group_cputime_adjusted(struct task_struct *p, cputime_t *ut, cputime_t *st)
  {
@@ -555,48 +556,43 @@ drop_precision:
  }
  
  /*
- * Atomically advance counter to the new value. Interrupts, vcpu
- * scheduling, and scaling inaccuracies can cause cputime_advance
- * to be occasionally called with a new value smaller than counter.
- * Let's enforce atomicity.
+ * Adjust tick based cputime random precision against scheduler runtime
+ * accounting.
   *
- * Normally a caller will only go through this loop once, or not
- * at all in case a previous caller updated counter the same jiffy.
- */
-static void cputime_advance(cputime_t *counter, cputime_t new)
-{
-       cputime_t old;
-
-       while (new > (old = ACCESS_ONCE(*counter)))
-               cmpxchg_cputime(counter, old, new);
-}
-
-/*
- * Adjust tick based cputime random precision against scheduler
- * runtime accounting.
+ * Tick based cputime accounting depend on random scheduling timeslices of a
+ * task to be interrupted or not by the timer.  Depending on these
+ * circumstances, the number of these interrupts may be over or
+ * under-optimistic, matching the real user and system cputime with a variable
+ * precision.
+ *
+ * Fix this by scaling these tick based values against the total runtime
+ * accounted by the CFS scheduler.
+ *
+ * This code provides the following guarantees:
+ *
+ *   stime + utime == rtime
+ *   stime_i+1 >= stime_i, utime_i+1 >= utime_i
+ *
+ * Assuming that rtime_i+1 >= rtime_i.
   */
  static void cputime_adjust(struct task_cputime *curr,
-                          struct cputime *prev,
+                          struct prev_cputime *prev,
                            cputime_t *ut, cputime_t *st)
  {
         cputime_t rtime, stime, utime;
+       unsigned long flags;
  
-       /*
-        * Tick based cputime accounting depend on random scheduling
-        * timeslices of a task to be interrupted or not by the timer.
-        * Depending on these circumstances, the number of these interrupts
-        * may be over or under-optimistic, matching the real user and system
-        * cputime with a variable precision.
-        *
-        * Fix this by scaling these tick based values against the total
-        * runtime accounted by the CFS scheduler.
-        */
+       /* Serialize concurrent callers such that we can honour our guarantees */
+       raw_spin_lock_irqsave(&prev->lock, flags);
         rtime = nsecs_to_cputime(curr->sum_exec_runtime);
  
         /*
-        * Update userspace visible utime/stime values only if actual execution
-        * time is bigger than already exported. Note that can happen, that we
-        * provided bigger values due to scaling inaccuracy on big numbers.
+        * This is possible under two circumstances:
+        *  - rtime isn't monotonic after all (a bug);
+        *  - we got reordered by the lock.
+        *
+        * In both cases this acts as a filter such that the rest of the code
+        * can assume it is monotonic regardless of anything else.
          */
         if (prev->stime + prev->utime >= rtime)
                 goto out;
@@ -604,24 +600,53 @@ static void cputime_adjust(struct task_cputime *curr,
         stime = curr->stime;
         utime = curr->utime;
  
-       if (utime == 0) {
-               stime = rtime;
-       } else if (stime == 0) {
+       /*
+        * If either stime or both stime and utime are 0, assume all runtime is
+        * userspace. Once a task gets some ticks, the monotonicy code at
+        * 'update' will ensure things converge to the observed ratio.
+        */
+       if (stime == 0) {
                 utime = rtime;
-       } else {
-               cputime_t total = stime + utime;
+               goto update;
+       }
  
-               stime = scale_stime((__force u64)stime,
-                                   (__force u64)rtime, (__force u64)total);
-               utime = rtime - stime;
+       if (utime == 0) {
+               stime = rtime;
+               goto update;
         }
  
-       cputime_advance(&prev->stime, stime);
-       cputime_advance(&prev->utime, utime);
+       stime = scale_stime((__force u64)stime, (__force u64)rtime,
+                           (__force u64)(stime + utime));
  
+update:
+       /*
+        * Make sure stime doesn't go backwards; this preserves monotonicity
+        * for utime because rtime is monotonic.
+        *
+        *  utime_i+1 = rtime_i+1 - stime_i
+        *            = rtime_i+1 - (rtime_i - utime_i)
+        *            = (rtime_i+1 - rtime_i) + utime_i
+        *            >= utime_i
+        */
+       if (stime < prev->stime)
+               stime = prev->stime;
+       utime = rtime - stime;
+
+       /*
+        * Make sure utime doesn't go backwards; this still preserves
+        * monotonicity for stime, analogous argument to above.
+        */
+       if (utime < prev->utime) {
+               utime = prev->utime;
+               stime = rtime - utime;
+       }
+
+       prev->stime = stime;
+       prev->utime = utime;
  out:
         *ut = prev->utime;
         *st = prev->stime;
+       raw_spin_unlock_irqrestore(&prev->lock, flags);
  }
  
  void task_cputime_adjusted(struct task_struct *p, cputime_t *ut, cputime_t *st)
@@ -633,6 +658,7 @@ void task_cputime_adjusted(struct task_struct *p, cputime_t *ut, cputime_t *st)
         task_cputime(p, &cputime.utime, &cputime.stime);
         cputime_adjust(&cputime, &p->prev_cputime, ut, st);
  }
+EXPORT_SYMBOL_GPL(task_cputime_adjusted);
  
  void thread_group_cputime_adjusted(struct task_struct *p, cputime_t *ut, cputime_t *st)
  {
@@ -659,7 +685,7 @@ static cputime_t get_vtime_delta(struct task_struct *tsk)
  {
         unsigned long long delta = vtime_delta(tsk);
  
-       WARN_ON_ONCE(tsk->vtime_snap_whence == VTIME_SLEEPING);
+       WARN_ON_ONCE(tsk->vtime_snap_whence == VTIME_INACTIVE);
         tsk->vtime_snap += delta;
  
         /* CHECKME: always safe to convert nsecs to cputime? */
@@ -675,45 +701,37 @@ static void __vtime_account_system(struct task_struct *tsk)
  
  void vtime_account_system(struct task_struct *tsk)
  {
-       raw_spin_lock(&tsk->vtime_lock);
-       write_seqcount_begin(&tsk->vtime_seq);
+       write_seqcount_begin(&tsk->vtime_seqcount);
         __vtime_account_system(tsk);
-       write_seqcount_end(&tsk->vtime_seq);
-       raw_spin_unlock(&tsk->vtime_lock);
+       write_seqcount_end(&tsk->vtime_seqcount);
  }
  
  void vtime_gen_account_irq_exit(struct task_struct *tsk)
  {
-       raw_spin_lock(&tsk->vtime_lock);
-       write_seqcount_begin(&tsk->vtime_seq);
+       write_seqcount_begin(&tsk->vtime_seqcount);
         __vtime_account_system(tsk);
         if (context_tracking_in_user())
                 tsk->vtime_snap_whence = VTIME_USER;
-       write_seqcount_end(&tsk->vtime_seq);
-       raw_spin_unlock(&tsk->vtime_lock);
+       write_seqcount_end(&tsk->vtime_seqcount);
  }
  
  void vtime_account_user(struct task_struct *tsk)
  {
         cputime_t delta_cpu;
  
-       raw_spin_lock(&tsk->vtime_lock);
-       write_seqcount_begin(&tsk->vtime_seq);
+       write_seqcount_begin(&tsk->vtime_seqcount);
         delta_cpu = get_vtime_delta(tsk);
         tsk->vtime_snap_whence = VTIME_SYS;
         account_user_time(tsk, delta_cpu, cputime_to_scaled(delta_cpu));
-       write_seqcount_end(&tsk->vtime_seq);
-       raw_spin_unlock(&tsk->vtime_lock);
+       write_seqcount_end(&tsk->vtime_seqcount);
  }
  
  void vtime_user_enter(struct task_struct *tsk)
  {
-       raw_spin_lock(&tsk->vtime_lock);
-       write_seqcount_begin(&tsk->vtime_seq);
+       write_seqcount_begin(&tsk->vtime_seqcount);
         __vtime_account_system(tsk);
         tsk->vtime_snap_whence = VTIME_USER;
-       write_seqcount_end(&tsk->vtime_seq);
-       raw_spin_unlock(&tsk->vtime_lock);
+       write_seqcount_end(&tsk->vtime_seqcount);
  }
  
  void vtime_guest_enter(struct task_struct *tsk)
@@ -725,23 +743,19 @@ void vtime_guest_enter(struct task_struct *tsk)
          * synchronization against the reader (task_gtime())
          * that can thus safely catch up with a tickless delta.
          */
-       raw_spin_lock(&tsk->vtime_lock);
-       write_seqcount_begin(&tsk->vtime_seq);
+       write_seqcount_begin(&tsk->vtime_seqcount);
         __vtime_account_system(tsk);
         current->flags |= PF_VCPU;
-       write_seqcount_end(&tsk->vtime_seq);
-       raw_spin_unlock(&tsk->vtime_lock);
+       write_seqcount_end(&tsk->vtime_seqcount);
  }
  EXPORT_SYMBOL_GPL(vtime_guest_enter);
  
  void vtime_guest_exit(struct task_struct *tsk)
  {
-       raw_spin_lock(&tsk->vtime_lock);
-       write_seqcount_begin(&tsk->vtime_seq);
+       write_seqcount_begin(&tsk->vtime_seqcount);
         __vtime_account_system(tsk);
         current->flags &= ~PF_VCPU;
-       write_seqcount_end(&tsk->vtime_seq);
-       raw_spin_unlock(&tsk->vtime_lock);
+       write_seqcount_end(&tsk->vtime_seqcount);
  }
  EXPORT_SYMBOL_GPL(vtime_guest_exit);
  
@@ -754,30 +768,26 @@ void vtime_account_idle(struct task_struct *tsk)
  
  void arch_vtime_task_switch(struct task_struct *prev)
  {
-       raw_spin_lock(&prev->vtime_lock);
-       write_seqcount_begin(&prev->vtime_seq);
-       prev->vtime_snap_whence = VTIME_SLEEPING;
-       write_seqcount_end(&prev->vtime_seq);
-       raw_spin_unlock(&prev->vtime_lock);
+       write_seqcount_begin(&prev->vtime_seqcount);
+       prev->vtime_snap_whence = VTIME_INACTIVE;
+       write_seqcount_end(&prev->vtime_seqcount);
  
-       raw_spin_lock(&current->vtime_lock);
-       write_seqcount_begin(&current->vtime_seq);
+       write_seqcount_begin(&current->vtime_seqcount);
         current->vtime_snap_whence = VTIME_SYS;
         current->vtime_snap = sched_clock_cpu(smp_processor_id());
-       write_seqcount_end(&current->vtime_seq);
-       raw_spin_unlock(&current->vtime_lock);
+       write_seqcount_end(&current->vtime_seqcount);
  }
  
  void vtime_init_idle(struct task_struct *t, int cpu)
  {
         unsigned long flags;
  
-       raw_spin_lock_irqsave(&t->vtime_lock, flags);
-       write_seqcount_begin(&t->vtime_seq);
+       local_irq_save(flags);
+       write_seqcount_begin(&t->vtime_seqcount);
         t->vtime_snap_whence = VTIME_SYS;
         t->vtime_snap = sched_clock_cpu(cpu);
-       write_seqcount_end(&t->vtime_seq);
-       raw_spin_unlock_irqrestore(&t->vtime_lock, flags);
+       write_seqcount_end(&t->vtime_seqcount);
+       local_irq_restore(flags);
  }
  
  cputime_t task_gtime(struct task_struct *t)
@@ -785,14 +795,17 @@ cputime_t task_gtime(struct task_struct *t)
         unsigned int seq;
         cputime_t gtime;
  
+       if (!context_tracking_is_enabled())
+               return t->gtime;
+
         do {
-               seq = read_seqcount_begin(&t->vtime_seq);
+               seq = read_seqcount_begin(&t->vtime_seqcount);
  
                 gtime = t->gtime;
                 if (t->flags & PF_VCPU)
                         gtime += vtime_delta(t);
  
-       } while (read_seqcount_retry(&t->vtime_seq, seq));
+       } while (read_seqcount_retry(&t->vtime_seqcount, seq));
  
         return gtime;
  }
@@ -815,7 +828,7 @@ fetch_task_cputime(struct task_struct *t,
                 *udelta = 0;
                 *sdelta = 0;
  
-               seq = read_seqcount_begin(&t->vtime_seq);
+               seq = read_seqcount_begin(&t->vtime_seqcount);
  
                 if (u_dst)
                         *u_dst = *u_src;
@@ -823,7 +836,7 @@ fetch_task_cputime(struct task_struct *t,
                         *s_dst = *s_src;
  
                 /* Task is sleeping, nothing to add */
-               if (t->vtime_snap_whence == VTIME_SLEEPING ||
+               if (t->vtime_snap_whence == VTIME_INACTIVE ||
                     is_idle_task(t))
                         continue;
  
@@ -839,7 +852,7 @@ fetch_task_cputime(struct task_struct *t,
                         if (t->vtime_snap_whence == VTIME_SYS)
                                 *sdelta = delta;
                 }
-       } while (read_seqcount_retry(&t->vtime_seq, seq));
+       } while (read_seqcount_retry(&t->vtime_seqcount, seq));
  }