These changes are the raw update to linux-4.4.6-rt14. Kernel sources
[kvmfornfv.git] / kernel / kernel / events / core.c
index 5146610..760f41d 100644 (file)
@@ -3,7 +3,7 @@
  *
  *  Copyright (C) 2008 Thomas Gleixner <tglx@linutronix.de>
  *  Copyright (C) 2008-2011 Red Hat, Inc., Ingo Molnar
- *  Copyright (C) 2008-2011 Red Hat, Inc., Peter Zijlstra <pzijlstr@redhat.com>
+ *  Copyright (C) 2008-2011 Red Hat, Inc., Peter Zijlstra
  *  Copyright  ©  2009 Paul Mackerras, IBM Corp. <paulus@au1.ibm.com>
  *
  * For licensing details see kernel-base/COPYING
@@ -36,7 +36,7 @@
 #include <linux/kernel_stat.h>
 #include <linux/cgroup.h>
 #include <linux/perf_event.h>
-#include <linux/ftrace_event.h>
+#include <linux/trace_events.h>
 #include <linux/hw_breakpoint.h>
 #include <linux/mm_types.h>
 #include <linux/module.h>
 
 static struct workqueue_struct *perf_wq;
 
+typedef int (*remote_function_f)(void *);
+
 struct remote_function_call {
        struct task_struct      *p;
-       int                     (*func)(void *info);
+       remote_function_f       func;
        void                    *info;
        int                     ret;
 };
@@ -86,7 +88,7 @@ static void remote_function(void *data)
  *         -EAGAIN - when the process moved away
  */
 static int
-task_function_call(struct task_struct *p, int (*func) (void *info), void *info)
+task_function_call(struct task_struct *p, remote_function_f func, void *info)
 {
        struct remote_function_call data = {
                .p      = p,
@@ -110,7 +112,7 @@ task_function_call(struct task_struct *p, int (*func) (void *info), void *info)
  *
  * returns: @func return value or -ENXIO when the cpu is offline
  */
-static int cpu_function_call(int cpu, int (*func) (void *info), void *info)
+static int cpu_function_call(int cpu, remote_function_f func, void *info)
 {
        struct remote_function_call data = {
                .p      = NULL,
@@ -161,6 +163,7 @@ static atomic_t nr_mmap_events __read_mostly;
 static atomic_t nr_comm_events __read_mostly;
 static atomic_t nr_task_events __read_mostly;
 static atomic_t nr_freq_events __read_mostly;
+static atomic_t nr_switch_events __read_mostly;
 
 static LIST_HEAD(pmus);
 static DEFINE_MUTEX(pmus_lock);
@@ -193,7 +196,7 @@ static int perf_sample_period_ns __read_mostly      = DEFAULT_SAMPLE_PERIOD_NS;
 static int perf_sample_allowed_ns __read_mostly =
        DEFAULT_SAMPLE_PERIOD_NS * DEFAULT_CPU_TIME_MAX_PERCENT / 100;
 
-void update_perf_cpu_limits(void)
+static void update_perf_cpu_limits(void)
 {
        u64 tmp = perf_sample_period_ns;
 
@@ -432,7 +435,7 @@ static inline void update_cgrp_time_from_event(struct perf_event *event)
        if (!is_cgroup_event(event))
                return;
 
-       cgrp = perf_cgroup_from_task(current);
+       cgrp = perf_cgroup_from_task(current, event->ctx);
        /*
         * Do not update time when cgroup is not active
         */
@@ -455,7 +458,7 @@ perf_cgroup_set_timestamp(struct task_struct *task,
        if (!task || !ctx->nr_cgroups)
                return;
 
-       cgrp = perf_cgroup_from_task(task);
+       cgrp = perf_cgroup_from_task(task, ctx);
        info = this_cpu_ptr(cgrp->info);
        info->timestamp = ctx->timestamp;
 }
@@ -469,7 +472,7 @@ perf_cgroup_set_timestamp(struct task_struct *task,
  * mode SWOUT : schedule out everything
  * mode SWIN : schedule in based on cgroup for next
  */
-void perf_cgroup_switch(struct task_struct *task, int mode)
+static void perf_cgroup_switch(struct task_struct *task, int mode)
 {
        struct perf_cpu_context *cpuctx;
        struct pmu *pmu;
@@ -486,7 +489,6 @@ void perf_cgroup_switch(struct task_struct *task, int mode)
         * we reschedule only in the presence of cgroup
         * constrained events.
         */
-       rcu_read_lock();
 
        list_for_each_entry_rcu(pmu, &pmus, entry) {
                cpuctx = this_cpu_ptr(pmu->pmu_cpu_context);
@@ -519,8 +521,10 @@ void perf_cgroup_switch(struct task_struct *task, int mode)
                                 * set cgrp before ctxsw in to allow
                                 * event_filter_match() to not have to pass
                                 * task around
+                                * we pass the cpuctx->ctx to perf_cgroup_from_task()
+                                * because cgorup events are only per-cpu
                                 */
-                               cpuctx->cgrp = perf_cgroup_from_task(task);
+                               cpuctx->cgrp = perf_cgroup_from_task(task, &cpuctx->ctx);
                                cpu_ctx_sched_in(cpuctx, EVENT_ALL, task);
                        }
                        perf_pmu_enable(cpuctx->ctx.pmu);
@@ -528,8 +532,6 @@ void perf_cgroup_switch(struct task_struct *task, int mode)
                }
        }
 
-       rcu_read_unlock();
-
        local_irq_restore(flags);
 }
 
@@ -539,17 +541,20 @@ static inline void perf_cgroup_sched_out(struct task_struct *task,
        struct perf_cgroup *cgrp1;
        struct perf_cgroup *cgrp2 = NULL;
 
+       rcu_read_lock();
        /*
         * we come here when we know perf_cgroup_events > 0
+        * we do not need to pass the ctx here because we know
+        * we are holding the rcu lock
         */
-       cgrp1 = perf_cgroup_from_task(task);
+       cgrp1 = perf_cgroup_from_task(task, NULL);
 
        /*
         * next is NULL when called from perf_event_enable_on_exec()
         * that will systematically cause a cgroup_switch()
         */
        if (next)
-               cgrp2 = perf_cgroup_from_task(next);
+               cgrp2 = perf_cgroup_from_task(next, NULL);
 
        /*
         * only schedule out current cgroup events if we know
@@ -558,6 +563,8 @@ static inline void perf_cgroup_sched_out(struct task_struct *task,
         */
        if (cgrp1 != cgrp2)
                perf_cgroup_switch(task, PERF_CGROUP_SWOUT);
+
+       rcu_read_unlock();
 }
 
 static inline void perf_cgroup_sched_in(struct task_struct *prev,
@@ -566,13 +573,16 @@ static inline void perf_cgroup_sched_in(struct task_struct *prev,
        struct perf_cgroup *cgrp1;
        struct perf_cgroup *cgrp2 = NULL;
 
+       rcu_read_lock();
        /*
         * we come here when we know perf_cgroup_events > 0
+        * we do not need to pass the ctx here because we know
+        * we are holding the rcu lock
         */
-       cgrp1 = perf_cgroup_from_task(task);
+       cgrp1 = perf_cgroup_from_task(task, NULL);
 
        /* prev can never be NULL */
-       cgrp2 = perf_cgroup_from_task(prev);
+       cgrp2 = perf_cgroup_from_task(prev, NULL);
 
        /*
         * only need to schedule in cgroup events if we are changing
@@ -581,6 +591,8 @@ static inline void perf_cgroup_sched_in(struct task_struct *prev,
         */
        if (cgrp1 != cgrp2)
                perf_cgroup_switch(task, PERF_CGROUP_SWIN);
+
+       rcu_read_unlock();
 }
 
 static inline int perf_cgroup_connect(int fd, struct perf_event *event,
@@ -747,62 +759,31 @@ perf_cgroup_mark_enabled(struct perf_event *event,
 /*
  * function must be called with interrupts disbled
  */
-static enum hrtimer_restart perf_cpu_hrtimer_handler(struct hrtimer *hr)
+static enum hrtimer_restart perf_mux_hrtimer_handler(struct hrtimer *hr)
 {
        struct perf_cpu_context *cpuctx;
-       enum hrtimer_restart ret = HRTIMER_NORESTART;
        int rotations = 0;
 
        WARN_ON(!irqs_disabled());
 
        cpuctx = container_of(hr, struct perf_cpu_context, hrtimer);
-
        rotations = perf_rotate_context(cpuctx);
 
-       /*
-        * arm timer if needed
-        */
-       if (rotations) {
+       raw_spin_lock(&cpuctx->hrtimer_lock);
+       if (rotations)
                hrtimer_forward_now(hr, cpuctx->hrtimer_interval);
-               ret = HRTIMER_RESTART;
-       }
-
-       return ret;
-}
-
-/* CPU is going down */
-void perf_cpu_hrtimer_cancel(int cpu)
-{
-       struct perf_cpu_context *cpuctx;
-       struct pmu *pmu;
-       unsigned long flags;
-
-       if (WARN_ON(cpu != smp_processor_id()))
-               return;
-
-       local_irq_save(flags);
-
-       rcu_read_lock();
-
-       list_for_each_entry_rcu(pmu, &pmus, entry) {
-               cpuctx = this_cpu_ptr(pmu->pmu_cpu_context);
-
-               if (pmu->task_ctx_nr == perf_sw_context)
-                       continue;
-
-               hrtimer_cancel(&cpuctx->hrtimer);
-       }
-
-       rcu_read_unlock();
+       else
+               cpuctx->hrtimer_active = 0;
+       raw_spin_unlock(&cpuctx->hrtimer_lock);
 
-       local_irq_restore(flags);
+       return rotations ? HRTIMER_RESTART : HRTIMER_NORESTART;
 }
 
-static void __perf_cpu_hrtimer_init(struct perf_cpu_context *cpuctx, int cpu)
+static void __perf_mux_hrtimer_init(struct perf_cpu_context *cpuctx, int cpu)
 {
-       struct hrtimer *hr = &cpuctx->hrtimer;
+       struct hrtimer *timer = &cpuctx->hrtimer;
        struct pmu *pmu = cpuctx->ctx.pmu;
-       int timer;
+       u64 interval;
 
        /* no multiplexing needed for SW PMU */
        if (pmu->task_ctx_nr == perf_sw_context)
@@ -812,31 +793,37 @@ static void __perf_cpu_hrtimer_init(struct perf_cpu_context *cpuctx, int cpu)
         * check default is sane, if not set then force to
         * default interval (1/tick)
         */
-       timer = pmu->hrtimer_interval_ms;
-       if (timer < 1)
-               timer = pmu->hrtimer_interval_ms = PERF_CPU_HRTIMER;
+       interval = pmu->hrtimer_interval_ms;
+       if (interval < 1)
+               interval = pmu->hrtimer_interval_ms = PERF_CPU_HRTIMER;
 
-       cpuctx->hrtimer_interval = ns_to_ktime(NSEC_PER_MSEC * timer);
+       cpuctx->hrtimer_interval = ns_to_ktime(NSEC_PER_MSEC * interval);
 
-       hrtimer_init(hr, CLOCK_MONOTONIC, HRTIMER_MODE_REL_PINNED);
-       hr->function = perf_cpu_hrtimer_handler;
+       raw_spin_lock_init(&cpuctx->hrtimer_lock);
+       hrtimer_init(timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS_PINNED);
+       timer->function = perf_mux_hrtimer_handler;
+       timer->irqsafe = 1;
 }
 
-static void perf_cpu_hrtimer_restart(struct perf_cpu_context *cpuctx)
+static int perf_mux_hrtimer_restart(struct perf_cpu_context *cpuctx)
 {
-       struct hrtimer *hr = &cpuctx->hrtimer;
+       struct hrtimer *timer = &cpuctx->hrtimer;
        struct pmu *pmu = cpuctx->ctx.pmu;
+       unsigned long flags;
 
        /* not for SW PMU */
        if (pmu->task_ctx_nr == perf_sw_context)
-               return;
+               return 0;
 
-       if (hrtimer_active(hr))
-               return;
+       raw_spin_lock_irqsave(&cpuctx->hrtimer_lock, flags);
+       if (!cpuctx->hrtimer_active) {
+               cpuctx->hrtimer_active = 1;
+               hrtimer_forward_now(timer, cpuctx->hrtimer_interval);
+               hrtimer_start_expires(timer, HRTIMER_MODE_ABS_PINNED);
+       }
+       raw_spin_unlock_irqrestore(&cpuctx->hrtimer_lock, flags);
 
-       if (!hrtimer_callback_running(hr))
-               __hrtimer_start_range_ns(hr, cpuctx->hrtimer_interval,
-                                        0, HRTIMER_MODE_REL_PINNED, 0);
+       return 0;
 }
 
 void perf_pmu_disable(struct pmu *pmu)
@@ -1073,13 +1060,13 @@ retry:
        /*
         * One of the few rules of preemptible RCU is that one cannot do
         * rcu_read_unlock() while holding a scheduler (or nested) lock when
-        * part of the read side critical section was preemptible -- see
+        * part of the read side critical section was irqs-enabled -- see
         * rcu_read_unlock_special().
         *
         * Since ctx->lock nests under rq->lock we must ensure the entire read
-        * side critical section is non-preemptible.
+        * side critical section has interrupts disabled.
         */
-       preempt_disable();
+       local_irq_save(*flags);
        rcu_read_lock();
        ctx = rcu_dereference(task->perf_event_ctxp[ctxn]);
        if (ctx) {
@@ -1093,21 +1080,22 @@ retry:
                 * if so.  If we locked the right context, then it
                 * can't get swapped on us any more.
                 */
-               raw_spin_lock_irqsave(&ctx->lock, *flags);
+               raw_spin_lock(&ctx->lock);
                if (ctx != rcu_dereference(task->perf_event_ctxp[ctxn])) {
-                       raw_spin_unlock_irqrestore(&ctx->lock, *flags);
+                       raw_spin_unlock(&ctx->lock);
                        rcu_read_unlock();
-                       preempt_enable();
+                       local_irq_restore(*flags);
                        goto retry;
                }
 
                if (!atomic_inc_not_zero(&ctx->refcount)) {
-                       raw_spin_unlock_irqrestore(&ctx->lock, *flags);
+                       raw_spin_unlock(&ctx->lock);
                        ctx = NULL;
                }
        }
        rcu_read_unlock();
-       preempt_enable();
+       if (!ctx)
+               local_irq_restore(*flags);
        return ctx;
 }
 
@@ -1266,11 +1254,7 @@ static inline void perf_event__state_init(struct perf_event *event)
                                              PERF_EVENT_STATE_INACTIVE;
 }
 
-/*
- * Called at perf_event creation and when events are attached/detached from a
- * group.
- */
-static void perf_event__read_size(struct perf_event *event)
+static void __perf_event_read_size(struct perf_event *event, int nr_siblings)
 {
        int entry = sizeof(u64); /* value */
        int size = 0;
@@ -1286,7 +1270,7 @@ static void perf_event__read_size(struct perf_event *event)
                entry += sizeof(u64);
 
        if (event->attr.read_format & PERF_FORMAT_GROUP) {
-               nr += event->group_leader->nr_siblings;
+               nr += nr_siblings;
                size += sizeof(u64);
        }
 
@@ -1294,14 +1278,11 @@ static void perf_event__read_size(struct perf_event *event)
        event->read_size = size;
 }
 
-static void perf_event__header_size(struct perf_event *event)
+static void __perf_event_header_size(struct perf_event *event, u64 sample_type)
 {
        struct perf_sample_data *data;
-       u64 sample_type = event->attr.sample_type;
        u16 size = 0;
 
-       perf_event__read_size(event);
-
        if (sample_type & PERF_SAMPLE_IP)
                size += sizeof(data->ip);
 
@@ -1326,6 +1307,17 @@ static void perf_event__header_size(struct perf_event *event)
        event->header_size = size;
 }
 
+/*
+ * Called at perf_event creation and when events are attached/detached from a
+ * group.
+ */
+static void perf_event__header_size(struct perf_event *event)
+{
+       __perf_event_read_size(event,
+                              event->group_leader->nr_siblings);
+       __perf_event_header_size(event, event->attr.sample_type);
+}
+
 static void perf_event__id_header_size(struct perf_event *event)
 {
        struct perf_sample_data *data;
@@ -1353,6 +1345,27 @@ static void perf_event__id_header_size(struct perf_event *event)
        event->id_header_size = size;
 }
 
+static bool perf_event_validate_size(struct perf_event *event)
+{
+       /*
+        * The values computed here will be over-written when we actually
+        * attach the event.
+        */
+       __perf_event_read_size(event, event->group_leader->nr_siblings + 1);
+       __perf_event_header_size(event, event->attr.sample_type & ~PERF_SAMPLE_READ);
+       perf_event__id_header_size(event);
+
+       /*
+        * Sum the lot; should not exceed the 64k limit we have on records.
+        * Conservative limit to allow for callchains and other variable fields.
+        */
+       if (event->read_size + event->header_size +
+           event->id_header_size + sizeof(struct perf_event_header) >= 16*1024)
+               return false;
+
+       return true;
+}
+
 static void perf_group_attach(struct perf_event *event)
 {
        struct perf_event *group_leader = event->group_leader, *pos;
@@ -1526,11 +1539,17 @@ static int __init perf_workqueue_init(void)
 
 core_initcall(perf_workqueue_init);
 
+static inline int pmu_filter_match(struct perf_event *event)
+{
+       struct pmu *pmu = event->pmu;
+       return pmu->filter_match ? pmu->filter_match(event) : 1;
+}
+
 static inline int
 event_filter_match(struct perf_event *event)
 {
        return (event->cpu == -1 || event->cpu == smp_processor_id())
-           && perf_cgroup_match(event);
+           && perf_cgroup_match(event) && pmu_filter_match(event);
 }
 
 static void
@@ -1931,11 +1950,11 @@ group_sched_in(struct perf_event *group_event,
        if (group_event->state == PERF_EVENT_STATE_OFF)
                return 0;
 
-       pmu->start_txn(pmu);
+       pmu->start_txn(pmu, PERF_PMU_TXN_ADD);
 
        if (event_sched_in(group_event, cpuctx, ctx)) {
                pmu->cancel_txn(pmu);
-               perf_cpu_hrtimer_restart(cpuctx);
+               perf_mux_hrtimer_restart(cpuctx);
                return -EAGAIN;
        }
 
@@ -1982,7 +2001,7 @@ group_error:
 
        pmu->cancel_txn(pmu);
 
-       perf_cpu_hrtimer_restart(cpuctx);
+       perf_mux_hrtimer_restart(cpuctx);
 
        return -EAGAIN;
 }
@@ -2255,7 +2274,7 @@ static int __perf_event_enable(void *info)
                 */
                if (leader != event) {
                        group_sched_out(leader, cpuctx, ctx);
-                       perf_cpu_hrtimer_restart(cpuctx);
+                       perf_mux_hrtimer_restart(cpuctx);
                }
                if (leader->attr.pinned) {
                        update_group_times(leader);
@@ -2637,6 +2656,9 @@ static void perf_pmu_sched_task(struct task_struct *prev,
        local_irq_restore(flags);
 }
 
+static void perf_event_switch(struct task_struct *task,
+                             struct task_struct *next_prev, bool sched_in);
+
 #define for_each_task_context_nr(ctxn)                                 \
        for ((ctxn) = 0; (ctxn) < perf_nr_task_contexts; (ctxn)++)
 
@@ -2659,6 +2681,9 @@ void __perf_event_task_sched_out(struct task_struct *task,
        if (__this_cpu_read(perf_sched_cb_usages))
                perf_pmu_sched_task(task, next, false);
 
+       if (atomic_read(&nr_switch_events))
+               perf_event_switch(task, next, false);
+
        for_each_task_context_nr(ctxn)
                perf_event_context_sched_out(task, ctxn, next);
 
@@ -2849,6 +2874,9 @@ void __perf_event_task_sched_in(struct task_struct *prev,
        if (atomic_read(this_cpu_ptr(&perf_cgroup_events)))
                perf_cgroup_sched_in(prev, task);
 
+       if (atomic_read(&nr_switch_events))
+               perf_event_switch(task, prev, true);
+
        if (__this_cpu_read(perf_sched_cb_usages))
                perf_pmu_sched_task(prev, task, true);
 }
@@ -3127,15 +3155,16 @@ static int event_enable_on_exec(struct perf_event *event,
  * Enable all of a task's events that have been marked enable-on-exec.
  * This expects task == current.
  */
-static void perf_event_enable_on_exec(struct perf_event_context *ctx)
+static void perf_event_enable_on_exec(int ctxn)
 {
-       struct perf_event_context *clone_ctx = NULL;
+       struct perf_event_context *ctx, *clone_ctx = NULL;
        struct perf_event *event;
        unsigned long flags;
        int enabled = 0;
        int ret;
 
        local_irq_save(flags);
+       ctx = current->perf_event_ctxp[ctxn];
        if (!ctx || !ctx->nr_events)
                goto out;
 
@@ -3178,28 +3207,30 @@ out:
 
 void perf_event_exec(void)
 {
-       struct perf_event_context *ctx;
        int ctxn;
 
        rcu_read_lock();
-       for_each_task_context_nr(ctxn) {
-               ctx = current->perf_event_ctxp[ctxn];
-               if (!ctx)
-                       continue;
-
-               perf_event_enable_on_exec(ctx);
-       }
+       for_each_task_context_nr(ctxn)
+               perf_event_enable_on_exec(ctxn);
        rcu_read_unlock();
 }
 
+struct perf_read_data {
+       struct perf_event *event;
+       bool group;
+       int ret;
+};
+
 /*
  * Cross CPU call to read the hardware event
  */
 static void __perf_event_read(void *info)
 {
-       struct perf_event *event = info;
+       struct perf_read_data *data = info;
+       struct perf_event *sub, *event = data->event;
        struct perf_event_context *ctx = event->ctx;
        struct perf_cpu_context *cpuctx = __get_cpu_context(ctx);
+       struct pmu *pmu = event->pmu;
 
        /*
         * If this is a task context, we need to check whether it is
@@ -3216,9 +3247,35 @@ static void __perf_event_read(void *info)
                update_context_time(ctx);
                update_cgrp_time_from_event(event);
        }
+
        update_event_times(event);
-       if (event->state == PERF_EVENT_STATE_ACTIVE)
-               event->pmu->read(event);
+       if (event->state != PERF_EVENT_STATE_ACTIVE)
+               goto unlock;
+
+       if (!data->group) {
+               pmu->read(event);
+               data->ret = 0;
+               goto unlock;
+       }
+
+       pmu->start_txn(pmu, PERF_PMU_TXN_READ);
+
+       pmu->read(event);
+
+       list_for_each_entry(sub, &event->sibling_list, group_entry) {
+               update_event_times(sub);
+               if (sub->state == PERF_EVENT_STATE_ACTIVE) {
+                       /*
+                        * Use sibling's PMU rather than @event's since
+                        * sibling could be on different (eg: software) PMU.
+                        */
+                       sub->pmu->read(sub);
+               }
+       }
+
+       data->ret = pmu->commit_txn(pmu);
+
+unlock:
        raw_spin_unlock(&ctx->lock);
 }
 
@@ -3230,15 +3287,76 @@ static inline u64 perf_event_count(struct perf_event *event)
        return __perf_event_count(event);
 }
 
-static u64 perf_event_read(struct perf_event *event)
+/*
+ * NMI-safe method to read a local event, that is an event that
+ * is:
+ *   - either for the current task, or for this CPU
+ *   - does not have inherit set, for inherited task events
+ *     will not be local and we cannot read them atomically
+ *   - must not have a pmu::count method
+ */
+u64 perf_event_read_local(struct perf_event *event)
 {
+       unsigned long flags;
+       u64 val;
+
+       /*
+        * Disabling interrupts avoids all counter scheduling (context
+        * switches, timer based rotation and IPIs).
+        */
+       local_irq_save(flags);
+
+       /* If this is a per-task event, it must be for current */
+       WARN_ON_ONCE((event->attach_state & PERF_ATTACH_TASK) &&
+                    event->hw.target != current);
+
+       /* If this is a per-CPU event, it must be for this CPU */
+       WARN_ON_ONCE(!(event->attach_state & PERF_ATTACH_TASK) &&
+                    event->cpu != smp_processor_id());
+
+       /*
+        * It must not be an event with inherit set, we cannot read
+        * all child counters from atomic context.
+        */
+       WARN_ON_ONCE(event->attr.inherit);
+
+       /*
+        * It must not have a pmu::count method, those are not
+        * NMI safe.
+        */
+       WARN_ON_ONCE(event->pmu->count);
+
+       /*
+        * If the event is currently on this CPU, its either a per-task event,
+        * or local to this CPU. Furthermore it means its ACTIVE (otherwise
+        * oncpu == -1).
+        */
+       if (event->oncpu == smp_processor_id())
+               event->pmu->read(event);
+
+       val = local64_read(&event->count);
+       local_irq_restore(flags);
+
+       return val;
+}
+
+static int perf_event_read(struct perf_event *event, bool group)
+{
+       int ret = 0;
+
        /*
         * If event is enabled and currently active on a CPU, update the
         * value in the event structure:
         */
        if (event->state == PERF_EVENT_STATE_ACTIVE) {
+               struct perf_read_data data = {
+                       .event = event,
+                       .group = group,
+                       .ret = 0,
+               };
                smp_call_function_single(event->oncpu,
-                                        __perf_event_read, event, 1);
+                                        __perf_event_read, &data, 1);
+               ret = data.ret;
        } else if (event->state == PERF_EVENT_STATE_INACTIVE) {
                struct perf_event_context *ctx = event->ctx;
                unsigned long flags;
@@ -3253,11 +3371,14 @@ static u64 perf_event_read(struct perf_event *event)
                        update_context_time(ctx);
                        update_cgrp_time_from_event(event);
                }
-               update_event_times(event);
+               if (group)
+                       update_group_times(event);
+               else
+                       update_event_times(event);
                raw_spin_unlock_irqrestore(&ctx->lock, flags);
        }
 
-       return perf_event_count(event);
+       return ret;
 }
 
 /*
@@ -3314,7 +3435,7 @@ find_lively_task_by_vpid(pid_t vpid)
 
        /* Reuse ptrace permission checks for now. */
        err = -EACCES;
-       if (!ptrace_may_access(task, PTRACE_MODE_READ))
+       if (!ptrace_may_access(task, PTRACE_MODE_READ_REALCREDS))
                goto errout;
 
        return task;
@@ -3472,6 +3593,10 @@ static void unaccount_event(struct perf_event *event)
                atomic_dec(&nr_task_events);
        if (event->attr.freq)
                atomic_dec(&nr_freq_events);
+       if (event->attr.context_switch) {
+               static_key_slow_dec_deferred(&perf_sched_events);
+               atomic_dec(&nr_switch_events);
+       }
        if (is_cgroup_event(event))
                static_key_slow_dec_deferred(&perf_sched_events);
        if (has_branch_stack(event))
@@ -3695,7 +3820,7 @@ static void put_event(struct perf_event *event)
         *     see the comment there.
         *
         *  2) there is a lock-inversion with mmap_sem through
-        *     perf_event_read_group(), which takes faults while
+        *     perf_read_group(), which takes faults while
         *     holding ctx->mutex, however this is called after
         *     the last filedesc died, so there is no possibility
         *     to trigger the AB-BA case.
@@ -3769,14 +3894,18 @@ u64 perf_event_read_value(struct perf_event *event, u64 *enabled, u64 *running)
        *running = 0;
 
        mutex_lock(&event->child_mutex);
-       total += perf_event_read(event);
+
+       (void)perf_event_read(event, false);
+       total += perf_event_count(event);
+
        *enabled += event->total_time_enabled +
                        atomic64_read(&event->child_total_time_enabled);
        *running += event->total_time_running +
                        atomic64_read(&event->child_total_time_running);
 
        list_for_each_entry(child, &event->child_list, child_list) {
-               total += perf_event_read(child);
+               (void)perf_event_read(child, false);
+               total += perf_event_count(child);
                *enabled += child->total_time_enabled;
                *running += child->total_time_running;
        }
@@ -3786,55 +3915,95 @@ u64 perf_event_read_value(struct perf_event *event, u64 *enabled, u64 *running)
 }
 EXPORT_SYMBOL_GPL(perf_event_read_value);
 
-static int perf_event_read_group(struct perf_event *event,
-                                  u64 read_format, char __user *buf)
+static int __perf_read_group_add(struct perf_event *leader,
+                                       u64 read_format, u64 *values)
 {
-       struct perf_event *leader = event->group_leader, *sub;
-       struct perf_event_context *ctx = leader->ctx;
-       int n = 0, size = 0, ret;
-       u64 count, enabled, running;
-       u64 values[5];
+       struct perf_event *sub;
+       int n = 1; /* skip @nr */
+       int ret;
 
-       lockdep_assert_held(&ctx->mutex);
+       ret = perf_event_read(leader, true);
+       if (ret)
+               return ret;
 
-       count = perf_event_read_value(leader, &enabled, &running);
+       /*
+        * Since we co-schedule groups, {enabled,running} times of siblings
+        * will be identical to those of the leader, so we only publish one
+        * set.
+        */
+       if (read_format & PERF_FORMAT_TOTAL_TIME_ENABLED) {
+               values[n++] += leader->total_time_enabled +
+                       atomic64_read(&leader->child_total_time_enabled);
+       }
 
-       values[n++] = 1 + leader->nr_siblings;
-       if (read_format & PERF_FORMAT_TOTAL_TIME_ENABLED)
-               values[n++] = enabled;
-       if (read_format & PERF_FORMAT_TOTAL_TIME_RUNNING)
-               values[n++] = running;
-       values[n++] = count;
+       if (read_format & PERF_FORMAT_TOTAL_TIME_RUNNING) {
+               values[n++] += leader->total_time_running +
+                       atomic64_read(&leader->child_total_time_running);
+       }
+
+       /*
+        * Write {count,id} tuples for every sibling.
+        */
+       values[n++] += perf_event_count(leader);
        if (read_format & PERF_FORMAT_ID)
                values[n++] = primary_event_id(leader);
 
-       size = n * sizeof(u64);
+       list_for_each_entry(sub, &leader->sibling_list, group_entry) {
+               values[n++] += perf_event_count(sub);
+               if (read_format & PERF_FORMAT_ID)
+                       values[n++] = primary_event_id(sub);
+       }
 
-       if (copy_to_user(buf, values, size))
-               return -EFAULT;
+       return 0;
+}
 
-       ret = size;
+static int perf_read_group(struct perf_event *event,
+                                  u64 read_format, char __user *buf)
+{
+       struct perf_event *leader = event->group_leader, *child;
+       struct perf_event_context *ctx = leader->ctx;
+       int ret;
+       u64 *values;
 
-       list_for_each_entry(sub, &leader->sibling_list, group_entry) {
-               n = 0;
+       lockdep_assert_held(&ctx->mutex);
 
-               values[n++] = perf_event_read_value(sub, &enabled, &running);
-               if (read_format & PERF_FORMAT_ID)
-                       values[n++] = primary_event_id(sub);
+       values = kzalloc(event->read_size, GFP_KERNEL);
+       if (!values)
+               return -ENOMEM;
 
-               size = n * sizeof(u64);
+       values[0] = 1 + leader->nr_siblings;
 
-               if (copy_to_user(buf + ret, values, size)) {
-                       return -EFAULT;
-               }
+       /*
+        * By locking the child_mutex of the leader we effectively
+        * lock the child list of all siblings.. XXX explain how.
+        */
+       mutex_lock(&leader->child_mutex);
+
+       ret = __perf_read_group_add(leader, read_format, values);
+       if (ret)
+               goto unlock;
 
-               ret += size;
+       list_for_each_entry(child, &leader->child_list, child_list) {
+               ret = __perf_read_group_add(child, read_format, values);
+               if (ret)
+                       goto unlock;
        }
 
+       mutex_unlock(&leader->child_mutex);
+
+       ret = event->read_size;
+       if (copy_to_user(buf, values, event->read_size))
+               ret = -EFAULT;
+       goto out;
+
+unlock:
+       mutex_unlock(&leader->child_mutex);
+out:
+       kfree(values);
        return ret;
 }
 
-static int perf_event_read_one(struct perf_event *event,
+static int perf_read_one(struct perf_event *event,
                                 u64 read_format, char __user *buf)
 {
        u64 enabled, running;
@@ -3872,7 +4041,7 @@ static bool is_event_hup(struct perf_event *event)
  * Read the performance event - simple non blocking version for now
  */
 static ssize_t
-perf_read_hw(struct perf_event *event, char __user *buf, size_t count)
+__perf_read(struct perf_event *event, char __user *buf, size_t count)
 {
        u64 read_format = event->attr.read_format;
        int ret;
@@ -3890,9 +4059,9 @@ perf_read_hw(struct perf_event *event, char __user *buf, size_t count)
 
        WARN_ON_ONCE(event->ctx->parent_ctx);
        if (read_format & PERF_FORMAT_GROUP)
-               ret = perf_event_read_group(event, read_format, buf);
+               ret = perf_read_group(event, read_format, buf);
        else
-               ret = perf_event_read_one(event, read_format, buf);
+               ret = perf_read_one(event, read_format, buf);
 
        return ret;
 }
@@ -3905,7 +4074,7 @@ perf_read(struct file *file, char __user *buf, size_t count, loff_t *ppos)
        int ret;
 
        ctx = perf_event_ctx_lock(event);
-       ret = perf_read_hw(event, buf, count);
+       ret = __perf_read(event, buf, count);
        perf_event_ctx_unlock(event, ctx);
 
        return ret;
@@ -3936,7 +4105,7 @@ static unsigned int perf_poll(struct file *file, poll_table *wait)
 
 static void _perf_event_reset(struct perf_event *event)
 {
-       (void)perf_event_read(event);
+       (void)perf_event_read(event, false);
        local64_set(&event->count, 0);
        perf_event_update_userpage(event);
 }
@@ -4052,7 +4221,14 @@ retry:
                goto retry;
        }
 
-       __perf_event_period(&pe);
+       if (event->attr.freq) {
+               event->attr.sample_freq = value;
+       } else {
+               event->attr.sample_period = value;
+               event->hw.sample_period = value;
+       }
+
+       local64_set(&event->hw.period_left, 0);
        raw_spin_unlock_irq(&ctx->lock);
 
        return 0;
@@ -4411,14 +4587,6 @@ static void ring_buffer_wakeup(struct perf_event *event)
        rcu_read_unlock();
 }
 
-static void rb_free_rcu(struct rcu_head *rcu_head)
-{
-       struct ring_buffer *rb;
-
-       rb = container_of(rcu_head, struct ring_buffer, rcu_head);
-       rb_free(rb);
-}
-
 struct ring_buffer *ring_buffer_get(struct perf_event *event)
 {
        struct ring_buffer *rb;
@@ -5220,9 +5388,15 @@ void perf_output_sample(struct perf_output_handle *handle,
 
        if (sample_type & PERF_SAMPLE_RAW) {
                if (data->raw) {
-                       perf_output_put(handle, data->raw->size);
-                       __output_copy(handle, data->raw->data,
-                                          data->raw->size);
+                       u32 raw_size = data->raw->size;
+                       u32 real_size = round_up(raw_size + sizeof(u32),
+                                                sizeof(u64)) - sizeof(u32);
+                       u64 zero = 0;
+
+                       perf_output_put(handle, real_size);
+                       __output_copy(handle, data->raw->data, raw_size);
+                       if (real_size - raw_size)
+                               __output_copy(handle, &zero, real_size - raw_size);
                } else {
                        struct {
                                u32     size;
@@ -5354,8 +5528,7 @@ void perf_prepare_sample(struct perf_event_header *header,
                else
                        size += sizeof(u32);
 
-               WARN_ON_ONCE(size & (sizeof(u64)-1));
-               header->size += size;
+               header->size += round_up(size, sizeof(u64));
        }
 
        if (sample_type & PERF_SAMPLE_BRANCH_STACK) {
@@ -5424,9 +5597,9 @@ void perf_prepare_sample(struct perf_event_header *header,
        }
 }
 
-static void perf_event_output(struct perf_event *event,
-                               struct perf_sample_data *data,
-                               struct pt_regs *regs)
+void perf_event_output(struct perf_event *event,
+                       struct perf_sample_data *data,
+                       struct pt_regs *regs)
 {
        struct perf_output_handle handle;
        struct perf_event_header header;
@@ -5505,6 +5678,17 @@ perf_event_aux_ctx(struct perf_event_context *ctx,
        }
 }
 
+static void
+perf_event_aux_task_ctx(perf_event_aux_output_cb output, void *data,
+                       struct perf_event_context *task_ctx)
+{
+       rcu_read_lock();
+       preempt_disable();
+       perf_event_aux_ctx(task_ctx, output, data);
+       preempt_enable();
+       rcu_read_unlock();
+}
+
 static void
 perf_event_aux(perf_event_aux_output_cb output, void *data,
               struct perf_event_context *task_ctx)
@@ -5514,14 +5698,23 @@ perf_event_aux(perf_event_aux_output_cb output, void *data,
        struct pmu *pmu;
        int ctxn;
 
+       /*
+        * If we have task_ctx != NULL we only notify
+        * the task context itself. The task_ctx is set
+        * only for EXIT events before releasing task
+        * context.
+        */
+       if (task_ctx) {
+               perf_event_aux_task_ctx(output, data, task_ctx);
+               return;
+       }
+
        rcu_read_lock();
        list_for_each_entry_rcu(pmu, &pmus, entry) {
                cpuctx = get_cpu_ptr(pmu->pmu_cpu_context);
                if (cpuctx->unique_pmu != pmu)
                        goto next;
                perf_event_aux_ctx(&cpuctx->ctx, output, data);
-               if (task_ctx)
-                       goto next;
                ctxn = pmu->task_ctx_nr;
                if (ctxn < 0)
                        goto next;
@@ -5531,12 +5724,6 @@ perf_event_aux(perf_event_aux_output_cb output, void *data,
 next:
                put_cpu_ptr(pmu->pmu_cpu_context);
        }
-
-       if (task_ctx) {
-               preempt_disable();
-               perf_event_aux_ctx(task_ctx, output, data);
-               preempt_enable();
-       }
        rcu_read_unlock();
 }
 
@@ -5855,7 +6042,7 @@ static void perf_event_mmap_event(struct perf_mmap_event *mmap_event)
                 * need to add enough zero bytes after the string to handle
                 * the 64bit alignment we do later.
                 */
-               name = d_path(&file->f_path, buf, PATH_MAX - sizeof(u64));
+               name = file_path(file, buf, PATH_MAX - sizeof(u64));
                if (IS_ERR(name)) {
                        name = "//toolong";
                        goto cpy_name;
@@ -6017,6 +6204,124 @@ void perf_event_aux_event(struct perf_event *event, unsigned long head,
        perf_output_end(&handle);
 }
 
+/*
+ * Lost/dropped samples logging
+ */
+void perf_log_lost_samples(struct perf_event *event, u64 lost)
+{
+       struct perf_output_handle handle;
+       struct perf_sample_data sample;
+       int ret;
+
+       struct {
+               struct perf_event_header        header;
+               u64                             lost;
+       } lost_samples_event = {
+               .header = {
+                       .type = PERF_RECORD_LOST_SAMPLES,
+                       .misc = 0,
+                       .size = sizeof(lost_samples_event),
+               },
+               .lost           = lost,
+       };
+
+       perf_event_header__init_id(&lost_samples_event.header, &sample, event);
+
+       ret = perf_output_begin(&handle, event,
+                               lost_samples_event.header.size);
+       if (ret)
+               return;
+
+       perf_output_put(&handle, lost_samples_event);
+       perf_event__output_id_sample(event, &handle, &sample);
+       perf_output_end(&handle);
+}
+
+/*
+ * context_switch tracking
+ */
+
+struct perf_switch_event {
+       struct task_struct      *task;
+       struct task_struct      *next_prev;
+
+       struct {
+               struct perf_event_header        header;
+               u32                             next_prev_pid;
+               u32                             next_prev_tid;
+       } event_id;
+};
+
+static int perf_event_switch_match(struct perf_event *event)
+{
+       return event->attr.context_switch;
+}
+
+static void perf_event_switch_output(struct perf_event *event, void *data)
+{
+       struct perf_switch_event *se = data;
+       struct perf_output_handle handle;
+       struct perf_sample_data sample;
+       int ret;
+
+       if (!perf_event_switch_match(event))
+               return;
+
+       /* Only CPU-wide events are allowed to see next/prev pid/tid */
+       if (event->ctx->task) {
+               se->event_id.header.type = PERF_RECORD_SWITCH;
+               se->event_id.header.size = sizeof(se->event_id.header);
+       } else {
+               se->event_id.header.type = PERF_RECORD_SWITCH_CPU_WIDE;
+               se->event_id.header.size = sizeof(se->event_id);
+               se->event_id.next_prev_pid =
+                                       perf_event_pid(event, se->next_prev);
+               se->event_id.next_prev_tid =
+                                       perf_event_tid(event, se->next_prev);
+       }
+
+       perf_event_header__init_id(&se->event_id.header, &sample, event);
+
+       ret = perf_output_begin(&handle, event, se->event_id.header.size);
+       if (ret)
+               return;
+
+       if (event->ctx->task)
+               perf_output_put(&handle, se->event_id.header);
+       else
+               perf_output_put(&handle, se->event_id);
+
+       perf_event__output_id_sample(event, &handle, &sample);
+
+       perf_output_end(&handle);
+}
+
+static void perf_event_switch(struct task_struct *task,
+                             struct task_struct *next_prev, bool sched_in)
+{
+       struct perf_switch_event switch_event;
+
+       /* N.B. caller checks nr_switch_events != 0 */
+
+       switch_event = (struct perf_switch_event){
+               .task           = task,
+               .next_prev      = next_prev,
+               .event_id       = {
+                       .header = {
+                               /* .type */
+                               .misc = sched_in ? 0 : PERF_RECORD_MISC_SWITCH_OUT,
+                               /* .size */
+                       },
+                       /* .next_prev_pid */
+                       /* .next_prev_tid */
+               },
+       };
+
+       perf_event_aux(perf_event_switch_output,
+                      &switch_event,
+                      NULL);
+}
+
 /*
  * IRQ throttle logging
  */
@@ -6076,8 +6381,6 @@ static void perf_log_itrace_start(struct perf_event *event)
            event->hw.itrace_started)
                return;
 
-       event->hw.itrace_started = 1;
-
        rec.header.type = PERF_RECORD_ITRACE_START;
        rec.header.misc = 0;
        rec.header.size = sizeof(rec);
@@ -6186,9 +6489,6 @@ struct swevent_htable {
 
        /* Recursion avoidance in each contexts */
        int                             recursion[PERF_NR_CONTEXTS];
-
-       /* Keeps track of cpu being initialized/exited */
-       bool                            online;
 };
 
 static DEFINE_PER_CPU(struct swevent_htable, swevent_htable);
@@ -6446,14 +6746,8 @@ static int perf_swevent_add(struct perf_event *event, int flags)
        hwc->state = !(flags & PERF_EF_START);
 
        head = find_swevent_head(swhash, event);
-       if (!head) {
-               /*
-                * We can race with cpu hotplug code. Do not
-                * WARN if the cpu just got unplugged.
-                */
-               WARN_ON_ONCE(swhash->online);
+       if (WARN_ON_ONCE(!head))
                return -EINVAL;
-       }
 
        hlist_add_head_rcu(&event->hlist_entry, head);
        perf_event_update_userpage(event);
@@ -6521,7 +6815,6 @@ static int swevent_hlist_get_cpu(struct perf_event *event, int cpu)
        int err = 0;
 
        mutex_lock(&swhash->hlist_mutex);
-
        if (!swevent_hlist_deref(swhash) && cpu_online(cpu)) {
                struct swevent_hlist *hlist;
 
@@ -6637,6 +6930,10 @@ static int perf_tp_filter_match(struct perf_event *event,
 {
        void *record = data->raw->data;
 
+       /* only top level events have filters set */
+       if (event->parent)
+               event = event->parent;
+
        if (likely(!event->filter) || filter_match_preds(event->filter, record))
                return 1;
        return 0;
@@ -6785,8 +7082,8 @@ static int perf_event_set_bpf_prog(struct perf_event *event, u32 prog_fd)
        if (event->tp_event->prog)
                return -EEXIST;
 
-       if (!(event->tp_event->flags & TRACE_EVENT_FL_KPROBE))
-               /* bpf programs can only be attached to kprobes */
+       if (!(event->tp_event->flags & TRACE_EVENT_FL_UKPROBE))
+               /* bpf programs can only be attached to u/kprobes */
                return -EINVAL;
 
        prog = bpf_prog_get(prog_fd);
@@ -6907,9 +7204,8 @@ static void perf_swevent_start_hrtimer(struct perf_event *event)
        } else {
                period = max_t(u64, 10000, hwc->sample_period);
        }
-       __hrtimer_start_range_ns(&hwc->hrtimer,
-                               ns_to_ktime(period), 0,
-                               HRTIMER_MODE_REL_PINNED, 0);
+       hrtimer_start(&hwc->hrtimer, ns_to_ktime(period),
+                     HRTIMER_MODE_REL_PINNED);
 }
 
 static void perf_swevent_cancel_hrtimer(struct perf_event *event)
@@ -7112,24 +7408,49 @@ static void perf_pmu_nop_void(struct pmu *pmu)
 {
 }
 
+static void perf_pmu_nop_txn(struct pmu *pmu, unsigned int flags)
+{
+}
+
 static int perf_pmu_nop_int(struct pmu *pmu)
 {
        return 0;
 }
 
-static void perf_pmu_start_txn(struct pmu *pmu)
+static DEFINE_PER_CPU(unsigned int, nop_txn_flags);
+
+static void perf_pmu_start_txn(struct pmu *pmu, unsigned int flags)
 {
+       __this_cpu_write(nop_txn_flags, flags);
+
+       if (flags & ~PERF_PMU_TXN_ADD)
+               return;
+
        perf_pmu_disable(pmu);
 }
 
 static int perf_pmu_commit_txn(struct pmu *pmu)
 {
+       unsigned int flags = __this_cpu_read(nop_txn_flags);
+
+       __this_cpu_write(nop_txn_flags, 0);
+
+       if (flags & ~PERF_PMU_TXN_ADD)
+               return 0;
+
        perf_pmu_enable(pmu);
        return 0;
 }
 
 static void perf_pmu_cancel_txn(struct pmu *pmu)
 {
+       unsigned int flags =  __this_cpu_read(nop_txn_flags);
+
+       __this_cpu_write(nop_txn_flags, 0);
+
+       if (flags & ~PERF_PMU_TXN_ADD)
+               return;
+
        perf_pmu_enable(pmu);
 }
 
@@ -7211,6 +7532,8 @@ perf_event_mux_interval_ms_show(struct device *dev,
        return snprintf(page, PAGE_SIZE-1, "%d\n", pmu->hrtimer_interval_ms);
 }
 
+static DEFINE_MUTEX(mux_interval_mutex);
+
 static ssize_t
 perf_event_mux_interval_ms_store(struct device *dev,
                                 struct device_attribute *attr,
@@ -7230,17 +7553,21 @@ perf_event_mux_interval_ms_store(struct device *dev,
        if (timer == pmu->hrtimer_interval_ms)
                return count;
 
+       mutex_lock(&mux_interval_mutex);
        pmu->hrtimer_interval_ms = timer;
 
        /* update all cpuctx for this PMU */
-       for_each_possible_cpu(cpu) {
+       get_online_cpus();
+       for_each_online_cpu(cpu) {
                struct perf_cpu_context *cpuctx;
                cpuctx = per_cpu_ptr(pmu->pmu_cpu_context, cpu);
                cpuctx->hrtimer_interval = ns_to_ktime(NSEC_PER_MSEC * timer);
 
-               if (hrtimer_active(&cpuctx->hrtimer))
-                       hrtimer_forward_now(&cpuctx->hrtimer, cpuctx->hrtimer_interval);
+               cpu_function_call(cpu,
+                       (remote_function_f)perf_mux_hrtimer_restart, cpuctx);
        }
+       put_online_cpus();
+       mutex_unlock(&mux_interval_mutex);
 
        return count;
 }
@@ -7345,7 +7672,7 @@ skip_type:
                lockdep_set_class(&cpuctx->ctx.lock, &cpuctx_lock);
                cpuctx->ctx.pmu = pmu;
 
-               __perf_cpu_hrtimer_init(cpuctx, cpu);
+               __perf_mux_hrtimer_init(cpuctx, cpu);
 
                cpuctx->unique_pmu = pmu;
        }
@@ -7362,7 +7689,7 @@ got_cpu_context:
                        pmu->commit_txn = perf_pmu_commit_txn;
                        pmu->cancel_txn = perf_pmu_cancel_txn;
                } else {
-                       pmu->start_txn  = perf_pmu_nop_void;
+                       pmu->start_txn  = perf_pmu_nop_txn;
                        pmu->commit_txn = perf_pmu_nop_int;
                        pmu->cancel_txn = perf_pmu_nop_void;
                }
@@ -7450,7 +7777,7 @@ static int perf_try_init_event(struct pmu *pmu, struct perf_event *event)
        return ret;
 }
 
-struct pmu *perf_init_event(struct perf_event *event)
+static struct pmu *perf_init_event(struct perf_event *event)
 {
        struct pmu *pmu = NULL;
        int idx;
@@ -7511,6 +7838,10 @@ static void account_event(struct perf_event *event)
                if (atomic_inc_return(&nr_freq_events) == 1)
                        tick_nohz_full_kick_all();
        }
+       if (event->attr.context_switch) {
+               atomic_inc(&nr_switch_events);
+               static_key_slow_inc(&perf_sched_events.key);
+       }
        if (has_branch_stack(event))
                static_key_slow_inc(&perf_sched_events.key);
        if (is_cgroup_event(event))
@@ -8132,13 +8463,35 @@ SYSCALL_DEFINE5(perf_event_open,
 
        if (move_group) {
                gctx = group_leader->ctx;
+               mutex_lock_double(&gctx->mutex, &ctx->mutex);
+       } else {
+               mutex_lock(&ctx->mutex);
+       }
 
+       if (!perf_event_validate_size(event)) {
+               err = -E2BIG;
+               goto err_locked;
+       }
+
+       /*
+        * Must be under the same ctx::mutex as perf_install_in_context(),
+        * because we need to serialize with concurrent event creation.
+        */
+       if (!exclusive_event_installable(event, ctx)) {
+               /* exclusive and group stuff are assumed mutually exclusive */
+               WARN_ON_ONCE(move_group);
+
+               err = -EBUSY;
+               goto err_locked;
+       }
+
+       WARN_ON_ONCE(ctx->parent_ctx);
+
+       if (move_group) {
                /*
                 * See perf_event_ctx_lock() for comments on the details
                 * of swizzling perf_event::ctx.
                 */
-               mutex_lock_double(&gctx->mutex, &ctx->mutex);
-
                perf_remove_from_context(group_leader, false);
 
                list_for_each_entry(sibling, &group_leader->sibling_list,
@@ -8146,13 +8499,7 @@ SYSCALL_DEFINE5(perf_event_open,
                        perf_remove_from_context(sibling, false);
                        put_ctx(gctx);
                }
-       } else {
-               mutex_lock(&ctx->mutex);
-       }
 
-       WARN_ON_ONCE(ctx->parent_ctx);
-
-       if (move_group) {
                /*
                 * Wait for everybody to stop referencing the events through
                 * the old lists, before installing it on new lists.
@@ -8184,22 +8531,29 @@ SYSCALL_DEFINE5(perf_event_open,
                perf_event__state_init(group_leader);
                perf_install_in_context(ctx, group_leader, group_leader->cpu);
                get_ctx(ctx);
-       }
 
-       if (!exclusive_event_installable(event, ctx)) {
-               err = -EBUSY;
-               mutex_unlock(&ctx->mutex);
-               fput(event_file);
-               goto err_context;
+               /*
+                * Now that all events are installed in @ctx, nothing
+                * references @gctx anymore, so drop the last reference we have
+                * on it.
+                */
+               put_ctx(gctx);
        }
 
+       /*
+        * Precalculate sample_data sizes; do while holding ctx::mutex such
+        * that we're serialized against further additions and before
+        * perf_install_in_context() which is the point the event is active and
+        * can use these values.
+        */
+       perf_event__header_size(event);
+       perf_event__id_header_size(event);
+
        perf_install_in_context(ctx, event, event->cpu);
        perf_unpin_context(ctx);
 
-       if (move_group) {
+       if (move_group)
                mutex_unlock(&gctx->mutex);
-               put_ctx(gctx);
-       }
        mutex_unlock(&ctx->mutex);
 
        put_online_cpus();
@@ -8210,12 +8564,6 @@ SYSCALL_DEFINE5(perf_event_open,
        list_add_tail(&event->owner_entry, &current->perf_event_list);
        mutex_unlock(&current->perf_event_mutex);
 
-       /*
-        * Precalculate sample_data sizes
-        */
-       perf_event__header_size(event);
-       perf_event__id_header_size(event);
-
        /*
         * Drop the reference on the group_event after placing the
         * new event on the sibling_list. This ensures destruction
@@ -8226,6 +8574,12 @@ SYSCALL_DEFINE5(perf_event_open,
        fd_install(event_fd, event_file);
        return event_fd;
 
+err_locked:
+       if (move_group)
+               mutex_unlock(&gctx->mutex);
+       mutex_unlock(&ctx->mutex);
+/* err_file: */
+       fput(event_file);
 err_context:
        perf_unpin_context(ctx);
        put_ctx(ctx);
@@ -8450,10 +8804,8 @@ static void perf_event_exit_task_context(struct task_struct *child, int ctxn)
        struct perf_event_context *child_ctx, *clone_ctx = NULL;
        unsigned long flags;
 
-       if (likely(!child->perf_event_ctxp[ctxn])) {
-               perf_event_task(child, NULL, 0);
+       if (likely(!child->perf_event_ctxp[ctxn]))
                return;
-       }
 
        local_irq_save(flags);
        /*
@@ -8537,6 +8889,14 @@ void perf_event_exit_task(struct task_struct *child)
 
        for_each_task_context_nr(ctxn)
                perf_event_exit_task_context(child, ctxn);
+
+       /*
+        * The perf_event_exit_task_context calls perf_event_task
+        * with child's task_ctx, which generates EXIT events for
+        * child contexts and sets child->perf_event_ctxp[] to NULL.
+        * At this point we need to send EXIT events to cpu contexts.
+        */
+       perf_event_task(child, NULL, 0);
 }
 
 static void perf_free_event(struct perf_event *event,
@@ -8606,6 +8966,31 @@ void perf_event_delayed_put(struct task_struct *task)
                WARN_ON_ONCE(task->perf_event_ctxp[ctxn]);
 }
 
+struct perf_event *perf_event_get(unsigned int fd)
+{
+       int err;
+       struct fd f;
+       struct perf_event *event;
+
+       err = perf_fget_light(fd, &f);
+       if (err)
+               return ERR_PTR(err);
+
+       event = f.file->private_data;
+       atomic_long_inc(&event->refcount);
+       fdput(f);
+
+       return event;
+}
+
+const struct perf_event_attr *perf_event_attrs(struct perf_event *event)
+{
+       if (!event)
+               return ERR_PTR(-EINVAL);
+
+       return &event->attr;
+}
+
 /*
  * inherit a event from parent task to child task:
  */
@@ -8893,7 +9278,6 @@ static void perf_event_init_cpu(int cpu)
        struct swevent_htable *swhash = &per_cpu(swevent_htable, cpu);
 
        mutex_lock(&swhash->hlist_mutex);
-       swhash->online = true;
        if (swhash->hlist_refcount > 0) {
                struct swevent_hlist *hlist;
 
@@ -8904,7 +9288,7 @@ static void perf_event_init_cpu(int cpu)
        mutex_unlock(&swhash->hlist_mutex);
 }
 
-#if defined CONFIG_HOTPLUG_CPU || defined CONFIG_KEXEC
+#if defined CONFIG_HOTPLUG_CPU || defined CONFIG_KEXEC_CORE
 static void __perf_event_exit_context(void *__info)
 {
        struct remove_event re = { .detach_group = true };
@@ -8935,14 +9319,7 @@ static void perf_event_exit_cpu_context(int cpu)
 
 static void perf_event_exit_cpu(int cpu)
 {
-       struct swevent_htable *swhash = &per_cpu(swevent_htable, cpu);
-
        perf_event_exit_cpu_context(cpu);
-
-       mutex_lock(&swhash->hlist_mutex);
-       swhash->online = false;
-       swevent_hlist_release(swhash);
-       mutex_unlock(&swhash->hlist_mutex);
 }
 #else
 static inline void perf_event_exit_cpu(int cpu) { }
@@ -9090,38 +9467,24 @@ static void perf_cgroup_css_free(struct cgroup_subsys_state *css)
 static int __perf_cgroup_move(void *info)
 {
        struct task_struct *task = info;
+       rcu_read_lock();
        perf_cgroup_switch(task, PERF_CGROUP_SWOUT | PERF_CGROUP_SWIN);
+       rcu_read_unlock();
        return 0;
 }
 
-static void perf_cgroup_attach(struct cgroup_subsys_state *css,
-                              struct cgroup_taskset *tset)
+static void perf_cgroup_attach(struct cgroup_taskset *tset)
 {
        struct task_struct *task;
+       struct cgroup_subsys_state *css;
 
-       cgroup_taskset_for_each(task, tset)
+       cgroup_taskset_for_each(task, css, tset)
                task_function_call(task, __perf_cgroup_move, task);
 }
 
-static void perf_cgroup_exit(struct cgroup_subsys_state *css,
-                            struct cgroup_subsys_state *old_css,
-                            struct task_struct *task)
-{
-       /*
-        * cgroup_exit() is called in the copy_process() failure path.
-        * Ignore this case since the task hasn't ran yet, this avoids
-        * trying to poke a half freed task state from generic code.
-        */
-       if (!(task->flags & PF_EXITING))
-               return;
-
-       task_function_call(task, __perf_cgroup_move, task);
-}
-
 struct cgroup_subsys perf_event_cgrp_subsys = {
        .css_alloc      = perf_cgroup_css_alloc,
        .css_free       = perf_cgroup_css_free,
-       .exit           = perf_cgroup_exit,
        .attach         = perf_cgroup_attach,
 };
 #endif /* CONFIG_CGROUP_PERF */