These changes are the raw update to linux-4.4.6-rt14. Kernel sources

[kvmfornfv.git] / kernel / mm / memcontrol.c
diff --git a/kernel/mm/memcontrol.c b/kernel/mm/memcontrol.c

index 8bd68b5..095d20f 100644 (file)
--- a/kernel/mm/memcontrol.c
+++ b/kernel/mm/memcontrol.c
@@ -62,6 +62,7 @@
  #include <linux/oom.h>
  #include <linux/lockdep.h>
  #include <linux/file.h>
+#include <linux/tracehook.h>
  #include "internal.h"
  #include <net/sock.h>
  #include <net/ip.h>
@@ -79,6 +80,7 @@ EXPORT_SYMBOL(memory_cgrp_subsys);
  
  #define MEM_CGROUP_RECLAIM_RETRIES     5
  static struct mem_cgroup *root_mem_cgroup __read_mostly;
+struct cgroup_subsys_state *mem_cgroup_root_css __read_mostly;
  
  /* Whether the swap controller is active */
  #ifdef CONFIG_MEMCG_SWAP
@@ -93,6 +95,7 @@ static const char * const mem_cgroup_stat_names[] = {
         "rss",
         "rss_huge",
         "mapped_file",
+       "dirty",
         "writeback",
         "swap",
  };
@@ -112,56 +115,10 @@ static const char * const mem_cgroup_lru_names[] = {
         "unevictable",
  };
  
-/*
- * Per memcg event counter is incremented at every pagein/pageout. With THP,
- * it will be incremated by the number of pages. This counter is used for
- * for trigger some periodic events. This is straightforward and better
- * than using jiffies etc. to handle periodic memcg event.
- */
-enum mem_cgroup_events_target {
-       MEM_CGROUP_TARGET_THRESH,
-       MEM_CGROUP_TARGET_SOFTLIMIT,
-       MEM_CGROUP_TARGET_NUMAINFO,
-       MEM_CGROUP_NTARGETS,
-};
  #define THRESHOLDS_EVENTS_TARGET 128
  #define SOFTLIMIT_EVENTS_TARGET 1024
  #define NUMAINFO_EVENTS_TARGET 1024
  
-struct mem_cgroup_stat_cpu {
-       long count[MEM_CGROUP_STAT_NSTATS];
-       unsigned long events[MEMCG_NR_EVENTS];
-       unsigned long nr_page_events;
-       unsigned long targets[MEM_CGROUP_NTARGETS];
-};
-
-struct reclaim_iter {
-       struct mem_cgroup *position;
-       /* scan generation, increased every round-trip */
-       unsigned int generation;
-};
-
-/*
- * per-zone information in memory controller.
- */
-struct mem_cgroup_per_zone {
-       struct lruvec           lruvec;
-       unsigned long           lru_size[NR_LRU_LISTS];
-
-       struct reclaim_iter     iter[DEF_PRIORITY + 1];
-
-       struct rb_node          tree_node;      /* RB tree node */
-       unsigned long           usage_in_excess;/* Set to the value by which */
-                                               /* the soft limit is exceeded*/
-       bool                    on_tree;
-       struct mem_cgroup       *memcg;         /* Back pointer, we cannot */
-                                               /* use container_of        */
-};
-
-struct mem_cgroup_per_node {
-       struct mem_cgroup_per_zone zoneinfo[MAX_NR_ZONES];
-};
-
  /*
   * Cgroups above their limits are maintained in a RB-Tree, independent of
   * their hierarchy representation
@@ -182,32 +139,6 @@ struct mem_cgroup_tree {
  
  static struct mem_cgroup_tree soft_limit_tree __read_mostly;
  
-struct mem_cgroup_threshold {
-       struct eventfd_ctx *eventfd;
-       unsigned long threshold;
-};
-
-/* For threshold */
-struct mem_cgroup_threshold_ary {
-       /* An array index points to threshold just below or equal to usage. */
-       int current_threshold;
-       /* Size of entries[] */
-       unsigned int size;
-       /* Array of thresholds */
-       struct mem_cgroup_threshold entries[0];
-};
-
-struct mem_cgroup_thresholds {
-       /* Primary thresholds array */
-       struct mem_cgroup_threshold_ary *primary;
-       /*
-        * Spare threshold array.
-        * This is needed to make mem_cgroup_unregister_event() "never fail".
-        * It must be able to store at least primary->size - 1 entries.
-        */
-       struct mem_cgroup_threshold_ary *spare;
-};
-
  /* for OOM */
  struct mem_cgroup_eventfd_list {
         struct list_head list;
@@ -257,113 +188,6 @@ struct mem_cgroup_event {
  static void mem_cgroup_threshold(struct mem_cgroup *memcg);
  static void mem_cgroup_oom_notify(struct mem_cgroup *memcg);
  
-/*
- * The memory controller data structure. The memory controller controls both
- * page cache and RSS per cgroup. We would eventually like to provide
- * statistics based on the statistics developed by Rik Van Riel for clock-pro,
- * to help the administrator determine what knobs to tune.
- */
-struct mem_cgroup {
-       struct cgroup_subsys_state css;
-
-       /* Accounted resources */
-       struct page_counter memory;
-       struct page_counter memsw;
-       struct page_counter kmem;
-
-       /* Normal memory consumption range */
-       unsigned long low;
-       unsigned long high;
-
-       unsigned long soft_limit;
-
-       /* vmpressure notifications */
-       struct vmpressure vmpressure;
-
-       /* css_online() has been completed */
-       int initialized;
-
-       /*
-        * Should the accounting and control be hierarchical, per subtree?
-        */
-       bool use_hierarchy;
-
-       bool            oom_lock;
-       atomic_t        under_oom;
-       atomic_t        oom_wakeups;
-
-       int     swappiness;
-       /* OOM-Killer disable */
-       int             oom_kill_disable;
-
-       /* protect arrays of thresholds */
-       struct mutex thresholds_lock;
-
-       /* thresholds for memory usage. RCU-protected */
-       struct mem_cgroup_thresholds thresholds;
-
-       /* thresholds for mem+swap usage. RCU-protected */
-       struct mem_cgroup_thresholds memsw_thresholds;
-
-       /* For oom notifier event fd */
-       struct list_head oom_notify;
-
-       /*
-        * Should we move charges of a task when a task is moved into this
-        * mem_cgroup ? And what type of charges should we move ?
-        */
-       unsigned long move_charge_at_immigrate;
-       /*
-        * set > 0 if pages under this cgroup are moving to other cgroup.
-        */
-       atomic_t                moving_account;
-       /* taken only while moving_account > 0 */
-       spinlock_t              move_lock;
-       struct task_struct      *move_lock_task;
-       unsigned long           move_lock_flags;
-       /*
-        * percpu counter.
-        */
-       struct mem_cgroup_stat_cpu __percpu *stat;
-       /*
-        * used when a cpu is offlined or other synchronizations
-        * See mem_cgroup_read_stat().
-        */
-       struct mem_cgroup_stat_cpu nocpu_base;
-       spinlock_t pcp_counter_lock;
-
-#if defined(CONFIG_MEMCG_KMEM) && defined(CONFIG_INET)
-       struct cg_proto tcp_mem;
-#endif
-#if defined(CONFIG_MEMCG_KMEM)
-        /* Index in the kmem_cache->memcg_params.memcg_caches array */
-       int kmemcg_id;
-       bool kmem_acct_activated;
-       bool kmem_acct_active;
-#endif
-
-       int last_scanned_node;
-#if MAX_NUMNODES > 1
-       nodemask_t      scan_nodes;
-       atomic_t        numainfo_events;
-       atomic_t        numainfo_updating;
-#endif
-
-       /* List of events which userspace want to receive */
-       struct list_head event_list;
-       spinlock_t event_list_lock;
-
-       struct mem_cgroup_per_node *nodeinfo[0];
-       /* WARNING: nodeinfo must be the last member here */
-};
-
-#ifdef CONFIG_MEMCG_KMEM
-bool memcg_kmem_is_active(struct mem_cgroup *memcg)
-{
-       return memcg->kmem_acct_active;
-}
-#endif
-
  /* Stuffs for move charges at task migration. */
  /*
   * Types of charges to be moved.
@@ -424,11 +248,6 @@ enum res_type {
   */
  static DEFINE_MUTEX(memcg_create_mutex);
  
-struct mem_cgroup *mem_cgroup_from_css(struct cgroup_subsys_state *s)
-{
-       return s ? container_of(s, struct mem_cgroup, css) : NULL;
-}
-
  /* Some nice accessors for the vmpressure. */
  struct vmpressure *memcg_to_vmpressure(struct mem_cgroup *memcg)
  {
@@ -500,8 +319,7 @@ void sock_update_memcg(struct sock *sk)
                 rcu_read_lock();
                 memcg = mem_cgroup_from_task(current);
                 cg_proto = sk->sk_prot->proto_cgroup(memcg);
-               if (!mem_cgroup_is_root(memcg) &&
-                   memcg_proto_active(cg_proto) &&
+               if (cg_proto && test_bit(MEMCG_SOCK_ACTIVE, &cg_proto->flags) &&
                     css_tryget_online(&memcg->css)) {
                         sk->sk_cgrp = cg_proto;
                 }
@@ -594,11 +412,67 @@ mem_cgroup_zone_zoneinfo(struct mem_cgroup *memcg, struct zone *zone)
         return &memcg->nodeinfo[nid]->zoneinfo[zid];
  }
  
-struct cgroup_subsys_state *mem_cgroup_css(struct mem_cgroup *memcg)
+/**
+ * mem_cgroup_css_from_page - css of the memcg associated with a page
+ * @page: page of interest
+ *
+ * If memcg is bound to the default hierarchy, css of the memcg associated
+ * with @page is returned.  The returned css remains associated with @page
+ * until it is released.
+ *
+ * If memcg is bound to a traditional hierarchy, the css of root_mem_cgroup
+ * is returned.
+ *
+ * XXX: The above description of behavior on the default hierarchy isn't
+ * strictly true yet as replace_page_cache_page() can modify the
+ * association before @page is released even on the default hierarchy;
+ * however, the current and planned usages don't mix the the two functions
+ * and replace_page_cache_page() will soon be updated to make the invariant
+ * actually true.
+ */
+struct cgroup_subsys_state *mem_cgroup_css_from_page(struct page *page)
  {
+       struct mem_cgroup *memcg;
+
+       rcu_read_lock();
+
+       memcg = page->mem_cgroup;
+
+       if (!memcg || !cgroup_subsys_on_dfl(memory_cgrp_subsys))
+               memcg = root_mem_cgroup;
+
+       rcu_read_unlock();
         return &memcg->css;
  }
  
+/**
+ * page_cgroup_ino - return inode number of the memcg a page is charged to
+ * @page: the page
+ *
+ * Look up the closest online ancestor of the memory cgroup @page is charged to
+ * and return its inode number or 0 if @page is not charged to any cgroup. It
+ * is safe to call this function without holding a reference to @page.
+ *
+ * Note, this function is inherently racy, because there is nothing to prevent
+ * the cgroup inode from getting torn down and potentially reallocated a moment
+ * after page_cgroup_ino() returns, so it only should be used by callers that
+ * do not care (such as procfs interfaces).
+ */
+ino_t page_cgroup_ino(struct page *page)
+{
+       struct mem_cgroup *memcg;
+       unsigned long ino = 0;
+
+       rcu_read_lock();
+       memcg = READ_ONCE(page->mem_cgroup);
+       while (memcg && !(memcg->css.flags & CSS_ONLINE))
+               memcg = parent_mem_cgroup(memcg);
+       if (memcg)
+               ino = cgroup_ino(memcg->css.cgroup);
+       rcu_read_unlock();
+       return ino;
+}
+
  static struct mem_cgroup_per_zone *
  mem_cgroup_page_zoneinfo(struct mem_cgroup *memcg, struct page *page)
  {
@@ -774,12 +648,14 @@ mem_cgroup_largest_soft_limit_node(struct mem_cgroup_tree_per_zone *mctz)
  }
  
  /*
+ * Return page count for single (non recursive) @memcg.
+ *
   * Implementation Note: reading percpu statistics for memcg.
   *
   * Both of vmstat[] and percpu_counter has threshold and do periodic
   * synchronization to implement "quick" read. There are trade-off between
   * reading cost and precision of value. Then, we may have a chance to implement
- * a periodic synchronizion of counter in memcg's counter.
+ * a periodic synchronization of counter in memcg's counter.
   *
   * But this _read() function is used for user interface now. The user accounts
   * memory usage by memory cgroup and he _always_ requires exact value because
@@ -789,24 +665,24 @@ mem_cgroup_largest_soft_limit_node(struct mem_cgroup_tree_per_zone *mctz)
   *
   * If there are kernel internal actions which can make use of some not-exact
   * value, and reading all cpu value can be performance bottleneck in some
- * common workload, threashold and synchonization as vmstat[] should be
+ * common workload, threshold and synchronization as vmstat[] should be
   * implemented.
   */
-static long mem_cgroup_read_stat(struct mem_cgroup *memcg,
-                                enum mem_cgroup_stat_index idx)
+static unsigned long
+mem_cgroup_read_stat(struct mem_cgroup *memcg, enum mem_cgroup_stat_index idx)
  {
         long val = 0;
         int cpu;
  
-       get_online_cpus();
-       for_each_online_cpu(cpu)
+       /* Per-cpu values can be negative, use a signed accumulator */
+       for_each_possible_cpu(cpu)
                 val += per_cpu(memcg->stat->count[idx], cpu);
-#ifdef CONFIG_HOTPLUG_CPU
-       spin_lock(&memcg->pcp_counter_lock);
-       val += memcg->nocpu_base.count[idx];
-       spin_unlock(&memcg->pcp_counter_lock);
-#endif
-       put_online_cpus();
+       /*
+        * Summing races with updates, so val may be negative.  Avoid exposing
+        * transient negative values.
+        */
+       if (val < 0)
+               val = 0;
         return val;
  }
  
@@ -816,15 +692,8 @@ static unsigned long mem_cgroup_read_events(struct mem_cgroup *memcg,
         unsigned long val = 0;
         int cpu;
  
-       get_online_cpus();
-       for_each_online_cpu(cpu)
+       for_each_possible_cpu(cpu)
                 val += per_cpu(memcg->stat->events[idx], cpu);
-#ifdef CONFIG_HOTPLUG_CPU
-       spin_lock(&memcg->pcp_counter_lock);
-       val += memcg->nocpu_base.events[idx];
-       spin_unlock(&memcg->pcp_counter_lock);
-#endif
-       put_online_cpus();
         return val;
  }
  
@@ -858,14 +727,6 @@ static void mem_cgroup_charge_statistics(struct mem_cgroup *memcg,
         __this_cpu_add(memcg->stat->nr_page_events, nr_pages);
  }
  
-unsigned long mem_cgroup_get_lru_size(struct lruvec *lruvec, enum lru_list lru)
-{
-       struct mem_cgroup_per_zone *mz;
-
-       mz = container_of(lruvec, struct mem_cgroup_per_zone, lruvec);
-       return mz->lru_size[lru];
-}
-
  static unsigned long mem_cgroup_node_nr_lru_pages(struct mem_cgroup *memcg,
                                                   int nid,
                                                   unsigned int lru_mask)
@@ -968,6 +829,7 @@ struct mem_cgroup *mem_cgroup_from_task(struct task_struct *p)
  
         return mem_cgroup_from_css(task_css(p, memory_cgrp_id));
  }
+EXPORT_SYMBOL(mem_cgroup_from_task);
  
  static struct mem_cgroup *get_mem_cgroup_from_mm(struct mm_struct *mm)
  {
@@ -1013,7 +875,7 @@ struct mem_cgroup *mem_cgroup_iter(struct mem_cgroup *root,
                                    struct mem_cgroup *prev,
                                    struct mem_cgroup_reclaim_cookie *reclaim)
  {
-       struct reclaim_iter *uninitialized_var(iter);
+       struct mem_cgroup_reclaim_iter *uninitialized_var(iter);
         struct cgroup_subsys_state *css = NULL;
         struct mem_cgroup *memcg = NULL;
         struct mem_cgroup *pos = NULL;
@@ -1044,14 +906,20 @@ struct mem_cgroup *mem_cgroup_iter(struct mem_cgroup *root,
                 if (prev && reclaim->generation != iter->generation)
                         goto out_unlock;
  
-               do {
+               while (1) {
                         pos = READ_ONCE(iter->position);
+                       if (!pos || css_tryget(&pos->css))
+                               break;
                         /*
-                        * A racing update may change the position and
-                        * put the last reference, hence css_tryget(),
-                        * or retry to see the updated position.
+                        * css reference reached zero, so iter->position will
+                        * be cleared by ->css_released. However, we should not
+                        * rely on this happening soon, because ->css_released
+                        * is called from a work queue, and by busy-waiting we
+                        * might block it. So we clear iter->position right
+                        * away.
                          */
-               } while (pos && !css_tryget(&pos->css));
+                       (void)cmpxchg(&iter->position, pos, NULL);
+               }
         }
  
         if (pos)
@@ -1097,17 +965,13 @@ struct mem_cgroup *mem_cgroup_iter(struct mem_cgroup *root,
         }
  
         if (reclaim) {
-               if (cmpxchg(&iter->position, pos, memcg) == pos) {
-                       if (memcg)
-                               css_get(&memcg->css);
-                       if (pos)
-                               css_put(&pos->css);
-               }
-
                 /*
-                * pairs with css_tryget when dereferencing iter->position
-                * above.
+                * The position could have already been updated by a competing
+                * thread, so check that the value hasn't changed since we read
+                * it to avoid reclaiming from the same cgroup twice.
                  */
+               (void)cmpxchg(&iter->position, pos, memcg);
+
                 if (pos)
                         css_put(&pos->css);
  
@@ -1140,6 +1004,28 @@ void mem_cgroup_iter_break(struct mem_cgroup *root,
                 css_put(&prev->css);
  }
  
+static void invalidate_reclaim_iterators(struct mem_cgroup *dead_memcg)
+{
+       struct mem_cgroup *memcg = dead_memcg;
+       struct mem_cgroup_reclaim_iter *iter;
+       struct mem_cgroup_per_zone *mz;
+       int nid, zid;
+       int i;
+
+       while ((memcg = parent_mem_cgroup(memcg))) {
+               for_each_node(nid) {
+                       for (zid = 0; zid < MAX_NR_ZONES; zid++) {
+                               mz = &memcg->nodeinfo[nid]->zoneinfo[zid];
+                               for (i = 0; i <= DEF_PRIORITY; i++) {
+                                       iter = &mz->iter[i];
+                                       cmpxchg(&iter->position,
+                                               dead_memcg, NULL);
+                               }
+                       }
+               }
+       }
+}
+
  /*
   * Iteration constructs for visiting all cgroups (under a tree).  If
   * loops are exited prematurely (break), mem_cgroup_iter_break() must
@@ -1155,30 +1041,6 @@ void mem_cgroup_iter_break(struct mem_cgroup *root,
              iter != NULL;                              \
              iter = mem_cgroup_iter(NULL, iter, NULL))
  
-void __mem_cgroup_count_vm_event(struct mm_struct *mm, enum vm_event_item idx)
-{
-       struct mem_cgroup *memcg;
-
-       rcu_read_lock();
-       memcg = mem_cgroup_from_task(rcu_dereference(mm->owner));
-       if (unlikely(!memcg))
-               goto out;
-
-       switch (idx) {
-       case PGFAULT:
-               this_cpu_inc(memcg->stat->events[MEM_CGROUP_EVENTS_PGFAULT]);
-               break;
-       case PGMAJFAULT:
-               this_cpu_inc(memcg->stat->events[MEM_CGROUP_EVENTS_PGMAJFAULT]);
-               break;
-       default:
-               BUG();
-       }
-out:
-       rcu_read_unlock();
-}
-EXPORT_SYMBOL(__mem_cgroup_count_vm_event);
-
  /**
   * mem_cgroup_zone_lruvec - get the lru list vector for a zone and memcg
   * @zone: zone of the wanted lruvec
@@ -1277,15 +1139,6 @@ void mem_cgroup_update_lru_size(struct lruvec *lruvec, enum lru_list lru,
         VM_BUG_ON((long)(*lru_size) < 0);
  }
  
-bool mem_cgroup_is_descendant(struct mem_cgroup *memcg, struct mem_cgroup *root)
-{
-       if (root == memcg)
-               return true;
-       if (!root->use_hierarchy)
-               return false;
-       return cgroup_is_descendant(memcg->css.cgroup, root->css.cgroup);
-}
-
  bool task_in_mem_cgroup(struct task_struct *task, struct mem_cgroup *memcg)
  {
         struct mem_cgroup *task_memcg;
@@ -1312,39 +1165,6 @@ bool task_in_mem_cgroup(struct task_struct *task, struct mem_cgroup *memcg)
         return ret;
  }
  
-int mem_cgroup_inactive_anon_is_low(struct lruvec *lruvec)
-{
-       unsigned long inactive_ratio;
-       unsigned long inactive;
-       unsigned long active;
-       unsigned long gb;
-
-       inactive = mem_cgroup_get_lru_size(lruvec, LRU_INACTIVE_ANON);
-       active = mem_cgroup_get_lru_size(lruvec, LRU_ACTIVE_ANON);
-
-       gb = (inactive + active) >> (30 - PAGE_SHIFT);
-       if (gb)
-               inactive_ratio = int_sqrt(10 * gb);
-       else
-               inactive_ratio = 1;
-
-       return inactive * inactive_ratio < active;
-}
-
-bool mem_cgroup_lruvec_online(struct lruvec *lruvec)
-{
-       struct mem_cgroup_per_zone *mz;
-       struct mem_cgroup *memcg;
-
-       if (mem_cgroup_disabled())
-               return true;
-
-       mz = container_of(lruvec, struct mem_cgroup_per_zone, lruvec);
-       memcg = mz->memcg;
-
-       return !!(memcg->css.flags & CSS_ONLINE);
-}
-
  #define mem_cgroup_from_counter(counter, member)       \
         container_of(counter, struct mem_cgroup, member)
  
@@ -1376,15 +1196,6 @@ static unsigned long mem_cgroup_margin(struct mem_cgroup *memcg)
         return margin;
  }
  
-int mem_cgroup_swappiness(struct mem_cgroup *memcg)
-{
-       /* root ? */
-       if (mem_cgroup_disabled() || !memcg->css.parent)
-               return vm_swappiness;
-
-       return memcg->swappiness;
-}
-
  /*
   * A routine for checking "mem" is under move_account() or not.
   *
@@ -1480,7 +1291,7 @@ void mem_cgroup_print_oom_info(struct mem_cgroup *memcg, struct task_struct *p)
                 for (i = 0; i < MEM_CGROUP_STAT_NSTATS; i++) {
                         if (i == MEM_CGROUP_STAT_SWAP && !do_swap_account)
                                 continue;
-                       pr_cont(" %s:%ldKB", mem_cgroup_stat_names[i],
+                       pr_cont(" %s:%luKB", mem_cgroup_stat_names[i],
                                 K(mem_cgroup_read_stat(iter, i)));
                 }
  
@@ -1527,23 +1338,31 @@ static unsigned long mem_cgroup_get_limit(struct mem_cgroup *memcg)
  static void mem_cgroup_out_of_memory(struct mem_cgroup *memcg, gfp_t gfp_mask,
                                      int order)
  {
+       struct oom_control oc = {
+               .zonelist = NULL,
+               .nodemask = NULL,
+               .gfp_mask = gfp_mask,
+               .order = order,
+       };
         struct mem_cgroup *iter;
         unsigned long chosen_points = 0;
         unsigned long totalpages;
         unsigned int points = 0;
         struct task_struct *chosen = NULL;
  
+       mutex_lock(&oom_lock);
+
         /*
          * If current has a pending SIGKILL or is exiting, then automatically
          * select it.  The goal is to allow it to allocate so that it may
          * quickly exit and free its memory.
          */
         if (fatal_signal_pending(current) || task_will_free_mem(current)) {
-               mark_tsk_oom_victim(current);
-               return;
+               mark_oom_victim(current);
+               goto unlock;
         }
  
-       check_panic_on_oom(CONSTRAINT_MEMCG, gfp_mask, order, NULL, memcg);
+       check_panic_on_oom(&oc, CONSTRAINT_MEMCG, memcg);
         totalpages = mem_cgroup_get_limit(memcg) ? : 1;
         for_each_mem_cgroup_tree(iter, memcg) {
                 struct css_task_iter it;
@@ -1551,8 +1370,7 @@ static void mem_cgroup_out_of_memory(struct mem_cgroup *memcg, gfp_t gfp_mask,
  
                 css_task_iter_start(&iter->css, &it);
                 while ((task = css_task_iter_next(&it))) {
-                       switch (oom_scan_process_thread(task, totalpages, NULL,
-                                                       false)) {
+                       switch (oom_scan_process_thread(&oc, task, totalpages)) {
                         case OOM_SCAN_SELECT:
                                 if (chosen)
                                         put_task_struct(chosen);
@@ -1567,7 +1385,7 @@ static void mem_cgroup_out_of_memory(struct mem_cgroup *memcg, gfp_t gfp_mask,
                                 mem_cgroup_iter_break(memcg, iter);
                                 if (chosen)
                                         put_task_struct(chosen);
-                               return;
+                               goto unlock;
                         case OOM_SCAN_OK:
                                 break;
                         };
@@ -1588,11 +1406,13 @@ static void mem_cgroup_out_of_memory(struct mem_cgroup *memcg, gfp_t gfp_mask,
                 css_task_iter_end(&it);
         }
  
-       if (!chosen)
-               return;
-       points = chosen_points * 1000 / totalpages;
-       oom_kill_process(chosen, gfp_mask, order, points, totalpages, memcg,
-                        NULL, "Memory cgroup out of memory");
+       if (chosen) {
+               points = chosen_points * 1000 / totalpages;
+               oom_kill_process(&oc, chosen, points, totalpages, memcg,
+                                "Memory cgroup out of memory");
+       }
+unlock:
+       mutex_unlock(&oom_lock);
  }
  
  #if MAX_NUMNODES > 1
@@ -1809,8 +1629,10 @@ static void mem_cgroup_mark_under_oom(struct mem_cgroup *memcg)
  {
         struct mem_cgroup *iter;
  
+       spin_lock(&memcg_oom_lock);
         for_each_mem_cgroup_tree(iter, memcg)
-               atomic_inc(&iter->under_oom);
+               iter->under_oom++;
+       spin_unlock(&memcg_oom_lock);
  }
  
  static void mem_cgroup_unmark_under_oom(struct mem_cgroup *memcg)
@@ -1819,11 +1641,13 @@ static void mem_cgroup_unmark_under_oom(struct mem_cgroup *memcg)
  
         /*
          * When a new child is created while the hierarchy is under oom,
-        * mem_cgroup_oom_lock() may not be called. We have to use
-        * atomic_add_unless() here.
+        * mem_cgroup_oom_lock() may not be called. Watch for underflow.
          */
+       spin_lock(&memcg_oom_lock);
         for_each_mem_cgroup_tree(iter, memcg)
-               atomic_add_unless(&iter->under_oom, -1, 0);
+               if (iter->under_oom > 0)
+                       iter->under_oom--;
+       spin_unlock(&memcg_oom_lock);
  }
  
  static DECLARE_WAIT_QUEUE_HEAD(memcg_oom_waitq);
@@ -1849,22 +1673,23 @@ static int memcg_oom_wake_function(wait_queue_t *wait,
         return autoremove_wake_function(wait, mode, sync, arg);
  }
  
-static void memcg_wakeup_oom(struct mem_cgroup *memcg)
-{
-       atomic_inc(&memcg->oom_wakeups);
-       /* for filtering, pass "memcg" as argument. */
-       __wake_up(&memcg_oom_waitq, TASK_NORMAL, 0, memcg);
-}
-
  static void memcg_oom_recover(struct mem_cgroup *memcg)
  {
-       if (memcg && atomic_read(&memcg->under_oom))
-               memcg_wakeup_oom(memcg);
+       /*
+        * For the following lockless ->under_oom test, the only required
+        * guarantee is that it must see the state asserted by an OOM when
+        * this function is called as a result of userland actions
+        * triggered by the notification of the OOM.  This is trivially
+        * achieved by invoking mem_cgroup_mark_under_oom() before
+        * triggering notification.
+        */
+       if (memcg && memcg->under_oom)
+               __wake_up(&memcg_oom_waitq, TASK_NORMAL, 0, memcg);
  }
  
  static void mem_cgroup_oom(struct mem_cgroup *memcg, gfp_t mask, int order)
  {
-       if (!current->memcg_oom.may_oom)
+       if (!current->memcg_may_oom)
                 return;
         /*
          * We are in the middle of the charge context here, so we
@@ -1881,9 +1706,9 @@ static void mem_cgroup_oom(struct mem_cgroup *memcg, gfp_t mask, int order)
          * and when we know whether the fault was overall successful.
          */
         css_get(&memcg->css);
-       current->memcg_oom.memcg = memcg;
-       current->memcg_oom.gfp_mask = mask;
-       current->memcg_oom.order = order;
+       current->memcg_in_oom = memcg;
+       current->memcg_oom_gfp_mask = mask;
+       current->memcg_oom_order = order;
  }
  
  /**
@@ -1905,7 +1730,7 @@ static void mem_cgroup_oom(struct mem_cgroup *memcg, gfp_t mask, int order)
   */
  bool mem_cgroup_oom_synchronize(bool handle)
  {
-       struct mem_cgroup *memcg = current->memcg_oom.memcg;
+       struct mem_cgroup *memcg = current->memcg_in_oom;
         struct oom_wait_info owait;
         bool locked;
  
@@ -1933,8 +1758,8 @@ bool mem_cgroup_oom_synchronize(bool handle)
         if (locked && !memcg->oom_kill_disable) {
                 mem_cgroup_unmark_under_oom(memcg);
                 finish_wait(&memcg_oom_waitq, &owait.wait);
-               mem_cgroup_out_of_memory(memcg, current->memcg_oom.gfp_mask,
-                                        current->memcg_oom.order);
+               mem_cgroup_out_of_memory(memcg, current->memcg_oom_gfp_mask,
+                                        current->memcg_oom_order);
         } else {
                 schedule();
                 mem_cgroup_unmark_under_oom(memcg);
@@ -1951,7 +1776,7 @@ bool mem_cgroup_oom_synchronize(bool handle)
                 memcg_oom_recover(memcg);
         }
  cleanup:
-       current->memcg_oom.memcg = NULL;
+       current->memcg_in_oom = NULL;
         css_put(&memcg->css);
         return true;
  }
@@ -2014,6 +1839,7 @@ again:
  
         return memcg;
  }
+EXPORT_SYMBOL(mem_cgroup_begin_page_stat);
  
  /**
   * mem_cgroup_end_page_stat - finish a page state statistics transaction
@@ -2032,23 +1858,7 @@ void mem_cgroup_end_page_stat(struct mem_cgroup *memcg)
  
         rcu_read_unlock();
  }
-
-/**
- * mem_cgroup_update_page_stat - update page state statistics
- * @memcg: memcg to account against
- * @idx: page state item to account
- * @val: number of pages (positive or negative)
- *
- * See mem_cgroup_begin_page_stat() for locking requirements.
- */
-void mem_cgroup_update_page_stat(struct mem_cgroup *memcg,
-                                enum mem_cgroup_stat_index idx, int val)
-{
-       VM_BUG_ON(!rcu_read_lock_held());
-
-       if (memcg)
-               this_cpu_add(memcg->stat->count[idx], val);
-}
+EXPORT_SYMBOL(mem_cgroup_end_page_stat);
  
  /*
   * size of first charge trial. "32" comes from vmscan.c's magic value.
@@ -2175,37 +1985,12 @@ static void drain_all_stock(struct mem_cgroup *root_memcg)
         mutex_unlock(&percpu_charge_mutex);
  }
  
-/*
- * This function drains percpu counter value from DEAD cpu and
- * move it to local cpu. Note that this function can be preempted.
- */
-static void mem_cgroup_drain_pcp_counter(struct mem_cgroup *memcg, int cpu)
-{
-       int i;
-
-       spin_lock(&memcg->pcp_counter_lock);
-       for (i = 0; i < MEM_CGROUP_STAT_NSTATS; i++) {
-               long x = per_cpu(memcg->stat->count[i], cpu);
-
-               per_cpu(memcg->stat->count[i], cpu) = 0;
-               memcg->nocpu_base.count[i] += x;
-       }
-       for (i = 0; i < MEM_CGROUP_EVENTS_NSTATS; i++) {
-               unsigned long x = per_cpu(memcg->stat->events[i], cpu);
-
-               per_cpu(memcg->stat->events[i], cpu) = 0;
-               memcg->nocpu_base.events[i] += x;
-       }
-       spin_unlock(&memcg->pcp_counter_lock);
-}
-
  static int memcg_cpu_hotplug_callback(struct notifier_block *nb,
                                         unsigned long action,
                                         void *hcpu)
  {
         int cpu = (unsigned long)hcpu;
         struct memcg_stock_pcp *stock;
-       struct mem_cgroup *iter;
  
         if (action == CPU_ONLINE)
                 return NOTIFY_OK;
@@ -2213,14 +1998,36 @@ static int memcg_cpu_hotplug_callback(struct notifier_block *nb,
         if (action != CPU_DEAD && action != CPU_DEAD_FROZEN)
                 return NOTIFY_OK;
  
-       for_each_mem_cgroup(iter)
-               mem_cgroup_drain_pcp_counter(iter, cpu);
-
         stock = &per_cpu(memcg_stock, cpu);
         drain_stock(stock);
         return NOTIFY_OK;
  }
  
+/*
+ * Scheduled by try_charge() to be executed from the userland return path
+ * and reclaims memory over the high limit.
+ */
+void mem_cgroup_handle_over_high(void)
+{
+       unsigned int nr_pages = current->memcg_nr_pages_over_high;
+       struct mem_cgroup *memcg, *pos;
+
+       if (likely(!nr_pages))
+               return;
+
+       pos = memcg = get_mem_cgroup_from_mm(current->mm);
+
+       do {
+               if (page_counter_read(&pos->memory) <= pos->high)
+                       continue;
+               mem_cgroup_events(pos, MEMCG_HIGH, 1);
+               try_to_free_mem_cgroup_pages(pos, nr_pages, GFP_KERNEL, true);
+       } while ((pos = parent_mem_cgroup(pos)));
+
+       css_put(&memcg->css);
+       current->memcg_nr_pages_over_high = 0;
+}
+
  static int try_charge(struct mem_cgroup *memcg, gfp_t gfp_mask,
                       unsigned int nr_pages)
  {
@@ -2231,17 +2038,16 @@ static int try_charge(struct mem_cgroup *memcg, gfp_t gfp_mask,
         unsigned long nr_reclaimed;
         bool may_swap = true;
         bool drained = false;
-       int ret = 0;
  
         if (mem_cgroup_is_root(memcg))
-               goto done;
+               return 0;
  retry:
         if (consume_stock(memcg, nr_pages))
-               goto done;
+               return 0;
  
         if (!do_swap_account ||
-           !page_counter_try_charge(&memcg->memsw, batch, &counter)) {
-               if (!page_counter_try_charge(&memcg->memory, batch, &counter))
+           page_counter_try_charge(&memcg->memsw, batch, &counter)) {
+               if (page_counter_try_charge(&memcg->memory, batch, &counter))
                         goto done_restock;
                 if (do_swap_account)
                         page_counter_uncharge(&memcg->memsw, batch);
@@ -2265,12 +2071,12 @@ retry:
         if (unlikely(test_thread_flag(TIF_MEMDIE) ||
                      fatal_signal_pending(current) ||
                      current->flags & PF_EXITING))
-               goto bypass;
+               goto force;
  
         if (unlikely(task_in_memcg_oom(current)))
                 goto nomem;
  
-       if (!(gfp_mask & __GFP_WAIT))
+       if (!gfpflags_allow_blocking(gfp_mask))
                 goto nomem;
  
         mem_cgroup_events(mem_over_limit, MEMCG_MAX, 1);
@@ -2311,38 +2117,54 @@ retry:
                 goto retry;
  
         if (gfp_mask & __GFP_NOFAIL)
-               goto bypass;
+               goto force;
  
         if (fatal_signal_pending(current))
-               goto bypass;
+               goto force;
  
         mem_cgroup_events(mem_over_limit, MEMCG_OOM, 1);
  
-       mem_cgroup_oom(mem_over_limit, gfp_mask, get_order(nr_pages));
+       mem_cgroup_oom(mem_over_limit, gfp_mask,
+                      get_order(nr_pages * PAGE_SIZE));
  nomem:
         if (!(gfp_mask & __GFP_NOFAIL))
                 return -ENOMEM;
-bypass:
-       return -EINTR;
+force:
+       /*
+        * The allocation either can't fail or will lead to more memory
+        * being freed very soon.  Allow memory usage go over the limit
+        * temporarily by force charging it.
+        */
+       page_counter_charge(&memcg->memory, nr_pages);
+       if (do_swap_account)
+               page_counter_charge(&memcg->memsw, nr_pages);
+       css_get_many(&memcg->css, nr_pages);
+
+       return 0;
  
  done_restock:
         css_get_many(&memcg->css, batch);
         if (batch > nr_pages)
                 refill_stock(memcg, batch - nr_pages);
-       if (!(gfp_mask & __GFP_WAIT))
-               goto done;
+
         /*
-        * If the hierarchy is above the normal consumption range,
-        * make the charging task trim their excess contribution.
+        * If the hierarchy is above the normal consumption range, schedule
+        * reclaim on returning to userland.  We can perform reclaim here
+        * if __GFP_RECLAIM but let's always punt for simplicity and so that
+        * GFP_KERNEL can consistently be used during reclaim.  @memcg is
+        * not recorded as it most likely matches current's and won't
+        * change in the meantime.  As high limit is checked again before
+        * reclaim, the cost of mismatch is negligible.
          */
         do {
-               if (page_counter_read(&memcg->memory) <= memcg->high)
-                       continue;
-               mem_cgroup_events(memcg, MEMCG_HIGH, 1);
-               try_to_free_mem_cgroup_pages(memcg, nr_pages, gfp_mask, true);
+               if (page_counter_read(&memcg->memory) > memcg->high) {
+                       current->memcg_nr_pages_over_high += batch;
+                       set_notify_resume(current);
+                       break;
+               }
         } while ((memcg = parent_mem_cgroup(memcg)));
-done:
-       return ret;
+
+       return 0;
  }
  
  static void cancel_charge(struct mem_cgroup *memcg, unsigned int nr_pages)
@@ -2357,40 +2179,6 @@ static void cancel_charge(struct mem_cgroup *memcg, unsigned int nr_pages)
         css_put_many(&memcg->css, nr_pages);
  }
  
-/*
- * try_get_mem_cgroup_from_page - look up page's memcg association
- * @page: the page
- *
- * Look up, get a css reference, and return the memcg that owns @page.
- *
- * The page must be locked to prevent racing with swap-in and page
- * cache charges.  If coming from an unlocked page table, the caller
- * must ensure the page is on the LRU or this can race with charging.
- */
-struct mem_cgroup *try_get_mem_cgroup_from_page(struct page *page)
-{
-       struct mem_cgroup *memcg;
-       unsigned short id;
-       swp_entry_t ent;
-
-       VM_BUG_ON_PAGE(!PageLocked(page), page);
-
-       memcg = page->mem_cgroup;
-       if (memcg) {
-               if (!css_tryget_online(&memcg->css))
-                       memcg = NULL;
-       } else if (PageSwapCache(page)) {
-               ent.val = page_private(page);
-               id = lookup_swap_cgroup_id(ent);
-               rcu_read_lock();
-               memcg = mem_cgroup_from_id(id);
-               if (memcg && !css_tryget_online(&memcg->css))
-                       memcg = NULL;
-               rcu_read_unlock();
-       }
-       return memcg;
-}
-
  static void lock_page_lru(struct page *page, int *isolated)
  {
         struct zone *zone = page_zone(page);
@@ -2457,65 +2245,6 @@ static void commit_charge(struct page *page, struct mem_cgroup *memcg,
  }
  
  #ifdef CONFIG_MEMCG_KMEM
-int memcg_charge_kmem(struct mem_cgroup *memcg, gfp_t gfp,
-                     unsigned long nr_pages)
-{
-       struct page_counter *counter;
-       int ret = 0;
-
-       ret = page_counter_try_charge(&memcg->kmem, nr_pages, &counter);
-       if (ret < 0)
-               return ret;
-
-       ret = try_charge(memcg, gfp, nr_pages);
-       if (ret == -EINTR)  {
-               /*
-                * try_charge() chose to bypass to root due to OOM kill or
-                * fatal signal.  Since our only options are to either fail
-                * the allocation or charge it to this cgroup, do it as a
-                * temporary condition. But we can't fail. From a kmem/slab
-                * perspective, the cache has already been selected, by
-                * mem_cgroup_kmem_get_cache(), so it is too late to change
-                * our minds.
-                *
-                * This condition will only trigger if the task entered
-                * memcg_charge_kmem in a sane state, but was OOM-killed
-                * during try_charge() above. Tasks that were already dying
-                * when the allocation triggers should have been already
-                * directed to the root cgroup in memcontrol.h
-                */
-               page_counter_charge(&memcg->memory, nr_pages);
-               if (do_swap_account)
-                       page_counter_charge(&memcg->memsw, nr_pages);
-               css_get_many(&memcg->css, nr_pages);
-               ret = 0;
-       } else if (ret)
-               page_counter_uncharge(&memcg->kmem, nr_pages);
-
-       return ret;
-}
-
-void memcg_uncharge_kmem(struct mem_cgroup *memcg, unsigned long nr_pages)
-{
-       page_counter_uncharge(&memcg->memory, nr_pages);
-       if (do_swap_account)
-               page_counter_uncharge(&memcg->memsw, nr_pages);
-
-       page_counter_uncharge(&memcg->kmem, nr_pages);
-
-       css_put_many(&memcg->css, nr_pages);
-}
-
-/*
- * helper for acessing a memcg's index. It will be used as an index in the
- * child cache array in kmem_cache, and also to derive its name. This function
- * will return -1 when this is not a kmem-limited memcg.
- */
-int memcg_cache_id(struct mem_cgroup *memcg)
-{
-       return memcg ? memcg->kmemcg_id : -1;
-}
-
  static int memcg_alloc_cache_id(void)
  {
         int id, size;
@@ -2677,85 +2406,58 @@ void __memcg_kmem_put_cache(struct kmem_cache *cachep)
                 css_put(&cachep->memcg_params.memcg->css);
  }
  
-/*
- * We need to verify if the allocation against current->mm->owner's memcg is
- * possible for the given order. But the page is not allocated yet, so we'll
- * need a further commit step to do the final arrangements.
- *
- * It is possible for the task to switch cgroups in this mean time, so at
- * commit time, we can't rely on task conversion any longer.  We'll then use
- * the handle argument to return to the caller which cgroup we should commit
- * against. We could also return the memcg directly and avoid the pointer
- * passing, but a boolean return value gives better semantics considering
- * the compiled-out case as well.
- *
- * Returning true means the allocation is possible.
- */
-bool
-__memcg_kmem_newpage_charge(gfp_t gfp, struct mem_cgroup **_memcg, int order)
+int __memcg_kmem_charge_memcg(struct page *page, gfp_t gfp, int order,
+                             struct mem_cgroup *memcg)
  {
-       struct mem_cgroup *memcg;
+       unsigned int nr_pages = 1 << order;
+       struct page_counter *counter;
         int ret;
  
-       *_memcg = NULL;
+       if (!memcg_kmem_is_active(memcg))
+               return 0;
  
-       memcg = get_mem_cgroup_from_mm(current->mm);
+       if (!page_counter_try_charge(&memcg->kmem, nr_pages, &counter))
+               return -ENOMEM;
  
-       if (!memcg_kmem_is_active(memcg)) {
-               css_put(&memcg->css);
-               return true;
+       ret = try_charge(memcg, gfp, nr_pages);
+       if (ret) {
+               page_counter_uncharge(&memcg->kmem, nr_pages);
+               return ret;
         }
  
-       ret = memcg_charge_kmem(memcg, gfp, 1 << order);
-       if (!ret)
-               *_memcg = memcg;
+       page->mem_cgroup = memcg;
  
-       css_put(&memcg->css);
-       return (ret == 0);
+       return 0;
  }
  
-void __memcg_kmem_commit_charge(struct page *page, struct mem_cgroup *memcg,
-                             int order)
+int __memcg_kmem_charge(struct page *page, gfp_t gfp, int order)
  {
-       VM_BUG_ON(mem_cgroup_is_root(memcg));
+       struct mem_cgroup *memcg;
+       int ret;
  
-       /* The page allocation failed. Revert */
-       if (!page) {
-               memcg_uncharge_kmem(memcg, 1 << order);
-               return;
-       }
-       page->mem_cgroup = memcg;
+       memcg = get_mem_cgroup_from_mm(current->mm);
+       ret = __memcg_kmem_charge_memcg(page, gfp, order, memcg);
+       css_put(&memcg->css);
+       return ret;
  }
  
-void __memcg_kmem_uncharge_pages(struct page *page, int order)
+void __memcg_kmem_uncharge(struct page *page, int order)
  {
         struct mem_cgroup *memcg = page->mem_cgroup;
+       unsigned int nr_pages = 1 << order;
  
         if (!memcg)
                 return;
  
         VM_BUG_ON_PAGE(mem_cgroup_is_root(memcg), page);
  
-       memcg_uncharge_kmem(memcg, 1 << order);
-       page->mem_cgroup = NULL;
-}
-
-struct mem_cgroup *__mem_cgroup_from_kmem(void *ptr)
-{
-       struct mem_cgroup *memcg = NULL;
-       struct kmem_cache *cachep;
-       struct page *page;
-
-       page = virt_to_head_page(ptr);
-       if (PageSlab(page)) {
-               cachep = page->slab_cache;
-               if (!is_root_cache(cachep))
-                       memcg = cachep->memcg_params.memcg;
-       } else
-               /* page allocated by alloc_kmem_pages */
-               memcg = page->mem_cgroup;
+       page_counter_uncharge(&memcg->kmem, nr_pages);
+       page_counter_uncharge(&memcg->memory, nr_pages);
+       if (do_swap_account)
+               page_counter_uncharge(&memcg->memsw, nr_pages);
  
-       return memcg;
+       page->mem_cgroup = NULL;
+       css_put_many(&memcg->css, nr_pages);
  }
  #endif /* CONFIG_MEMCG_KMEM */
  
@@ -3121,20 +2823,17 @@ static unsigned long tree_stat(struct mem_cgroup *memcg,
                                enum mem_cgroup_stat_index idx)
  {
         struct mem_cgroup *iter;
-       long val = 0;
+       unsigned long val = 0;
  
-       /* Per-cpu values can be negative, use a signed accumulator */
         for_each_mem_cgroup_tree(iter, memcg)
                 val += mem_cgroup_read_stat(iter, idx);
  
-       if (val < 0) /* race ? */
-               val = 0;
         return val;
  }
  
-static inline u64 mem_cgroup_usage(struct mem_cgroup *memcg, bool swap)
+static unsigned long mem_cgroup_usage(struct mem_cgroup *memcg, bool swap)
  {
-       u64 val;
+       unsigned long val;
  
         if (mem_cgroup_is_root(memcg)) {
                 val = tree_stat(memcg, MEM_CGROUP_STAT_CACHE);
@@ -3147,7 +2846,7 @@ static inline u64 mem_cgroup_usage(struct mem_cgroup *memcg, bool swap)
                 else
                         val = page_counter_read(&memcg->memsw);
         }
-       return val << PAGE_SHIFT;
+       return val;
  }
  
  enum {
@@ -3181,9 +2880,9 @@ static u64 mem_cgroup_read_u64(struct cgroup_subsys_state *css,
         switch (MEMFILE_ATTR(cft->private)) {
         case RES_USAGE:
                 if (counter == &memcg->memory)
-                       return mem_cgroup_usage(memcg, false);
+                       return (u64)mem_cgroup_usage(memcg, false) * PAGE_SIZE;
                 if (counter == &memcg->memsw)
-                       return mem_cgroup_usage(memcg, true);
+                       return (u64)mem_cgroup_usage(memcg, true) * PAGE_SIZE;
                 return (u64)page_counter_read(counter) * PAGE_SIZE;
         case RES_LIMIT:
                 return (u64)counter->limit * PAGE_SIZE;
@@ -3222,7 +2921,7 @@ static int memcg_activate_kmem(struct mem_cgroup *memcg,
          * of course permitted.
          */
         mutex_lock(&memcg_create_mutex);
-       if (cgroup_has_tasks(memcg->css.cgroup) ||
+       if (cgroup_is_populated(memcg->css.cgroup) ||
             (memcg->use_hierarchy && memcg_has_children(memcg)))
                 err = -EBUSY;
         mutex_unlock(&memcg_create_mutex);
@@ -3471,7 +3170,7 @@ static int memcg_stat_show(struct seq_file *m, void *v)
         for (i = 0; i < MEM_CGROUP_STAT_NSTATS; i++) {
                 if (i == MEM_CGROUP_STAT_SWAP && !do_swap_account)
                         continue;
-               seq_printf(m, "%s %ld\n", mem_cgroup_stat_names[i],
+               seq_printf(m, "%s %lu\n", mem_cgroup_stat_names[i],
                            mem_cgroup_read_stat(memcg, i) * PAGE_SIZE);
         }
  
@@ -3496,13 +3195,13 @@ static int memcg_stat_show(struct seq_file *m, void *v)
                            (u64)memsw * PAGE_SIZE);
  
         for (i = 0; i < MEM_CGROUP_STAT_NSTATS; i++) {
-               long long val = 0;
+               unsigned long long val = 0;
  
                 if (i == MEM_CGROUP_STAT_SWAP && !do_swap_account)
                         continue;
                 for_each_mem_cgroup_tree(mi, memcg)
                         val += mem_cgroup_read_stat(mi, i) * PAGE_SIZE;
-               seq_printf(m, "total_%s %lld\n", mem_cgroup_stat_names[i], val);
+               seq_printf(m, "total_%s %llu\n", mem_cgroup_stat_names[i], val);
         }
  
         for (i = 0; i < MEM_CGROUP_EVENTS_NSTATS; i++) {
@@ -3829,16 +3528,17 @@ static void __mem_cgroup_usage_unregister_event(struct mem_cgroup *memcg,
  swap_buffers:
         /* Swap primary and spare array */
         thresholds->spare = thresholds->primary;
-       /* If all events are unregistered, free the spare array */
-       if (!new) {
-               kfree(thresholds->spare);
-               thresholds->spare = NULL;
-       }
  
         rcu_assign_pointer(thresholds->primary, new);
  
         /* To be sure that nobody uses thresholds */
         synchronize_rcu();
+
+       /* If all events are unregistered, free the spare array */
+       if (!new) {
+               kfree(thresholds->spare);
+               thresholds->spare = NULL;
+       }
  unlock:
         mutex_unlock(&memcg->thresholds_lock);
  }
@@ -3870,7 +3570,7 @@ static int mem_cgroup_oom_register_event(struct mem_cgroup *memcg,
         list_add(&event->list, &memcg->oom_notify);
  
         /* already in OOM ? */
-       if (atomic_read(&memcg->under_oom))
+       if (memcg->under_oom)
                 eventfd_signal(eventfd, 1);
         spin_unlock(&memcg_oom_lock);
  
@@ -3899,7 +3599,7 @@ static int mem_cgroup_oom_control_read(struct seq_file *sf, void *v)
         struct mem_cgroup *memcg = mem_cgroup_from_css(seq_css(sf));
  
         seq_printf(sf, "oom_kill_disable %d\n", memcg->oom_kill_disable);
-       seq_printf(sf, "under_oom %d\n", (bool)atomic_read(&memcg->under_oom));
+       seq_printf(sf, "under_oom %d\n", (bool)memcg->under_oom);
         return 0;
  }
  
@@ -4001,6 +3701,97 @@ static void memcg_destroy_kmem(struct mem_cgroup *memcg)
  }
  #endif
  
+#ifdef CONFIG_CGROUP_WRITEBACK
+
+struct list_head *mem_cgroup_cgwb_list(struct mem_cgroup *memcg)
+{
+       return &memcg->cgwb_list;
+}
+
+static int memcg_wb_domain_init(struct mem_cgroup *memcg, gfp_t gfp)
+{
+       return wb_domain_init(&memcg->cgwb_domain, gfp);
+}
+
+static void memcg_wb_domain_exit(struct mem_cgroup *memcg)
+{
+       wb_domain_exit(&memcg->cgwb_domain);
+}
+
+static void memcg_wb_domain_size_changed(struct mem_cgroup *memcg)
+{
+       wb_domain_size_changed(&memcg->cgwb_domain);
+}
+
+struct wb_domain *mem_cgroup_wb_domain(struct bdi_writeback *wb)
+{
+       struct mem_cgroup *memcg = mem_cgroup_from_css(wb->memcg_css);
+
+       if (!memcg->css.parent)
+               return NULL;
+
+       return &memcg->cgwb_domain;
+}
+
+/**
+ * mem_cgroup_wb_stats - retrieve writeback related stats from its memcg
+ * @wb: bdi_writeback in question
+ * @pfilepages: out parameter for number of file pages
+ * @pheadroom: out parameter for number of allocatable pages according to memcg
+ * @pdirty: out parameter for number of dirty pages
+ * @pwriteback: out parameter for number of pages under writeback
+ *
+ * Determine the numbers of file, headroom, dirty, and writeback pages in
+ * @wb's memcg.  File, dirty and writeback are self-explanatory.  Headroom
+ * is a bit more involved.
+ *
+ * A memcg's headroom is "min(max, high) - used".  In the hierarchy, the
+ * headroom is calculated as the lowest headroom of itself and the
+ * ancestors.  Note that this doesn't consider the actual amount of
+ * available memory in the system.  The caller should further cap
+ * *@pheadroom accordingly.
+ */
+void mem_cgroup_wb_stats(struct bdi_writeback *wb, unsigned long *pfilepages,
+                        unsigned long *pheadroom, unsigned long *pdirty,
+                        unsigned long *pwriteback)
+{
+       struct mem_cgroup *memcg = mem_cgroup_from_css(wb->memcg_css);
+       struct mem_cgroup *parent;
+
+       *pdirty = mem_cgroup_read_stat(memcg, MEM_CGROUP_STAT_DIRTY);
+
+       /* this should eventually include NR_UNSTABLE_NFS */
+       *pwriteback = mem_cgroup_read_stat(memcg, MEM_CGROUP_STAT_WRITEBACK);
+       *pfilepages = mem_cgroup_nr_lru_pages(memcg, (1 << LRU_INACTIVE_FILE) |
+                                                    (1 << LRU_ACTIVE_FILE));
+       *pheadroom = PAGE_COUNTER_MAX;
+
+       while ((parent = parent_mem_cgroup(memcg))) {
+               unsigned long ceiling = min(memcg->memory.limit, memcg->high);
+               unsigned long used = page_counter_read(&memcg->memory);
+
+               *pheadroom = min(*pheadroom, ceiling - min(ceiling, used));
+               memcg = parent;
+       }
+}
+
+#else  /* CONFIG_CGROUP_WRITEBACK */
+
+static int memcg_wb_domain_init(struct mem_cgroup *memcg, gfp_t gfp)
+{
+       return 0;
+}
+
+static void memcg_wb_domain_exit(struct mem_cgroup *memcg)
+{
+}
+
+static void memcg_wb_domain_size_changed(struct mem_cgroup *memcg)
+{
+}
+
+#endif /* CONFIG_CGROUP_WRITEBACK */
+
  /*
   * DO NOT USE IN NEW FILES.
   *
@@ -4270,8 +4061,7 @@ static struct cftype mem_cgroup_legacy_files[] = {
         {
                 .name = "cgroup.event_control",         /* XXX: for compat */
                 .write = memcg_write_event_control,
-               .flags = CFTYPE_NO_PREFIX,
-               .mode = S_IWUGO,
+               .flags = CFTYPE_NO_PREFIX | CFTYPE_WORLD_WRITABLE,
         },
         {
                 .name = "swappiness",
@@ -4385,9 +4175,14 @@ static struct mem_cgroup *mem_cgroup_alloc(void)
         memcg->stat = alloc_percpu(struct mem_cgroup_stat_cpu);
         if (!memcg->stat)
                 goto out_free;
-       spin_lock_init(&memcg->pcp_counter_lock);
+
+       if (memcg_wb_domain_init(memcg, GFP_KERNEL))
+               goto out_free_stat;
+
         return memcg;
  
+out_free_stat:
+       free_percpu(memcg->stat);
  out_free:
         kfree(memcg);
         return NULL;
@@ -4414,6 +4209,7 @@ static void __mem_cgroup_free(struct mem_cgroup *memcg)
                 free_mem_cgroup_per_zone_info(memcg, node);
  
         free_percpu(memcg->stat);
+       memcg_wb_domain_exit(memcg);
         kfree(memcg);
  }
  
@@ -4446,6 +4242,7 @@ mem_cgroup_css_alloc(struct cgroup_subsys_state *parent_css)
         /* root ? */
         if (parent_css == NULL) {
                 root_mem_cgroup = memcg;
+               mem_cgroup_root_css = &memcg->css;
                 page_counter_init(&memcg->memory, NULL);
                 memcg->high = PAGE_COUNTER_MAX;
                 memcg->soft_limit = PAGE_COUNTER_MAX;
@@ -4464,7 +4261,9 @@ mem_cgroup_css_alloc(struct cgroup_subsys_state *parent_css)
  #ifdef CONFIG_MEMCG_KMEM
         memcg->kmemcg_id = -1;
  #endif
-
+#ifdef CONFIG_CGROUP_WRITEBACK
+       INIT_LIST_HEAD(&memcg->cgwb_list);
+#endif
         return &memcg->css;
  
  free_out:
@@ -4552,6 +4351,15 @@ static void mem_cgroup_css_offline(struct cgroup_subsys_state *css)
         vmpressure_cleanup(&memcg->vmpressure);
  
         memcg_deactivate_kmem(memcg);
+
+       wb_memcg_offline(memcg);
+}
+
+static void mem_cgroup_css_released(struct cgroup_subsys_state *css)
+{
+       struct mem_cgroup *memcg = mem_cgroup_from_css(css);
+
+       invalidate_reclaim_iterators(memcg);
  }
  
  static void mem_cgroup_css_free(struct cgroup_subsys_state *css)
@@ -4585,6 +4393,7 @@ static void mem_cgroup_css_reset(struct cgroup_subsys_state *css)
         memcg->low = 0;
         memcg->high = PAGE_COUNTER_MAX;
         memcg->soft_limit = PAGE_COUNTER_MAX;
+       memcg_wb_domain_size_changed(memcg);
  }
  
  #ifdef CONFIG_MMU
@@ -4593,28 +4402,16 @@ static int mem_cgroup_do_precharge(unsigned long count)
  {
         int ret;
  
-       /* Try a single bulk charge without reclaim first */
-       ret = try_charge(mc.to, GFP_KERNEL & ~__GFP_WAIT, count);
+       /* Try a single bulk charge without reclaim first, kswapd may wake */
+       ret = try_charge(mc.to, GFP_KERNEL & ~__GFP_DIRECT_RECLAIM, count);
         if (!ret) {
                 mc.precharge += count;
                 return ret;
         }
-       if (ret == -EINTR) {
-               cancel_charge(root_mem_cgroup, count);
-               return ret;
-       }
  
         /* Try charges one by one with reclaim */
         while (count--) {
                 ret = try_charge(mc.to, GFP_KERNEL & ~__GFP_NORETRY, 1);
-               /*
-                * In case of failure, any residual charges against
-                * mc.to will be dropped by mem_cgroup_clear_mc()
-                * later on.  However, cancel any charges that are
-                * bypassed to root right away or they'll be lost.
-                */
-               if (ret == -EINTR)
-                       cancel_charge(root_mem_cgroup, 1);
                 if (ret)
                         return ret;
                 mc.precharge++;
@@ -4754,6 +4551,7 @@ static int mem_cgroup_move_account(struct page *page,
  {
         unsigned long flags;
         int ret;
+       bool anon;
  
         VM_BUG_ON(from == to);
         VM_BUG_ON_PAGE(PageLRU(page), page);
@@ -4768,9 +4566,8 @@ static int mem_cgroup_move_account(struct page *page,
                 goto out;
  
         /*
-        * Prevent mem_cgroup_migrate() from looking at page->mem_cgroup
-        * of its source page while we change it: page migration takes
-        * both pages off the LRU, but page cache replacement doesn't.
+        * Prevent mem_cgroup_replace_page() from looking at
+        * page->mem_cgroup of its source page while we change it.
          */
         if (!trylock_page(page))
                 goto out;
@@ -4779,15 +4576,33 @@ static int mem_cgroup_move_account(struct page *page,
         if (page->mem_cgroup != from)
                 goto out_unlock;
  
+       anon = PageAnon(page);
+
         spin_lock_irqsave(&from->move_lock, flags);
  
-       if (!PageAnon(page) && page_mapped(page)) {
+       if (!anon && page_mapped(page)) {
                 __this_cpu_sub(from->stat->count[MEM_CGROUP_STAT_FILE_MAPPED],
                                nr_pages);
                 __this_cpu_add(to->stat->count[MEM_CGROUP_STAT_FILE_MAPPED],
                                nr_pages);
         }
  
+       /*
+        * move_lock grabbed above and caller set from->moving_account, so
+        * mem_cgroup_update_page_stat() will serialize updates to PageDirty.
+        * So mapping should be stable for dirty pages.
+        */
+       if (!anon && PageDirty(page)) {
+               struct address_space *mapping = page_mapping(page);
+
+               if (mapping_cap_account_dirty(mapping)) {
+                       __this_cpu_sub(from->stat->count[MEM_CGROUP_STAT_DIRTY],
+                                      nr_pages);
+                       __this_cpu_add(to->stat->count[MEM_CGROUP_STAT_DIRTY],
+                                      nr_pages);
+               }
+       }
+
         if (PageWriteback(page)) {
                 __this_cpu_sub(from->stat->count[MEM_CGROUP_STAT_WRITEBACK],
                                nr_pages);
@@ -5002,13 +4817,34 @@ static void mem_cgroup_clear_mc(void)
         spin_unlock(&mc.lock);
  }
  
-static int mem_cgroup_can_attach(struct cgroup_subsys_state *css,
-                                struct cgroup_taskset *tset)
+static int mem_cgroup_can_attach(struct cgroup_taskset *tset)
  {
-       struct task_struct *p = cgroup_taskset_first(tset);
-       int ret = 0;
-       struct mem_cgroup *memcg = mem_cgroup_from_css(css);
+       struct cgroup_subsys_state *css;
+       struct mem_cgroup *memcg;
+       struct mem_cgroup *from;
+       struct task_struct *leader, *p;
+       struct mm_struct *mm;
         unsigned long move_flags;
+       int ret = 0;
+
+       /* charge immigration isn't supported on the default hierarchy */
+       if (cgroup_subsys_on_dfl(memory_cgrp_subsys))
+               return 0;
+
+       /*
+        * Multi-process migrations only happen on the default hierarchy
+        * where charge immigration is not used.  Perform charge
+        * immigration if @tset contains a leader and whine if there are
+        * multiple.
+        */
+       p = NULL;
+       cgroup_taskset_for_each_leader(leader, css, tset) {
+               WARN_ON_ONCE(p);
+               p = leader;
+               memcg = mem_cgroup_from_css(css);
+       }
+       if (!p)
+               return 0;
  
         /*
          * We are now commited to this value whatever it is. Changes in this
@@ -5016,41 +4852,40 @@ static int mem_cgroup_can_attach(struct cgroup_subsys_state *css,
          * So we need to save it, and keep it going.
          */
         move_flags = READ_ONCE(memcg->move_charge_at_immigrate);
-       if (move_flags) {
-               struct mm_struct *mm;
-               struct mem_cgroup *from = mem_cgroup_from_task(p);
+       if (!move_flags)
+               return 0;
  
-               VM_BUG_ON(from == memcg);
+       from = mem_cgroup_from_task(p);
  
-               mm = get_task_mm(p);
-               if (!mm)
-                       return 0;
-               /* We move charges only when we move a owner of the mm */
-               if (mm->owner == p) {
-                       VM_BUG_ON(mc.from);
-                       VM_BUG_ON(mc.to);
-                       VM_BUG_ON(mc.precharge);
-                       VM_BUG_ON(mc.moved_charge);
-                       VM_BUG_ON(mc.moved_swap);
-
-                       spin_lock(&mc.lock);
-                       mc.from = from;
-                       mc.to = memcg;
-                       mc.flags = move_flags;
-                       spin_unlock(&mc.lock);
-                       /* We set mc.moving_task later */
-
-                       ret = mem_cgroup_precharge_mc(mm);
-                       if (ret)
-                               mem_cgroup_clear_mc();
-               }
-               mmput(mm);
+       VM_BUG_ON(from == memcg);
+
+       mm = get_task_mm(p);
+       if (!mm)
+               return 0;
+       /* We move charges only when we move a owner of the mm */
+       if (mm->owner == p) {
+               VM_BUG_ON(mc.from);
+               VM_BUG_ON(mc.to);
+               VM_BUG_ON(mc.precharge);
+               VM_BUG_ON(mc.moved_charge);
+               VM_BUG_ON(mc.moved_swap);
+
+               spin_lock(&mc.lock);
+               mc.from = from;
+               mc.to = memcg;
+               mc.flags = move_flags;
+               spin_unlock(&mc.lock);
+               /* We set mc.moving_task later */
+
+               ret = mem_cgroup_precharge_mc(mm);
+               if (ret)
+                       mem_cgroup_clear_mc();
         }
+       mmput(mm);
         return ret;
  }
  
-static void mem_cgroup_cancel_attach(struct cgroup_subsys_state *css,
-                                    struct cgroup_taskset *tset)
+static void mem_cgroup_cancel_attach(struct cgroup_taskset *tset)
  {
         if (mc.to)
                 mem_cgroup_clear_mc();
@@ -5192,10 +5027,10 @@ retry:
         atomic_dec(&mc.from->moving_account);
  }
  
-static void mem_cgroup_move_task(struct cgroup_subsys_state *css,
-                                struct cgroup_taskset *tset)
+static void mem_cgroup_move_task(struct cgroup_taskset *tset)
  {
-       struct task_struct *p = cgroup_taskset_first(tset);
+       struct cgroup_subsys_state *css;
+       struct task_struct *p = cgroup_taskset_first(tset, &css);
         struct mm_struct *mm = get_task_mm(p);
  
         if (mm) {
@@ -5207,17 +5042,14 @@ static void mem_cgroup_move_task(struct cgroup_subsys_state *css,
                 mem_cgroup_clear_mc();
  }
  #else  /* !CONFIG_MMU */
-static int mem_cgroup_can_attach(struct cgroup_subsys_state *css,
-                                struct cgroup_taskset *tset)
+static int mem_cgroup_can_attach(struct cgroup_taskset *tset)
  {
         return 0;
  }
-static void mem_cgroup_cancel_attach(struct cgroup_subsys_state *css,
-                                    struct cgroup_taskset *tset)
+static void mem_cgroup_cancel_attach(struct cgroup_taskset *tset)
  {
  }
-static void mem_cgroup_move_task(struct cgroup_subsys_state *css,
-                                struct cgroup_taskset *tset)
+static void mem_cgroup_move_task(struct cgroup_taskset *tset)
  {
  }
  #endif
@@ -5234,7 +5066,7 @@ static void mem_cgroup_bind(struct cgroup_subsys_state *root_css)
          * guarantees that @root doesn't have any children, so turning it
          * on for the root memcg is enough.
          */
-       if (cgroup_on_dfl(root_css->cgroup))
+       if (cgroup_subsys_on_dfl(memory_cgrp_subsys))
                 root_mem_cgroup->use_hierarchy = true;
         else
                 root_mem_cgroup->use_hierarchy = false;
@@ -5243,7 +5075,9 @@ static void mem_cgroup_bind(struct cgroup_subsys_state *root_css)
  static u64 memory_current_read(struct cgroup_subsys_state *css,
                                struct cftype *cft)
  {
-       return mem_cgroup_usage(mem_cgroup_from_css(css), false);
+       struct mem_cgroup *memcg = mem_cgroup_from_css(css);
+
+       return (u64)page_counter_read(&memcg->memory) * PAGE_SIZE;
  }
  
  static int memory_low_show(struct seq_file *m, void *v)
@@ -5303,6 +5137,7 @@ static ssize_t memory_high_write(struct kernfs_open_file *of,
  
         memcg->high = high;
  
+       memcg_wb_domain_size_changed(memcg);
         return nbytes;
  }
  
@@ -5335,6 +5170,7 @@ static ssize_t memory_max_write(struct kernfs_open_file *of,
         if (err)
                 return err;
  
+       memcg_wb_domain_size_changed(memcg);
         return nbytes;
  }
  
@@ -5353,6 +5189,7 @@ static int memory_events_show(struct seq_file *m, void *v)
  static struct cftype memory_files[] = {
         {
                 .name = "current",
+               .flags = CFTYPE_NOT_ON_ROOT,
                 .read_u64 = memory_current_read,
         },
         {
@@ -5376,6 +5213,7 @@ static struct cftype memory_files[] = {
         {
                 .name = "events",
                 .flags = CFTYPE_NOT_ON_ROOT,
+               .file_offset = offsetof(struct mem_cgroup, events_file),
                 .seq_show = memory_events_show,
         },
         { }     /* terminate */
@@ -5385,6 +5223,7 @@ struct cgroup_subsys memory_cgrp_subsys = {
         .css_alloc = mem_cgroup_css_alloc,
         .css_online = mem_cgroup_css_online,
         .css_offline = mem_cgroup_css_offline,
+       .css_released = mem_cgroup_css_released,
         .css_free = mem_cgroup_css_free,
         .css_reset = mem_cgroup_css_reset,
         .can_attach = mem_cgroup_can_attach,
@@ -5396,19 +5235,6 @@ struct cgroup_subsys memory_cgrp_subsys = {
         .early_init = 0,
  };
  
-/**
- * mem_cgroup_events - count memory events against a cgroup
- * @memcg: the memory cgroup
- * @idx: the event index
- * @nr: the number of events to account for
- */
-void mem_cgroup_events(struct mem_cgroup *memcg,
-                      enum mem_cgroup_events_index idx,
-                      unsigned int nr)
-{
-       this_cpu_add(memcg->stat->events[idx], nr);
-}
-
  /**
   * mem_cgroup_low - check if memory consumption is below the normal range
   * @root: the highest ancestor to consider
@@ -5481,8 +5307,20 @@ int mem_cgroup_try_charge(struct page *page, struct mm_struct *mm,
                  * the page lock, which serializes swap cache removal, which
                  * in turn serializes uncharging.
                  */
+               VM_BUG_ON_PAGE(!PageLocked(page), page);
                 if (page->mem_cgroup)
                         goto out;
+
+               if (do_swap_account) {
+                       swp_entry_t ent = { .val = page_private(page), };
+                       unsigned short id = lookup_swap_cgroup_id(ent);
+
+                       rcu_read_lock();
+                       memcg = mem_cgroup_from_id(id);
+                       if (memcg && !css_tryget_online(&memcg->css))
+                               memcg = NULL;
+                       rcu_read_unlock();
+               }
         }
  
         if (PageTransHuge(page)) {
@@ -5490,19 +5328,12 @@ int mem_cgroup_try_charge(struct page *page, struct mm_struct *mm,
                 VM_BUG_ON_PAGE(!PageTransHuge(page), page);
         }
  
-       if (do_swap_account && PageSwapCache(page))
-               memcg = try_get_mem_cgroup_from_page(page);
         if (!memcg)
                 memcg = get_mem_cgroup_from_mm(mm);
  
         ret = try_charge(memcg, gfp_mask, nr_pages);
  
         css_put(&memcg->css);
-
-       if (ret == -EINTR) {
-               memcg = root_mem_cgroup;
-               ret = 0;
-       }
  out:
         *memcgp = memcg;
         return ret;
@@ -5717,25 +5548,22 @@ void mem_cgroup_uncharge_list(struct list_head *page_list)
  }
  
  /**
- * mem_cgroup_migrate - migrate a charge to another page
+ * mem_cgroup_replace_page - migrate a charge to another page
   * @oldpage: currently charged page
   * @newpage: page to transfer the charge to
- * @lrucare: either or both pages might be on the LRU already
   *
   * Migrate the charge from @oldpage to @newpage.
   *
   * Both pages must be locked, @newpage->mapping must be set up.
+ * Either or both pages might be on the LRU already.
   */
-void mem_cgroup_migrate(struct page *oldpage, struct page *newpage,
-                       bool lrucare)
+void mem_cgroup_replace_page(struct page *oldpage, struct page *newpage)
  {
         struct mem_cgroup *memcg;
         int isolated;
  
         VM_BUG_ON_PAGE(!PageLocked(oldpage), oldpage);
         VM_BUG_ON_PAGE(!PageLocked(newpage), newpage);
-       VM_BUG_ON_PAGE(!lrucare && PageLRU(oldpage), oldpage);
-       VM_BUG_ON_PAGE(!lrucare && PageLRU(newpage), newpage);
         VM_BUG_ON_PAGE(PageAnon(oldpage) != PageAnon(newpage), newpage);
         VM_BUG_ON_PAGE(PageTransHuge(oldpage) != PageTransHuge(newpage),
                        newpage);
@@ -5747,25 +5575,16 @@ void mem_cgroup_migrate(struct page *oldpage, struct page *newpage,
         if (newpage->mem_cgroup)
                 return;
  
-       /*
-        * Swapcache readahead pages can get migrated before being
-        * charged, and migration from compaction can happen to an
-        * uncharged page when the PFN walker finds a page that
-        * reclaim just put back on the LRU but has not released yet.
-        */
+       /* Swapcache readahead pages can get replaced before being charged */
         memcg = oldpage->mem_cgroup;
         if (!memcg)
                 return;
  
-       if (lrucare)
-               lock_page_lru(oldpage, &isolated);
-
+       lock_page_lru(oldpage, &isolated);
         oldpage->mem_cgroup = NULL;
+       unlock_page_lru(oldpage, isolated);
  
-       if (lrucare)
-               unlock_page_lru(oldpage, isolated);
-
-       commit_charge(newpage, memcg, lrucare);
+       commit_charge(newpage, memcg, true);
  }
  
  /*
@@ -5842,8 +5661,16 @@ void mem_cgroup_swapout(struct page *page, swp_entry_t entry)
         if (!mem_cgroup_is_root(memcg))
                 page_counter_uncharge(&memcg->memory, 1);
  
+       /*
+        * Interrupts should be disabled here because the caller holds the
+        * mapping->tree_lock lock which is taken with interrupts-off. It is
+        * important here to have the interrupts disabled because it is the
+        * only synchronisation we have for udpating the per-CPU variables.
+        */
         local_lock_irqsave(event_lock, flags);
-       /* Caller disabled preemption with mapping->tree_lock */
+#ifndef CONFIG_PREEMPT_RT_BASE
+       VM_BUG_ON(!irqs_disabled());
+#endif
         mem_cgroup_charge_statistics(memcg, page, -1);
         memcg_check_events(memcg, page);
         local_unlock_irqrestore(event_lock, flags);