kernel/mm/memcontrol.c

   1 /* memcontrol.c - Memory Controller
   2  *
   3  * Copyright IBM Corporation, 2007
   4  * Author Balbir Singh <balbir@linux.vnet.ibm.com>
   5  *
   6  * Copyright 2007 OpenVZ SWsoft Inc
   7  * Author: Pavel Emelianov <xemul@openvz.org>
   8  *
   9  * Memory thresholds
  10  * Copyright (C) 2009 Nokia Corporation
  11  * Author: Kirill A. Shutemov
  12  *
  13  * Kernel Memory Controller
  14  * Copyright (C) 2012 Parallels Inc. and Google Inc.
  15  * Authors: Glauber Costa and Suleiman Souhlal
  16  *
  17  * Native page reclaim
  18  * Charge lifetime sanitation
  19  * Lockless page tracking & accounting
  20  * Unified hierarchy configuration model
  21  * Copyright (C) 2015 Red Hat, Inc., Johannes Weiner
  22  *
  23  * This program is free software; you can redistribute it and/or modify
  24  * it under the terms of the GNU General Public License as published by
  25  * the Free Software Foundation; either version 2 of the License, or
  26  * (at your option) any later version.
  27  *
  28  * This program is distributed in the hope that it will be useful,
  29  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  30  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  31  * GNU General Public License for more details.
  32  */
  33
  34 #include <linux/page_counter.h>
  35 #include <linux/memcontrol.h>
  36 #include <linux/cgroup.h>
  37 #include <linux/mm.h>
  38 #include <linux/hugetlb.h>
  39 #include <linux/pagemap.h>
  40 #include <linux/smp.h>
  41 #include <linux/page-flags.h>
  42 #include <linux/backing-dev.h>
  43 #include <linux/bit_spinlock.h>
  44 #include <linux/rcupdate.h>
  45 #include <linux/limits.h>
  46 #include <linux/export.h>
  47 #include <linux/mutex.h>
  48 #include <linux/rbtree.h>
  49 #include <linux/slab.h>
  50 #include <linux/swap.h>
  51 #include <linux/swapops.h>
  52 #include <linux/spinlock.h>
  53 #include <linux/eventfd.h>
  54 #include <linux/poll.h>
  55 #include <linux/sort.h>
  56 #include <linux/fs.h>
  57 #include <linux/seq_file.h>
  58 #include <linux/vmpressure.h>
  59 #include <linux/mm_inline.h>
  60 #include <linux/swap_cgroup.h>
  61 #include <linux/cpu.h>
  62 #include <linux/oom.h>
  63 #include <linux/lockdep.h>
  64 #include <linux/file.h>
  65 #include "internal.h"
  66 #include <net/sock.h>
  67 #include <net/ip.h>
  68 #include <net/tcp_memcontrol.h>
  69 #include <linux/locallock.h>
  70
  71 #include "slab.h"
  72
  73 #include <asm/uaccess.h>
  74
  75 #include <trace/events/vmscan.h>
  76
  77 struct cgroup_subsys memory_cgrp_subsys __read_mostly;
  78 EXPORT_SYMBOL(memory_cgrp_subsys);
  79
  80 #define MEM_CGROUP_RECLAIM_RETRIES      5
  81 static struct mem_cgroup *root_mem_cgroup __read_mostly;
  82
  83 /* Whether the swap controller is active */
  84 #ifdef CONFIG_MEMCG_SWAP
  85 int do_swap_account __read_mostly;
  86 #else
  87 #define do_swap_account         0
  88 #endif
  89
  90 static DEFINE_LOCAL_IRQ_LOCK(event_lock);
  91 static const char * const mem_cgroup_stat_names[] = {
  92         "cache",
  93         "rss",
  94         "rss_huge",
  95         "mapped_file",
  96         "writeback",
  97         "swap",
  98 };
  99
 100 static const char * const mem_cgroup_events_names[] = {
 101         "pgpgin",
 102         "pgpgout",
 103         "pgfault",
 104         "pgmajfault",
 105 };
 106
 107 static const char * const mem_cgroup_lru_names[] = {
 108         "inactive_anon",
 109         "active_anon",
 110         "inactive_file",
 111         "active_file",
 112         "unevictable",
 113 };
 114
 115 /*
 116  * Per memcg event counter is incremented at every pagein/pageout. With THP,
 117  * it will be incremated by the number of pages. This counter is used for
 118  * for trigger some periodic events. This is straightforward and better
 119  * than using jiffies etc. to handle periodic memcg event.
 120  */
 121 enum mem_cgroup_events_target {
 122         MEM_CGROUP_TARGET_THRESH,
 123         MEM_CGROUP_TARGET_SOFTLIMIT,
 124         MEM_CGROUP_TARGET_NUMAINFO,
 125         MEM_CGROUP_NTARGETS,
 126 };
 127 #define THRESHOLDS_EVENTS_TARGET 128
 128 #define SOFTLIMIT_EVENTS_TARGET 1024
 129 #define NUMAINFO_EVENTS_TARGET  1024
 130
 131 struct mem_cgroup_stat_cpu {
 132         long count[MEM_CGROUP_STAT_NSTATS];
 133         unsigned long events[MEMCG_NR_EVENTS];
 134         unsigned long nr_page_events;
 135         unsigned long targets[MEM_CGROUP_NTARGETS];
 136 };
 137
 138 struct reclaim_iter {
 139         struct mem_cgroup *position;
 140         /* scan generation, increased every round-trip */
 141         unsigned int generation;
 142 };
 143
 144 /*
 145  * per-zone information in memory controller.
 146  */
 147 struct mem_cgroup_per_zone {
 148         struct lruvec           lruvec;
 149         unsigned long           lru_size[NR_LRU_LISTS];
 150
 151         struct reclaim_iter     iter[DEF_PRIORITY + 1];
 152
 153         struct rb_node          tree_node;      /* RB tree node */
 154         unsigned long           usage_in_excess;/* Set to the value by which */
 155                                                 /* the soft limit is exceeded*/
 156         bool                    on_tree;
 157         struct mem_cgroup       *memcg;         /* Back pointer, we cannot */
 158                                                 /* use container_of        */
 159 };
 160
 161 struct mem_cgroup_per_node {
 162         struct mem_cgroup_per_zone zoneinfo[MAX_NR_ZONES];
 163 };
 164
 165 /*
 166  * Cgroups above their limits are maintained in a RB-Tree, independent of
 167  * their hierarchy representation
 168  */
 169
 170 struct mem_cgroup_tree_per_zone {
 171         struct rb_root rb_root;
 172         spinlock_t lock;
 173 };
 174
 175 struct mem_cgroup_tree_per_node {
 176         struct mem_cgroup_tree_per_zone rb_tree_per_zone[MAX_NR_ZONES];
 177 };
 178
 179 struct mem_cgroup_tree {
 180         struct mem_cgroup_tree_per_node *rb_tree_per_node[MAX_NUMNODES];
 181 };
 182
 183 static struct mem_cgroup_tree soft_limit_tree __read_mostly;
 184
 185 struct mem_cgroup_threshold {
 186         struct eventfd_ctx *eventfd;
 187         unsigned long threshold;
 188 };
 189
 190 /* For threshold */
 191 struct mem_cgroup_threshold_ary {
 192         /* An array index points to threshold just below or equal to usage. */
 193         int current_threshold;
 194         /* Size of entries[] */
 195         unsigned int size;
 196         /* Array of thresholds */
 197         struct mem_cgroup_threshold entries[0];
 198 };
 199
 200 struct mem_cgroup_thresholds {
 201         /* Primary thresholds array */
 202         struct mem_cgroup_threshold_ary *primary;
 203         /*
 204          * Spare threshold array.
 205          * This is needed to make mem_cgroup_unregister_event() "never fail".
 206          * It must be able to store at least primary->size - 1 entries.
 207          */
 208         struct mem_cgroup_threshold_ary *spare;
 209 };
 210
 211 /* for OOM */
 212 struct mem_cgroup_eventfd_list {
 213         struct list_head list;
 214         struct eventfd_ctx *eventfd;
 215 };
 216
 217 /*
 218  * cgroup_event represents events which userspace want to receive.
 219  */
 220 struct mem_cgroup_event {
 221         /*
 222          * memcg which the event belongs to.
 223          */
 224         struct mem_cgroup *memcg;
 225         /*
 226          * eventfd to signal userspace about the event.
 227          */
 228         struct eventfd_ctx *eventfd;
 229         /*
 230          * Each of these stored in a list by the cgroup.
 231          */
 232         struct list_head list;
 233         /*
 234          * register_event() callback will be used to add new userspace
 235          * waiter for changes related to this event.  Use eventfd_signal()
 236          * on eventfd to send notification to userspace.
 237          */
 238         int (*register_event)(struct mem_cgroup *memcg,
 239                               struct eventfd_ctx *eventfd, const char *args);
 240         /*
 241          * unregister_event() callback will be called when userspace closes
 242          * the eventfd or on cgroup removing.  This callback must be set,
 243          * if you want provide notification functionality.
 244          */
 245         void (*unregister_event)(struct mem_cgroup *memcg,
 246                                  struct eventfd_ctx *eventfd);
 247         /*
 248          * All fields below needed to unregister event when
 249          * userspace closes eventfd.
 250          */
 251         poll_table pt;
 252         wait_queue_head_t *wqh;
 253         wait_queue_t wait;
 254         struct work_struct remove;
 255 };
 256
 257 static void mem_cgroup_threshold(struct mem_cgroup *memcg);
 258 static void mem_cgroup_oom_notify(struct mem_cgroup *memcg);
 259
 260 /*
 261  * The memory controller data structure. The memory controller controls both
 262  * page cache and RSS per cgroup. We would eventually like to provide
 263  * statistics based on the statistics developed by Rik Van Riel for clock-pro,
 264  * to help the administrator determine what knobs to tune.
 265  */
 266 struct mem_cgroup {
 267         struct cgroup_subsys_state css;
 268
 269         /* Accounted resources */
 270         struct page_counter memory;
 271         struct page_counter memsw;
 272         struct page_counter kmem;
 273
 274         /* Normal memory consumption range */
 275         unsigned long low;
 276         unsigned long high;
 277
 278         unsigned long soft_limit;
 279
 280         /* vmpressure notifications */
 281         struct vmpressure vmpressure;
 282
 283         /* css_online() has been completed */
 284         int initialized;
 285
 286         /*
 287          * Should the accounting and control be hierarchical, per subtree?
 288          */
 289         bool use_hierarchy;
 290
 291         bool            oom_lock;
 292         atomic_t        under_oom;
 293         atomic_t        oom_wakeups;
 294
 295         int     swappiness;
 296         /* OOM-Killer disable */
 297         int             oom_kill_disable;
 298
 299         /* protect arrays of thresholds */
 300         struct mutex thresholds_lock;
 301
 302         /* thresholds for memory usage. RCU-protected */
 303         struct mem_cgroup_thresholds thresholds;
 304
 305         /* thresholds for mem+swap usage. RCU-protected */
 306         struct mem_cgroup_thresholds memsw_thresholds;
 307
 308         /* For oom notifier event fd */
 309         struct list_head oom_notify;
 310
 311         /*
 312          * Should we move charges of a task when a task is moved into this
 313          * mem_cgroup ? And what type of charges should we move ?
 314          */
 315         unsigned long move_charge_at_immigrate;
 316         /*
 317          * set > 0 if pages under this cgroup are moving to other cgroup.
 318          */
 319         atomic_t                moving_account;
 320         /* taken only while moving_account > 0 */
 321         spinlock_t              move_lock;
 322         struct task_struct      *move_lock_task;
 323         unsigned long           move_lock_flags;
 324         /*
 325          * percpu counter.
 326          */
 327         struct mem_cgroup_stat_cpu __percpu *stat;
 328         /*
 329          * used when a cpu is offlined or other synchronizations
 330          * See mem_cgroup_read_stat().
 331          */
 332         struct mem_cgroup_stat_cpu nocpu_base;
 333         spinlock_t pcp_counter_lock;
 334
 335 #if defined(CONFIG_MEMCG_KMEM) && defined(CONFIG_INET)
 336         struct cg_proto tcp_mem;
 337 #endif
 338 #if defined(CONFIG_MEMCG_KMEM)
 339         /* Index in the kmem_cache->memcg_params.memcg_caches array */
 340         int kmemcg_id;
 341         bool kmem_acct_activated;
 342         bool kmem_acct_active;
 343 #endif
 344
 345         int last_scanned_node;
 346 #if MAX_NUMNODES > 1
 347         nodemask_t      scan_nodes;
 348         atomic_t        numainfo_events;
 349         atomic_t        numainfo_updating;
 350 #endif
 351
 352         /* List of events which userspace want to receive */
 353         struct list_head event_list;
 354         spinlock_t event_list_lock;
 355
 356         struct mem_cgroup_per_node *nodeinfo[0];
 357         /* WARNING: nodeinfo must be the last member here */
 358 };
 359
 360 #ifdef CONFIG_MEMCG_KMEM
 361 bool memcg_kmem_is_active(struct mem_cgroup *memcg)
 362 {
 363         return memcg->kmem_acct_active;
 364 }
 365 #endif
 366
 367 /* Stuffs for move charges at task migration. */
 368 /*
 369  * Types of charges to be moved.
 370  */
 371 #define MOVE_ANON       0x1U
 372 #define MOVE_FILE       0x2U
 373 #define MOVE_MASK       (MOVE_ANON | MOVE_FILE)
 374
 375 /* "mc" and its members are protected by cgroup_mutex */
 376 static struct move_charge_struct {
 377         spinlock_t        lock; /* for from, to */
 378         struct mem_cgroup *from;
 379         struct mem_cgroup *to;
 380         unsigned long flags;
 381         unsigned long precharge;
 382         unsigned long moved_charge;
 383         unsigned long moved_swap;
 384         struct task_struct *moving_task;        /* a task moving charges */
 385         wait_queue_head_t waitq;                /* a waitq for other context */
 386 } mc = {
 387         .lock = __SPIN_LOCK_UNLOCKED(mc.lock),
 388         .waitq = __WAIT_QUEUE_HEAD_INITIALIZER(mc.waitq),
 389 };
 390
 391 /*
 392  * Maximum loops in mem_cgroup_hierarchical_reclaim(), used for soft
 393  * limit reclaim to prevent infinite loops, if they ever occur.
 394  */
 395 #define MEM_CGROUP_MAX_RECLAIM_LOOPS            100
 396 #define MEM_CGROUP_MAX_SOFT_LIMIT_RECLAIM_LOOPS 2
 397
 398 enum charge_type {
 399         MEM_CGROUP_CHARGE_TYPE_CACHE = 0,
 400         MEM_CGROUP_CHARGE_TYPE_ANON,
 401         MEM_CGROUP_CHARGE_TYPE_SWAPOUT, /* for accounting swapcache */
 402         MEM_CGROUP_CHARGE_TYPE_DROP,    /* a page was unused swap cache */
 403         NR_CHARGE_TYPE,
 404 };
 405
 406 /* for encoding cft->private value on file */
 407 enum res_type {
 408         _MEM,
 409         _MEMSWAP,
 410         _OOM_TYPE,
 411         _KMEM,
 412 };
 413
 414 #define MEMFILE_PRIVATE(x, val) ((x) << 16 | (val))
 415 #define MEMFILE_TYPE(val)       ((val) >> 16 & 0xffff)
 416 #define MEMFILE_ATTR(val)       ((val) & 0xffff)
 417 /* Used for OOM nofiier */
 418 #define OOM_CONTROL             (0)
 419
 420 /*
 421  * The memcg_create_mutex will be held whenever a new cgroup is created.
 422  * As a consequence, any change that needs to protect against new child cgroups
 423  * appearing has to hold it as well.
 424  */
 425 static DEFINE_MUTEX(memcg_create_mutex);
 426
 427 struct mem_cgroup *mem_cgroup_from_css(struct cgroup_subsys_state *s)
 428 {
 429         return s ? container_of(s, struct mem_cgroup, css) : NULL;
 430 }
 431
 432 /* Some nice accessors for the vmpressure. */
 433 struct vmpressure *memcg_to_vmpressure(struct mem_cgroup *memcg)
 434 {
 435         if (!memcg)
 436                 memcg = root_mem_cgroup;
 437         return &memcg->vmpressure;
 438 }
 439
 440 struct cgroup_subsys_state *vmpressure_to_css(struct vmpressure *vmpr)
 441 {
 442         return &container_of(vmpr, struct mem_cgroup, vmpressure)->css;
 443 }
 444
 445 static inline bool mem_cgroup_is_root(struct mem_cgroup *memcg)
 446 {
 447         return (memcg == root_mem_cgroup);
 448 }
 449
 450 /*
 451  * We restrict the id in the range of [1, 65535], so it can fit into
 452  * an unsigned short.
 453  */
 454 #define MEM_CGROUP_ID_MAX       USHRT_MAX
 455
 456 static inline unsigned short mem_cgroup_id(struct mem_cgroup *memcg)
 457 {
 458         return memcg->css.id;
 459 }
 460
 461 /*
 462  * A helper function to get mem_cgroup from ID. must be called under
 463  * rcu_read_lock().  The caller is responsible for calling
 464  * css_tryget_online() if the mem_cgroup is used for charging. (dropping
 465  * refcnt from swap can be called against removed memcg.)
 466  */
 467 static inline struct mem_cgroup *mem_cgroup_from_id(unsigned short id)
 468 {
 469         struct cgroup_subsys_state *css;
 470
 471         css = css_from_id(id, &memory_cgrp_subsys);
 472         return mem_cgroup_from_css(css);
 473 }
 474
 475 /* Writing them here to avoid exposing memcg's inner layout */
 476 #if defined(CONFIG_INET) && defined(CONFIG_MEMCG_KMEM)
 477
 478 void sock_update_memcg(struct sock *sk)
 479 {
 480         if (mem_cgroup_sockets_enabled) {
 481                 struct mem_cgroup *memcg;
 482                 struct cg_proto *cg_proto;
 483
 484                 BUG_ON(!sk->sk_prot->proto_cgroup);
 485
 486                 /* Socket cloning can throw us here with sk_cgrp already
 487                  * filled. It won't however, necessarily happen from
 488                  * process context. So the test for root memcg given
 489                  * the current task's memcg won't help us in this case.
 490                  *
 491                  * Respecting the original socket's memcg is a better
 492                  * decision in this case.
 493                  */
 494                 if (sk->sk_cgrp) {
 495                         BUG_ON(mem_cgroup_is_root(sk->sk_cgrp->memcg));
 496                         css_get(&sk->sk_cgrp->memcg->css);
 497                         return;
 498                 }
 499
 500                 rcu_read_lock();
 501                 memcg = mem_cgroup_from_task(current);
 502                 cg_proto = sk->sk_prot->proto_cgroup(memcg);
 503                 if (!mem_cgroup_is_root(memcg) &&
 504                     memcg_proto_active(cg_proto) &&
 505                     css_tryget_online(&memcg->css)) {
 506                         sk->sk_cgrp = cg_proto;
 507                 }
 508                 rcu_read_unlock();
 509         }
 510 }
 511 EXPORT_SYMBOL(sock_update_memcg);
 512
 513 void sock_release_memcg(struct sock *sk)
 514 {
 515         if (mem_cgroup_sockets_enabled && sk->sk_cgrp) {
 516                 struct mem_cgroup *memcg;
 517                 WARN_ON(!sk->sk_cgrp->memcg);
 518                 memcg = sk->sk_cgrp->memcg;
 519                 css_put(&sk->sk_cgrp->memcg->css);
 520         }
 521 }
 522
 523 struct cg_proto *tcp_proto_cgroup(struct mem_cgroup *memcg)
 524 {
 525         if (!memcg || mem_cgroup_is_root(memcg))
 526                 return NULL;
 527
 528         return &memcg->tcp_mem;
 529 }
 530 EXPORT_SYMBOL(tcp_proto_cgroup);
 531
 532 #endif
 533
 534 #ifdef CONFIG_MEMCG_KMEM
 535 /*
 536  * This will be the memcg's index in each cache's ->memcg_params.memcg_caches.
 537  * The main reason for not using cgroup id for this:
 538  *  this works better in sparse environments, where we have a lot of memcgs,
 539  *  but only a few kmem-limited. Or also, if we have, for instance, 200
 540  *  memcgs, and none but the 200th is kmem-limited, we'd have to have a
 541  *  200 entry array for that.
 542  *
 543  * The current size of the caches array is stored in memcg_nr_cache_ids. It
 544  * will double each time we have to increase it.
 545  */
 546 static DEFINE_IDA(memcg_cache_ida);
 547 int memcg_nr_cache_ids;
 548
 549 /* Protects memcg_nr_cache_ids */
 550 static DECLARE_RWSEM(memcg_cache_ids_sem);
 551
 552 void memcg_get_cache_ids(void)
 553 {
 554         down_read(&memcg_cache_ids_sem);
 555 }
 556
 557 void memcg_put_cache_ids(void)
 558 {
 559         up_read(&memcg_cache_ids_sem);
 560 }
 561
 562 /*
 563  * MIN_SIZE is different than 1, because we would like to avoid going through
 564  * the alloc/free process all the time. In a small machine, 4 kmem-limited
 565  * cgroups is a reasonable guess. In the future, it could be a parameter or
 566  * tunable, but that is strictly not necessary.
 567  *
 568  * MAX_SIZE should be as large as the number of cgrp_ids. Ideally, we could get
 569  * this constant directly from cgroup, but it is understandable that this is
 570  * better kept as an internal representation in cgroup.c. In any case, the
 571  * cgrp_id space is not getting any smaller, and we don't have to necessarily
 572  * increase ours as well if it increases.
 573  */
 574 #define MEMCG_CACHES_MIN_SIZE 4
 575 #define MEMCG_CACHES_MAX_SIZE MEM_CGROUP_ID_MAX
 576
 577 /*
 578  * A lot of the calls to the cache allocation functions are expected to be
 579  * inlined by the compiler. Since the calls to memcg_kmem_get_cache are
 580  * conditional to this static branch, we'll have to allow modules that does
 581  * kmem_cache_alloc and the such to see this symbol as well
 582  */
 583 struct static_key memcg_kmem_enabled_key;
 584 EXPORT_SYMBOL(memcg_kmem_enabled_key);
 585
 586 #endif /* CONFIG_MEMCG_KMEM */
 587
 588 static struct mem_cgroup_per_zone *
 589 mem_cgroup_zone_zoneinfo(struct mem_cgroup *memcg, struct zone *zone)
 590 {
 591         int nid = zone_to_nid(zone);
 592         int zid = zone_idx(zone);
 593
 594         return &memcg->nodeinfo[nid]->zoneinfo[zid];
 595 }
 596
 597 struct cgroup_subsys_state *mem_cgroup_css(struct mem_cgroup *memcg)
 598 {
 599         return &memcg->css;
 600 }
 601
 602 static struct mem_cgroup_per_zone *
 603 mem_cgroup_page_zoneinfo(struct mem_cgroup *memcg, struct page *page)
 604 {
 605         int nid = page_to_nid(page);
 606         int zid = page_zonenum(page);
 607
 608         return &memcg->nodeinfo[nid]->zoneinfo[zid];
 609 }
 610
 611 static struct mem_cgroup_tree_per_zone *
 612 soft_limit_tree_node_zone(int nid, int zid)
 613 {
 614         return &soft_limit_tree.rb_tree_per_node[nid]->rb_tree_per_zone[zid];
 615 }
 616
 617 static struct mem_cgroup_tree_per_zone *
 618 soft_limit_tree_from_page(struct page *page)
 619 {
 620         int nid = page_to_nid(page);
 621         int zid = page_zonenum(page);
 622
 623         return &soft_limit_tree.rb_tree_per_node[nid]->rb_tree_per_zone[zid];
 624 }
 625
 626 static void __mem_cgroup_insert_exceeded(struct mem_cgroup_per_zone *mz,
 627                                          struct mem_cgroup_tree_per_zone *mctz,
 628                                          unsigned long new_usage_in_excess)
 629 {
 630         struct rb_node **p = &mctz->rb_root.rb_node;
 631         struct rb_node *parent = NULL;
 632         struct mem_cgroup_per_zone *mz_node;
 633
 634         if (mz->on_tree)
 635                 return;
 636
 637         mz->usage_in_excess = new_usage_in_excess;
 638         if (!mz->usage_in_excess)
 639                 return;
 640         while (*p) {
 641                 parent = *p;
 642                 mz_node = rb_entry(parent, struct mem_cgroup_per_zone,
 643                                         tree_node);
 644                 if (mz->usage_in_excess < mz_node->usage_in_excess)
 645                         p = &(*p)->rb_left;
 646                 /*
 647                  * We can't avoid mem cgroups that are over their soft
 648                  * limit by the same amount
 649                  */
 650                 else if (mz->usage_in_excess >= mz_node->usage_in_excess)
 651                         p = &(*p)->rb_right;
 652         }
 653         rb_link_node(&mz->tree_node, parent, p);
 654         rb_insert_color(&mz->tree_node, &mctz->rb_root);
 655         mz->on_tree = true;
 656 }
 657
 658 static void __mem_cgroup_remove_exceeded(struct mem_cgroup_per_zone *mz,
 659                                          struct mem_cgroup_tree_per_zone *mctz)
 660 {
 661         if (!mz->on_tree)
 662                 return;
 663         rb_erase(&mz->tree_node, &mctz->rb_root);
 664         mz->on_tree = false;
 665 }
 666
 667 static void mem_cgroup_remove_exceeded(struct mem_cgroup_per_zone *mz,
 668                                        struct mem_cgroup_tree_per_zone *mctz)
 669 {
 670         unsigned long flags;
 671
 672         spin_lock_irqsave(&mctz->lock, flags);
 673         __mem_cgroup_remove_exceeded(mz, mctz);
 674         spin_unlock_irqrestore(&mctz->lock, flags);
 675 }
 676
 677 static unsigned long soft_limit_excess(struct mem_cgroup *memcg)
 678 {
 679         unsigned long nr_pages = page_counter_read(&memcg->memory);
 680         unsigned long soft_limit = READ_ONCE(memcg->soft_limit);
 681         unsigned long excess = 0;
 682
 683         if (nr_pages > soft_limit)
 684                 excess = nr_pages - soft_limit;
 685
 686         return excess;
 687 }
 688
 689 static void mem_cgroup_update_tree(struct mem_cgroup *memcg, struct page *page)
 690 {
 691         unsigned long excess;
 692         struct mem_cgroup_per_zone *mz;
 693         struct mem_cgroup_tree_per_zone *mctz;
 694
 695         mctz = soft_limit_tree_from_page(page);
 696         /*
 697          * Necessary to update all ancestors when hierarchy is used.
 698          * because their event counter is not touched.
 699          */
 700         for (; memcg; memcg = parent_mem_cgroup(memcg)) {
 701                 mz = mem_cgroup_page_zoneinfo(memcg, page);
 702                 excess = soft_limit_excess(memcg);
 703                 /*
 704                  * We have to update the tree if mz is on RB-tree or
 705                  * mem is over its softlimit.
 706                  */
 707                 if (excess || mz->on_tree) {
 708                         unsigned long flags;
 709
 710                         spin_lock_irqsave(&mctz->lock, flags);
 711                         /* if on-tree, remove it */
 712                         if (mz->on_tree)
 713                                 __mem_cgroup_remove_exceeded(mz, mctz);
 714                         /*
 715                          * Insert again. mz->usage_in_excess will be updated.
 716                          * If excess is 0, no tree ops.
 717                          */
 718                         __mem_cgroup_insert_exceeded(mz, mctz, excess);
 719                         spin_unlock_irqrestore(&mctz->lock, flags);
 720                 }
 721         }
 722 }
 723
 724 static void mem_cgroup_remove_from_trees(struct mem_cgroup *memcg)
 725 {
 726         struct mem_cgroup_tree_per_zone *mctz;
 727         struct mem_cgroup_per_zone *mz;
 728         int nid, zid;
 729
 730         for_each_node(nid) {
 731                 for (zid = 0; zid < MAX_NR_ZONES; zid++) {
 732                         mz = &memcg->nodeinfo[nid]->zoneinfo[zid];
 733                         mctz = soft_limit_tree_node_zone(nid, zid);
 734                         mem_cgroup_remove_exceeded(mz, mctz);
 735                 }
 736         }
 737 }
 738
 739 static struct mem_cgroup_per_zone *
 740 __mem_cgroup_largest_soft_limit_node(struct mem_cgroup_tree_per_zone *mctz)
 741 {
 742         struct rb_node *rightmost = NULL;
 743         struct mem_cgroup_per_zone *mz;
 744
 745 retry:
 746         mz = NULL;
 747         rightmost = rb_last(&mctz->rb_root);
 748         if (!rightmost)
 749                 goto done;              /* Nothing to reclaim from */
 750
 751         mz = rb_entry(rightmost, struct mem_cgroup_per_zone, tree_node);
 752         /*
 753          * Remove the node now but someone else can add it back,
 754          * we will to add it back at the end of reclaim to its correct
 755          * position in the tree.
 756          */
 757         __mem_cgroup_remove_exceeded(mz, mctz);
 758         if (!soft_limit_excess(mz->memcg) ||
 759             !css_tryget_online(&mz->memcg->css))
 760                 goto retry;
 761 done:
 762         return mz;
 763 }
 764
 765 static struct mem_cgroup_per_zone *
 766 mem_cgroup_largest_soft_limit_node(struct mem_cgroup_tree_per_zone *mctz)
 767 {
 768         struct mem_cgroup_per_zone *mz;
 769
 770         spin_lock_irq(&mctz->lock);
 771         mz = __mem_cgroup_largest_soft_limit_node(mctz);
 772         spin_unlock_irq(&mctz->lock);
 773         return mz;
 774 }
 775
 776 /*
 777  * Implementation Note: reading percpu statistics for memcg.
 778  *
 779  * Both of vmstat[] and percpu_counter has threshold and do periodic
 780  * synchronization to implement "quick" read. There are trade-off between
 781  * reading cost and precision of value. Then, we may have a chance to implement
 782  * a periodic synchronizion of counter in memcg's counter.
 783  *
 784  * But this _read() function is used for user interface now. The user accounts
 785  * memory usage by memory cgroup and he _always_ requires exact value because
 786  * he accounts memory. Even if we provide quick-and-fuzzy read, we always
 787  * have to visit all online cpus and make sum. So, for now, unnecessary
 788  * synchronization is not implemented. (just implemented for cpu hotplug)
 789  *
 790  * If there are kernel internal actions which can make use of some not-exact
 791  * value, and reading all cpu value can be performance bottleneck in some
 792  * common workload, threashold and synchonization as vmstat[] should be
 793  * implemented.
 794  */
 795 static long mem_cgroup_read_stat(struct mem_cgroup *memcg,
 796                                  enum mem_cgroup_stat_index idx)
 797 {
 798         long val = 0;
 799         int cpu;
 800
 801         get_online_cpus();
 802         for_each_online_cpu(cpu)
 803                 val += per_cpu(memcg->stat->count[idx], cpu);
 804 #ifdef CONFIG_HOTPLUG_CPU
 805         spin_lock(&memcg->pcp_counter_lock);
 806         val += memcg->nocpu_base.count[idx];
 807         spin_unlock(&memcg->pcp_counter_lock);
 808 #endif
 809         put_online_cpus();
 810         return val;
 811 }
 812
 813 static unsigned long mem_cgroup_read_events(struct mem_cgroup *memcg,
 814                                             enum mem_cgroup_events_index idx)
 815 {
 816         unsigned long val = 0;
 817         int cpu;
 818
 819         get_online_cpus();
 820         for_each_online_cpu(cpu)
 821                 val += per_cpu(memcg->stat->events[idx], cpu);
 822 #ifdef CONFIG_HOTPLUG_CPU
 823         spin_lock(&memcg->pcp_counter_lock);
 824         val += memcg->nocpu_base.events[idx];
 825         spin_unlock(&memcg->pcp_counter_lock);
 826 #endif
 827         put_online_cpus();
 828         return val;
 829 }
 830
 831 static void mem_cgroup_charge_statistics(struct mem_cgroup *memcg,
 832                                          struct page *page,
 833                                          int nr_pages)
 834 {
 835         /*
 836          * Here, RSS means 'mapped anon' and anon's SwapCache. Shmem/tmpfs is
 837          * counted as CACHE even if it's on ANON LRU.
 838          */
 839         if (PageAnon(page))
 840                 __this_cpu_add(memcg->stat->count[MEM_CGROUP_STAT_RSS],
 841                                 nr_pages);
 842         else
 843                 __this_cpu_add(memcg->stat->count[MEM_CGROUP_STAT_CACHE],
 844                                 nr_pages);
 845
 846         if (PageTransHuge(page))
 847                 __this_cpu_add(memcg->stat->count[MEM_CGROUP_STAT_RSS_HUGE],
 848                                 nr_pages);
 849
 850         /* pagein of a big page is an event. So, ignore page size */
 851         if (nr_pages > 0)
 852                 __this_cpu_inc(memcg->stat->events[MEM_CGROUP_EVENTS_PGPGIN]);
 853         else {
 854                 __this_cpu_inc(memcg->stat->events[MEM_CGROUP_EVENTS_PGPGOUT]);
 855                 nr_pages = -nr_pages; /* for event */
 856         }
 857
 858         __this_cpu_add(memcg->stat->nr_page_events, nr_pages);
 859 }
 860
 861 unsigned long mem_cgroup_get_lru_size(struct lruvec *lruvec, enum lru_list lru)
 862 {
 863         struct mem_cgroup_per_zone *mz;
 864
 865         mz = container_of(lruvec, struct mem_cgroup_per_zone, lruvec);
 866         return mz->lru_size[lru];
 867 }
 868
 869 static unsigned long mem_cgroup_node_nr_lru_pages(struct mem_cgroup *memcg,
 870                                                   int nid,
 871                                                   unsigned int lru_mask)
 872 {
 873         unsigned long nr = 0;
 874         int zid;
 875
 876         VM_BUG_ON((unsigned)nid >= nr_node_ids);
 877
 878         for (zid = 0; zid < MAX_NR_ZONES; zid++) {
 879                 struct mem_cgroup_per_zone *mz;
 880                 enum lru_list lru;
 881
 882                 for_each_lru(lru) {
 883                         if (!(BIT(lru) & lru_mask))
 884                                 continue;
 885                         mz = &memcg->nodeinfo[nid]->zoneinfo[zid];
 886                         nr += mz->lru_size[lru];
 887                 }
 888         }
 889         return nr;
 890 }
 891
 892 static unsigned long mem_cgroup_nr_lru_pages(struct mem_cgroup *memcg,
 893                         unsigned int lru_mask)
 894 {
 895         unsigned long nr = 0;
 896         int nid;
 897
 898         for_each_node_state(nid, N_MEMORY)
 899                 nr += mem_cgroup_node_nr_lru_pages(memcg, nid, lru_mask);
 900         return nr;
 901 }
 902
 903 static bool mem_cgroup_event_ratelimit(struct mem_cgroup *memcg,
 904                                        enum mem_cgroup_events_target target)
 905 {
 906         unsigned long val, next;
 907
 908         val = __this_cpu_read(memcg->stat->nr_page_events);
 909         next = __this_cpu_read(memcg->stat->targets[target]);
 910         /* from time_after() in jiffies.h */
 911         if ((long)next - (long)val < 0) {
 912                 switch (target) {
 913                 case MEM_CGROUP_TARGET_THRESH:
 914                         next = val + THRESHOLDS_EVENTS_TARGET;
 915                         break;
 916                 case MEM_CGROUP_TARGET_SOFTLIMIT:
 917                         next = val + SOFTLIMIT_EVENTS_TARGET;
 918                         break;
 919                 case MEM_CGROUP_TARGET_NUMAINFO:
 920                         next = val + NUMAINFO_EVENTS_TARGET;
 921                         break;
 922                 default:
 923                         break;
 924                 }
 925                 __this_cpu_write(memcg->stat->targets[target], next);
 926                 return true;
 927         }
 928         return false;
 929 }
 930
 931 /*
 932  * Check events in order.
 933  *
 934  */
 935 static void memcg_check_events(struct mem_cgroup *memcg, struct page *page)
 936 {
 937         /* threshold event is triggered in finer grain than soft limit */
 938         if (unlikely(mem_cgroup_event_ratelimit(memcg,
 939                                                 MEM_CGROUP_TARGET_THRESH))) {
 940                 bool do_softlimit;
 941                 bool do_numainfo __maybe_unused;
 942
 943                 do_softlimit = mem_cgroup_event_ratelimit(memcg,
 944                                                 MEM_CGROUP_TARGET_SOFTLIMIT);
 945 #if MAX_NUMNODES > 1
 946                 do_numainfo = mem_cgroup_event_ratelimit(memcg,
 947                                                 MEM_CGROUP_TARGET_NUMAINFO);
 948 #endif
 949                 mem_cgroup_threshold(memcg);
 950                 if (unlikely(do_softlimit))
 951                         mem_cgroup_update_tree(memcg, page);
 952 #if MAX_NUMNODES > 1
 953                 if (unlikely(do_numainfo))
 954                         atomic_inc(&memcg->numainfo_events);
 955 #endif
 956         }
 957 }
 958
 959 struct mem_cgroup *mem_cgroup_from_task(struct task_struct *p)
 960 {
 961         /*
 962          * mm_update_next_owner() may clear mm->owner to NULL
 963          * if it races with swapoff, page migration, etc.
 964          * So this can be called with p == NULL.
 965          */
 966         if (unlikely(!p))
 967                 return NULL;
 968
 969         return mem_cgroup_from_css(task_css(p, memory_cgrp_id));
 970 }
 971
 972 static struct mem_cgroup *get_mem_cgroup_from_mm(struct mm_struct *mm)
 973 {
 974         struct mem_cgroup *memcg = NULL;
 975
 976         rcu_read_lock();
 977         do {
 978                 /*
 979                  * Page cache insertions can happen withou an
 980                  * actual mm context, e.g. during disk probing
 981                  * on boot, loopback IO, acct() writes etc.
 982                  */
 983                 if (unlikely(!mm))
 984                         memcg = root_mem_cgroup;
 985                 else {
 986                         memcg = mem_cgroup_from_task(rcu_dereference(mm->owner));
 987                         if (unlikely(!memcg))
 988                                 memcg = root_mem_cgroup;
 989                 }
 990         } while (!css_tryget_online(&memcg->css));
 991         rcu_read_unlock();
 992         return memcg;
 993 }
 994
 995 /**
 996  * mem_cgroup_iter - iterate over memory cgroup hierarchy
 997  * @root: hierarchy root
 998  * @prev: previously returned memcg, NULL on first invocation
 999  * @reclaim: cookie for shared reclaim walks, NULL for full walks
1000  *
1001  * Returns references to children of the hierarchy below @root, or
1002  * @root itself, or %NULL after a full round-trip.
1003  *
1004  * Caller must pass the return value in @prev on subsequent
1005  * invocations for reference counting, or use mem_cgroup_iter_break()
1006  * to cancel a hierarchy walk before the round-trip is complete.
1007  *
1008  * Reclaimers can specify a zone and a priority level in @reclaim to
1009  * divide up the memcgs in the hierarchy among all concurrent
1010  * reclaimers operating on the same zone and priority.
1011  */
1012 struct mem_cgroup *mem_cgroup_iter(struct mem_cgroup *root,
1013                                    struct mem_cgroup *prev,
1014                                    struct mem_cgroup_reclaim_cookie *reclaim)
1015 {
1016         struct reclaim_iter *uninitialized_var(iter);
1017         struct cgroup_subsys_state *css = NULL;
1018         struct mem_cgroup *memcg = NULL;
1019         struct mem_cgroup *pos = NULL;
1020
1021         if (mem_cgroup_disabled())
1022                 return NULL;
1023
1024         if (!root)
1025                 root = root_mem_cgroup;
1026
1027         if (prev && !reclaim)
1028                 pos = prev;
1029
1030         if (!root->use_hierarchy && root != root_mem_cgroup) {
1031                 if (prev)
1032                         goto out;
1033                 return root;
1034         }
1035
1036         rcu_read_lock();
1037
1038         if (reclaim) {
1039                 struct mem_cgroup_per_zone *mz;
1040
1041                 mz = mem_cgroup_zone_zoneinfo(root, reclaim->zone);
1042                 iter = &mz->iter[reclaim->priority];
1043
1044                 if (prev && reclaim->generation != iter->generation)
1045                         goto out_unlock;
1046
1047                 do {
1048                         pos = READ_ONCE(iter->position);
1049                         /*
1050                          * A racing update may change the position and
1051                          * put the last reference, hence css_tryget(),
1052                          * or retry to see the updated position.
1053                          */
1054                 } while (pos && !css_tryget(&pos->css));
1055         }
1056
1057         if (pos)
1058                 css = &pos->css;
1059
1060         for (;;) {
1061                 css = css_next_descendant_pre(css, &root->css);
1062                 if (!css) {
1063                         /*
1064                          * Reclaimers share the hierarchy walk, and a
1065                          * new one might jump in right at the end of
1066                          * the hierarchy - make sure they see at least
1067                          * one group and restart from the beginning.
1068                          */
1069                         if (!prev)
1070                                 continue;
1071                         break;
1072                 }
1073
1074                 /*
1075                  * Verify the css and acquire a reference.  The root
1076                  * is provided by the caller, so we know it's alive
1077                  * and kicking, and don't take an extra reference.
1078                  */
1079                 memcg = mem_cgroup_from_css(css);
1080
1081                 if (css == &root->css)
1082                         break;
1083
1084                 if (css_tryget(css)) {
1085                         /*
1086                          * Make sure the memcg is initialized:
1087                          * mem_cgroup_css_online() orders the the
1088                          * initialization against setting the flag.
1089                          */
1090                         if (smp_load_acquire(&memcg->initialized))
1091                                 break;
1092
1093                         css_put(css);
1094                 }
1095
1096                 memcg = NULL;
1097         }
1098
1099         if (reclaim) {
1100                 if (cmpxchg(&iter->position, pos, memcg) == pos) {
1101                         if (memcg)
1102                                 css_get(&memcg->css);
1103                         if (pos)
1104                                 css_put(&pos->css);
1105                 }
1106
1107                 /*
1108                  * pairs with css_tryget when dereferencing iter->position
1109                  * above.
1110                  */
1111                 if (pos)
1112                         css_put(&pos->css);
1113
1114                 if (!memcg)
1115                         iter->generation++;
1116                 else if (!prev)
1117                         reclaim->generation = iter->generation;
1118         }
1119
1120 out_unlock:
1121         rcu_read_unlock();
1122 out:
1123         if (prev && prev != root)
1124                 css_put(&prev->css);
1125
1126         return memcg;
1127 }
1128
1129 /**
1130  * mem_cgroup_iter_break - abort a hierarchy walk prematurely
1131  * @root: hierarchy root
1132  * @prev: last visited hierarchy member as returned by mem_cgroup_iter()
1133  */
1134 void mem_cgroup_iter_break(struct mem_cgroup *root,
1135                            struct mem_cgroup *prev)
1136 {
1137         if (!root)
1138                 root = root_mem_cgroup;
1139         if (prev && prev != root)
1140                 css_put(&prev->css);
1141 }
1142
1143 /*
1144  * Iteration constructs for visiting all cgroups (under a tree).  If
1145  * loops are exited prematurely (break), mem_cgroup_iter_break() must
1146  * be used for reference counting.
1147  */
1148 #define for_each_mem_cgroup_tree(iter, root)            \
1149         for (iter = mem_cgroup_iter(root, NULL, NULL);  \
1150              iter != NULL;                              \
1151              iter = mem_cgroup_iter(root, iter, NULL))
1152
1153 #define for_each_mem_cgroup(iter)                       \
1154         for (iter = mem_cgroup_iter(NULL, NULL, NULL);  \
1155              iter != NULL;                              \
1156              iter = mem_cgroup_iter(NULL, iter, NULL))
1157
1158 void __mem_cgroup_count_vm_event(struct mm_struct *mm, enum vm_event_item idx)
1159 {
1160         struct mem_cgroup *memcg;
1161
1162         rcu_read_lock();
1163         memcg = mem_cgroup_from_task(rcu_dereference(mm->owner));
1164         if (unlikely(!memcg))
1165                 goto out;
1166
1167         switch (idx) {
1168         case PGFAULT:
1169                 this_cpu_inc(memcg->stat->events[MEM_CGROUP_EVENTS_PGFAULT]);
1170                 break;
1171         case PGMAJFAULT:
1172                 this_cpu_inc(memcg->stat->events[MEM_CGROUP_EVENTS_PGMAJFAULT]);
1173                 break;
1174         default:
1175                 BUG();
1176         }
1177 out:
1178         rcu_read_unlock();
1179 }
1180 EXPORT_SYMBOL(__mem_cgroup_count_vm_event);
1181
1182 /**
1183  * mem_cgroup_zone_lruvec - get the lru list vector for a zone and memcg
1184  * @zone: zone of the wanted lruvec
1185  * @memcg: memcg of the wanted lruvec
1186  *
1187  * Returns the lru list vector holding pages for the given @zone and
1188  * @mem.  This can be the global zone lruvec, if the memory controller
1189  * is disabled.
1190  */
1191 struct lruvec *mem_cgroup_zone_lruvec(struct zone *zone,
1192                                       struct mem_cgroup *memcg)
1193 {
1194         struct mem_cgroup_per_zone *mz;
1195         struct lruvec *lruvec;
1196
1197         if (mem_cgroup_disabled()) {
1198                 lruvec = &zone->lruvec;
1199                 goto out;
1200         }
1201
1202         mz = mem_cgroup_zone_zoneinfo(memcg, zone);
1203         lruvec = &mz->lruvec;
1204 out:
1205         /*
1206          * Since a node can be onlined after the mem_cgroup was created,
1207          * we have to be prepared to initialize lruvec->zone here;
1208          * and if offlined then reonlined, we need to reinitialize it.
1209          */
1210         if (unlikely(lruvec->zone != zone))
1211                 lruvec->zone = zone;
1212         return lruvec;
1213 }
1214
1215 /**
1216  * mem_cgroup_page_lruvec - return lruvec for isolating/putting an LRU page
1217  * @page: the page
1218  * @zone: zone of the page
1219  *
1220  * This function is only safe when following the LRU page isolation
1221  * and putback protocol: the LRU lock must be held, and the page must
1222  * either be PageLRU() or the caller must have isolated/allocated it.
1223  */
1224 struct lruvec *mem_cgroup_page_lruvec(struct page *page, struct zone *zone)
1225 {
1226         struct mem_cgroup_per_zone *mz;
1227         struct mem_cgroup *memcg;
1228         struct lruvec *lruvec;
1229
1230         if (mem_cgroup_disabled()) {
1231                 lruvec = &zone->lruvec;
1232                 goto out;
1233         }
1234
1235         memcg = page->mem_cgroup;
1236         /*
1237          * Swapcache readahead pages are added to the LRU - and
1238          * possibly migrated - before they are charged.
1239          */
1240         if (!memcg)
1241                 memcg = root_mem_cgroup;
1242
1243         mz = mem_cgroup_page_zoneinfo(memcg, page);
1244         lruvec = &mz->lruvec;
1245 out:
1246         /*
1247          * Since a node can be onlined after the mem_cgroup was created,
1248          * we have to be prepared to initialize lruvec->zone here;
1249          * and if offlined then reonlined, we need to reinitialize it.
1250          */
1251         if (unlikely(lruvec->zone != zone))
1252                 lruvec->zone = zone;
1253         return lruvec;
1254 }
1255
1256 /**
1257  * mem_cgroup_update_lru_size - account for adding or removing an lru page
1258  * @lruvec: mem_cgroup per zone lru vector
1259  * @lru: index of lru list the page is sitting on
1260  * @nr_pages: positive when adding or negative when removing
1261  *
1262  * This function must be called when a page is added to or removed from an
1263  * lru list.
1264  */
1265 void mem_cgroup_update_lru_size(struct lruvec *lruvec, enum lru_list lru,
1266                                 int nr_pages)
1267 {
1268         struct mem_cgroup_per_zone *mz;
1269         unsigned long *lru_size;
1270
1271         if (mem_cgroup_disabled())
1272                 return;
1273
1274         mz = container_of(lruvec, struct mem_cgroup_per_zone, lruvec);
1275         lru_size = mz->lru_size + lru;
1276         *lru_size += nr_pages;
1277         VM_BUG_ON((long)(*lru_size) < 0);
1278 }
1279
1280 bool mem_cgroup_is_descendant(struct mem_cgroup *memcg, struct mem_cgroup *root)
1281 {
1282         if (root == memcg)
1283                 return true;
1284         if (!root->use_hierarchy)
1285                 return false;
1286         return cgroup_is_descendant(memcg->css.cgroup, root->css.cgroup);
1287 }
1288
1289 bool task_in_mem_cgroup(struct task_struct *task, struct mem_cgroup *memcg)
1290 {
1291         struct mem_cgroup *task_memcg;
1292         struct task_struct *p;
1293         bool ret;
1294
1295         p = find_lock_task_mm(task);
1296         if (p) {
1297                 task_memcg = get_mem_cgroup_from_mm(p->mm);
1298                 task_unlock(p);
1299         } else {
1300                 /*
1301                  * All threads may have already detached their mm's, but the oom
1302                  * killer still needs to detect if they have already been oom
1303                  * killed to prevent needlessly killing additional tasks.
1304                  */
1305                 rcu_read_lock();
1306                 task_memcg = mem_cgroup_from_task(task);
1307                 css_get(&task_memcg->css);
1308                 rcu_read_unlock();
1309         }
1310         ret = mem_cgroup_is_descendant(task_memcg, memcg);
1311         css_put(&task_memcg->css);
1312         return ret;
1313 }
1314
1315 int mem_cgroup_inactive_anon_is_low(struct lruvec *lruvec)
1316 {
1317         unsigned long inactive_ratio;
1318         unsigned long inactive;
1319         unsigned long active;
1320         unsigned long gb;
1321
1322         inactive = mem_cgroup_get_lru_size(lruvec, LRU_INACTIVE_ANON);
1323         active = mem_cgroup_get_lru_size(lruvec, LRU_ACTIVE_ANON);
1324
1325         gb = (inactive + active) >> (30 - PAGE_SHIFT);
1326         if (gb)
1327                 inactive_ratio = int_sqrt(10 * gb);
1328         else
1329                 inactive_ratio = 1;
1330
1331         return inactive * inactive_ratio < active;
1332 }
1333
1334 bool mem_cgroup_lruvec_online(struct lruvec *lruvec)
1335 {
1336         struct mem_cgroup_per_zone *mz;
1337         struct mem_cgroup *memcg;
1338
1339         if (mem_cgroup_disabled())
1340                 return true;
1341
1342         mz = container_of(lruvec, struct mem_cgroup_per_zone, lruvec);
1343         memcg = mz->memcg;
1344
1345         return !!(memcg->css.flags & CSS_ONLINE);
1346 }
1347
1348 #define mem_cgroup_from_counter(counter, member)        \
1349         container_of(counter, struct mem_cgroup, member)
1350
1351 /**
1352  * mem_cgroup_margin - calculate chargeable space of a memory cgroup
1353  * @memcg: the memory cgroup
1354  *
1355  * Returns the maximum amount of memory @mem can be charged with, in
1356  * pages.
1357  */
1358 static unsigned long mem_cgroup_margin(struct mem_cgroup *memcg)
1359 {
1360         unsigned long margin = 0;
1361         unsigned long count;
1362         unsigned long limit;
1363
1364         count = page_counter_read(&memcg->memory);
1365         limit = READ_ONCE(memcg->memory.limit);
1366         if (count < limit)
1367                 margin = limit - count;
1368
1369         if (do_swap_account) {
1370                 count = page_counter_read(&memcg->memsw);
1371                 limit = READ_ONCE(memcg->memsw.limit);
1372                 if (count <= limit)
1373                         margin = min(margin, limit - count);
1374         }
1375
1376         return margin;
1377 }
1378
1379 int mem_cgroup_swappiness(struct mem_cgroup *memcg)
1380 {
1381         /* root ? */
1382         if (mem_cgroup_disabled() || !memcg->css.parent)
1383                 return vm_swappiness;
1384
1385         return memcg->swappiness;
1386 }
1387
1388 /*
1389  * A routine for checking "mem" is under move_account() or not.
1390  *
1391  * Checking a cgroup is mc.from or mc.to or under hierarchy of
1392  * moving cgroups. This is for waiting at high-memory pressure
1393  * caused by "move".
1394  */
1395 static bool mem_cgroup_under_move(struct mem_cgroup *memcg)
1396 {
1397         struct mem_cgroup *from;
1398         struct mem_cgroup *to;
1399         bool ret = false;
1400         /*
1401          * Unlike task_move routines, we access mc.to, mc.from not under
1402          * mutual exclusion by cgroup_mutex. Here, we take spinlock instead.
1403          */
1404         spin_lock(&mc.lock);
1405         from = mc.from;
1406         to = mc.to;
1407         if (!from)
1408                 goto unlock;
1409
1410         ret = mem_cgroup_is_descendant(from, memcg) ||
1411                 mem_cgroup_is_descendant(to, memcg);
1412 unlock:
1413         spin_unlock(&mc.lock);
1414         return ret;
1415 }
1416
1417 static bool mem_cgroup_wait_acct_move(struct mem_cgroup *memcg)
1418 {
1419         if (mc.moving_task && current != mc.moving_task) {
1420                 if (mem_cgroup_under_move(memcg)) {
1421                         DEFINE_WAIT(wait);
1422                         prepare_to_wait(&mc.waitq, &wait, TASK_INTERRUPTIBLE);
1423                         /* moving charge context might have finished. */
1424                         if (mc.moving_task)
1425                                 schedule();
1426                         finish_wait(&mc.waitq, &wait);
1427                         return true;
1428                 }
1429         }
1430         return false;
1431 }
1432
1433 #define K(x) ((x) << (PAGE_SHIFT-10))
1434 /**
1435  * mem_cgroup_print_oom_info: Print OOM information relevant to memory controller.
1436  * @memcg: The memory cgroup that went over limit
1437  * @p: Task that is going to be killed
1438  *
1439  * NOTE: @memcg and @p's mem_cgroup can be different when hierarchy is
1440  * enabled
1441  */
1442 void mem_cgroup_print_oom_info(struct mem_cgroup *memcg, struct task_struct *p)
1443 {
1444         /* oom_info_lock ensures that parallel ooms do not interleave */
1445         static DEFINE_MUTEX(oom_info_lock);
1446         struct mem_cgroup *iter;
1447         unsigned int i;
1448
1449         mutex_lock(&oom_info_lock);
1450         rcu_read_lock();
1451
1452         if (p) {
1453                 pr_info("Task in ");
1454                 pr_cont_cgroup_path(task_cgroup(p, memory_cgrp_id));
1455                 pr_cont(" killed as a result of limit of ");
1456         } else {
1457                 pr_info("Memory limit reached of cgroup ");
1458         }
1459
1460         pr_cont_cgroup_path(memcg->css.cgroup);
1461         pr_cont("\n");
1462
1463         rcu_read_unlock();
1464
1465         pr_info("memory: usage %llukB, limit %llukB, failcnt %lu\n",
1466                 K((u64)page_counter_read(&memcg->memory)),
1467                 K((u64)memcg->memory.limit), memcg->memory.failcnt);
1468         pr_info("memory+swap: usage %llukB, limit %llukB, failcnt %lu\n",
1469                 K((u64)page_counter_read(&memcg->memsw)),
1470                 K((u64)memcg->memsw.limit), memcg->memsw.failcnt);
1471         pr_info("kmem: usage %llukB, limit %llukB, failcnt %lu\n",
1472                 K((u64)page_counter_read(&memcg->kmem)),
1473                 K((u64)memcg->kmem.limit), memcg->kmem.failcnt);
1474
1475         for_each_mem_cgroup_tree(iter, memcg) {
1476                 pr_info("Memory cgroup stats for ");
1477                 pr_cont_cgroup_path(iter->css.cgroup);
1478                 pr_cont(":");
1479
1480                 for (i = 0; i < MEM_CGROUP_STAT_NSTATS; i++) {
1481                         if (i == MEM_CGROUP_STAT_SWAP && !do_swap_account)
1482                                 continue;
1483                         pr_cont(" %s:%ldKB", mem_cgroup_stat_names[i],
1484                                 K(mem_cgroup_read_stat(iter, i)));
1485                 }
1486
1487                 for (i = 0; i < NR_LRU_LISTS; i++)
1488                         pr_cont(" %s:%luKB", mem_cgroup_lru_names[i],
1489                                 K(mem_cgroup_nr_lru_pages(iter, BIT(i))));
1490
1491                 pr_cont("\n");
1492         }
1493         mutex_unlock(&oom_info_lock);
1494 }
1495
1496 /*
1497  * This function returns the number of memcg under hierarchy tree. Returns
1498  * 1(self count) if no children.
1499  */
1500 static int mem_cgroup_count_children(struct mem_cgroup *memcg)
1501 {
1502         int num = 0;
1503         struct mem_cgroup *iter;
1504
1505         for_each_mem_cgroup_tree(iter, memcg)
1506                 num++;
1507         return num;
1508 }
1509
1510 /*
1511  * Return the memory (and swap, if configured) limit for a memcg.
1512  */
1513 static unsigned long mem_cgroup_get_limit(struct mem_cgroup *memcg)
1514 {
1515         unsigned long limit;
1516
1517         limit = memcg->memory.limit;
1518         if (mem_cgroup_swappiness(memcg)) {
1519                 unsigned long memsw_limit;
1520
1521                 memsw_limit = memcg->memsw.limit;
1522                 limit = min(limit + total_swap_pages, memsw_limit);
1523         }
1524         return limit;
1525 }
1526
1527 static void mem_cgroup_out_of_memory(struct mem_cgroup *memcg, gfp_t gfp_mask,
1528                                      int order)
1529 {
1530         struct mem_cgroup *iter;
1531         unsigned long chosen_points = 0;
1532         unsigned long totalpages;
1533         unsigned int points = 0;
1534         struct task_struct *chosen = NULL;
1535
1536         /*
1537          * If current has a pending SIGKILL or is exiting, then automatically
1538          * select it.  The goal is to allow it to allocate so that it may
1539          * quickly exit and free its memory.
1540          */
1541         if (fatal_signal_pending(current) || task_will_free_mem(current)) {
1542                 mark_tsk_oom_victim(current);
1543                 return;
1544         }
1545
1546         check_panic_on_oom(CONSTRAINT_MEMCG, gfp_mask, order, NULL, memcg);
1547         totalpages = mem_cgroup_get_limit(memcg) ? : 1;
1548         for_each_mem_cgroup_tree(iter, memcg) {
1549                 struct css_task_iter it;
1550                 struct task_struct *task;
1551
1552                 css_task_iter_start(&iter->css, &it);
1553                 while ((task = css_task_iter_next(&it))) {
1554                         switch (oom_scan_process_thread(task, totalpages, NULL,
1555                                                         false)) {
1556                         case OOM_SCAN_SELECT:
1557                                 if (chosen)
1558                                         put_task_struct(chosen);
1559                                 chosen = task;
1560                                 chosen_points = ULONG_MAX;
1561                                 get_task_struct(chosen);
1562                                 /* fall through */
1563                         case OOM_SCAN_CONTINUE:
1564                                 continue;
1565                         case OOM_SCAN_ABORT:
1566                                 css_task_iter_end(&it);
1567                                 mem_cgroup_iter_break(memcg, iter);
1568                                 if (chosen)
1569                                         put_task_struct(chosen);
1570                                 return;
1571                         case OOM_SCAN_OK:
1572                                 break;
1573                         };
1574                         points = oom_badness(task, memcg, NULL, totalpages);
1575                         if (!points || points < chosen_points)
1576                                 continue;
1577                         /* Prefer thread group leaders for display purposes */
1578                         if (points == chosen_points &&
1579                             thread_group_leader(chosen))
1580                                 continue;
1581
1582                         if (chosen)
1583                                 put_task_struct(chosen);
1584                         chosen = task;
1585                         chosen_points = points;
1586                         get_task_struct(chosen);
1587                 }
1588                 css_task_iter_end(&it);
1589         }
1590
1591         if (!chosen)
1592                 return;
1593         points = chosen_points * 1000 / totalpages;
1594         oom_kill_process(chosen, gfp_mask, order, points, totalpages, memcg,
1595                          NULL, "Memory cgroup out of memory");
1596 }
1597
1598 #if MAX_NUMNODES > 1
1599
1600 /**
1601  * test_mem_cgroup_node_reclaimable
1602  * @memcg: the target memcg
1603  * @nid: the node ID to be checked.
1604  * @noswap : specify true here if the user wants flle only information.
1605  *
1606  * This function returns whether the specified memcg contains any
1607  * reclaimable pages on a node. Returns true if there are any reclaimable
1608  * pages in the node.
1609  */
1610 static bool test_mem_cgroup_node_reclaimable(struct mem_cgroup *memcg,
1611                 int nid, bool noswap)
1612 {
1613         if (mem_cgroup_node_nr_lru_pages(memcg, nid, LRU_ALL_FILE))
1614                 return true;
1615         if (noswap || !total_swap_pages)
1616                 return false;
1617         if (mem_cgroup_node_nr_lru_pages(memcg, nid, LRU_ALL_ANON))
1618                 return true;
1619         return false;
1620
1621 }
1622
1623 /*
1624  * Always updating the nodemask is not very good - even if we have an empty
1625  * list or the wrong list here, we can start from some node and traverse all
1626  * nodes based on the zonelist. So update the list loosely once per 10 secs.
1627  *
1628  */
1629 static void mem_cgroup_may_update_nodemask(struct mem_cgroup *memcg)
1630 {
1631         int nid;
1632         /*
1633          * numainfo_events > 0 means there was at least NUMAINFO_EVENTS_TARGET
1634          * pagein/pageout changes since the last update.
1635          */
1636         if (!atomic_read(&memcg->numainfo_events))
1637                 return;
1638         if (atomic_inc_return(&memcg->numainfo_updating) > 1)
1639                 return;
1640
1641         /* make a nodemask where this memcg uses memory from */
1642         memcg->scan_nodes = node_states[N_MEMORY];
1643
1644         for_each_node_mask(nid, node_states[N_MEMORY]) {
1645
1646                 if (!test_mem_cgroup_node_reclaimable(memcg, nid, false))
1647                         node_clear(nid, memcg->scan_nodes);
1648         }
1649
1650         atomic_set(&memcg->numainfo_events, 0);
1651         atomic_set(&memcg->numainfo_updating, 0);
1652 }
1653
1654 /*
1655  * Selecting a node where we start reclaim from. Because what we need is just
1656  * reducing usage counter, start from anywhere is O,K. Considering
1657  * memory reclaim from current node, there are pros. and cons.
1658  *
1659  * Freeing memory from current node means freeing memory from a node which
1660  * we'll use or we've used. So, it may make LRU bad. And if several threads
1661  * hit limits, it will see a contention on a node. But freeing from remote
1662  * node means more costs for memory reclaim because of memory latency.
1663  *
1664  * Now, we use round-robin. Better algorithm is welcomed.
1665  */
1666 int mem_cgroup_select_victim_node(struct mem_cgroup *memcg)
1667 {
1668         int node;
1669
1670         mem_cgroup_may_update_nodemask(memcg);
1671         node = memcg->last_scanned_node;
1672
1673         node = next_node(node, memcg->scan_nodes);
1674         if (node == MAX_NUMNODES)
1675                 node = first_node(memcg->scan_nodes);
1676         /*
1677          * We call this when we hit limit, not when pages are added to LRU.
1678          * No LRU may hold pages because all pages are UNEVICTABLE or
1679          * memcg is too small and all pages are not on LRU. In that case,
1680          * we use curret node.
1681          */
1682         if (unlikely(node == MAX_NUMNODES))
1683                 node = numa_node_id();
1684
1685         memcg->last_scanned_node = node;
1686         return node;
1687 }
1688 #else
1689 int mem_cgroup_select_victim_node(struct mem_cgroup *memcg)
1690 {
1691         return 0;
1692 }
1693 #endif
1694
1695 static int mem_cgroup_soft_reclaim(struct mem_cgroup *root_memcg,
1696                                    struct zone *zone,
1697                                    gfp_t gfp_mask,
1698                                    unsigned long *total_scanned)
1699 {
1700         struct mem_cgroup *victim = NULL;
1701         int total = 0;
1702         int loop = 0;
1703         unsigned long excess;
1704         unsigned long nr_scanned;
1705         struct mem_cgroup_reclaim_cookie reclaim = {
1706                 .zone = zone,
1707                 .priority = 0,
1708         };
1709
1710         excess = soft_limit_excess(root_memcg);
1711
1712         while (1) {
1713                 victim = mem_cgroup_iter(root_memcg, victim, &reclaim);
1714                 if (!victim) {
1715                         loop++;
1716                         if (loop >= 2) {
1717                                 /*
1718                                  * If we have not been able to reclaim
1719                                  * anything, it might because there are
1720                                  * no reclaimable pages under this hierarchy
1721                                  */
1722                                 if (!total)
1723                                         break;
1724                                 /*
1725                                  * We want to do more targeted reclaim.
1726                                  * excess >> 2 is not to excessive so as to
1727                                  * reclaim too much, nor too less that we keep
1728                                  * coming back to reclaim from this cgroup
1729                                  */
1730                                 if (total >= (excess >> 2) ||
1731                                         (loop > MEM_CGROUP_MAX_RECLAIM_LOOPS))
1732                                         break;
1733                         }
1734                         continue;
1735                 }
1736                 total += mem_cgroup_shrink_node_zone(victim, gfp_mask, false,
1737                                                      zone, &nr_scanned);
1738                 *total_scanned += nr_scanned;
1739                 if (!soft_limit_excess(root_memcg))
1740                         break;
1741         }
1742         mem_cgroup_iter_break(root_memcg, victim);
1743         return total;
1744 }
1745
1746 #ifdef CONFIG_LOCKDEP
1747 static struct lockdep_map memcg_oom_lock_dep_map = {
1748         .name = "memcg_oom_lock",
1749 };
1750 #endif
1751
1752 static DEFINE_SPINLOCK(memcg_oom_lock);
1753
1754 /*
1755  * Check OOM-Killer is already running under our hierarchy.
1756  * If someone is running, return false.
1757  */
1758 static bool mem_cgroup_oom_trylock(struct mem_cgroup *memcg)
1759 {
1760         struct mem_cgroup *iter, *failed = NULL;
1761
1762         spin_lock(&memcg_oom_lock);
1763
1764         for_each_mem_cgroup_tree(iter, memcg) {
1765                 if (iter->oom_lock) {
1766                         /*
1767                          * this subtree of our hierarchy is already locked
1768                          * so we cannot give a lock.
1769                          */
1770                         failed = iter;
1771                         mem_cgroup_iter_break(memcg, iter);
1772                         break;
1773                 } else
1774                         iter->oom_lock = true;
1775         }
1776
1777         if (failed) {
1778                 /*
1779                  * OK, we failed to lock the whole subtree so we have
1780                  * to clean up what we set up to the failing subtree
1781                  */
1782                 for_each_mem_cgroup_tree(iter, memcg) {
1783                         if (iter == failed) {
1784                                 mem_cgroup_iter_break(memcg, iter);
1785                                 break;
1786                         }
1787                         iter->oom_lock = false;
1788                 }
1789         } else
1790                 mutex_acquire(&memcg_oom_lock_dep_map, 0, 1, _RET_IP_);
1791
1792         spin_unlock(&memcg_oom_lock);
1793
1794         return !failed;
1795 }
1796
1797 static void mem_cgroup_oom_unlock(struct mem_cgroup *memcg)
1798 {
1799         struct mem_cgroup *iter;
1800
1801         spin_lock(&memcg_oom_lock);
1802         mutex_release(&memcg_oom_lock_dep_map, 1, _RET_IP_);
1803         for_each_mem_cgroup_tree(iter, memcg)
1804                 iter->oom_lock = false;
1805         spin_unlock(&memcg_oom_lock);
1806 }
1807
1808 static void mem_cgroup_mark_under_oom(struct mem_cgroup *memcg)
1809 {
1810         struct mem_cgroup *iter;
1811
1812         for_each_mem_cgroup_tree(iter, memcg)
1813                 atomic_inc(&iter->under_oom);
1814 }
1815
1816 static void mem_cgroup_unmark_under_oom(struct mem_cgroup *memcg)
1817 {
1818         struct mem_cgroup *iter;
1819
1820         /*
1821          * When a new child is created while the hierarchy is under oom,
1822          * mem_cgroup_oom_lock() may not be called. We have to use
1823          * atomic_add_unless() here.
1824          */
1825         for_each_mem_cgroup_tree(iter, memcg)
1826                 atomic_add_unless(&iter->under_oom, -1, 0);
1827 }
1828
1829 static DECLARE_WAIT_QUEUE_HEAD(memcg_oom_waitq);
1830
1831 struct oom_wait_info {
1832         struct mem_cgroup *memcg;
1833         wait_queue_t    wait;
1834 };
1835
1836 static int memcg_oom_wake_function(wait_queue_t *wait,
1837         unsigned mode, int sync, void *arg)
1838 {
1839         struct mem_cgroup *wake_memcg = (struct mem_cgroup *)arg;
1840         struct mem_cgroup *oom_wait_memcg;
1841         struct oom_wait_info *oom_wait_info;
1842
1843         oom_wait_info = container_of(wait, struct oom_wait_info, wait);
1844         oom_wait_memcg = oom_wait_info->memcg;
1845
1846         if (!mem_cgroup_is_descendant(wake_memcg, oom_wait_memcg) &&
1847             !mem_cgroup_is_descendant(oom_wait_memcg, wake_memcg))
1848                 return 0;
1849         return autoremove_wake_function(wait, mode, sync, arg);
1850 }
1851
1852 static void memcg_wakeup_oom(struct mem_cgroup *memcg)
1853 {
1854         atomic_inc(&memcg->oom_wakeups);
1855         /* for filtering, pass "memcg" as argument. */
1856         __wake_up(&memcg_oom_waitq, TASK_NORMAL, 0, memcg);
1857 }
1858
1859 static void memcg_oom_recover(struct mem_cgroup *memcg)
1860 {
1861         if (memcg && atomic_read(&memcg->under_oom))
1862                 memcg_wakeup_oom(memcg);
1863 }
1864
1865 static void mem_cgroup_oom(struct mem_cgroup *memcg, gfp_t mask, int order)
1866 {
1867         if (!current->memcg_oom.may_oom)
1868                 return;
1869         /*
1870          * We are in the middle of the charge context here, so we
1871          * don't want to block when potentially sitting on a callstack
1872          * that holds all kinds of filesystem and mm locks.
1873          *
1874          * Also, the caller may handle a failed allocation gracefully
1875          * (like optional page cache readahead) and so an OOM killer
1876          * invocation might not even be necessary.
1877          *
1878          * That's why we don't do anything here except remember the
1879          * OOM context and then deal with it at the end of the page
1880          * fault when the stack is unwound, the locks are released,
1881          * and when we know whether the fault was overall successful.
1882          */
1883         css_get(&memcg->css);
1884         current->memcg_oom.memcg = memcg;
1885         current->memcg_oom.gfp_mask = mask;
1886         current->memcg_oom.order = order;
1887 }
1888
1889 /**
1890  * mem_cgroup_oom_synchronize - complete memcg OOM handling
1891  * @handle: actually kill/wait or just clean up the OOM state
1892  *
1893  * This has to be called at the end of a page fault if the memcg OOM
1894  * handler was enabled.
1895  *
1896  * Memcg supports userspace OOM handling where failed allocations must
1897  * sleep on a waitqueue until the userspace task resolves the
1898  * situation.  Sleeping directly in the charge context with all kinds
1899  * of locks held is not a good idea, instead we remember an OOM state
1900  * in the task and mem_cgroup_oom_synchronize() has to be called at
1901  * the end of the page fault to complete the OOM handling.
1902  *
1903  * Returns %true if an ongoing memcg OOM situation was detected and
1904  * completed, %false otherwise.
1905  */
1906 bool mem_cgroup_oom_synchronize(bool handle)
1907 {
1908         struct mem_cgroup *memcg = current->memcg_oom.memcg;
1909         struct oom_wait_info owait;
1910         bool locked;
1911
1912         /* OOM is global, do not handle */
1913         if (!memcg)
1914                 return false;
1915
1916         if (!handle || oom_killer_disabled)
1917                 goto cleanup;
1918
1919         owait.memcg = memcg;
1920         owait.wait.flags = 0;
1921         owait.wait.func = memcg_oom_wake_function;
1922         owait.wait.private = current;
1923         INIT_LIST_HEAD(&owait.wait.task_list);
1924
1925         prepare_to_wait(&memcg_oom_waitq, &owait.wait, TASK_KILLABLE);
1926         mem_cgroup_mark_under_oom(memcg);
1927
1928         locked = mem_cgroup_oom_trylock(memcg);
1929
1930         if (locked)
1931                 mem_cgroup_oom_notify(memcg);
1932
1933         if (locked && !memcg->oom_kill_disable) {
1934                 mem_cgroup_unmark_under_oom(memcg);
1935                 finish_wait(&memcg_oom_waitq, &owait.wait);
1936                 mem_cgroup_out_of_memory(memcg, current->memcg_oom.gfp_mask,
1937                                          current->memcg_oom.order);
1938         } else {
1939                 schedule();
1940                 mem_cgroup_unmark_under_oom(memcg);
1941                 finish_wait(&memcg_oom_waitq, &owait.wait);
1942         }
1943
1944         if (locked) {
1945                 mem_cgroup_oom_unlock(memcg);
1946                 /*
1947                  * There is no guarantee that an OOM-lock contender
1948                  * sees the wakeups triggered by the OOM kill
1949                  * uncharges.  Wake any sleepers explicitely.
1950                  */
1951                 memcg_oom_recover(memcg);
1952         }
1953 cleanup:
1954         current->memcg_oom.memcg = NULL;
1955         css_put(&memcg->css);
1956         return true;
1957 }
1958
1959 /**
1960  * mem_cgroup_begin_page_stat - begin a page state statistics transaction
1961  * @page: page that is going to change accounted state
1962  *
1963  * This function must mark the beginning of an accounted page state
1964  * change to prevent double accounting when the page is concurrently
1965  * being moved to another memcg:
1966  *
1967  *   memcg = mem_cgroup_begin_page_stat(page);
1968  *   if (TestClearPageState(page))
1969  *     mem_cgroup_update_page_stat(memcg, state, -1);
1970  *   mem_cgroup_end_page_stat(memcg);
1971  */
1972 struct mem_cgroup *mem_cgroup_begin_page_stat(struct page *page)
1973 {
1974         struct mem_cgroup *memcg;
1975         unsigned long flags;
1976
1977         /*
1978          * The RCU lock is held throughout the transaction.  The fast
1979          * path can get away without acquiring the memcg->move_lock
1980          * because page moving starts with an RCU grace period.
1981          *
1982          * The RCU lock also protects the memcg from being freed when
1983          * the page state that is going to change is the only thing
1984          * preventing the page from being uncharged.
1985          * E.g. end-writeback clearing PageWriteback(), which allows
1986          * migration to go ahead and uncharge the page before the
1987          * account transaction might be complete.
1988          */
1989         rcu_read_lock();
1990
1991         if (mem_cgroup_disabled())
1992                 return NULL;
1993 again:
1994         memcg = page->mem_cgroup;
1995         if (unlikely(!memcg))
1996                 return NULL;
1997
1998         if (atomic_read(&memcg->moving_account) <= 0)
1999                 return memcg;
2000
2001         spin_lock_irqsave(&memcg->move_lock, flags);
2002         if (memcg != page->mem_cgroup) {
2003                 spin_unlock_irqrestore(&memcg->move_lock, flags);
2004                 goto again;
2005         }
2006
2007         /*
2008          * When charge migration first begins, we can have locked and
2009          * unlocked page stat updates happening concurrently.  Track
2010          * the task who has the lock for mem_cgroup_end_page_stat().
2011          */
2012         memcg->move_lock_task = current;
2013         memcg->move_lock_flags = flags;
2014
2015         return memcg;
2016 }
2017
2018 /**
2019  * mem_cgroup_end_page_stat - finish a page state statistics transaction
2020  * @memcg: the memcg that was accounted against
2021  */
2022 void mem_cgroup_end_page_stat(struct mem_cgroup *memcg)
2023 {
2024         if (memcg && memcg->move_lock_task == current) {
2025                 unsigned long flags = memcg->move_lock_flags;
2026
2027                 memcg->move_lock_task = NULL;
2028                 memcg->move_lock_flags = 0;
2029
2030                 spin_unlock_irqrestore(&memcg->move_lock, flags);
2031         }
2032
2033         rcu_read_unlock();
2034 }
2035
2036 /**
2037  * mem_cgroup_update_page_stat - update page state statistics
2038  * @memcg: memcg to account against
2039  * @idx: page state item to account
2040  * @val: number of pages (positive or negative)
2041  *
2042  * See mem_cgroup_begin_page_stat() for locking requirements.
2043  */
2044 void mem_cgroup_update_page_stat(struct mem_cgroup *memcg,
2045                                  enum mem_cgroup_stat_index idx, int val)
2046 {
2047         VM_BUG_ON(!rcu_read_lock_held());
2048
2049         if (memcg)
2050                 this_cpu_add(memcg->stat->count[idx], val);
2051 }
2052
2053 /*
2054  * size of first charge trial. "32" comes from vmscan.c's magic value.
2055  * TODO: maybe necessary to use big numbers in big irons.
2056  */
2057 #define CHARGE_BATCH    32U
2058 struct memcg_stock_pcp {
2059         struct mem_cgroup *cached; /* this never be root cgroup */
2060         unsigned int nr_pages;
2061         struct work_struct work;
2062         unsigned long flags;
2063 #define FLUSHING_CACHED_CHARGE  0
2064 };
2065 static DEFINE_PER_CPU(struct memcg_stock_pcp, memcg_stock);
2066 static DEFINE_MUTEX(percpu_charge_mutex);
2067
2068 /**
2069  * consume_stock: Try to consume stocked charge on this cpu.
2070  * @memcg: memcg to consume from.
2071  * @nr_pages: how many pages to charge.
2072  *
2073  * The charges will only happen if @memcg matches the current cpu's memcg
2074  * stock, and at least @nr_pages are available in that stock.  Failure to
2075  * service an allocation will refill the stock.
2076  *
2077  * returns true if successful, false otherwise.
2078  */
2079 static bool consume_stock(struct mem_cgroup *memcg, unsigned int nr_pages)
2080 {
2081         struct memcg_stock_pcp *stock;
2082         bool ret = false;
2083
2084         if (nr_pages > CHARGE_BATCH)
2085                 return ret;
2086
2087         stock = &get_cpu_var(memcg_stock);
2088         if (memcg == stock->cached && stock->nr_pages >= nr_pages) {
2089                 stock->nr_pages -= nr_pages;
2090                 ret = true;
2091         }
2092         put_cpu_var(memcg_stock);
2093         return ret;
2094 }
2095
2096 /*
2097  * Returns stocks cached in percpu and reset cached information.
2098  */
2099 static void drain_stock(struct memcg_stock_pcp *stock)
2100 {
2101         struct mem_cgroup *old = stock->cached;
2102
2103         if (stock->nr_pages) {
2104                 page_counter_uncharge(&old->memory, stock->nr_pages);
2105                 if (do_swap_account)
2106                         page_counter_uncharge(&old->memsw, stock->nr_pages);
2107                 css_put_many(&old->css, stock->nr_pages);
2108                 stock->nr_pages = 0;
2109         }
2110         stock->cached = NULL;
2111 }
2112
2113 /*
2114  * This must be called under preempt disabled or must be called by
2115  * a thread which is pinned to local cpu.
2116  */
2117 static void drain_local_stock(struct work_struct *dummy)
2118 {
2119         struct memcg_stock_pcp *stock = this_cpu_ptr(&memcg_stock);
2120         drain_stock(stock);
2121         clear_bit(FLUSHING_CACHED_CHARGE, &stock->flags);
2122 }
2123
2124 /*
2125  * Cache charges(val) to local per_cpu area.
2126  * This will be consumed by consume_stock() function, later.
2127  */
2128 static void refill_stock(struct mem_cgroup *memcg, unsigned int nr_pages)
2129 {
2130         struct memcg_stock_pcp *stock;
2131         int cpu = get_cpu_light();
2132
2133         stock = &per_cpu(memcg_stock, cpu);
2134
2135         if (stock->cached != memcg) { /* reset if necessary */
2136                 drain_stock(stock);
2137                 stock->cached = memcg;
2138         }
2139         stock->nr_pages += nr_pages;
2140         put_cpu_light();
2141 }
2142
2143 /*
2144  * Drains all per-CPU charge caches for given root_memcg resp. subtree
2145  * of the hierarchy under it.
2146  */
2147 static void drain_all_stock(struct mem_cgroup *root_memcg)
2148 {
2149         int cpu, curcpu;
2150
2151         /* If someone's already draining, avoid adding running more workers. */
2152         if (!mutex_trylock(&percpu_charge_mutex))
2153                 return;
2154         /* Notify other cpus that system-wide "drain" is running */
2155         get_online_cpus();
2156         curcpu = get_cpu_light();
2157         for_each_online_cpu(cpu) {
2158                 struct memcg_stock_pcp *stock = &per_cpu(memcg_stock, cpu);
2159                 struct mem_cgroup *memcg;
2160
2161                 memcg = stock->cached;
2162                 if (!memcg || !stock->nr_pages)
2163                         continue;
2164                 if (!mem_cgroup_is_descendant(memcg, root_memcg))
2165                         continue;
2166                 if (!test_and_set_bit(FLUSHING_CACHED_CHARGE, &stock->flags)) {
2167                         if (cpu == curcpu)
2168                                 drain_local_stock(&stock->work);
2169                         else
2170                                 schedule_work_on(cpu, &stock->work);
2171                 }
2172         }
2173         put_cpu_light();
2174         put_online_cpus();
2175         mutex_unlock(&percpu_charge_mutex);
2176 }
2177
2178 /*
2179  * This function drains percpu counter value from DEAD cpu and
2180  * move it to local cpu. Note that this function can be preempted.
2181  */
2182 static void mem_cgroup_drain_pcp_counter(struct mem_cgroup *memcg, int cpu)
2183 {
2184         int i;
2185
2186         spin_lock(&memcg->pcp_counter_lock);
2187         for (i = 0; i < MEM_CGROUP_STAT_NSTATS; i++) {
2188                 long x = per_cpu(memcg->stat->count[i], cpu);
2189
2190                 per_cpu(memcg->stat->count[i], cpu) = 0;
2191                 memcg->nocpu_base.count[i] += x;
2192         }
2193         for (i = 0; i < MEM_CGROUP_EVENTS_NSTATS; i++) {
2194                 unsigned long x = per_cpu(memcg->stat->events[i], cpu);
2195
2196                 per_cpu(memcg->stat->events[i], cpu) = 0;
2197                 memcg->nocpu_base.events[i] += x;
2198         }
2199         spin_unlock(&memcg->pcp_counter_lock);
2200 }
2201
2202 static int memcg_cpu_hotplug_callback(struct notifier_block *nb,
2203                                         unsigned long action,
2204                                         void *hcpu)
2205 {
2206         int cpu = (unsigned long)hcpu;
2207         struct memcg_stock_pcp *stock;
2208         struct mem_cgroup *iter;
2209
2210         if (action == CPU_ONLINE)
2211                 return NOTIFY_OK;
2212
2213         if (action != CPU_DEAD && action != CPU_DEAD_FROZEN)
2214                 return NOTIFY_OK;
2215
2216         for_each_mem_cgroup(iter)
2217                 mem_cgroup_drain_pcp_counter(iter, cpu);
2218
2219         stock = &per_cpu(memcg_stock, cpu);
2220         drain_stock(stock);
2221         return NOTIFY_OK;
2222 }
2223
2224 static int try_charge(struct mem_cgroup *memcg, gfp_t gfp_mask,
2225                       unsigned int nr_pages)
2226 {
2227         unsigned int batch = max(CHARGE_BATCH, nr_pages);
2228         int nr_retries = MEM_CGROUP_RECLAIM_RETRIES;
2229         struct mem_cgroup *mem_over_limit;
2230         struct page_counter *counter;
2231         unsigned long nr_reclaimed;
2232         bool may_swap = true;
2233         bool drained = false;
2234         int ret = 0;
2235
2236         if (mem_cgroup_is_root(memcg))
2237                 goto done;
2238 retry:
2239         if (consume_stock(memcg, nr_pages))
2240                 goto done;
2241
2242         if (!do_swap_account ||
2243             !page_counter_try_charge(&memcg->memsw, batch, &counter)) {
2244                 if (!page_counter_try_charge(&memcg->memory, batch, &counter))
2245                         goto done_restock;
2246                 if (do_swap_account)
2247                         page_counter_uncharge(&memcg->memsw, batch);
2248                 mem_over_limit = mem_cgroup_from_counter(counter, memory);
2249         } else {
2250                 mem_over_limit = mem_cgroup_from_counter(counter, memsw);
2251                 may_swap = false;
2252         }
2253
2254         if (batch > nr_pages) {
2255                 batch = nr_pages;
2256                 goto retry;
2257         }
2258
2259         /*
2260          * Unlike in global OOM situations, memcg is not in a physical
2261          * memory shortage.  Allow dying and OOM-killed tasks to
2262          * bypass the last charges so that they can exit quickly and
2263          * free their memory.
2264          */
2265         if (unlikely(test_thread_flag(TIF_MEMDIE) ||
2266                      fatal_signal_pending(current) ||
2267                      current->flags & PF_EXITING))
2268                 goto bypass;
2269
2270         if (unlikely(task_in_memcg_oom(current)))
2271                 goto nomem;
2272
2273         if (!(gfp_mask & __GFP_WAIT))
2274                 goto nomem;
2275
2276         mem_cgroup_events(mem_over_limit, MEMCG_MAX, 1);
2277
2278         nr_reclaimed = try_to_free_mem_cgroup_pages(mem_over_limit, nr_pages,
2279                                                     gfp_mask, may_swap);
2280
2281         if (mem_cgroup_margin(mem_over_limit) >= nr_pages)
2282                 goto retry;
2283
2284         if (!drained) {
2285                 drain_all_stock(mem_over_limit);
2286                 drained = true;
2287                 goto retry;
2288         }
2289
2290         if (gfp_mask & __GFP_NORETRY)
2291                 goto nomem;
2292         /*
2293          * Even though the limit is exceeded at this point, reclaim
2294          * may have been able to free some pages.  Retry the charge
2295          * before killing the task.
2296          *
2297          * Only for regular pages, though: huge pages are rather
2298          * unlikely to succeed so close to the limit, and we fall back
2299          * to regular pages anyway in case of failure.
2300          */
2301         if (nr_reclaimed && nr_pages <= (1 << PAGE_ALLOC_COSTLY_ORDER))
2302                 goto retry;
2303         /*
2304          * At task move, charge accounts can be doubly counted. So, it's
2305          * better to wait until the end of task_move if something is going on.
2306          */
2307         if (mem_cgroup_wait_acct_move(mem_over_limit))
2308                 goto retry;
2309
2310         if (nr_retries--)
2311                 goto retry;
2312
2313         if (gfp_mask & __GFP_NOFAIL)
2314                 goto bypass;
2315
2316         if (fatal_signal_pending(current))
2317                 goto bypass;
2318
2319         mem_cgroup_events(mem_over_limit, MEMCG_OOM, 1);
2320
2321         mem_cgroup_oom(mem_over_limit, gfp_mask, get_order(nr_pages));
2322 nomem:
2323         if (!(gfp_mask & __GFP_NOFAIL))
2324                 return -ENOMEM;
2325 bypass:
2326         return -EINTR;
2327
2328 done_restock:
2329         css_get_many(&memcg->css, batch);
2330         if (batch > nr_pages)
2331                 refill_stock(memcg, batch - nr_pages);
2332         if (!(gfp_mask & __GFP_WAIT))
2333                 goto done;
2334         /*
2335          * If the hierarchy is above the normal consumption range,
2336          * make the charging task trim their excess contribution.
2337          */
2338         do {
2339                 if (page_counter_read(&memcg->memory) <= memcg->high)
2340                         continue;
2341                 mem_cgroup_events(memcg, MEMCG_HIGH, 1);
2342                 try_to_free_mem_cgroup_pages(memcg, nr_pages, gfp_mask, true);
2343         } while ((memcg = parent_mem_cgroup(memcg)));
2344 done:
2345         return ret;
2346 }
2347
2348 static void cancel_charge(struct mem_cgroup *memcg, unsigned int nr_pages)
2349 {
2350         if (mem_cgroup_is_root(memcg))
2351                 return;
2352
2353         page_counter_uncharge(&memcg->memory, nr_pages);
2354         if (do_swap_account)
2355                 page_counter_uncharge(&memcg->memsw, nr_pages);
2356
2357         css_put_many(&memcg->css, nr_pages);
2358 }
2359
2360 /*
2361  * try_get_mem_cgroup_from_page - look up page's memcg association
2362  * @page: the page
2363  *
2364  * Look up, get a css reference, and return the memcg that owns @page.
2365  *
2366  * The page must be locked to prevent racing with swap-in and page
2367  * cache charges.  If coming from an unlocked page table, the caller
2368  * must ensure the page is on the LRU or this can race with charging.
2369  */
2370 struct mem_cgroup *try_get_mem_cgroup_from_page(struct page *page)
2371 {
2372         struct mem_cgroup *memcg;
2373         unsigned short id;
2374         swp_entry_t ent;
2375
2376         VM_BUG_ON_PAGE(!PageLocked(page), page);
2377
2378         memcg = page->mem_cgroup;
2379         if (memcg) {
2380                 if (!css_tryget_online(&memcg->css))
2381                         memcg = NULL;
2382         } else if (PageSwapCache(page)) {
2383                 ent.val = page_private(page);
2384                 id = lookup_swap_cgroup_id(ent);
2385                 rcu_read_lock();
2386                 memcg = mem_cgroup_from_id(id);
2387                 if (memcg && !css_tryget_online(&memcg->css))
2388                         memcg = NULL;
2389                 rcu_read_unlock();
2390         }
2391         return memcg;
2392 }
2393
2394 static void lock_page_lru(struct page *page, int *isolated)
2395 {
2396         struct zone *zone = page_zone(page);
2397
2398         spin_lock_irq(&zone->lru_lock);
2399         if (PageLRU(page)) {
2400                 struct lruvec *lruvec;
2401
2402                 lruvec = mem_cgroup_page_lruvec(page, zone);
2403                 ClearPageLRU(page);
2404                 del_page_from_lru_list(page, lruvec, page_lru(page));
2405                 *isolated = 1;
2406         } else
2407                 *isolated = 0;
2408 }
2409
2410 static void unlock_page_lru(struct page *page, int isolated)
2411 {
2412         struct zone *zone = page_zone(page);
2413
2414         if (isolated) {
2415                 struct lruvec *lruvec;
2416
2417                 lruvec = mem_cgroup_page_lruvec(page, zone);
2418                 VM_BUG_ON_PAGE(PageLRU(page), page);
2419                 SetPageLRU(page);
2420                 add_page_to_lru_list(page, lruvec, page_lru(page));
2421         }
2422         spin_unlock_irq(&zone->lru_lock);
2423 }
2424
2425 static void commit_charge(struct page *page, struct mem_cgroup *memcg,
2426                           bool lrucare)
2427 {
2428         int isolated;
2429
2430         VM_BUG_ON_PAGE(page->mem_cgroup, page);
2431
2432         /*
2433          * In some cases, SwapCache and FUSE(splice_buf->radixtree), the page
2434          * may already be on some other mem_cgroup's LRU.  Take care of it.
2435          */
2436         if (lrucare)
2437                 lock_page_lru(page, &isolated);
2438
2439         /*
2440          * Nobody should be changing or seriously looking at
2441          * page->mem_cgroup at this point:
2442          *
2443          * - the page is uncharged
2444          *
2445          * - the page is off-LRU
2446          *
2447          * - an anonymous fault has exclusive page access, except for
2448          *   a locked page table
2449          *
2450          * - a page cache insertion, a swapin fault, or a migration
2451          *   have the page locked
2452          */
2453         page->mem_cgroup = memcg;
2454
2455         if (lrucare)
2456                 unlock_page_lru(page, isolated);
2457 }
2458
2459 #ifdef CONFIG_MEMCG_KMEM
2460 int memcg_charge_kmem(struct mem_cgroup *memcg, gfp_t gfp,
2461                       unsigned long nr_pages)
2462 {
2463         struct page_counter *counter;
2464         int ret = 0;
2465
2466         ret = page_counter_try_charge(&memcg->kmem, nr_pages, &counter);
2467         if (ret < 0)
2468                 return ret;
2469
2470         ret = try_charge(memcg, gfp, nr_pages);
2471         if (ret == -EINTR)  {
2472                 /*
2473                  * try_charge() chose to bypass to root due to OOM kill or
2474                  * fatal signal.  Since our only options are to either fail
2475                  * the allocation or charge it to this cgroup, do it as a
2476                  * temporary condition. But we can't fail. From a kmem/slab
2477                  * perspective, the cache has already been selected, by
2478                  * mem_cgroup_kmem_get_cache(), so it is too late to change
2479                  * our minds.
2480                  *
2481                  * This condition will only trigger if the task entered
2482                  * memcg_charge_kmem in a sane state, but was OOM-killed
2483                  * during try_charge() above. Tasks that were already dying
2484                  * when the allocation triggers should have been already
2485                  * directed to the root cgroup in memcontrol.h
2486                  */
2487                 page_counter_charge(&memcg->memory, nr_pages);
2488                 if (do_swap_account)
2489                         page_counter_charge(&memcg->memsw, nr_pages);
2490                 css_get_many(&memcg->css, nr_pages);
2491                 ret = 0;
2492         } else if (ret)
2493                 page_counter_uncharge(&memcg->kmem, nr_pages);
2494
2495         return ret;
2496 }
2497
2498 void memcg_uncharge_kmem(struct mem_cgroup *memcg, unsigned long nr_pages)
2499 {
2500         page_counter_uncharge(&memcg->memory, nr_pages);
2501         if (do_swap_account)
2502                 page_counter_uncharge(&memcg->memsw, nr_pages);
2503
2504         page_counter_uncharge(&memcg->kmem, nr_pages);
2505
2506         css_put_many(&memcg->css, nr_pages);
2507 }
2508
2509 /*
2510  * helper for acessing a memcg's index. It will be used as an index in the
2511  * child cache array in kmem_cache, and also to derive its name. This function
2512  * will return -1 when this is not a kmem-limited memcg.
2513  */
2514 int memcg_cache_id(struct mem_cgroup *memcg)
2515 {
2516         return memcg ? memcg->kmemcg_id : -1;
2517 }
2518
2519 static int memcg_alloc_cache_id(void)
2520 {
2521         int id, size;
2522         int err;
2523
2524         id = ida_simple_get(&memcg_cache_ida,
2525                             0, MEMCG_CACHES_MAX_SIZE, GFP_KERNEL);
2526         if (id < 0)
2527                 return id;
2528
2529         if (id < memcg_nr_cache_ids)
2530                 return id;
2531
2532         /*
2533          * There's no space for the new id in memcg_caches arrays,
2534          * so we have to grow them.
2535          */
2536         down_write(&memcg_cache_ids_sem);
2537
2538         size = 2 * (id + 1);
2539         if (size < MEMCG_CACHES_MIN_SIZE)
2540                 size = MEMCG_CACHES_MIN_SIZE;
2541         else if (size > MEMCG_CACHES_MAX_SIZE)
2542                 size = MEMCG_CACHES_MAX_SIZE;
2543
2544         err = memcg_update_all_caches(size);
2545         if (!err)
2546                 err = memcg_update_all_list_lrus(size);
2547         if (!err)
2548                 memcg_nr_cache_ids = size;
2549
2550         up_write(&memcg_cache_ids_sem);
2551
2552         if (err) {
2553                 ida_simple_remove(&memcg_cache_ida, id);
2554                 return err;
2555         }
2556         return id;
2557 }
2558
2559 static void memcg_free_cache_id(int id)
2560 {
2561         ida_simple_remove(&memcg_cache_ida, id);
2562 }
2563
2564 struct memcg_kmem_cache_create_work {
2565         struct mem_cgroup *memcg;
2566         struct kmem_cache *cachep;
2567         struct work_struct work;
2568 };
2569
2570 static void memcg_kmem_cache_create_func(struct work_struct *w)
2571 {
2572         struct memcg_kmem_cache_create_work *cw =
2573                 container_of(w, struct memcg_kmem_cache_create_work, work);
2574         struct mem_cgroup *memcg = cw->memcg;
2575         struct kmem_cache *cachep = cw->cachep;
2576
2577         memcg_create_kmem_cache(memcg, cachep);
2578
2579         css_put(&memcg->css);
2580         kfree(cw);
2581 }
2582
2583 /*
2584  * Enqueue the creation of a per-memcg kmem_cache.
2585  */
2586 static void __memcg_schedule_kmem_cache_create(struct mem_cgroup *memcg,
2587                                                struct kmem_cache *cachep)
2588 {
2589         struct memcg_kmem_cache_create_work *cw;
2590
2591         cw = kmalloc(sizeof(*cw), GFP_NOWAIT);
2592         if (!cw)
2593                 return;
2594
2595         css_get(&memcg->css);
2596
2597         cw->memcg = memcg;
2598         cw->cachep = cachep;
2599         INIT_WORK(&cw->work, memcg_kmem_cache_create_func);
2600
2601         schedule_work(&cw->work);
2602 }
2603
2604 static void memcg_schedule_kmem_cache_create(struct mem_cgroup *memcg,
2605                                              struct kmem_cache *cachep)
2606 {
2607         /*
2608          * We need to stop accounting when we kmalloc, because if the
2609          * corresponding kmalloc cache is not yet created, the first allocation
2610          * in __memcg_schedule_kmem_cache_create will recurse.
2611          *
2612          * However, it is better to enclose the whole function. Depending on
2613          * the debugging options enabled, INIT_WORK(), for instance, can
2614          * trigger an allocation. This too, will make us recurse. Because at
2615          * this point we can't allow ourselves back into memcg_kmem_get_cache,
2616          * the safest choice is to do it like this, wrapping the whole function.
2617          */
2618         current->memcg_kmem_skip_account = 1;
2619         __memcg_schedule_kmem_cache_create(memcg, cachep);
2620         current->memcg_kmem_skip_account = 0;
2621 }
2622
2623 /*
2624  * Return the kmem_cache we're supposed to use for a slab allocation.
2625  * We try to use the current memcg's version of the cache.
2626  *
2627  * If the cache does not exist yet, if we are the first user of it,
2628  * we either create it immediately, if possible, or create it asynchronously
2629  * in a workqueue.
2630  * In the latter case, we will let the current allocation go through with
2631  * the original cache.
2632  *
2633  * Can't be called in interrupt context or from kernel threads.
2634  * This function needs to be called with rcu_read_lock() held.
2635  */
2636 struct kmem_cache *__memcg_kmem_get_cache(struct kmem_cache *cachep)
2637 {
2638         struct mem_cgroup *memcg;
2639         struct kmem_cache *memcg_cachep;
2640         int kmemcg_id;
2641
2642         VM_BUG_ON(!is_root_cache(cachep));
2643
2644         if (current->memcg_kmem_skip_account)
2645                 return cachep;
2646
2647         memcg = get_mem_cgroup_from_mm(current->mm);
2648         kmemcg_id = READ_ONCE(memcg->kmemcg_id);
2649         if (kmemcg_id < 0)
2650                 goto out;
2651
2652         memcg_cachep = cache_from_memcg_idx(cachep, kmemcg_id);
2653         if (likely(memcg_cachep))
2654                 return memcg_cachep;
2655
2656         /*
2657          * If we are in a safe context (can wait, and not in interrupt
2658          * context), we could be be predictable and return right away.
2659          * This would guarantee that the allocation being performed
2660          * already belongs in the new cache.
2661          *
2662          * However, there are some clashes that can arrive from locking.
2663          * For instance, because we acquire the slab_mutex while doing
2664          * memcg_create_kmem_cache, this means no further allocation
2665          * could happen with the slab_mutex held. So it's better to
2666          * defer everything.
2667          */
2668         memcg_schedule_kmem_cache_create(memcg, cachep);
2669 out:
2670         css_put(&memcg->css);
2671         return cachep;
2672 }
2673
2674 void __memcg_kmem_put_cache(struct kmem_cache *cachep)
2675 {
2676         if (!is_root_cache(cachep))
2677                 css_put(&cachep->memcg_params.memcg->css);
2678 }
2679
2680 /*
2681  * We need to verify if the allocation against current->mm->owner's memcg is
2682  * possible for the given order. But the page is not allocated yet, so we'll
2683  * need a further commit step to do the final arrangements.
2684  *
2685  * It is possible for the task to switch cgroups in this mean time, so at
2686  * commit time, we can't rely on task conversion any longer.  We'll then use
2687  * the handle argument to return to the caller which cgroup we should commit
2688  * against. We could also return the memcg directly and avoid the pointer
2689  * passing, but a boolean return value gives better semantics considering
2690  * the compiled-out case as well.
2691  *
2692  * Returning true means the allocation is possible.
2693  */
2694 bool
2695 __memcg_kmem_newpage_charge(gfp_t gfp, struct mem_cgroup **_memcg, int order)
2696 {
2697         struct mem_cgroup *memcg;
2698         int ret;
2699
2700         *_memcg = NULL;
2701
2702         memcg = get_mem_cgroup_from_mm(current->mm);
2703
2704         if (!memcg_kmem_is_active(memcg)) {
2705                 css_put(&memcg->css);
2706                 return true;
2707         }
2708
2709         ret = memcg_charge_kmem(memcg, gfp, 1 << order);
2710         if (!ret)
2711                 *_memcg = memcg;
2712
2713         css_put(&memcg->css);
2714         return (ret == 0);
2715 }
2716
2717 void __memcg_kmem_commit_charge(struct page *page, struct mem_cgroup *memcg,
2718                               int order)
2719 {
2720         VM_BUG_ON(mem_cgroup_is_root(memcg));
2721
2722         /* The page allocation failed. Revert */
2723         if (!page) {
2724                 memcg_uncharge_kmem(memcg, 1 << order);
2725                 return;
2726         }
2727         page->mem_cgroup = memcg;
2728 }
2729
2730 void __memcg_kmem_uncharge_pages(struct page *page, int order)
2731 {
2732         struct mem_cgroup *memcg = page->mem_cgroup;
2733
2734         if (!memcg)
2735                 return;
2736
2737         VM_BUG_ON_PAGE(mem_cgroup_is_root(memcg), page);
2738
2739         memcg_uncharge_kmem(memcg, 1 << order);
2740         page->mem_cgroup = NULL;
2741 }
2742
2743 struct mem_cgroup *__mem_cgroup_from_kmem(void *ptr)
2744 {
2745         struct mem_cgroup *memcg = NULL;
2746         struct kmem_cache *cachep;
2747         struct page *page;
2748
2749         page = virt_to_head_page(ptr);
2750         if (PageSlab(page)) {
2751                 cachep = page->slab_cache;
2752                 if (!is_root_cache(cachep))
2753                         memcg = cachep->memcg_params.memcg;
2754         } else
2755                 /* page allocated by alloc_kmem_pages */
2756                 memcg = page->mem_cgroup;
2757
2758         return memcg;
2759 }
2760 #endif /* CONFIG_MEMCG_KMEM */
2761
2762 #ifdef CONFIG_TRANSPARENT_HUGEPAGE
2763
2764 /*
2765  * Because tail pages are not marked as "used", set it. We're under
2766  * zone->lru_lock, 'splitting on pmd' and compound_lock.
2767  * charge/uncharge will be never happen and move_account() is done under
2768  * compound_lock(), so we don't have to take care of races.
2769  */
2770 void mem_cgroup_split_huge_fixup(struct page *head)
2771 {
2772         int i;
2773
2774         if (mem_cgroup_disabled())
2775                 return;
2776
2777         for (i = 1; i < HPAGE_PMD_NR; i++)
2778                 head[i].mem_cgroup = head->mem_cgroup;
2779
2780         __this_cpu_sub(head->mem_cgroup->stat->count[MEM_CGROUP_STAT_RSS_HUGE],
2781                        HPAGE_PMD_NR);
2782 }
2783 #endif /* CONFIG_TRANSPARENT_HUGEPAGE */
2784
2785 #ifdef CONFIG_MEMCG_SWAP
2786 static void mem_cgroup_swap_statistics(struct mem_cgroup *memcg,
2787                                          bool charge)
2788 {
2789         int val = (charge) ? 1 : -1;
2790         this_cpu_add(memcg->stat->count[MEM_CGROUP_STAT_SWAP], val);
2791 }
2792
2793 /**
2794  * mem_cgroup_move_swap_account - move swap charge and swap_cgroup's record.
2795  * @entry: swap entry to be moved
2796  * @from:  mem_cgroup which the entry is moved from
2797  * @to:  mem_cgroup which the entry is moved to
2798  *
2799  * It succeeds only when the swap_cgroup's record for this entry is the same
2800  * as the mem_cgroup's id of @from.
2801  *
2802  * Returns 0 on success, -EINVAL on failure.
2803  *
2804  * The caller must have charged to @to, IOW, called page_counter_charge() about
2805  * both res and memsw, and called css_get().
2806  */
2807 static int mem_cgroup_move_swap_account(swp_entry_t entry,
2808                                 struct mem_cgroup *from, struct mem_cgroup *to)
2809 {
2810         unsigned short old_id, new_id;
2811
2812         old_id = mem_cgroup_id(from);
2813         new_id = mem_cgroup_id(to);
2814
2815         if (swap_cgroup_cmpxchg(entry, old_id, new_id) == old_id) {
2816                 mem_cgroup_swap_statistics(from, false);
2817                 mem_cgroup_swap_statistics(to, true);
2818                 return 0;
2819         }
2820         return -EINVAL;
2821 }
2822 #else
2823 static inline int mem_cgroup_move_swap_account(swp_entry_t entry,
2824                                 struct mem_cgroup *from, struct mem_cgroup *to)
2825 {
2826         return -EINVAL;
2827 }
2828 #endif
2829
2830 static DEFINE_MUTEX(memcg_limit_mutex);
2831
2832 static int mem_cgroup_resize_limit(struct mem_cgroup *memcg,
2833                                    unsigned long limit)
2834 {
2835         unsigned long curusage;
2836         unsigned long oldusage;
2837         bool enlarge = false;
2838         int retry_count;
2839         int ret;
2840
2841         /*
2842          * For keeping hierarchical_reclaim simple, how long we should retry
2843          * is depends on callers. We set our retry-count to be function
2844          * of # of children which we should visit in this loop.
2845          */
2846         retry_count = MEM_CGROUP_RECLAIM_RETRIES *
2847                       mem_cgroup_count_children(memcg);
2848
2849         oldusage = page_counter_read(&memcg->memory);
2850
2851         do {
2852                 if (signal_pending(current)) {
2853                         ret = -EINTR;
2854                         break;
2855                 }
2856
2857                 mutex_lock(&memcg_limit_mutex);
2858                 if (limit > memcg->memsw.limit) {
2859                         mutex_unlock(&memcg_limit_mutex);
2860                         ret = -EINVAL;
2861                         break;
2862                 }
2863                 if (limit > memcg->memory.limit)
2864                         enlarge = true;
2865                 ret = page_counter_limit(&memcg->memory, limit);
2866                 mutex_unlock(&memcg_limit_mutex);
2867
2868                 if (!ret)
2869                         break;
2870
2871                 try_to_free_mem_cgroup_pages(memcg, 1, GFP_KERNEL, true);
2872
2873                 curusage = page_counter_read(&memcg->memory);
2874                 /* Usage is reduced ? */
2875                 if (curusage >= oldusage)
2876                         retry_count--;
2877                 else
2878                         oldusage = curusage;
2879         } while (retry_count);
2880
2881         if (!ret && enlarge)
2882                 memcg_oom_recover(memcg);
2883
2884         return ret;
2885 }
2886
2887 static int mem_cgroup_resize_memsw_limit(struct mem_cgroup *memcg,
2888                                          unsigned long limit)
2889 {
2890         unsigned long curusage;
2891         unsigned long oldusage;
2892         bool enlarge = false;
2893         int retry_count;
2894         int ret;
2895
2896         /* see mem_cgroup_resize_res_limit */
2897         retry_count = MEM_CGROUP_RECLAIM_RETRIES *
2898                       mem_cgroup_count_children(memcg);
2899
2900         oldusage = page_counter_read(&memcg->memsw);
2901
2902         do {
2903                 if (signal_pending(current)) {
2904                         ret = -EINTR;
2905                         break;
2906                 }
2907
2908                 mutex_lock(&memcg_limit_mutex);
2909                 if (limit < memcg->memory.limit) {
2910                         mutex_unlock(&memcg_limit_mutex);
2911                         ret = -EINVAL;
2912                         break;
2913                 }
2914                 if (limit > memcg->memsw.limit)
2915                         enlarge = true;
2916                 ret = page_counter_limit(&memcg->memsw, limit);
2917                 mutex_unlock(&memcg_limit_mutex);
2918
2919                 if (!ret)
2920                         break;
2921
2922                 try_to_free_mem_cgroup_pages(memcg, 1, GFP_KERNEL, false);
2923
2924                 curusage = page_counter_read(&memcg->memsw);
2925                 /* Usage is reduced ? */
2926                 if (curusage >= oldusage)
2927                         retry_count--;
2928                 else
2929                         oldusage = curusage;
2930         } while (retry_count);
2931
2932         if (!ret && enlarge)
2933                 memcg_oom_recover(memcg);
2934
2935         return ret;
2936 }
2937
2938 unsigned long mem_cgroup_soft_limit_reclaim(struct zone *zone, int order,
2939                                             gfp_t gfp_mask,
2940                                             unsigned long *total_scanned)
2941 {
2942         unsigned long nr_reclaimed = 0;
2943         struct mem_cgroup_per_zone *mz, *next_mz = NULL;
2944         unsigned long reclaimed;
2945         int loop = 0;
2946         struct mem_cgroup_tree_per_zone *mctz;
2947         unsigned long excess;
2948         unsigned long nr_scanned;
2949
2950         if (order > 0)
2951                 return 0;
2952
2953         mctz = soft_limit_tree_node_zone(zone_to_nid(zone), zone_idx(zone));
2954         /*
2955          * This loop can run a while, specially if mem_cgroup's continuously
2956          * keep exceeding their soft limit and putting the system under
2957          * pressure
2958          */
2959         do {
2960                 if (next_mz)
2961                         mz = next_mz;
2962                 else
2963                         mz = mem_cgroup_largest_soft_limit_node(mctz);
2964                 if (!mz)
2965                         break;
2966
2967                 nr_scanned = 0;
2968                 reclaimed = mem_cgroup_soft_reclaim(mz->memcg, zone,
2969                                                     gfp_mask, &nr_scanned);
2970                 nr_reclaimed += reclaimed;
2971                 *total_scanned += nr_scanned;
2972                 spin_lock_irq(&mctz->lock);
2973                 __mem_cgroup_remove_exceeded(mz, mctz);
2974
2975                 /*
2976                  * If we failed to reclaim anything from this memory cgroup
2977                  * it is time to move on to the next cgroup
2978                  */
2979                 next_mz = NULL;
2980                 if (!reclaimed)
2981                         next_mz = __mem_cgroup_largest_soft_limit_node(mctz);
2982
2983                 excess = soft_limit_excess(mz->memcg);
2984                 /*
2985                  * One school of thought says that we should not add
2986                  * back the node to the tree if reclaim returns 0.
2987                  * But our reclaim could return 0, simply because due
2988                  * to priority we are exposing a smaller subset of
2989                  * memory to reclaim from. Consider this as a longer
2990                  * term TODO.
2991                  */
2992                 /* If excess == 0, no tree ops */
2993                 __mem_cgroup_insert_exceeded(mz, mctz, excess);
2994                 spin_unlock_irq(&mctz->lock);
2995                 css_put(&mz->memcg->css);
2996                 loop++;
2997                 /*
2998                  * Could not reclaim anything and there are no more
2999                  * mem cgroups to try or we seem to be looping without
3000                  * reclaiming anything.
3001                  */
3002                 if (!nr_reclaimed &&
3003                         (next_mz == NULL ||
3004                         loop > MEM_CGROUP_MAX_SOFT_LIMIT_RECLAIM_LOOPS))
3005                         break;
3006         } while (!nr_reclaimed);
3007         if (next_mz)
3008                 css_put(&next_mz->memcg->css);
3009         return nr_reclaimed;
3010 }
3011
3012 /*
3013  * Test whether @memcg has children, dead or alive.  Note that this
3014  * function doesn't care whether @memcg has use_hierarchy enabled and
3015  * returns %true if there are child csses according to the cgroup
3016  * hierarchy.  Testing use_hierarchy is the caller's responsiblity.
3017  */
3018 static inline bool memcg_has_children(struct mem_cgroup *memcg)
3019 {
3020         bool ret;
3021
3022         /*
3023          * The lock does not prevent addition or deletion of children, but
3024          * it prevents a new child from being initialized based on this
3025          * parent in css_online(), so it's enough to decide whether
3026          * hierarchically inherited attributes can still be changed or not.
3027          */
3028         lockdep_assert_held(&memcg_create_mutex);
3029
3030         rcu_read_lock();
3031         ret = css_next_child(NULL, &memcg->css);
3032         rcu_read_unlock();
3033         return ret;
3034 }
3035
3036 /*
3037  * Reclaims as many pages from the given memcg as possible and moves
3038  * the rest to the parent.
3039  *
3040  * Caller is responsible for holding css reference for memcg.
3041  */
3042 static int mem_cgroup_force_empty(struct mem_cgroup *memcg)
3043 {
3044         int nr_retries = MEM_CGROUP_RECLAIM_RETRIES;
3045
3046         /* we call try-to-free pages for make this cgroup empty */
3047         lru_add_drain_all();
3048         /* try to free all pages in this cgroup */
3049         while (nr_retries && page_counter_read(&memcg->memory)) {
3050                 int progress;
3051
3052                 if (signal_pending(current))
3053                         return -EINTR;
3054
3055                 progress = try_to_free_mem_cgroup_pages(memcg, 1,
3056                                                         GFP_KERNEL, true);
3057                 if (!progress) {
3058                         nr_retries--;
3059                         /* maybe some writeback is necessary */
3060                         congestion_wait(BLK_RW_ASYNC, HZ/10);
3061                 }
3062
3063         }
3064
3065         return 0;
3066 }
3067
3068 static ssize_t mem_cgroup_force_empty_write(struct kernfs_open_file *of,
3069                                             char *buf, size_t nbytes,
3070                                             loff_t off)
3071 {
3072         struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of));
3073
3074         if (mem_cgroup_is_root(memcg))
3075                 return -EINVAL;
3076         return mem_cgroup_force_empty(memcg) ?: nbytes;
3077 }
3078
3079 static u64 mem_cgroup_hierarchy_read(struct cgroup_subsys_state *css,
3080                                      struct cftype *cft)
3081 {
3082         return mem_cgroup_from_css(css)->use_hierarchy;
3083 }
3084
3085 static int mem_cgroup_hierarchy_write(struct cgroup_subsys_state *css,
3086                                       struct cftype *cft, u64 val)
3087 {
3088         int retval = 0;
3089         struct mem_cgroup *memcg = mem_cgroup_from_css(css);
3090         struct mem_cgroup *parent_memcg = mem_cgroup_from_css(memcg->css.parent);
3091
3092         mutex_lock(&memcg_create_mutex);
3093
3094         if (memcg->use_hierarchy == val)
3095                 goto out;
3096
3097         /*
3098          * If parent's use_hierarchy is set, we can't make any modifications
3099          * in the child subtrees. If it is unset, then the change can
3100          * occur, provided the current cgroup has no children.
3101          *
3102          * For the root cgroup, parent_mem is NULL, we allow value to be
3103          * set if there are no children.
3104          */
3105         if ((!parent_memcg || !parent_memcg->use_hierarchy) &&
3106                                 (val == 1 || val == 0)) {
3107                 if (!memcg_has_children(memcg))
3108                         memcg->use_hierarchy = val;
3109                 else
3110                         retval = -EBUSY;
3111         } else
3112                 retval = -EINVAL;
3113
3114 out:
3115         mutex_unlock(&memcg_create_mutex);
3116
3117         return retval;
3118 }
3119
3120 static unsigned long tree_stat(struct mem_cgroup *memcg,
3121                                enum mem_cgroup_stat_index idx)
3122 {
3123         struct mem_cgroup *iter;
3124         long val = 0;
3125
3126         /* Per-cpu values can be negative, use a signed accumulator */
3127         for_each_mem_cgroup_tree(iter, memcg)
3128                 val += mem_cgroup_read_stat(iter, idx);
3129
3130         if (val < 0) /* race ? */
3131                 val = 0;
3132         return val;
3133 }
3134
3135 static inline u64 mem_cgroup_usage(struct mem_cgroup *memcg, bool swap)
3136 {
3137         u64 val;
3138
3139         if (mem_cgroup_is_root(memcg)) {
3140                 val = tree_stat(memcg, MEM_CGROUP_STAT_CACHE);
3141                 val += tree_stat(memcg, MEM_CGROUP_STAT_RSS);
3142                 if (swap)
3143                         val += tree_stat(memcg, MEM_CGROUP_STAT_SWAP);
3144         } else {
3145                 if (!swap)
3146                         val = page_counter_read(&memcg->memory);
3147                 else
3148                         val = page_counter_read(&memcg->memsw);
3149         }
3150         return val << PAGE_SHIFT;
3151 }
3152
3153 enum {
3154         RES_USAGE,
3155         RES_LIMIT,
3156         RES_MAX_USAGE,
3157         RES_FAILCNT,
3158         RES_SOFT_LIMIT,
3159 };
3160
3161 static u64 mem_cgroup_read_u64(struct cgroup_subsys_state *css,
3162                                struct cftype *cft)
3163 {
3164         struct mem_cgroup *memcg = mem_cgroup_from_css(css);
3165         struct page_counter *counter;
3166
3167         switch (MEMFILE_TYPE(cft->private)) {
3168         case _MEM:
3169                 counter = &memcg->memory;
3170                 break;
3171         case _MEMSWAP:
3172                 counter = &memcg->memsw;
3173                 break;
3174         case _KMEM:
3175                 counter = &memcg->kmem;
3176                 break;
3177         default:
3178                 BUG();
3179         }
3180
3181         switch (MEMFILE_ATTR(cft->private)) {
3182         case RES_USAGE:
3183                 if (counter == &memcg->memory)
3184                         return mem_cgroup_usage(memcg, false);
3185                 if (counter == &memcg->memsw)
3186                         return mem_cgroup_usage(memcg, true);
3187                 return (u64)page_counter_read(counter) * PAGE_SIZE;
3188         case RES_LIMIT:
3189                 return (u64)counter->limit * PAGE_SIZE;
3190         case RES_MAX_USAGE:
3191                 return (u64)counter->watermark * PAGE_SIZE;
3192         case RES_FAILCNT:
3193                 return counter->failcnt;
3194         case RES_SOFT_LIMIT:
3195                 return (u64)memcg->soft_limit * PAGE_SIZE;
3196         default:
3197                 BUG();
3198         }
3199 }
3200
3201 #ifdef CONFIG_MEMCG_KMEM
3202 static int memcg_activate_kmem(struct mem_cgroup *memcg,
3203                                unsigned long nr_pages)
3204 {
3205         int err = 0;
3206         int memcg_id;
3207
3208         BUG_ON(memcg->kmemcg_id >= 0);
3209         BUG_ON(memcg->kmem_acct_activated);
3210         BUG_ON(memcg->kmem_acct_active);
3211
3212         /*
3213          * For simplicity, we won't allow this to be disabled.  It also can't
3214          * be changed if the cgroup has children already, or if tasks had
3215          * already joined.
3216          *
3217          * If tasks join before we set the limit, a person looking at
3218          * kmem.usage_in_bytes will have no way to determine when it took
3219          * place, which makes the value quite meaningless.
3220          *
3221          * After it first became limited, changes in the value of the limit are
3222          * of course permitted.
3223          */
3224         mutex_lock(&memcg_create_mutex);
3225         if (cgroup_has_tasks(memcg->css.cgroup) ||
3226             (memcg->use_hierarchy && memcg_has_children(memcg)))
3227                 err = -EBUSY;
3228         mutex_unlock(&memcg_create_mutex);
3229         if (err)
3230                 goto out;
3231
3232         memcg_id = memcg_alloc_cache_id();
3233         if (memcg_id < 0) {
3234                 err = memcg_id;
3235                 goto out;
3236         }
3237
3238         /*
3239          * We couldn't have accounted to this cgroup, because it hasn't got
3240          * activated yet, so this should succeed.
3241          */
3242         err = page_counter_limit(&memcg->kmem, nr_pages);
3243         VM_BUG_ON(err);
3244
3245         static_key_slow_inc(&memcg_kmem_enabled_key);
3246         /*
3247          * A memory cgroup is considered kmem-active as soon as it gets
3248          * kmemcg_id. Setting the id after enabling static branching will
3249          * guarantee no one starts accounting before all call sites are
3250          * patched.
3251          */
3252         memcg->kmemcg_id = memcg_id;
3253         memcg->kmem_acct_activated = true;
3254         memcg->kmem_acct_active = true;
3255 out:
3256         return err;
3257 }
3258
3259 static int memcg_update_kmem_limit(struct mem_cgroup *memcg,
3260                                    unsigned long limit)
3261 {
3262         int ret;
3263
3264         mutex_lock(&memcg_limit_mutex);
3265         if (!memcg_kmem_is_active(memcg))
3266                 ret = memcg_activate_kmem(memcg, limit);
3267         else
3268                 ret = page_counter_limit(&memcg->kmem, limit);
3269         mutex_unlock(&memcg_limit_mutex);
3270         return ret;
3271 }
3272
3273 static int memcg_propagate_kmem(struct mem_cgroup *memcg)
3274 {
3275         int ret = 0;
3276         struct mem_cgroup *parent = parent_mem_cgroup(memcg);
3277
3278         if (!parent)
3279                 return 0;
3280
3281         mutex_lock(&memcg_limit_mutex);
3282         /*
3283          * If the parent cgroup is not kmem-active now, it cannot be activated
3284          * after this point, because it has at least one child already.
3285          */
3286         if (memcg_kmem_is_active(parent))
3287                 ret = memcg_activate_kmem(memcg, PAGE_COUNTER_MAX);
3288         mutex_unlock(&memcg_limit_mutex);
3289         return ret;
3290 }
3291 #else
3292 static int memcg_update_kmem_limit(struct mem_cgroup *memcg,
3293                                    unsigned long limit)
3294 {
3295         return -EINVAL;
3296 }
3297 #endif /* CONFIG_MEMCG_KMEM */
3298
3299 /*
3300  * The user of this function is...
3301  * RES_LIMIT.
3302  */
3303 static ssize_t mem_cgroup_write(struct kernfs_open_file *of,
3304                                 char *buf, size_t nbytes, loff_t off)
3305 {
3306         struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of));
3307         unsigned long nr_pages;
3308         int ret;
3309
3310         buf = strstrip(buf);
3311         ret = page_counter_memparse(buf, "-1", &nr_pages);
3312         if (ret)
3313                 return ret;
3314
3315         switch (MEMFILE_ATTR(of_cft(of)->private)) {
3316         case RES_LIMIT:
3317                 if (mem_cgroup_is_root(memcg)) { /* Can't set limit on root */
3318                         ret = -EINVAL;
3319                         break;
3320                 }
3321                 switch (MEMFILE_TYPE(of_cft(of)->private)) {
3322                 case _MEM:
3323                         ret = mem_cgroup_resize_limit(memcg, nr_pages);
3324                         break;
3325                 case _MEMSWAP:
3326                         ret = mem_cgroup_resize_memsw_limit(memcg, nr_pages);
3327                         break;
3328                 case _KMEM:
3329                         ret = memcg_update_kmem_limit(memcg, nr_pages);
3330                         break;
3331                 }
3332                 break;
3333         case RES_SOFT_LIMIT:
3334                 memcg->soft_limit = nr_pages;
3335                 ret = 0;
3336                 break;
3337         }
3338         return ret ?: nbytes;
3339 }
3340
3341 static ssize_t mem_cgroup_reset(struct kernfs_open_file *of, char *buf,
3342                                 size_t nbytes, loff_t off)
3343 {
3344         struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of));
3345         struct page_counter *counter;
3346
3347         switch (MEMFILE_TYPE(of_cft(of)->private)) {
3348         case _MEM:
3349                 counter = &memcg->memory;
3350                 break;
3351         case _MEMSWAP:
3352                 counter = &memcg->memsw;
3353                 break;
3354         case _KMEM:
3355                 counter = &memcg->kmem;
3356                 break;
3357         default:
3358                 BUG();
3359         }
3360
3361         switch (MEMFILE_ATTR(of_cft(of)->private)) {
3362         case RES_MAX_USAGE:
3363                 page_counter_reset_watermark(counter);
3364                 break;
3365         case RES_FAILCNT:
3366                 counter->failcnt = 0;
3367                 break;
3368         default:
3369                 BUG();
3370         }
3371
3372         return nbytes;
3373 }
3374
3375 static u64 mem_cgroup_move_charge_read(struct cgroup_subsys_state *css,
3376                                         struct cftype *cft)
3377 {
3378         return mem_cgroup_from_css(css)->move_charge_at_immigrate;
3379 }
3380
3381 #ifdef CONFIG_MMU
3382 static int mem_cgroup_move_charge_write(struct cgroup_subsys_state *css,
3383                                         struct cftype *cft, u64 val)
3384 {
3385         struct mem_cgroup *memcg = mem_cgroup_from_css(css);
3386
3387         if (val & ~MOVE_MASK)
3388                 return -EINVAL;
3389
3390         /*
3391          * No kind of locking is needed in here, because ->can_attach() will
3392          * check this value once in the beginning of the process, and then carry
3393          * on with stale data. This means that changes to this value will only
3394          * affect task migrations starting after the change.
3395          */
3396         memcg->move_charge_at_immigrate = val;
3397         return 0;
3398 }
3399 #else
3400 static int mem_cgroup_move_charge_write(struct cgroup_subsys_state *css,
3401                                         struct cftype *cft, u64 val)
3402 {
3403         return -ENOSYS;
3404 }
3405 #endif
3406
3407 #ifdef CONFIG_NUMA
3408 static int memcg_numa_stat_show(struct seq_file *m, void *v)
3409 {
3410         struct numa_stat {
3411                 const char *name;
3412                 unsigned int lru_mask;
3413         };
3414
3415         static const struct numa_stat stats[] = {
3416                 { "total", LRU_ALL },
3417                 { "file", LRU_ALL_FILE },
3418                 { "anon", LRU_ALL_ANON },
3419                 { "unevictable", BIT(LRU_UNEVICTABLE) },
3420         };
3421         const struct numa_stat *stat;
3422         int nid;
3423         unsigned long nr;
3424         struct mem_cgroup *memcg = mem_cgroup_from_css(seq_css(m));
3425
3426         for (stat = stats; stat < stats + ARRAY_SIZE(stats); stat++) {
3427                 nr = mem_cgroup_nr_lru_pages(memcg, stat->lru_mask);
3428                 seq_printf(m, "%s=%lu", stat->name, nr);
3429                 for_each_node_state(nid, N_MEMORY) {
3430                         nr = mem_cgroup_node_nr_lru_pages(memcg, nid,
3431                                                           stat->lru_mask);
3432                         seq_printf(m, " N%d=%lu", nid, nr);
3433                 }
3434                 seq_putc(m, '\n');
3435         }
3436
3437         for (stat = stats; stat < stats + ARRAY_SIZE(stats); stat++) {
3438                 struct mem_cgroup *iter;
3439
3440                 nr = 0;
3441                 for_each_mem_cgroup_tree(iter, memcg)
3442                         nr += mem_cgroup_nr_lru_pages(iter, stat->lru_mask);
3443                 seq_printf(m, "hierarchical_%s=%lu", stat->name, nr);
3444                 for_each_node_state(nid, N_MEMORY) {
3445                         nr = 0;
3446                         for_each_mem_cgroup_tree(iter, memcg)
3447                                 nr += mem_cgroup_node_nr_lru_pages(
3448                                         iter, nid, stat->lru_mask);
3449                         seq_printf(m, " N%d=%lu", nid, nr);
3450                 }
3451                 seq_putc(m, '\n');
3452         }
3453
3454         return 0;
3455 }
3456 #endif /* CONFIG_NUMA */
3457
3458 static int memcg_stat_show(struct seq_file *m, void *v)
3459 {
3460         struct mem_cgroup *memcg = mem_cgroup_from_css(seq_css(m));
3461         unsigned long memory, memsw;
3462         struct mem_cgroup *mi;
3463         unsigned int i;
3464
3465         BUILD_BUG_ON(ARRAY_SIZE(mem_cgroup_stat_names) !=
3466                      MEM_CGROUP_STAT_NSTATS);
3467         BUILD_BUG_ON(ARRAY_SIZE(mem_cgroup_events_names) !=
3468                      MEM_CGROUP_EVENTS_NSTATS);
3469         BUILD_BUG_ON(ARRAY_SIZE(mem_cgroup_lru_names) != NR_LRU_LISTS);
3470
3471         for (i = 0; i < MEM_CGROUP_STAT_NSTATS; i++) {
3472                 if (i == MEM_CGROUP_STAT_SWAP && !do_swap_account)
3473                         continue;
3474                 seq_printf(m, "%s %ld\n", mem_cgroup_stat_names[i],
3475                            mem_cgroup_read_stat(memcg, i) * PAGE_SIZE);
3476         }
3477
3478         for (i = 0; i < MEM_CGROUP_EVENTS_NSTATS; i++)
3479                 seq_printf(m, "%s %lu\n", mem_cgroup_events_names[i],
3480                            mem_cgroup_read_events(memcg, i));
3481
3482         for (i = 0; i < NR_LRU_LISTS; i++)
3483                 seq_printf(m, "%s %lu\n", mem_cgroup_lru_names[i],
3484                            mem_cgroup_nr_lru_pages(memcg, BIT(i)) * PAGE_SIZE);
3485
3486         /* Hierarchical information */
3487         memory = memsw = PAGE_COUNTER_MAX;
3488         for (mi = memcg; mi; mi = parent_mem_cgroup(mi)) {
3489                 memory = min(memory, mi->memory.limit);
3490                 memsw = min(memsw, mi->memsw.limit);
3491         }
3492         seq_printf(m, "hierarchical_memory_limit %llu\n",
3493                    (u64)memory * PAGE_SIZE);
3494         if (do_swap_account)
3495                 seq_printf(m, "hierarchical_memsw_limit %llu\n",
3496                            (u64)memsw * PAGE_SIZE);
3497
3498         for (i = 0; i < MEM_CGROUP_STAT_NSTATS; i++) {
3499                 long long val = 0;
3500
3501                 if (i == MEM_CGROUP_STAT_SWAP && !do_swap_account)
3502                         continue;
3503                 for_each_mem_cgroup_tree(mi, memcg)
3504                         val += mem_cgroup_read_stat(mi, i) * PAGE_SIZE;
3505                 seq_printf(m, "total_%s %lld\n", mem_cgroup_stat_names[i], val);
3506         }
3507
3508         for (i = 0; i < MEM_CGROUP_EVENTS_NSTATS; i++) {
3509                 unsigned long long val = 0;
3510
3511                 for_each_mem_cgroup_tree(mi, memcg)
3512                         val += mem_cgroup_read_events(mi, i);
3513                 seq_printf(m, "total_%s %llu\n",
3514                            mem_cgroup_events_names[i], val);
3515         }
3516
3517         for (i = 0; i < NR_LRU_LISTS; i++) {
3518                 unsigned long long val = 0;
3519
3520                 for_each_mem_cgroup_tree(mi, memcg)
3521                         val += mem_cgroup_nr_lru_pages(mi, BIT(i)) * PAGE_SIZE;
3522                 seq_printf(m, "total_%s %llu\n", mem_cgroup_lru_names[i], val);
3523         }
3524
3525 #ifdef CONFIG_DEBUG_VM
3526         {
3527                 int nid, zid;
3528                 struct mem_cgroup_per_zone *mz;
3529                 struct zone_reclaim_stat *rstat;
3530                 unsigned long recent_rotated[2] = {0, 0};
3531                 unsigned long recent_scanned[2] = {0, 0};
3532
3533                 for_each_online_node(nid)
3534                         for (zid = 0; zid < MAX_NR_ZONES; zid++) {
3535                                 mz = &memcg->nodeinfo[nid]->zoneinfo[zid];
3536                                 rstat = &mz->lruvec.reclaim_stat;
3537
3538                                 recent_rotated[0] += rstat->recent_rotated[0];
3539                                 recent_rotated[1] += rstat->recent_rotated[1];
3540                                 recent_scanned[0] += rstat->recent_scanned[0];
3541                                 recent_scanned[1] += rstat->recent_scanned[1];
3542                         }
3543                 seq_printf(m, "recent_rotated_anon %lu\n", recent_rotated[0]);
3544                 seq_printf(m, "recent_rotated_file %lu\n", recent_rotated[1]);
3545                 seq_printf(m, "recent_scanned_anon %lu\n", recent_scanned[0]);
3546                 seq_printf(m, "recent_scanned_file %lu\n", recent_scanned[1]);
3547         }
3548 #endif
3549
3550         return 0;
3551 }
3552
3553 static u64 mem_cgroup_swappiness_read(struct cgroup_subsys_state *css,
3554                                       struct cftype *cft)
3555 {
3556         struct mem_cgroup *memcg = mem_cgroup_from_css(css);
3557
3558         return mem_cgroup_swappiness(memcg);
3559 }
3560
3561 static int mem_cgroup_swappiness_write(struct cgroup_subsys_state *css,
3562                                        struct cftype *cft, u64 val)
3563 {
3564         struct mem_cgroup *memcg = mem_cgroup_from_css(css);
3565
3566         if (val > 100)
3567                 return -EINVAL;
3568
3569         if (css->parent)
3570                 memcg->swappiness = val;
3571         else
3572                 vm_swappiness = val;
3573
3574         return 0;
3575 }
3576
3577 static void __mem_cgroup_threshold(struct mem_cgroup *memcg, bool swap)
3578 {
3579         struct mem_cgroup_threshold_ary *t;
3580         unsigned long usage;
3581         int i;
3582
3583         rcu_read_lock();
3584         if (!swap)
3585                 t = rcu_dereference(memcg->thresholds.primary);
3586         else
3587                 t = rcu_dereference(memcg->memsw_thresholds.primary);
3588
3589         if (!t)
3590                 goto unlock;
3591
3592         usage = mem_cgroup_usage(memcg, swap);
3593
3594         /*
3595          * current_threshold points to threshold just below or equal to usage.
3596          * If it's not true, a threshold was crossed after last
3597          * call of __mem_cgroup_threshold().
3598          */
3599         i = t->current_threshold;
3600
3601         /*
3602          * Iterate backward over array of thresholds starting from
3603          * current_threshold and check if a threshold is crossed.
3604          * If none of thresholds below usage is crossed, we read
3605          * only one element of the array here.
3606          */
3607         for (; i >= 0 && unlikely(t->entries[i].threshold > usage); i--)
3608                 eventfd_signal(t->entries[i].eventfd, 1);
3609
3610         /* i = current_threshold + 1 */
3611         i++;
3612
3613         /*
3614          * Iterate forward over array of thresholds starting from
3615          * current_threshold+1 and check if a threshold is crossed.
3616          * If none of thresholds above usage is crossed, we read
3617          * only one element of the array here.
3618          */
3619         for (; i < t->size && unlikely(t->entries[i].threshold <= usage); i++)
3620                 eventfd_signal(t->entries[i].eventfd, 1);
3621
3622         /* Update current_threshold */
3623         t->current_threshold = i - 1;
3624 unlock:
3625         rcu_read_unlock();
3626 }
3627
3628 static void mem_cgroup_threshold(struct mem_cgroup *memcg)
3629 {
3630         while (memcg) {
3631                 __mem_cgroup_threshold(memcg, false);
3632                 if (do_swap_account)
3633                         __mem_cgroup_threshold(memcg, true);
3634
3635                 memcg = parent_mem_cgroup(memcg);
3636         }
3637 }
3638
3639 static int compare_thresholds(const void *a, const void *b)
3640 {
3641         const struct mem_cgroup_threshold *_a = a;
3642         const struct mem_cgroup_threshold *_b = b;
3643
3644         if (_a->threshold > _b->threshold)
3645                 return 1;
3646
3647         if (_a->threshold < _b->threshold)
3648                 return -1;
3649
3650         return 0;
3651 }
3652
3653 static int mem_cgroup_oom_notify_cb(struct mem_cgroup *memcg)
3654 {
3655         struct mem_cgroup_eventfd_list *ev;
3656
3657         spin_lock(&memcg_oom_lock);
3658
3659         list_for_each_entry(ev, &memcg->oom_notify, list)
3660                 eventfd_signal(ev->eventfd, 1);
3661
3662         spin_unlock(&memcg_oom_lock);
3663         return 0;
3664 }
3665
3666 static void mem_cgroup_oom_notify(struct mem_cgroup *memcg)
3667 {
3668         struct mem_cgroup *iter;
3669
3670         for_each_mem_cgroup_tree(iter, memcg)
3671                 mem_cgroup_oom_notify_cb(iter);
3672 }
3673
3674 static int __mem_cgroup_usage_register_event(struct mem_cgroup *memcg,
3675         struct eventfd_ctx *eventfd, const char *args, enum res_type type)
3676 {
3677         struct mem_cgroup_thresholds *thresholds;
3678         struct mem_cgroup_threshold_ary *new;
3679         unsigned long threshold;
3680         unsigned long usage;
3681         int i, size, ret;
3682
3683         ret = page_counter_memparse(args, "-1", &threshold);
3684         if (ret)
3685                 return ret;
3686
3687         mutex_lock(&memcg->thresholds_lock);
3688
3689         if (type == _MEM) {
3690                 thresholds = &memcg->thresholds;
3691                 usage = mem_cgroup_usage(memcg, false);
3692         } else if (type == _MEMSWAP) {
3693                 thresholds = &memcg->memsw_thresholds;
3694                 usage = mem_cgroup_usage(memcg, true);
3695         } else
3696                 BUG();
3697
3698         /* Check if a threshold crossed before adding a new one */
3699         if (thresholds->primary)
3700                 __mem_cgroup_threshold(memcg, type == _MEMSWAP);
3701
3702         size = thresholds->primary ? thresholds->primary->size + 1 : 1;
3703
3704         /* Allocate memory for new array of thresholds */
3705         new = kmalloc(sizeof(*new) + size * sizeof(struct mem_cgroup_threshold),
3706                         GFP_KERNEL);
3707         if (!new) {
3708                 ret = -ENOMEM;
3709                 goto unlock;
3710         }
3711         new->size = size;
3712
3713         /* Copy thresholds (if any) to new array */
3714         if (thresholds->primary) {
3715                 memcpy(new->entries, thresholds->primary->entries, (size - 1) *
3716                                 sizeof(struct mem_cgroup_threshold));
3717         }
3718
3719         /* Add new threshold */
3720         new->entries[size - 1].eventfd = eventfd;
3721         new->entries[size - 1].threshold = threshold;
3722
3723         /* Sort thresholds. Registering of new threshold isn't time-critical */
3724         sort(new->entries, size, sizeof(struct mem_cgroup_threshold),
3725                         compare_thresholds, NULL);
3726
3727         /* Find current threshold */
3728         new->current_threshold = -1;
3729         for (i = 0; i < size; i++) {
3730                 if (new->entries[i].threshold <= usage) {
3731                         /*
3732                          * new->current_threshold will not be used until
3733                          * rcu_assign_pointer(), so it's safe to increment
3734                          * it here.
3735                          */
3736                         ++new->current_threshold;
3737                 } else
3738                         break;
3739         }
3740
3741         /* Free old spare buffer and save old primary buffer as spare */
3742         kfree(thresholds->spare);
3743         thresholds->spare = thresholds->primary;
3744
3745         rcu_assign_pointer(thresholds->primary, new);
3746
3747         /* To be sure that nobody uses thresholds */
3748         synchronize_rcu();
3749
3750 unlock:
3751         mutex_unlock(&memcg->thresholds_lock);
3752
3753         return ret;
3754 }
3755
3756 static int mem_cgroup_usage_register_event(struct mem_cgroup *memcg,
3757         struct eventfd_ctx *eventfd, const char *args)
3758 {
3759         return __mem_cgroup_usage_register_event(memcg, eventfd, args, _MEM);
3760 }
3761
3762 static int memsw_cgroup_usage_register_event(struct mem_cgroup *memcg,
3763         struct eventfd_ctx *eventfd, const char *args)
3764 {
3765         return __mem_cgroup_usage_register_event(memcg, eventfd, args, _MEMSWAP);
3766 }
3767
3768 static void __mem_cgroup_usage_unregister_event(struct mem_cgroup *memcg,
3769         struct eventfd_ctx *eventfd, enum res_type type)
3770 {
3771         struct mem_cgroup_thresholds *thresholds;
3772         struct mem_cgroup_threshold_ary *new;
3773         unsigned long usage;
3774         int i, j, size;
3775
3776         mutex_lock(&memcg->thresholds_lock);
3777
3778         if (type == _MEM) {
3779                 thresholds = &memcg->thresholds;
3780                 usage = mem_cgroup_usage(memcg, false);
3781         } else if (type == _MEMSWAP) {
3782                 thresholds = &memcg->memsw_thresholds;
3783                 usage = mem_cgroup_usage(memcg, true);
3784         } else
3785                 BUG();
3786
3787         if (!thresholds->primary)
3788                 goto unlock;
3789
3790         /* Check if a threshold crossed before removing */
3791         __mem_cgroup_threshold(memcg, type == _MEMSWAP);
3792
3793         /* Calculate new number of threshold */
3794         size = 0;
3795         for (i = 0; i < thresholds->primary->size; i++) {
3796                 if (thresholds->primary->entries[i].eventfd != eventfd)
3797                         size++;
3798         }
3799
3800         new = thresholds->spare;
3801
3802         /* Set thresholds array to NULL if we don't have thresholds */
3803         if (!size) {
3804                 kfree(new);
3805                 new = NULL;
3806                 goto swap_buffers;
3807         }
3808
3809         new->size = size;
3810
3811         /* Copy thresholds and find current threshold */
3812         new->current_threshold = -1;
3813         for (i = 0, j = 0; i < thresholds->primary->size; i++) {
3814                 if (thresholds->primary->entries[i].eventfd == eventfd)
3815                         continue;
3816
3817                 new->entries[j] = thresholds->primary->entries[i];
3818                 if (new->entries[j].threshold <= usage) {
3819                         /*
3820                          * new->current_threshold will not be used
3821                          * until rcu_assign_pointer(), so it's safe to increment
3822                          * it here.
3823                          */
3824                         ++new->current_threshold;
3825                 }
3826                 j++;
3827         }
3828
3829 swap_buffers:
3830         /* Swap primary and spare array */
3831         thresholds->spare = thresholds->primary;
3832         /* If all events are unregistered, free the spare array */
3833         if (!new) {
3834                 kfree(thresholds->spare);
3835                 thresholds->spare = NULL;
3836         }
3837
3838         rcu_assign_pointer(thresholds->primary, new);
3839
3840         /* To be sure that nobody uses thresholds */
3841         synchronize_rcu();
3842 unlock:
3843         mutex_unlock(&memcg->thresholds_lock);
3844 }
3845
3846 static void mem_cgroup_usage_unregister_event(struct mem_cgroup *memcg,
3847         struct eventfd_ctx *eventfd)
3848 {
3849         return __mem_cgroup_usage_unregister_event(memcg, eventfd, _MEM);
3850 }
3851
3852 static void memsw_cgroup_usage_unregister_event(struct mem_cgroup *memcg,
3853         struct eventfd_ctx *eventfd)
3854 {
3855         return __mem_cgroup_usage_unregister_event(memcg, eventfd, _MEMSWAP);
3856 }
3857
3858 static int mem_cgroup_oom_register_event(struct mem_cgroup *memcg,
3859         struct eventfd_ctx *eventfd, const char *args)
3860 {
3861         struct mem_cgroup_eventfd_list *event;
3862
3863         event = kmalloc(sizeof(*event), GFP_KERNEL);
3864         if (!event)
3865                 return -ENOMEM;
3866
3867         spin_lock(&memcg_oom_lock);
3868
3869         event->eventfd = eventfd;
3870         list_add(&event->list, &memcg->oom_notify);
3871
3872         /* already in OOM ? */
3873         if (atomic_read(&memcg->under_oom))
3874                 eventfd_signal(eventfd, 1);
3875         spin_unlock(&memcg_oom_lock);
3876
3877         return 0;
3878 }
3879
3880 static void mem_cgroup_oom_unregister_event(struct mem_cgroup *memcg,
3881         struct eventfd_ctx *eventfd)
3882 {
3883         struct mem_cgroup_eventfd_list *ev, *tmp;
3884
3885         spin_lock(&memcg_oom_lock);
3886
3887         list_for_each_entry_safe(ev, tmp, &memcg->oom_notify, list) {
3888                 if (ev->eventfd == eventfd) {
3889                         list_del(&ev->list);
3890                         kfree(ev);
3891                 }
3892         }
3893
3894         spin_unlock(&memcg_oom_lock);
3895 }
3896
3897 static int mem_cgroup_oom_control_read(struct seq_file *sf, void *v)
3898 {
3899         struct mem_cgroup *memcg = mem_cgroup_from_css(seq_css(sf));
3900
3901         seq_printf(sf, "oom_kill_disable %d\n", memcg->oom_kill_disable);
3902         seq_printf(sf, "under_oom %d\n", (bool)atomic_read(&memcg->under_oom));
3903         return 0;
3904 }
3905
3906 static int mem_cgroup_oom_control_write(struct cgroup_subsys_state *css,
3907         struct cftype *cft, u64 val)
3908 {
3909         struct mem_cgroup *memcg = mem_cgroup_from_css(css);
3910
3911         /* cannot set to root cgroup and only 0 and 1 are allowed */
3912         if (!css->parent || !((val == 0) || (val == 1)))
3913                 return -EINVAL;
3914
3915         memcg->oom_kill_disable = val;
3916         if (!val)
3917                 memcg_oom_recover(memcg);
3918
3919         return 0;
3920 }
3921
3922 #ifdef CONFIG_MEMCG_KMEM
3923 static int memcg_init_kmem(struct mem_cgroup *memcg, struct cgroup_subsys *ss)
3924 {
3925         int ret;
3926
3927         ret = memcg_propagate_kmem(memcg);
3928         if (ret)
3929                 return ret;
3930
3931         return mem_cgroup_sockets_init(memcg, ss);
3932 }
3933
3934 static void memcg_deactivate_kmem(struct mem_cgroup *memcg)
3935 {
3936         struct cgroup_subsys_state *css;
3937         struct mem_cgroup *parent, *child;
3938         int kmemcg_id;
3939
3940         if (!memcg->kmem_acct_active)
3941                 return;
3942
3943         /*
3944          * Clear the 'active' flag before clearing memcg_caches arrays entries.
3945          * Since we take the slab_mutex in memcg_deactivate_kmem_caches(), it
3946          * guarantees no cache will be created for this cgroup after we are
3947          * done (see memcg_create_kmem_cache()).
3948          */
3949         memcg->kmem_acct_active = false;
3950
3951         memcg_deactivate_kmem_caches(memcg);
3952
3953         kmemcg_id = memcg->kmemcg_id;
3954         BUG_ON(kmemcg_id < 0);
3955
3956         parent = parent_mem_cgroup(memcg);
3957         if (!parent)
3958                 parent = root_mem_cgroup;
3959
3960         /*
3961          * Change kmemcg_id of this cgroup and all its descendants to the
3962          * parent's id, and then move all entries from this cgroup's list_lrus
3963          * to ones of the parent. After we have finished, all list_lrus
3964          * corresponding to this cgroup are guaranteed to remain empty. The
3965          * ordering is imposed by list_lru_node->lock taken by
3966          * memcg_drain_all_list_lrus().
3967          */
3968         css_for_each_descendant_pre(css, &memcg->css) {
3969                 child = mem_cgroup_from_css(css);
3970                 BUG_ON(child->kmemcg_id != kmemcg_id);
3971                 child->kmemcg_id = parent->kmemcg_id;
3972                 if (!memcg->use_hierarchy)
3973                         break;
3974         }
3975         memcg_drain_all_list_lrus(kmemcg_id, parent->kmemcg_id);
3976
3977         memcg_free_cache_id(kmemcg_id);
3978 }
3979
3980 static void memcg_destroy_kmem(struct mem_cgroup *memcg)
3981 {
3982         if (memcg->kmem_acct_activated) {
3983                 memcg_destroy_kmem_caches(memcg);
3984                 static_key_slow_dec(&memcg_kmem_enabled_key);
3985                 WARN_ON(page_counter_read(&memcg->kmem));
3986         }
3987         mem_cgroup_sockets_destroy(memcg);
3988 }
3989 #else
3990 static int memcg_init_kmem(struct mem_cgroup *memcg, struct cgroup_subsys *ss)
3991 {
3992         return 0;
3993 }
3994
3995 static void memcg_deactivate_kmem(struct mem_cgroup *memcg)
3996 {
3997 }
3998
3999 static void memcg_destroy_kmem(struct mem_cgroup *memcg)
4000 {
4001 }
4002 #endif
4003
4004 /*
4005  * DO NOT USE IN NEW FILES.
4006  *
4007  * "cgroup.event_control" implementation.
4008  *
4009  * This is way over-engineered.  It tries to support fully configurable
4010  * events for each user.  Such level of flexibility is completely
4011  * unnecessary especially in the light of the planned unified hierarchy.
4012  *
4013  * Please deprecate this and replace with something simpler if at all
4014  * possible.
4015  */
4016
4017 /*
4018  * Unregister event and free resources.
4019  *
4020  * Gets called from workqueue.
4021  */
4022 static void memcg_event_remove(struct work_struct *work)
4023 {
4024         struct mem_cgroup_event *event =
4025                 container_of(work, struct mem_cgroup_event, remove);
4026         struct mem_cgroup *memcg = event->memcg;
4027
4028         remove_wait_queue(event->wqh, &event->wait);
4029
4030         event->unregister_event(memcg, event->eventfd);
4031
4032         /* Notify userspace the event is going away. */
4033         eventfd_signal(event->eventfd, 1);
4034
4035         eventfd_ctx_put(event->eventfd);
4036         kfree(event);
4037         css_put(&memcg->css);
4038 }
4039
4040 /*
4041  * Gets called on POLLHUP on eventfd when user closes it.
4042  *
4043  * Called with wqh->lock held and interrupts disabled.
4044  */
4045 static int memcg_event_wake(wait_queue_t *wait, unsigned mode,
4046                             int sync, void *key)
4047 {
4048         struct mem_cgroup_event *event =
4049                 container_of(wait, struct mem_cgroup_event, wait);
4050         struct mem_cgroup *memcg = event->memcg;
4051         unsigned long flags = (unsigned long)key;
4052
4053         if (flags & POLLHUP) {
4054                 /*
4055                  * If the event has been detached at cgroup removal, we
4056                  * can simply return knowing the other side will cleanup
4057                  * for us.
4058                  *
4059                  * We can't race against event freeing since the other
4060                  * side will require wqh->lock via remove_wait_queue(),
4061                  * which we hold.
4062                  */
4063                 spin_lock(&memcg->event_list_lock);
4064                 if (!list_empty(&event->list)) {
4065                         list_del_init(&event->list);
4066                         /*
4067                          * We are in atomic context, but cgroup_event_remove()
4068                          * may sleep, so we have to call it in workqueue.
4069                          */
4070                         schedule_work(&event->remove);
4071                 }
4072                 spin_unlock(&memcg->event_list_lock);
4073         }
4074
4075         return 0;
4076 }
4077
4078 static void memcg_event_ptable_queue_proc(struct file *file,
4079                 wait_queue_head_t *wqh, poll_table *pt)
4080 {
4081         struct mem_cgroup_event *event =
4082                 container_of(pt, struct mem_cgroup_event, pt);
4083
4084         event->wqh = wqh;
4085         add_wait_queue(wqh, &event->wait);
4086 }
4087
4088 /*
4089  * DO NOT USE IN NEW FILES.
4090  *
4091  * Parse input and register new cgroup event handler.
4092  *
4093  * Input must be in format '<event_fd> <control_fd> <args>'.
4094  * Interpretation of args is defined by control file implementation.
4095  */
4096 static ssize_t memcg_write_event_control(struct kernfs_open_file *of,
4097                                          char *buf, size_t nbytes, loff_t off)
4098 {
4099         struct cgroup_subsys_state *css = of_css(of);
4100         struct mem_cgroup *memcg = mem_cgroup_from_css(css);
4101         struct mem_cgroup_event *event;
4102         struct cgroup_subsys_state *cfile_css;
4103         unsigned int efd, cfd;
4104         struct fd efile;
4105         struct fd cfile;
4106         const char *name;
4107         char *endp;
4108         int ret;
4109
4110         buf = strstrip(buf);
4111
4112         efd = simple_strtoul(buf, &endp, 10);
4113         if (*endp != ' ')
4114                 return -EINVAL;
4115         buf = endp + 1;
4116
4117         cfd = simple_strtoul(buf, &endp, 10);
4118         if ((*endp != ' ') && (*endp != '\0'))
4119                 return -EINVAL;
4120         buf = endp + 1;
4121
4122         event = kzalloc(sizeof(*event), GFP_KERNEL);
4123         if (!event)
4124                 return -ENOMEM;
4125
4126         event->memcg = memcg;
4127         INIT_LIST_HEAD(&event->list);
4128         init_poll_funcptr(&event->pt, memcg_event_ptable_queue_proc);
4129         init_waitqueue_func_entry(&event->wait, memcg_event_wake);
4130         INIT_WORK(&event->remove, memcg_event_remove);
4131
4132         efile = fdget(efd);
4133         if (!efile.file) {
4134                 ret = -EBADF;
4135                 goto out_kfree;
4136         }
4137
4138         event->eventfd = eventfd_ctx_fileget(efile.file);
4139         if (IS_ERR(event->eventfd)) {
4140                 ret = PTR_ERR(event->eventfd);
4141                 goto out_put_efile;
4142         }
4143
4144         cfile = fdget(cfd);
4145         if (!cfile.file) {
4146                 ret = -EBADF;
4147                 goto out_put_eventfd;
4148         }
4149
4150         /* the process need read permission on control file */
4151         /* AV: shouldn't we check that it's been opened for read instead? */
4152         ret = inode_permission(file_inode(cfile.file), MAY_READ);
4153         if (ret < 0)
4154                 goto out_put_cfile;
4155
4156         /*
4157          * Determine the event callbacks and set them in @event.  This used
4158          * to be done via struct cftype but cgroup core no longer knows
4159          * about these events.  The following is crude but the whole thing
4160          * is for compatibility anyway.
4161          *
4162          * DO NOT ADD NEW FILES.
4163          */
4164         name = cfile.file->f_path.dentry->d_name.name;
4165
4166         if (!strcmp(name, "memory.usage_in_bytes")) {
4167                 event->register_event = mem_cgroup_usage_register_event;
4168                 event->unregister_event = mem_cgroup_usage_unregister_event;
4169         } else if (!strcmp(name, "memory.oom_control")) {
4170                 event->register_event = mem_cgroup_oom_register_event;
4171                 event->unregister_event = mem_cgroup_oom_unregister_event;
4172         } else if (!strcmp(name, "memory.pressure_level")) {
4173                 event->register_event = vmpressure_register_event;
4174                 event->unregister_event = vmpressure_unregister_event;
4175         } else if (!strcmp(name, "memory.memsw.usage_in_bytes")) {
4176                 event->register_event = memsw_cgroup_usage_register_event;
4177                 event->unregister_event = memsw_cgroup_usage_unregister_event;
4178         } else {
4179                 ret = -EINVAL;
4180                 goto out_put_cfile;
4181         }
4182
4183         /*
4184          * Verify @cfile should belong to @css.  Also, remaining events are
4185          * automatically removed on cgroup destruction but the removal is
4186          * asynchronous, so take an extra ref on @css.
4187          */
4188         cfile_css = css_tryget_online_from_dir(cfile.file->f_path.dentry->d_parent,
4189                                                &memory_cgrp_subsys);
4190         ret = -EINVAL;
4191         if (IS_ERR(cfile_css))
4192                 goto out_put_cfile;
4193         if (cfile_css != css) {
4194                 css_put(cfile_css);
4195                 goto out_put_cfile;
4196         }
4197
4198         ret = event->register_event(memcg, event->eventfd, buf);
4199         if (ret)
4200                 goto out_put_css;
4201
4202         efile.file->f_op->poll(efile.file, &event->pt);
4203
4204         spin_lock(&memcg->event_list_lock);
4205         list_add(&event->list, &memcg->event_list);
4206         spin_unlock(&memcg->event_list_lock);
4207
4208         fdput(cfile);
4209         fdput(efile);
4210
4211         return nbytes;
4212
4213 out_put_css:
4214         css_put(css);
4215 out_put_cfile:
4216         fdput(cfile);
4217 out_put_eventfd:
4218         eventfd_ctx_put(event->eventfd);
4219 out_put_efile:
4220         fdput(efile);
4221 out_kfree:
4222         kfree(event);
4223
4224         return ret;
4225 }
4226
4227 static struct cftype mem_cgroup_legacy_files[] = {
4228         {
4229                 .name = "usage_in_bytes",
4230                 .private = MEMFILE_PRIVATE(_MEM, RES_USAGE),
4231                 .read_u64 = mem_cgroup_read_u64,
4232         },
4233         {
4234                 .name = "max_usage_in_bytes",
4235                 .private = MEMFILE_PRIVATE(_MEM, RES_MAX_USAGE),
4236                 .write = mem_cgroup_reset,
4237                 .read_u64 = mem_cgroup_read_u64,
4238         },
4239         {
4240                 .name = "limit_in_bytes",
4241                 .private = MEMFILE_PRIVATE(_MEM, RES_LIMIT),
4242                 .write = mem_cgroup_write,
4243                 .read_u64 = mem_cgroup_read_u64,
4244         },
4245         {
4246                 .name = "soft_limit_in_bytes",
4247                 .private = MEMFILE_PRIVATE(_MEM, RES_SOFT_LIMIT),
4248                 .write = mem_cgroup_write,
4249                 .read_u64 = mem_cgroup_read_u64,
4250         },
4251         {
4252                 .name = "failcnt",
4253                 .private = MEMFILE_PRIVATE(_MEM, RES_FAILCNT),
4254                 .write = mem_cgroup_reset,
4255                 .read_u64 = mem_cgroup_read_u64,
4256         },
4257         {
4258                 .name = "stat",
4259                 .seq_show = memcg_stat_show,
4260         },
4261         {
4262                 .name = "force_empty",
4263                 .write = mem_cgroup_force_empty_write,
4264         },
4265         {
4266                 .name = "use_hierarchy",
4267                 .write_u64 = mem_cgroup_hierarchy_write,
4268                 .read_u64 = mem_cgroup_hierarchy_read,
4269         },
4270         {
4271                 .name = "cgroup.event_control",         /* XXX: for compat */
4272                 .write = memcg_write_event_control,
4273                 .flags = CFTYPE_NO_PREFIX,
4274                 .mode = S_IWUGO,
4275         },
4276         {
4277                 .name = "swappiness",
4278                 .read_u64 = mem_cgroup_swappiness_read,
4279                 .write_u64 = mem_cgroup_swappiness_write,
4280         },
4281         {
4282                 .name = "move_charge_at_immigrate",
4283                 .read_u64 = mem_cgroup_move_charge_read,
4284                 .write_u64 = mem_cgroup_move_charge_write,
4285         },
4286         {
4287                 .name = "oom_control",
4288                 .seq_show = mem_cgroup_oom_control_read,
4289                 .write_u64 = mem_cgroup_oom_control_write,
4290                 .private = MEMFILE_PRIVATE(_OOM_TYPE, OOM_CONTROL),
4291         },
4292         {
4293                 .name = "pressure_level",
4294         },
4295 #ifdef CONFIG_NUMA
4296         {
4297                 .name = "numa_stat",
4298                 .seq_show = memcg_numa_stat_show,
4299         },
4300 #endif
4301 #ifdef CONFIG_MEMCG_KMEM
4302         {
4303                 .name = "kmem.limit_in_bytes",
4304                 .private = MEMFILE_PRIVATE(_KMEM, RES_LIMIT),
4305                 .write = mem_cgroup_write,
4306                 .read_u64 = mem_cgroup_read_u64,
4307         },
4308         {
4309                 .name = "kmem.usage_in_bytes",
4310                 .private = MEMFILE_PRIVATE(_KMEM, RES_USAGE),
4311                 .read_u64 = mem_cgroup_read_u64,
4312         },
4313         {
4314                 .name = "kmem.failcnt",
4315                 .private = MEMFILE_PRIVATE(_KMEM, RES_FAILCNT),
4316                 .write = mem_cgroup_reset,
4317                 .read_u64 = mem_cgroup_read_u64,
4318         },
4319         {
4320                 .name = "kmem.max_usage_in_bytes",
4321                 .private = MEMFILE_PRIVATE(_KMEM, RES_MAX_USAGE),
4322                 .write = mem_cgroup_reset,
4323                 .read_u64 = mem_cgroup_read_u64,
4324         },
4325 #ifdef CONFIG_SLABINFO
4326         {
4327                 .name = "kmem.slabinfo",
4328                 .seq_start = slab_start,
4329                 .seq_next = slab_next,
4330                 .seq_stop = slab_stop,
4331                 .seq_show = memcg_slab_show,
4332         },
4333 #endif
4334 #endif
4335         { },    /* terminate */
4336 };
4337
4338 static int alloc_mem_cgroup_per_zone_info(struct mem_cgroup *memcg, int node)
4339 {
4340         struct mem_cgroup_per_node *pn;
4341         struct mem_cgroup_per_zone *mz;
4342         int zone, tmp = node;
4343         /*
4344          * This routine is called against possible nodes.
4345          * But it's BUG to call kmalloc() against offline node.
4346          *
4347          * TODO: this routine can waste much memory for nodes which will
4348          *       never be onlined. It's better to use memory hotplug callback
4349          *       function.
4350          */
4351         if (!node_state(node, N_NORMAL_MEMORY))
4352                 tmp = -1;
4353         pn = kzalloc_node(sizeof(*pn), GFP_KERNEL, tmp);
4354         if (!pn)
4355                 return 1;
4356
4357         for (zone = 0; zone < MAX_NR_ZONES; zone++) {
4358                 mz = &pn->zoneinfo[zone];
4359                 lruvec_init(&mz->lruvec);
4360                 mz->usage_in_excess = 0;
4361                 mz->on_tree = false;
4362                 mz->memcg = memcg;
4363         }
4364         memcg->nodeinfo[node] = pn;
4365         return 0;
4366 }
4367
4368 static void free_mem_cgroup_per_zone_info(struct mem_cgroup *memcg, int node)
4369 {
4370         kfree(memcg->nodeinfo[node]);
4371 }
4372
4373 static struct mem_cgroup *mem_cgroup_alloc(void)
4374 {
4375         struct mem_cgroup *memcg;
4376         size_t size;
4377
4378         size = sizeof(struct mem_cgroup);
4379         size += nr_node_ids * sizeof(struct mem_cgroup_per_node *);
4380
4381         memcg = kzalloc(size, GFP_KERNEL);
4382         if (!memcg)
4383                 return NULL;
4384
4385         memcg->stat = alloc_percpu(struct mem_cgroup_stat_cpu);
4386         if (!memcg->stat)
4387                 goto out_free;
4388         spin_lock_init(&memcg->pcp_counter_lock);
4389         return memcg;
4390
4391 out_free:
4392         kfree(memcg);
4393         return NULL;
4394 }
4395
4396 /*
4397  * At destroying mem_cgroup, references from swap_cgroup can remain.
4398  * (scanning all at force_empty is too costly...)
4399  *
4400  * Instead of clearing all references at force_empty, we remember
4401  * the number of reference from swap_cgroup and free mem_cgroup when
4402  * it goes down to 0.
4403  *
4404  * Removal of cgroup itself succeeds regardless of refs from swap.
4405  */
4406
4407 static void __mem_cgroup_free(struct mem_cgroup *memcg)
4408 {
4409         int node;
4410
4411         mem_cgroup_remove_from_trees(memcg);
4412
4413         for_each_node(node)
4414                 free_mem_cgroup_per_zone_info(memcg, node);
4415
4416         free_percpu(memcg->stat);
4417         kfree(memcg);
4418 }
4419
4420 /*
4421  * Returns the parent mem_cgroup in memcgroup hierarchy with hierarchy enabled.
4422  */
4423 struct mem_cgroup *parent_mem_cgroup(struct mem_cgroup *memcg)
4424 {
4425         if (!memcg->memory.parent)
4426                 return NULL;
4427         return mem_cgroup_from_counter(memcg->memory.parent, memory);
4428 }
4429 EXPORT_SYMBOL(parent_mem_cgroup);
4430
4431 static struct cgroup_subsys_state * __ref
4432 mem_cgroup_css_alloc(struct cgroup_subsys_state *parent_css)
4433 {
4434         struct mem_cgroup *memcg;
4435         long error = -ENOMEM;
4436         int node;
4437
4438         memcg = mem_cgroup_alloc();
4439         if (!memcg)
4440                 return ERR_PTR(error);
4441
4442         for_each_node(node)
4443                 if (alloc_mem_cgroup_per_zone_info(memcg, node))
4444                         goto free_out;
4445
4446         /* root ? */
4447         if (parent_css == NULL) {
4448                 root_mem_cgroup = memcg;
4449                 page_counter_init(&memcg->memory, NULL);
4450                 memcg->high = PAGE_COUNTER_MAX;
4451                 memcg->soft_limit = PAGE_COUNTER_MAX;
4452                 page_counter_init(&memcg->memsw, NULL);
4453                 page_counter_init(&memcg->kmem, NULL);
4454         }
4455
4456         memcg->last_scanned_node = MAX_NUMNODES;
4457         INIT_LIST_HEAD(&memcg->oom_notify);
4458         memcg->move_charge_at_immigrate = 0;
4459         mutex_init(&memcg->thresholds_lock);
4460         spin_lock_init(&memcg->move_lock);
4461         vmpressure_init(&memcg->vmpressure);
4462         INIT_LIST_HEAD(&memcg->event_list);
4463         spin_lock_init(&memcg->event_list_lock);
4464 #ifdef CONFIG_MEMCG_KMEM
4465         memcg->kmemcg_id = -1;
4466 #endif
4467
4468         return &memcg->css;
4469
4470 free_out:
4471         __mem_cgroup_free(memcg);
4472         return ERR_PTR(error);
4473 }
4474
4475 static int
4476 mem_cgroup_css_online(struct cgroup_subsys_state *css)
4477 {
4478         struct mem_cgroup *memcg = mem_cgroup_from_css(css);
4479         struct mem_cgroup *parent = mem_cgroup_from_css(css->parent);
4480         int ret;
4481
4482         if (css->id > MEM_CGROUP_ID_MAX)
4483                 return -ENOSPC;
4484
4485         if (!parent)
4486                 return 0;
4487
4488         mutex_lock(&memcg_create_mutex);
4489
4490         memcg->use_hierarchy = parent->use_hierarchy;
4491         memcg->oom_kill_disable = parent->oom_kill_disable;
4492         memcg->swappiness = mem_cgroup_swappiness(parent);
4493
4494         if (parent->use_hierarchy) {
4495                 page_counter_init(&memcg->memory, &parent->memory);
4496                 memcg->high = PAGE_COUNTER_MAX;
4497                 memcg->soft_limit = PAGE_COUNTER_MAX;
4498                 page_counter_init(&memcg->memsw, &parent->memsw);
4499                 page_counter_init(&memcg->kmem, &parent->kmem);
4500
4501                 /*
4502                  * No need to take a reference to the parent because cgroup
4503                  * core guarantees its existence.
4504                  */
4505         } else {
4506                 page_counter_init(&memcg->memory, NULL);
4507                 memcg->high = PAGE_COUNTER_MAX;
4508                 memcg->soft_limit = PAGE_COUNTER_MAX;
4509                 page_counter_init(&memcg->memsw, NULL);
4510                 page_counter_init(&memcg->kmem, NULL);
4511                 /*
4512                  * Deeper hierachy with use_hierarchy == false doesn't make
4513                  * much sense so let cgroup subsystem know about this
4514                  * unfortunate state in our controller.
4515                  */
4516                 if (parent != root_mem_cgroup)
4517                         memory_cgrp_subsys.broken_hierarchy = true;
4518         }
4519         mutex_unlock(&memcg_create_mutex);
4520
4521         ret = memcg_init_kmem(memcg, &memory_cgrp_subsys);
4522         if (ret)
4523                 return ret;
4524
4525         /*
4526          * Make sure the memcg is initialized: mem_cgroup_iter()
4527          * orders reading memcg->initialized against its callers
4528          * reading the memcg members.
4529          */
4530         smp_store_release(&memcg->initialized, 1);
4531
4532         return 0;
4533 }
4534
4535 static void mem_cgroup_css_offline(struct cgroup_subsys_state *css)
4536 {
4537         struct mem_cgroup *memcg = mem_cgroup_from_css(css);
4538         struct mem_cgroup_event *event, *tmp;
4539
4540         /*
4541          * Unregister events and notify userspace.
4542          * Notify userspace about cgroup removing only after rmdir of cgroup
4543          * directory to avoid race between userspace and kernelspace.
4544          */
4545         spin_lock(&memcg->event_list_lock);
4546         list_for_each_entry_safe(event, tmp, &memcg->event_list, list) {
4547                 list_del_init(&event->list);
4548                 schedule_work(&event->remove);
4549         }
4550         spin_unlock(&memcg->event_list_lock);
4551
4552         vmpressure_cleanup(&memcg->vmpressure);
4553
4554         memcg_deactivate_kmem(memcg);
4555 }
4556
4557 static void mem_cgroup_css_free(struct cgroup_subsys_state *css)
4558 {
4559         struct mem_cgroup *memcg = mem_cgroup_from_css(css);
4560
4561         memcg_destroy_kmem(memcg);
4562         __mem_cgroup_free(memcg);
4563 }
4564
4565 /**
4566  * mem_cgroup_css_reset - reset the states of a mem_cgroup
4567  * @css: the target css
4568  *
4569  * Reset the states of the mem_cgroup associated with @css.  This is
4570  * invoked when the userland requests disabling on the default hierarchy
4571  * but the memcg is pinned through dependency.  The memcg should stop
4572  * applying policies and should revert to the vanilla state as it may be
4573  * made visible again.
4574  *
4575  * The current implementation only resets the essential configurations.
4576  * This needs to be expanded to cover all the visible parts.
4577  */
4578 static void mem_cgroup_css_reset(struct cgroup_subsys_state *css)
4579 {
4580         struct mem_cgroup *memcg = mem_cgroup_from_css(css);
4581
4582         mem_cgroup_resize_limit(memcg, PAGE_COUNTER_MAX);
4583         mem_cgroup_resize_memsw_limit(memcg, PAGE_COUNTER_MAX);
4584         memcg_update_kmem_limit(memcg, PAGE_COUNTER_MAX);
4585         memcg->low = 0;
4586         memcg->high = PAGE_COUNTER_MAX;
4587         memcg->soft_limit = PAGE_COUNTER_MAX;
4588 }
4589
4590 #ifdef CONFIG_MMU
4591 /* Handlers for move charge at task migration. */
4592 static int mem_cgroup_do_precharge(unsigned long count)
4593 {
4594         int ret;
4595
4596         /* Try a single bulk charge without reclaim first */
4597         ret = try_charge(mc.to, GFP_KERNEL & ~__GFP_WAIT, count);
4598         if (!ret) {
4599                 mc.precharge += count;
4600                 return ret;
4601         }
4602         if (ret == -EINTR) {
4603                 cancel_charge(root_mem_cgroup, count);
4604                 return ret;
4605         }
4606
4607         /* Try charges one by one with reclaim */
4608         while (count--) {
4609                 ret = try_charge(mc.to, GFP_KERNEL & ~__GFP_NORETRY, 1);
4610                 /*
4611                  * In case of failure, any residual charges against
4612                  * mc.to will be dropped by mem_cgroup_clear_mc()
4613                  * later on.  However, cancel any charges that are
4614                  * bypassed to root right away or they'll be lost.
4615                  */
4616                 if (ret == -EINTR)
4617                         cancel_charge(root_mem_cgroup, 1);
4618                 if (ret)
4619                         return ret;
4620                 mc.precharge++;
4621                 cond_resched();
4622         }
4623         return 0;
4624 }
4625
4626 /**
4627  * get_mctgt_type - get target type of moving charge
4628  * @vma: the vma the pte to be checked belongs
4629  * @addr: the address corresponding to the pte to be checked
4630  * @ptent: the pte to be checked
4631  * @target: the pointer the target page or swap ent will be stored(can be NULL)
4632  *
4633  * Returns
4634  *   0(MC_TARGET_NONE): if the pte is not a target for move charge.
4635  *   1(MC_TARGET_PAGE): if the page corresponding to this pte is a target for
4636  *     move charge. if @target is not NULL, the page is stored in target->page
4637  *     with extra refcnt got(Callers should handle it).
4638  *   2(MC_TARGET_SWAP): if the swap entry corresponding to this pte is a
4639  *     target for charge migration. if @target is not NULL, the entry is stored
4640  *     in target->ent.
4641  *
4642  * Called with pte lock held.
4643  */
4644 union mc_target {
4645         struct page     *page;
4646         swp_entry_t     ent;
4647 };
4648
4649 enum mc_target_type {
4650         MC_TARGET_NONE = 0,
4651         MC_TARGET_PAGE,
4652         MC_TARGET_SWAP,
4653 };
4654
4655 static struct page *mc_handle_present_pte(struct vm_area_struct *vma,
4656                                                 unsigned long addr, pte_t ptent)
4657 {
4658         struct page *page = vm_normal_page(vma, addr, ptent);
4659
4660         if (!page || !page_mapped(page))
4661                 return NULL;
4662         if (PageAnon(page)) {
4663                 if (!(mc.flags & MOVE_ANON))
4664                         return NULL;
4665         } else {
4666                 if (!(mc.flags & MOVE_FILE))
4667                         return NULL;
4668         }
4669         if (!get_page_unless_zero(page))
4670                 return NULL;
4671
4672         return page;
4673 }
4674
4675 #ifdef CONFIG_SWAP
4676 static struct page *mc_handle_swap_pte(struct vm_area_struct *vma,
4677                         unsigned long addr, pte_t ptent, swp_entry_t *entry)
4678 {
4679         struct page *page = NULL;
4680         swp_entry_t ent = pte_to_swp_entry(ptent);
4681
4682         if (!(mc.flags & MOVE_ANON) || non_swap_entry(ent))
4683                 return NULL;
4684         /*
4685          * Because lookup_swap_cache() updates some statistics counter,
4686          * we call find_get_page() with swapper_space directly.
4687          */
4688         page = find_get_page(swap_address_space(ent), ent.val);
4689         if (do_swap_account)
4690                 entry->val = ent.val;
4691
4692         return page;
4693 }
4694 #else
4695 static struct page *mc_handle_swap_pte(struct vm_area_struct *vma,
4696                         unsigned long addr, pte_t ptent, swp_entry_t *entry)
4697 {
4698         return NULL;
4699 }
4700 #endif
4701
4702 static struct page *mc_handle_file_pte(struct vm_area_struct *vma,
4703                         unsigned long addr, pte_t ptent, swp_entry_t *entry)
4704 {
4705         struct page *page = NULL;
4706         struct address_space *mapping;
4707         pgoff_t pgoff;
4708
4709         if (!vma->vm_file) /* anonymous vma */
4710                 return NULL;
4711         if (!(mc.flags & MOVE_FILE))
4712                 return NULL;
4713
4714         mapping = vma->vm_file->f_mapping;
4715         pgoff = linear_page_index(vma, addr);
4716
4717         /* page is moved even if it's not RSS of this task(page-faulted). */
4718 #ifdef CONFIG_SWAP
4719         /* shmem/tmpfs may report page out on swap: account for that too. */
4720         if (shmem_mapping(mapping)) {
4721                 page = find_get_entry(mapping, pgoff);
4722                 if (radix_tree_exceptional_entry(page)) {
4723                         swp_entry_t swp = radix_to_swp_entry(page);
4724                         if (do_swap_account)
4725                                 *entry = swp;
4726                         page = find_get_page(swap_address_space(swp), swp.val);
4727                 }
4728         } else
4729                 page = find_get_page(mapping, pgoff);
4730 #else
4731         page = find_get_page(mapping, pgoff);
4732 #endif
4733         return page;
4734 }
4735
4736 /**
4737  * mem_cgroup_move_account - move account of the page
4738  * @page: the page
4739  * @nr_pages: number of regular pages (>1 for huge pages)
4740  * @from: mem_cgroup which the page is moved from.
4741  * @to: mem_cgroup which the page is moved to. @from != @to.
4742  *
4743  * The caller must confirm following.
4744  * - page is not on LRU (isolate_page() is useful.)
4745  * - compound_lock is held when nr_pages > 1
4746  *
4747  * This function doesn't do "charge" to new cgroup and doesn't do "uncharge"
4748  * from old cgroup.
4749  */
4750 static int mem_cgroup_move_account(struct page *page,
4751                                    unsigned int nr_pages,
4752                                    struct mem_cgroup *from,
4753                                    struct mem_cgroup *to)
4754 {
4755         unsigned long flags;
4756         int ret;
4757
4758         VM_BUG_ON(from == to);
4759         VM_BUG_ON_PAGE(PageLRU(page), page);
4760         /*
4761          * The page is isolated from LRU. So, collapse function
4762          * will not handle this page. But page splitting can happen.
4763          * Do this check under compound_page_lock(). The caller should
4764          * hold it.
4765          */
4766         ret = -EBUSY;
4767         if (nr_pages > 1 && !PageTransHuge(page))
4768                 goto out;
4769
4770         /*
4771          * Prevent mem_cgroup_migrate() from looking at page->mem_cgroup
4772          * of its source page while we change it: page migration takes
4773          * both pages off the LRU, but page cache replacement doesn't.
4774          */
4775         if (!trylock_page(page))
4776                 goto out;
4777
4778         ret = -EINVAL;
4779         if (page->mem_cgroup != from)
4780                 goto out_unlock;
4781
4782         spin_lock_irqsave(&from->move_lock, flags);
4783
4784         if (!PageAnon(page) && page_mapped(page)) {
4785                 __this_cpu_sub(from->stat->count[MEM_CGROUP_STAT_FILE_MAPPED],
4786                                nr_pages);
4787                 __this_cpu_add(to->stat->count[MEM_CGROUP_STAT_FILE_MAPPED],
4788                                nr_pages);
4789         }
4790
4791         if (PageWriteback(page)) {
4792                 __this_cpu_sub(from->stat->count[MEM_CGROUP_STAT_WRITEBACK],
4793                                nr_pages);
4794                 __this_cpu_add(to->stat->count[MEM_CGROUP_STAT_WRITEBACK],
4795                                nr_pages);
4796         }
4797
4798         /*
4799          * It is safe to change page->mem_cgroup here because the page
4800          * is referenced, charged, and isolated - we can't race with
4801          * uncharging, charging, migration, or LRU putback.
4802          */
4803
4804         /* caller should have done css_get */
4805         page->mem_cgroup = to;
4806         spin_unlock_irqrestore(&from->move_lock, flags);
4807
4808         ret = 0;
4809
4810         local_lock_irq(event_lock);
4811         mem_cgroup_charge_statistics(to, page, nr_pages);
4812         memcg_check_events(to, page);
4813         mem_cgroup_charge_statistics(from, page, -nr_pages);
4814         memcg_check_events(from, page);
4815         local_unlock_irq(event_lock);
4816 out_unlock:
4817         unlock_page(page);
4818 out:
4819         return ret;
4820 }
4821
4822 static enum mc_target_type get_mctgt_type(struct vm_area_struct *vma,
4823                 unsigned long addr, pte_t ptent, union mc_target *target)
4824 {
4825         struct page *page = NULL;
4826         enum mc_target_type ret = MC_TARGET_NONE;
4827         swp_entry_t ent = { .val = 0 };
4828
4829         if (pte_present(ptent))
4830                 page = mc_handle_present_pte(vma, addr, ptent);
4831         else if (is_swap_pte(ptent))
4832                 page = mc_handle_swap_pte(vma, addr, ptent, &ent);
4833         else if (pte_none(ptent))
4834                 page = mc_handle_file_pte(vma, addr, ptent, &ent);
4835
4836         if (!page && !ent.val)
4837                 return ret;
4838         if (page) {
4839                 /*
4840                  * Do only loose check w/o serialization.
4841                  * mem_cgroup_move_account() checks the page is valid or
4842                  * not under LRU exclusion.
4843                  */
4844                 if (page->mem_cgroup == mc.from) {
4845                         ret = MC_TARGET_PAGE;
4846                         if (target)
4847                                 target->page = page;
4848                 }
4849                 if (!ret || !target)
4850                         put_page(page);
4851         }
4852         /* There is a swap entry and a page doesn't exist or isn't charged */
4853         if (ent.val && !ret &&
4854             mem_cgroup_id(mc.from) == lookup_swap_cgroup_id(ent)) {
4855                 ret = MC_TARGET_SWAP;
4856                 if (target)
4857                         target->ent = ent;
4858         }
4859         return ret;
4860 }
4861
4862 #ifdef CONFIG_TRANSPARENT_HUGEPAGE
4863 /*
4864  * We don't consider swapping or file mapped pages because THP does not
4865  * support them for now.
4866  * Caller should make sure that pmd_trans_huge(pmd) is true.
4867  */
4868 static enum mc_target_type get_mctgt_type_thp(struct vm_area_struct *vma,
4869                 unsigned long addr, pmd_t pmd, union mc_target *target)
4870 {
4871         struct page *page = NULL;
4872         enum mc_target_type ret = MC_TARGET_NONE;
4873
4874         page = pmd_page(pmd);
4875         VM_BUG_ON_PAGE(!page || !PageHead(page), page);
4876         if (!(mc.flags & MOVE_ANON))
4877                 return ret;
4878         if (page->mem_cgroup == mc.from) {
4879                 ret = MC_TARGET_PAGE;
4880                 if (target) {
4881                         get_page(page);
4882                         target->page = page;
4883                 }
4884         }
4885         return ret;
4886 }
4887 #else
4888 static inline enum mc_target_type get_mctgt_type_thp(struct vm_area_struct *vma,
4889                 unsigned long addr, pmd_t pmd, union mc_target *target)
4890 {
4891         return MC_TARGET_NONE;
4892 }
4893 #endif
4894
4895 static int mem_cgroup_count_precharge_pte_range(pmd_t *pmd,
4896                                         unsigned long addr, unsigned long end,
4897                                         struct mm_walk *walk)
4898 {
4899         struct vm_area_struct *vma = walk->vma;
4900         pte_t *pte;
4901         spinlock_t *ptl;
4902
4903         if (pmd_trans_huge_lock(pmd, vma, &ptl) == 1) {
4904                 if (get_mctgt_type_thp(vma, addr, *pmd, NULL) == MC_TARGET_PAGE)
4905                         mc.precharge += HPAGE_PMD_NR;
4906                 spin_unlock(ptl);
4907                 return 0;
4908         }
4909
4910         if (pmd_trans_unstable(pmd))
4911                 return 0;
4912         pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl);
4913         for (; addr != end; pte++, addr += PAGE_SIZE)
4914                 if (get_mctgt_type(vma, addr, *pte, NULL))
4915                         mc.precharge++; /* increment precharge temporarily */
4916         pte_unmap_unlock(pte - 1, ptl);
4917         cond_resched();
4918
4919         return 0;
4920 }
4921
4922 static unsigned long mem_cgroup_count_precharge(struct mm_struct *mm)
4923 {
4924         unsigned long precharge;
4925
4926         struct mm_walk mem_cgroup_count_precharge_walk = {
4927                 .pmd_entry = mem_cgroup_count_precharge_pte_range,
4928                 .mm = mm,
4929         };
4930         down_read(&mm->mmap_sem);
4931         walk_page_range(0, ~0UL, &mem_cgroup_count_precharge_walk);
4932         up_read(&mm->mmap_sem);
4933
4934         precharge = mc.precharge;
4935         mc.precharge = 0;
4936
4937         return precharge;
4938 }
4939
4940 static int mem_cgroup_precharge_mc(struct mm_struct *mm)
4941 {
4942         unsigned long precharge = mem_cgroup_count_precharge(mm);
4943
4944         VM_BUG_ON(mc.moving_task);
4945         mc.moving_task = current;
4946         return mem_cgroup_do_precharge(precharge);
4947 }
4948
4949 /* cancels all extra charges on mc.from and mc.to, and wakes up all waiters. */
4950 static void __mem_cgroup_clear_mc(void)
4951 {
4952         struct mem_cgroup *from = mc.from;
4953         struct mem_cgroup *to = mc.to;
4954
4955         /* we must uncharge all the leftover precharges from mc.to */
4956         if (mc.precharge) {
4957                 cancel_charge(mc.to, mc.precharge);
4958                 mc.precharge = 0;
4959         }
4960         /*
4961          * we didn't uncharge from mc.from at mem_cgroup_move_account(), so
4962          * we must uncharge here.
4963          */
4964         if (mc.moved_charge) {
4965                 cancel_charge(mc.from, mc.moved_charge);
4966                 mc.moved_charge = 0;
4967         }
4968         /* we must fixup refcnts and charges */
4969         if (mc.moved_swap) {
4970                 /* uncharge swap account from the old cgroup */
4971                 if (!mem_cgroup_is_root(mc.from))
4972                         page_counter_uncharge(&mc.from->memsw, mc.moved_swap);
4973
4974                 /*
4975                  * we charged both to->memory and to->memsw, so we
4976                  * should uncharge to->memory.
4977                  */
4978                 if (!mem_cgroup_is_root(mc.to))
4979                         page_counter_uncharge(&mc.to->memory, mc.moved_swap);
4980
4981                 css_put_many(&mc.from->css, mc.moved_swap);
4982
4983                 /* we've already done css_get(mc.to) */
4984                 mc.moved_swap = 0;
4985         }
4986         memcg_oom_recover(from);
4987         memcg_oom_recover(to);
4988         wake_up_all(&mc.waitq);
4989 }
4990
4991 static void mem_cgroup_clear_mc(void)
4992 {
4993         /*
4994          * we must clear moving_task before waking up waiters at the end of
4995          * task migration.
4996          */
4997         mc.moving_task = NULL;
4998         __mem_cgroup_clear_mc();
4999         spin_lock(&mc.lock);
5000         mc.from = NULL;
5001         mc.to = NULL;
5002         spin_unlock(&mc.lock);
5003 }
5004
5005 static int mem_cgroup_can_attach(struct cgroup_subsys_state *css,
5006                                  struct cgroup_taskset *tset)
5007 {
5008         struct task_struct *p = cgroup_taskset_first(tset);
5009         int ret = 0;
5010         struct mem_cgroup *memcg = mem_cgroup_from_css(css);
5011         unsigned long move_flags;
5012
5013         /*
5014          * We are now commited to this value whatever it is. Changes in this
5015          * tunable will only affect upcoming migrations, not the current one.
5016          * So we need to save it, and keep it going.
5017          */
5018         move_flags = READ_ONCE(memcg->move_charge_at_immigrate);
5019         if (move_flags) {
5020                 struct mm_struct *mm;
5021                 struct mem_cgroup *from = mem_cgroup_from_task(p);
5022
5023                 VM_BUG_ON(from == memcg);
5024
5025                 mm = get_task_mm(p);
5026                 if (!mm)
5027                         return 0;
5028                 /* We move charges only when we move a owner of the mm */
5029                 if (mm->owner == p) {
5030                         VM_BUG_ON(mc.from);
5031                         VM_BUG_ON(mc.to);
5032                         VM_BUG_ON(mc.precharge);
5033                         VM_BUG_ON(mc.moved_charge);
5034                         VM_BUG_ON(mc.moved_swap);
5035
5036                         spin_lock(&mc.lock);
5037                         mc.from = from;
5038                         mc.to = memcg;
5039                         mc.flags = move_flags;
5040                         spin_unlock(&mc.lock);
5041                         /* We set mc.moving_task later */
5042
5043                         ret = mem_cgroup_precharge_mc(mm);
5044                         if (ret)
5045                                 mem_cgroup_clear_mc();
5046                 }
5047                 mmput(mm);
5048         }
5049         return ret;
5050 }
5051
5052 static void mem_cgroup_cancel_attach(struct cgroup_subsys_state *css,
5053                                      struct cgroup_taskset *tset)
5054 {
5055         if (mc.to)
5056                 mem_cgroup_clear_mc();
5057 }
5058
5059 static int mem_cgroup_move_charge_pte_range(pmd_t *pmd,
5060                                 unsigned long addr, unsigned long end,
5061                                 struct mm_walk *walk)
5062 {
5063         int ret = 0;
5064         struct vm_area_struct *vma = walk->vma;
5065         pte_t *pte;
5066         spinlock_t *ptl;
5067         enum mc_target_type target_type;
5068         union mc_target target;
5069         struct page *page;
5070
5071         /*
5072          * We don't take compound_lock() here but no race with splitting thp
5073          * happens because:
5074          *  - if pmd_trans_huge_lock() returns 1, the relevant thp is not
5075          *    under splitting, which means there's no concurrent thp split,
5076          *  - if another thread runs into split_huge_page() just after we
5077          *    entered this if-block, the thread must wait for page table lock
5078          *    to be unlocked in __split_huge_page_splitting(), where the main
5079          *    part of thp split is not executed yet.
5080          */
5081         if (pmd_trans_huge_lock(pmd, vma, &ptl) == 1) {
5082                 if (mc.precharge < HPAGE_PMD_NR) {
5083                         spin_unlock(ptl);
5084                         return 0;
5085                 }
5086                 target_type = get_mctgt_type_thp(vma, addr, *pmd, &target);
5087                 if (target_type == MC_TARGET_PAGE) {
5088                         page = target.page;
5089                         if (!isolate_lru_page(page)) {
5090                                 if (!mem_cgroup_move_account(page, HPAGE_PMD_NR,
5091                                                              mc.from, mc.to)) {
5092                                         mc.precharge -= HPAGE_PMD_NR;
5093                                         mc.moved_charge += HPAGE_PMD_NR;
5094                                 }
5095                                 putback_lru_page(page);
5096                         }
5097                         put_page(page);
5098                 }
5099                 spin_unlock(ptl);
5100                 return 0;
5101         }
5102
5103         if (pmd_trans_unstable(pmd))
5104                 return 0;
5105 retry:
5106         pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl);
5107         for (; addr != end; addr += PAGE_SIZE) {
5108                 pte_t ptent = *(pte++);
5109                 swp_entry_t ent;
5110
5111                 if (!mc.precharge)
5112                         break;
5113
5114                 switch (get_mctgt_type(vma, addr, ptent, &target)) {
5115                 case MC_TARGET_PAGE:
5116                         page = target.page;
5117                         if (isolate_lru_page(page))
5118                                 goto put;
5119                         if (!mem_cgroup_move_account(page, 1, mc.from, mc.to)) {
5120                                 mc.precharge--;
5121                                 /* we uncharge from mc.from later. */
5122                                 mc.moved_charge++;
5123                         }
5124                         putback_lru_page(page);
5125 put:                    /* get_mctgt_type() gets the page */
5126                         put_page(page);
5127                         break;
5128                 case MC_TARGET_SWAP:
5129                         ent = target.ent;
5130                         if (!mem_cgroup_move_swap_account(ent, mc.from, mc.to)) {
5131                                 mc.precharge--;
5132                                 /* we fixup refcnts and charges later. */
5133                                 mc.moved_swap++;
5134                         }
5135                         break;
5136                 default:
5137                         break;
5138                 }
5139         }
5140         pte_unmap_unlock(pte - 1, ptl);
5141         cond_resched();
5142
5143         if (addr != end) {
5144                 /*
5145                  * We have consumed all precharges we got in can_attach().
5146                  * We try charge one by one, but don't do any additional
5147                  * charges to mc.to if we have failed in charge once in attach()
5148                  * phase.
5149                  */
5150                 ret = mem_cgroup_do_precharge(1);
5151                 if (!ret)
5152                         goto retry;
5153         }
5154
5155         return ret;
5156 }
5157
5158 static void mem_cgroup_move_charge(struct mm_struct *mm)
5159 {
5160         struct mm_walk mem_cgroup_move_charge_walk = {
5161                 .pmd_entry = mem_cgroup_move_charge_pte_range,
5162                 .mm = mm,
5163         };
5164
5165         lru_add_drain_all();
5166         /*
5167          * Signal mem_cgroup_begin_page_stat() to take the memcg's
5168          * move_lock while we're moving its pages to another memcg.
5169          * Then wait for already started RCU-only updates to finish.
5170          */
5171         atomic_inc(&mc.from->moving_account);
5172         synchronize_rcu();
5173 retry:
5174         if (unlikely(!down_read_trylock(&mm->mmap_sem))) {
5175                 /*
5176                  * Someone who are holding the mmap_sem might be waiting in
5177                  * waitq. So we cancel all extra charges, wake up all waiters,
5178                  * and retry. Because we cancel precharges, we might not be able
5179                  * to move enough charges, but moving charge is a best-effort
5180                  * feature anyway, so it wouldn't be a big problem.
5181                  */
5182                 __mem_cgroup_clear_mc();
5183                 cond_resched();
5184                 goto retry;
5185         }
5186         /*
5187          * When we have consumed all precharges and failed in doing
5188          * additional charge, the page walk just aborts.
5189          */
5190         walk_page_range(0, ~0UL, &mem_cgroup_move_charge_walk);
5191         up_read(&mm->mmap_sem);
5192         atomic_dec(&mc.from->moving_account);
5193 }
5194
5195 static void mem_cgroup_move_task(struct cgroup_subsys_state *css,
5196                                  struct cgroup_taskset *tset)
5197 {
5198         struct task_struct *p = cgroup_taskset_first(tset);
5199         struct mm_struct *mm = get_task_mm(p);
5200
5201         if (mm) {
5202                 if (mc.to)
5203                         mem_cgroup_move_charge(mm);
5204                 mmput(mm);
5205         }
5206         if (mc.to)
5207                 mem_cgroup_clear_mc();
5208 }
5209 #else   /* !CONFIG_MMU */
5210 static int mem_cgroup_can_attach(struct cgroup_subsys_state *css,
5211                                  struct cgroup_taskset *tset)
5212 {
5213         return 0;
5214 }
5215 static void mem_cgroup_cancel_attach(struct cgroup_subsys_state *css,
5216                                      struct cgroup_taskset *tset)
5217 {
5218 }
5219 static void mem_cgroup_move_task(struct cgroup_subsys_state *css,
5220                                  struct cgroup_taskset *tset)
5221 {
5222 }
5223 #endif
5224
5225 /*
5226  * Cgroup retains root cgroups across [un]mount cycles making it necessary
5227  * to verify whether we're attached to the default hierarchy on each mount
5228  * attempt.
5229  */
5230 static void mem_cgroup_bind(struct cgroup_subsys_state *root_css)
5231 {
5232         /*
5233          * use_hierarchy is forced on the default hierarchy.  cgroup core
5234          * guarantees that @root doesn't have any children, so turning it
5235          * on for the root memcg is enough.
5236          */
5237         if (cgroup_on_dfl(root_css->cgroup))
5238                 root_mem_cgroup->use_hierarchy = true;
5239         else
5240                 root_mem_cgroup->use_hierarchy = false;
5241 }
5242
5243 static u64 memory_current_read(struct cgroup_subsys_state *css,
5244                                struct cftype *cft)
5245 {
5246         return mem_cgroup_usage(mem_cgroup_from_css(css), false);
5247 }
5248
5249 static int memory_low_show(struct seq_file *m, void *v)
5250 {
5251         struct mem_cgroup *memcg = mem_cgroup_from_css(seq_css(m));
5252         unsigned long low = READ_ONCE(memcg->low);
5253
5254         if (low == PAGE_COUNTER_MAX)
5255                 seq_puts(m, "max\n");
5256         else
5257                 seq_printf(m, "%llu\n", (u64)low * PAGE_SIZE);
5258
5259         return 0;
5260 }
5261
5262 static ssize_t memory_low_write(struct kernfs_open_file *of,
5263                                 char *buf, size_t nbytes, loff_t off)
5264 {
5265         struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of));
5266         unsigned long low;
5267         int err;
5268
5269         buf = strstrip(buf);
5270         err = page_counter_memparse(buf, "max", &low);
5271         if (err)
5272                 return err;
5273
5274         memcg->low = low;
5275
5276         return nbytes;
5277 }
5278
5279 static int memory_high_show(struct seq_file *m, void *v)
5280 {
5281         struct mem_cgroup *memcg = mem_cgroup_from_css(seq_css(m));
5282         unsigned long high = READ_ONCE(memcg->high);
5283
5284         if (high == PAGE_COUNTER_MAX)
5285                 seq_puts(m, "max\n");
5286         else
5287                 seq_printf(m, "%llu\n", (u64)high * PAGE_SIZE);
5288
5289         return 0;
5290 }
5291
5292 static ssize_t memory_high_write(struct kernfs_open_file *of,
5293                                  char *buf, size_t nbytes, loff_t off)
5294 {
5295         struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of));
5296         unsigned long high;
5297         int err;
5298
5299         buf = strstrip(buf);
5300         err = page_counter_memparse(buf, "max", &high);
5301         if (err)
5302                 return err;
5303
5304         memcg->high = high;
5305
5306         return nbytes;
5307 }
5308
5309 static int memory_max_show(struct seq_file *m, void *v)
5310 {
5311         struct mem_cgroup *memcg = mem_cgroup_from_css(seq_css(m));
5312         unsigned long max = READ_ONCE(memcg->memory.limit);
5313
5314         if (max == PAGE_COUNTER_MAX)
5315                 seq_puts(m, "max\n");
5316         else
5317                 seq_printf(m, "%llu\n", (u64)max * PAGE_SIZE);
5318
5319         return 0;
5320 }
5321
5322 static ssize_t memory_max_write(struct kernfs_open_file *of,
5323                                 char *buf, size_t nbytes, loff_t off)
5324 {
5325         struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of));
5326         unsigned long max;
5327         int err;
5328
5329         buf = strstrip(buf);
5330         err = page_counter_memparse(buf, "max", &max);
5331         if (err)
5332                 return err;
5333
5334         err = mem_cgroup_resize_limit(memcg, max);
5335         if (err)
5336                 return err;
5337
5338         return nbytes;
5339 }
5340
5341 static int memory_events_show(struct seq_file *m, void *v)
5342 {
5343         struct mem_cgroup *memcg = mem_cgroup_from_css(seq_css(m));
5344
5345         seq_printf(m, "low %lu\n", mem_cgroup_read_events(memcg, MEMCG_LOW));
5346         seq_printf(m, "high %lu\n", mem_cgroup_read_events(memcg, MEMCG_HIGH));
5347         seq_printf(m, "max %lu\n", mem_cgroup_read_events(memcg, MEMCG_MAX));
5348         seq_printf(m, "oom %lu\n", mem_cgroup_read_events(memcg, MEMCG_OOM));
5349
5350         return 0;
5351 }
5352
5353 static struct cftype memory_files[] = {
5354         {
5355                 .name = "current",
5356                 .read_u64 = memory_current_read,
5357         },
5358         {
5359                 .name = "low",
5360                 .flags = CFTYPE_NOT_ON_ROOT,
5361                 .seq_show = memory_low_show,
5362                 .write = memory_low_write,
5363         },
5364         {
5365                 .name = "high",
5366                 .flags = CFTYPE_NOT_ON_ROOT,
5367                 .seq_show = memory_high_show,
5368                 .write = memory_high_write,
5369         },
5370         {
5371                 .name = "max",
5372                 .flags = CFTYPE_NOT_ON_ROOT,
5373                 .seq_show = memory_max_show,
5374                 .write = memory_max_write,
5375         },
5376         {
5377                 .name = "events",
5378                 .flags = CFTYPE_NOT_ON_ROOT,
5379                 .seq_show = memory_events_show,
5380         },
5381         { }     /* terminate */
5382 };
5383
5384 struct cgroup_subsys memory_cgrp_subsys = {
5385         .css_alloc = mem_cgroup_css_alloc,
5386         .css_online = mem_cgroup_css_online,
5387         .css_offline = mem_cgroup_css_offline,
5388         .css_free = mem_cgroup_css_free,
5389         .css_reset = mem_cgroup_css_reset,
5390         .can_attach = mem_cgroup_can_attach,
5391         .cancel_attach = mem_cgroup_cancel_attach,
5392         .attach = mem_cgroup_move_task,
5393         .bind = mem_cgroup_bind,
5394         .dfl_cftypes = memory_files,
5395         .legacy_cftypes = mem_cgroup_legacy_files,
5396         .early_init = 0,
5397 };
5398
5399 /**
5400  * mem_cgroup_events - count memory events against a cgroup
5401  * @memcg: the memory cgroup
5402  * @idx: the event index
5403  * @nr: the number of events to account for
5404  */
5405 void mem_cgroup_events(struct mem_cgroup *memcg,
5406                        enum mem_cgroup_events_index idx,
5407                        unsigned int nr)
5408 {
5409         this_cpu_add(memcg->stat->events[idx], nr);
5410 }
5411
5412 /**
5413  * mem_cgroup_low - check if memory consumption is below the normal range
5414  * @root: the highest ancestor to consider
5415  * @memcg: the memory cgroup to check
5416  *
5417  * Returns %true if memory consumption of @memcg, and that of all
5418  * configurable ancestors up to @root, is below the normal range.
5419  */
5420 bool mem_cgroup_low(struct mem_cgroup *root, struct mem_cgroup *memcg)
5421 {
5422         if (mem_cgroup_disabled())
5423                 return false;
5424
5425         /*
5426          * The toplevel group doesn't have a configurable range, so
5427          * it's never low when looked at directly, and it is not
5428          * considered an ancestor when assessing the hierarchy.
5429          */
5430
5431         if (memcg == root_mem_cgroup)
5432                 return false;
5433
5434         if (page_counter_read(&memcg->memory) >= memcg->low)
5435                 return false;
5436
5437         while (memcg != root) {
5438                 memcg = parent_mem_cgroup(memcg);
5439
5440                 if (memcg == root_mem_cgroup)
5441                         break;
5442
5443                 if (page_counter_read(&memcg->memory) >= memcg->low)
5444                         return false;
5445         }
5446         return true;
5447 }
5448
5449 /**
5450  * mem_cgroup_try_charge - try charging a page
5451  * @page: page to charge
5452  * @mm: mm context of the victim
5453  * @gfp_mask: reclaim mode
5454  * @memcgp: charged memcg return
5455  *
5456  * Try to charge @page to the memcg that @mm belongs to, reclaiming
5457  * pages according to @gfp_mask if necessary.
5458  *
5459  * Returns 0 on success, with *@memcgp pointing to the charged memcg.
5460  * Otherwise, an error code is returned.
5461  *
5462  * After page->mapping has been set up, the caller must finalize the
5463  * charge with mem_cgroup_commit_charge().  Or abort the transaction
5464  * with mem_cgroup_cancel_charge() in case page instantiation fails.
5465  */
5466 int mem_cgroup_try_charge(struct page *page, struct mm_struct *mm,
5467                           gfp_t gfp_mask, struct mem_cgroup **memcgp)
5468 {
5469         struct mem_cgroup *memcg = NULL;
5470         unsigned int nr_pages = 1;
5471         int ret = 0;
5472
5473         if (mem_cgroup_disabled())
5474                 goto out;
5475
5476         if (PageSwapCache(page)) {
5477                 /*
5478                  * Every swap fault against a single page tries to charge the
5479                  * page, bail as early as possible.  shmem_unuse() encounters
5480                  * already charged pages, too.  The USED bit is protected by
5481                  * the page lock, which serializes swap cache removal, which
5482                  * in turn serializes uncharging.
5483                  */
5484                 if (page->mem_cgroup)
5485                         goto out;
5486         }
5487
5488         if (PageTransHuge(page)) {
5489                 nr_pages <<= compound_order(page);
5490                 VM_BUG_ON_PAGE(!PageTransHuge(page), page);
5491         }
5492
5493         if (do_swap_account && PageSwapCache(page))
5494                 memcg = try_get_mem_cgroup_from_page(page);
5495         if (!memcg)
5496                 memcg = get_mem_cgroup_from_mm(mm);
5497
5498         ret = try_charge(memcg, gfp_mask, nr_pages);
5499
5500         css_put(&memcg->css);
5501
5502         if (ret == -EINTR) {
5503                 memcg = root_mem_cgroup;
5504                 ret = 0;
5505         }
5506 out:
5507         *memcgp = memcg;
5508         return ret;
5509 }
5510
5511 /**
5512  * mem_cgroup_commit_charge - commit a page charge
5513  * @page: page to charge
5514  * @memcg: memcg to charge the page to
5515  * @lrucare: page might be on LRU already
5516  *
5517  * Finalize a charge transaction started by mem_cgroup_try_charge(),
5518  * after page->mapping has been set up.  This must happen atomically
5519  * as part of the page instantiation, i.e. under the page table lock
5520  * for anonymous pages, under the page lock for page and swap cache.
5521  *
5522  * In addition, the page must not be on the LRU during the commit, to
5523  * prevent racing with task migration.  If it might be, use @lrucare.
5524  *
5525  * Use mem_cgroup_cancel_charge() to cancel the transaction instead.
5526  */
5527 void mem_cgroup_commit_charge(struct page *page, struct mem_cgroup *memcg,
5528                               bool lrucare)
5529 {
5530         unsigned int nr_pages = 1;
5531
5532         VM_BUG_ON_PAGE(!page->mapping, page);
5533         VM_BUG_ON_PAGE(PageLRU(page) && !lrucare, page);
5534
5535         if (mem_cgroup_disabled())
5536                 return;
5537         /*
5538          * Swap faults will attempt to charge the same page multiple
5539          * times.  But reuse_swap_page() might have removed the page
5540          * from swapcache already, so we can't check PageSwapCache().
5541          */
5542         if (!memcg)
5543                 return;
5544
5545         commit_charge(page, memcg, lrucare);
5546
5547         if (PageTransHuge(page)) {
5548                 nr_pages <<= compound_order(page);
5549                 VM_BUG_ON_PAGE(!PageTransHuge(page), page);
5550         }
5551
5552         local_lock_irq(event_lock);
5553         mem_cgroup_charge_statistics(memcg, page, nr_pages);
5554         memcg_check_events(memcg, page);
5555         local_unlock_irq(event_lock);
5556
5557         if (do_swap_account && PageSwapCache(page)) {
5558                 swp_entry_t entry = { .val = page_private(page) };
5559                 /*
5560                  * The swap entry might not get freed for a long time,
5561                  * let's not wait for it.  The page already received a
5562                  * memory+swap charge, drop the swap entry duplicate.
5563                  */
5564                 mem_cgroup_uncharge_swap(entry);
5565         }
5566 }
5567
5568 /**
5569  * mem_cgroup_cancel_charge - cancel a page charge
5570  * @page: page to charge
5571  * @memcg: memcg to charge the page to
5572  *
5573  * Cancel a charge transaction started by mem_cgroup_try_charge().
5574  */
5575 void mem_cgroup_cancel_charge(struct page *page, struct mem_cgroup *memcg)
5576 {
5577         unsigned int nr_pages = 1;
5578
5579         if (mem_cgroup_disabled())
5580                 return;
5581         /*
5582          * Swap faults will attempt to charge the same page multiple
5583          * times.  But reuse_swap_page() might have removed the page
5584          * from swapcache already, so we can't check PageSwapCache().
5585          */
5586         if (!memcg)
5587                 return;
5588
5589         if (PageTransHuge(page)) {
5590                 nr_pages <<= compound_order(page);
5591                 VM_BUG_ON_PAGE(!PageTransHuge(page), page);
5592         }
5593
5594         cancel_charge(memcg, nr_pages);
5595 }
5596
5597 static void uncharge_batch(struct mem_cgroup *memcg, unsigned long pgpgout,
5598                            unsigned long nr_anon, unsigned long nr_file,
5599                            unsigned long nr_huge, struct page *dummy_page)
5600 {
5601         unsigned long nr_pages = nr_anon + nr_file;
5602         unsigned long flags;
5603
5604         if (!mem_cgroup_is_root(memcg)) {
5605                 page_counter_uncharge(&memcg->memory, nr_pages);
5606                 if (do_swap_account)
5607                         page_counter_uncharge(&memcg->memsw, nr_pages);
5608                 memcg_oom_recover(memcg);
5609         }
5610
5611         local_lock_irqsave(event_lock, flags);
5612         __this_cpu_sub(memcg->stat->count[MEM_CGROUP_STAT_RSS], nr_anon);
5613         __this_cpu_sub(memcg->stat->count[MEM_CGROUP_STAT_CACHE], nr_file);
5614         __this_cpu_sub(memcg->stat->count[MEM_CGROUP_STAT_RSS_HUGE], nr_huge);
5615         __this_cpu_add(memcg->stat->events[MEM_CGROUP_EVENTS_PGPGOUT], pgpgout);
5616         __this_cpu_add(memcg->stat->nr_page_events, nr_pages);
5617         memcg_check_events(memcg, dummy_page);
5618         local_unlock_irqrestore(event_lock, flags);
5619
5620         if (!mem_cgroup_is_root(memcg))
5621                 css_put_many(&memcg->css, nr_pages);
5622 }
5623
5624 static void uncharge_list(struct list_head *page_list)
5625 {
5626         struct mem_cgroup *memcg = NULL;
5627         unsigned long nr_anon = 0;
5628         unsigned long nr_file = 0;
5629         unsigned long nr_huge = 0;
5630         unsigned long pgpgout = 0;
5631         struct list_head *next;
5632         struct page *page;
5633
5634         next = page_list->next;
5635         do {
5636                 unsigned int nr_pages = 1;
5637
5638                 page = list_entry(next, struct page, lru);
5639                 next = page->lru.next;
5640
5641                 VM_BUG_ON_PAGE(PageLRU(page), page);
5642                 VM_BUG_ON_PAGE(page_count(page), page);
5643
5644                 if (!page->mem_cgroup)
5645                         continue;
5646
5647                 /*
5648                  * Nobody should be changing or seriously looking at
5649                  * page->mem_cgroup at this point, we have fully
5650                  * exclusive access to the page.
5651                  */
5652
5653                 if (memcg != page->mem_cgroup) {
5654                         if (memcg) {
5655                                 uncharge_batch(memcg, pgpgout, nr_anon, nr_file,
5656                                                nr_huge, page);
5657                                 pgpgout = nr_anon = nr_file = nr_huge = 0;
5658                         }
5659                         memcg = page->mem_cgroup;
5660                 }
5661
5662                 if (PageTransHuge(page)) {
5663                         nr_pages <<= compound_order(page);
5664                         VM_BUG_ON_PAGE(!PageTransHuge(page), page);
5665                         nr_huge += nr_pages;
5666                 }
5667
5668                 if (PageAnon(page))
5669                         nr_anon += nr_pages;
5670                 else
5671                         nr_file += nr_pages;
5672
5673                 page->mem_cgroup = NULL;
5674
5675                 pgpgout++;
5676         } while (next != page_list);
5677
5678         if (memcg)
5679                 uncharge_batch(memcg, pgpgout, nr_anon, nr_file,
5680                                nr_huge, page);
5681 }
5682
5683 /**
5684  * mem_cgroup_uncharge - uncharge a page
5685  * @page: page to uncharge
5686  *
5687  * Uncharge a page previously charged with mem_cgroup_try_charge() and
5688  * mem_cgroup_commit_charge().
5689  */
5690 void mem_cgroup_uncharge(struct page *page)
5691 {
5692         if (mem_cgroup_disabled())
5693                 return;
5694
5695         /* Don't touch page->lru of any random page, pre-check: */
5696         if (!page->mem_cgroup)
5697                 return;
5698
5699         INIT_LIST_HEAD(&page->lru);
5700         uncharge_list(&page->lru);
5701 }
5702
5703 /**
5704  * mem_cgroup_uncharge_list - uncharge a list of page
5705  * @page_list: list of pages to uncharge
5706  *
5707  * Uncharge a list of pages previously charged with
5708  * mem_cgroup_try_charge() and mem_cgroup_commit_charge().
5709  */
5710 void mem_cgroup_uncharge_list(struct list_head *page_list)
5711 {
5712         if (mem_cgroup_disabled())
5713                 return;
5714
5715         if (!list_empty(page_list))
5716                 uncharge_list(page_list);
5717 }
5718
5719 /**
5720  * mem_cgroup_migrate - migrate a charge to another page
5721  * @oldpage: currently charged page
5722  * @newpage: page to transfer the charge to
5723  * @lrucare: either or both pages might be on the LRU already
5724  *
5725  * Migrate the charge from @oldpage to @newpage.
5726  *
5727  * Both pages must be locked, @newpage->mapping must be set up.
5728  */
5729 void mem_cgroup_migrate(struct page *oldpage, struct page *newpage,
5730                         bool lrucare)
5731 {
5732         struct mem_cgroup *memcg;
5733         int isolated;
5734
5735         VM_BUG_ON_PAGE(!PageLocked(oldpage), oldpage);
5736         VM_BUG_ON_PAGE(!PageLocked(newpage), newpage);
5737         VM_BUG_ON_PAGE(!lrucare && PageLRU(oldpage), oldpage);
5738         VM_BUG_ON_PAGE(!lrucare && PageLRU(newpage), newpage);
5739         VM_BUG_ON_PAGE(PageAnon(oldpage) != PageAnon(newpage), newpage);
5740         VM_BUG_ON_PAGE(PageTransHuge(oldpage) != PageTransHuge(newpage),
5741                        newpage);
5742
5743         if (mem_cgroup_disabled())
5744                 return;
5745
5746         /* Page cache replacement: new page already charged? */
5747         if (newpage->mem_cgroup)
5748                 return;
5749
5750         /*
5751          * Swapcache readahead pages can get migrated before being
5752          * charged, and migration from compaction can happen to an
5753          * uncharged page when the PFN walker finds a page that
5754          * reclaim just put back on the LRU but has not released yet.
5755          */
5756         memcg = oldpage->mem_cgroup;
5757         if (!memcg)
5758                 return;
5759
5760         if (lrucare)
5761                 lock_page_lru(oldpage, &isolated);
5762
5763         oldpage->mem_cgroup = NULL;
5764
5765         if (lrucare)
5766                 unlock_page_lru(oldpage, isolated);
5767
5768         commit_charge(newpage, memcg, lrucare);
5769 }
5770
5771 /*
5772  * subsys_initcall() for memory controller.
5773  *
5774  * Some parts like hotcpu_notifier() have to be initialized from this context
5775  * because of lock dependencies (cgroup_lock -> cpu hotplug) but basically
5776  * everything that doesn't depend on a specific mem_cgroup structure should
5777  * be initialized from here.
5778  */
5779 static int __init mem_cgroup_init(void)
5780 {
5781         int cpu, node;
5782
5783         hotcpu_notifier(memcg_cpu_hotplug_callback, 0);
5784
5785         for_each_possible_cpu(cpu)
5786                 INIT_WORK(&per_cpu_ptr(&memcg_stock, cpu)->work,
5787                           drain_local_stock);
5788
5789         for_each_node(node) {
5790                 struct mem_cgroup_tree_per_node *rtpn;
5791                 int zone;
5792
5793                 rtpn = kzalloc_node(sizeof(*rtpn), GFP_KERNEL,
5794                                     node_online(node) ? node : NUMA_NO_NODE);
5795
5796                 for (zone = 0; zone < MAX_NR_ZONES; zone++) {
5797                         struct mem_cgroup_tree_per_zone *rtpz;
5798
5799                         rtpz = &rtpn->rb_tree_per_zone[zone];
5800                         rtpz->rb_root = RB_ROOT;
5801                         spin_lock_init(&rtpz->lock);
5802                 }
5803                 soft_limit_tree.rb_tree_per_node[node] = rtpn;
5804         }
5805
5806         return 0;
5807 }
5808 subsys_initcall(mem_cgroup_init);
5809
5810 #ifdef CONFIG_MEMCG_SWAP
5811 /**
5812  * mem_cgroup_swapout - transfer a memsw charge to swap
5813  * @page: page whose memsw charge to transfer
5814  * @entry: swap entry to move the charge to
5815  *
5816  * Transfer the memsw charge of @page to @entry.
5817  */
5818 void mem_cgroup_swapout(struct page *page, swp_entry_t entry)
5819 {
5820         struct mem_cgroup *memcg;
5821         unsigned short oldid;
5822         unsigned long flags;
5823
5824         VM_BUG_ON_PAGE(PageLRU(page), page);
5825         VM_BUG_ON_PAGE(page_count(page), page);
5826
5827         if (!do_swap_account)
5828                 return;
5829
5830         memcg = page->mem_cgroup;
5831
5832         /* Readahead page, never charged */
5833         if (!memcg)
5834                 return;
5835
5836         oldid = swap_cgroup_record(entry, mem_cgroup_id(memcg));
5837         VM_BUG_ON_PAGE(oldid, page);
5838         mem_cgroup_swap_statistics(memcg, true);
5839
5840         page->mem_cgroup = NULL;
5841
5842         if (!mem_cgroup_is_root(memcg))
5843                 page_counter_uncharge(&memcg->memory, 1);
5844
5845         local_lock_irqsave(event_lock, flags);
5846         /* Caller disabled preemption with mapping->tree_lock */
5847         mem_cgroup_charge_statistics(memcg, page, -1);
5848         memcg_check_events(memcg, page);
5849         local_unlock_irqrestore(event_lock, flags);
5850 }
5851
5852 /**
5853  * mem_cgroup_uncharge_swap - uncharge a swap entry
5854  * @entry: swap entry to uncharge
5855  *
5856  * Drop the memsw charge associated with @entry.
5857  */
5858 void mem_cgroup_uncharge_swap(swp_entry_t entry)
5859 {
5860         struct mem_cgroup *memcg;
5861         unsigned short id;
5862
5863         if (!do_swap_account)
5864                 return;
5865
5866         id = swap_cgroup_record(entry, 0);
5867         rcu_read_lock();
5868         memcg = mem_cgroup_from_id(id);
5869         if (memcg) {
5870                 if (!mem_cgroup_is_root(memcg))
5871                         page_counter_uncharge(&memcg->memsw, 1);
5872                 mem_cgroup_swap_statistics(memcg, false);
5873                 css_put(&memcg->css);
5874         }
5875         rcu_read_unlock();
5876 }
5877
5878 /* for remember boot option*/
5879 #ifdef CONFIG_MEMCG_SWAP_ENABLED
5880 static int really_do_swap_account __initdata = 1;
5881 #else
5882 static int really_do_swap_account __initdata;
5883 #endif
5884
5885 static int __init enable_swap_account(char *s)
5886 {
5887         if (!strcmp(s, "1"))
5888                 really_do_swap_account = 1;
5889         else if (!strcmp(s, "0"))
5890                 really_do_swap_account = 0;
5891         return 1;
5892 }
5893 __setup("swapaccount=", enable_swap_account);
5894
5895 static struct cftype memsw_cgroup_files[] = {
5896         {
5897                 .name = "memsw.usage_in_bytes",
5898                 .private = MEMFILE_PRIVATE(_MEMSWAP, RES_USAGE),
5899                 .read_u64 = mem_cgroup_read_u64,
5900         },
5901         {
5902                 .name = "memsw.max_usage_in_bytes",
5903                 .private = MEMFILE_PRIVATE(_MEMSWAP, RES_MAX_USAGE),
5904                 .write = mem_cgroup_reset,
5905                 .read_u64 = mem_cgroup_read_u64,
5906         },
5907         {
5908                 .name = "memsw.limit_in_bytes",
5909                 .private = MEMFILE_PRIVATE(_MEMSWAP, RES_LIMIT),
5910                 .write = mem_cgroup_write,
5911                 .read_u64 = mem_cgroup_read_u64,
5912         },
5913         {
5914                 .name = "memsw.failcnt",
5915                 .private = MEMFILE_PRIVATE(_MEMSWAP, RES_FAILCNT),
5916                 .write = mem_cgroup_reset,
5917                 .read_u64 = mem_cgroup_read_u64,
5918         },
5919         { },    /* terminate */
5920 };
5921
5922 static int __init mem_cgroup_swap_init(void)
5923 {
5924         if (!mem_cgroup_disabled() && really_do_swap_account) {
5925                 do_swap_account = 1;
5926                 WARN_ON(cgroup_add_legacy_cftypes(&memory_cgrp_subsys,
5927                                                   memsw_cgroup_files));
5928         }
5929         return 0;
5930 }
5931 subsys_initcall(mem_cgroup_swap_init);
5932
5933 #endif /* CONFIG_MEMCG_SWAP */