Add the rt linux 4.1.3-rt3 as base
[kvmfornfv.git] / kernel / mm / page_counter.c
diff --git a/kernel/mm/page_counter.c b/kernel/mm/page_counter.c
new file mode 100644 (file)
index 0000000..11b4bed
--- /dev/null
@@ -0,0 +1,193 @@
+/*
+ * Lockless hierarchical page accounting & limiting
+ *
+ * Copyright (C) 2014 Red Hat, Inc., Johannes Weiner
+ */
+
+#include <linux/page_counter.h>
+#include <linux/atomic.h>
+#include <linux/kernel.h>
+#include <linux/string.h>
+#include <linux/sched.h>
+#include <linux/bug.h>
+#include <asm/page.h>
+
+/**
+ * page_counter_cancel - take pages out of the local counter
+ * @counter: counter
+ * @nr_pages: number of pages to cancel
+ */
+void page_counter_cancel(struct page_counter *counter, unsigned long nr_pages)
+{
+       long new;
+
+       new = atomic_long_sub_return(nr_pages, &counter->count);
+       /* More uncharges than charges? */
+       WARN_ON_ONCE(new < 0);
+}
+
+/**
+ * page_counter_charge - hierarchically charge pages
+ * @counter: counter
+ * @nr_pages: number of pages to charge
+ *
+ * NOTE: This does not consider any configured counter limits.
+ */
+void page_counter_charge(struct page_counter *counter, unsigned long nr_pages)
+{
+       struct page_counter *c;
+
+       for (c = counter; c; c = c->parent) {
+               long new;
+
+               new = atomic_long_add_return(nr_pages, &c->count);
+               /*
+                * This is indeed racy, but we can live with some
+                * inaccuracy in the watermark.
+                */
+               if (new > c->watermark)
+                       c->watermark = new;
+       }
+}
+
+/**
+ * page_counter_try_charge - try to hierarchically charge pages
+ * @counter: counter
+ * @nr_pages: number of pages to charge
+ * @fail: points first counter to hit its limit, if any
+ *
+ * Returns 0 on success, or -ENOMEM and @fail if the counter or one of
+ * its ancestors has hit its configured limit.
+ */
+int page_counter_try_charge(struct page_counter *counter,
+                           unsigned long nr_pages,
+                           struct page_counter **fail)
+{
+       struct page_counter *c;
+
+       for (c = counter; c; c = c->parent) {
+               long new;
+               /*
+                * Charge speculatively to avoid an expensive CAS.  If
+                * a bigger charge fails, it might falsely lock out a
+                * racing smaller charge and send it into reclaim
+                * early, but the error is limited to the difference
+                * between the two sizes, which is less than 2M/4M in
+                * case of a THP locking out a regular page charge.
+                *
+                * The atomic_long_add_return() implies a full memory
+                * barrier between incrementing the count and reading
+                * the limit.  When racing with page_counter_limit(),
+                * we either see the new limit or the setter sees the
+                * counter has changed and retries.
+                */
+               new = atomic_long_add_return(nr_pages, &c->count);
+               if (new > c->limit) {
+                       atomic_long_sub(nr_pages, &c->count);
+                       /*
+                        * This is racy, but we can live with some
+                        * inaccuracy in the failcnt.
+                        */
+                       c->failcnt++;
+                       *fail = c;
+                       goto failed;
+               }
+               /*
+                * Just like with failcnt, we can live with some
+                * inaccuracy in the watermark.
+                */
+               if (new > c->watermark)
+                       c->watermark = new;
+       }
+       return 0;
+
+failed:
+       for (c = counter; c != *fail; c = c->parent)
+               page_counter_cancel(c, nr_pages);
+
+       return -ENOMEM;
+}
+
+/**
+ * page_counter_uncharge - hierarchically uncharge pages
+ * @counter: counter
+ * @nr_pages: number of pages to uncharge
+ */
+void page_counter_uncharge(struct page_counter *counter, unsigned long nr_pages)
+{
+       struct page_counter *c;
+
+       for (c = counter; c; c = c->parent)
+               page_counter_cancel(c, nr_pages);
+}
+
+/**
+ * page_counter_limit - limit the number of pages allowed
+ * @counter: counter
+ * @limit: limit to set
+ *
+ * Returns 0 on success, -EBUSY if the current number of pages on the
+ * counter already exceeds the specified limit.
+ *
+ * The caller must serialize invocations on the same counter.
+ */
+int page_counter_limit(struct page_counter *counter, unsigned long limit)
+{
+       for (;;) {
+               unsigned long old;
+               long count;
+
+               /*
+                * Update the limit while making sure that it's not
+                * below the concurrently-changing counter value.
+                *
+                * The xchg implies two full memory barriers before
+                * and after, so the read-swap-read is ordered and
+                * ensures coherency with page_counter_try_charge():
+                * that function modifies the count before checking
+                * the limit, so if it sees the old limit, we see the
+                * modified counter and retry.
+                */
+               count = atomic_long_read(&counter->count);
+
+               if (count > limit)
+                       return -EBUSY;
+
+               old = xchg(&counter->limit, limit);
+
+               if (atomic_long_read(&counter->count) <= count)
+                       return 0;
+
+               counter->limit = old;
+               cond_resched();
+       }
+}
+
+/**
+ * page_counter_memparse - memparse() for page counter limits
+ * @buf: string to parse
+ * @max: string meaning maximum possible value
+ * @nr_pages: returns the result in number of pages
+ *
+ * Returns -EINVAL, or 0 and @nr_pages on success.  @nr_pages will be
+ * limited to %PAGE_COUNTER_MAX.
+ */
+int page_counter_memparse(const char *buf, const char *max,
+                         unsigned long *nr_pages)
+{
+       char *end;
+       u64 bytes;
+
+       if (!strcmp(buf, max)) {
+               *nr_pages = PAGE_COUNTER_MAX;
+               return 0;
+       }
+
+       bytes = memparse(buf, &end);
+       if (*end != '\0')
+               return -EINVAL;
+
+       *nr_pages = min(bytes / PAGE_SIZE, (u64)PAGE_COUNTER_MAX);
+
+       return 0;
+}