These changes are the raw update to linux-4.4.6-rt14. Kernel sources

[kvmfornfv.git] / kernel / mm / page_alloc.c
diff --git a/kernel/mm/page_alloc.c b/kernel/mm/page_alloc.c

index 41bd90d..d002418 100644 (file)
--- a/kernel/mm/page_alloc.c
+++ b/kernel/mm/page_alloc.c
@@ -62,6 +62,7 @@
  #include <linux/sched/rt.h>
  #include <linux/locallock.h>
  #include <linux/page_owner.h>
+#include <linux/kthread.h>
  
  #include <asm/sections.h>
  #include <asm/tlbflush.h>
@@ -125,6 +126,24 @@ unsigned long dirty_balance_reserve __read_mostly;
  int percpu_pagelist_fraction;
  gfp_t gfp_allowed_mask __read_mostly = GFP_BOOT_MASK;
  
+/*
+ * A cached value of the page's pageblock's migratetype, used when the page is
+ * put on a pcplist. Used to avoid the pageblock migratetype lookup when
+ * freeing from pcplists in most cases, at the cost of possibly becoming stale.
+ * Also the migratetype set in the page does not necessarily match the pcplist
+ * index, e.g. page might have MIGRATE_CMA set but be on a pcplist with any
+ * other index - this ensures that it will be put on the correct CMA freelist.
+ */
+static inline int get_pcppage_migratetype(struct page *page)
+{
+       return page->index;
+}
+
+static inline void set_pcppage_migratetype(struct page *page, int migratetype)
+{
+       page->index = migratetype;
+}
+
  #ifdef CONFIG_PM_SLEEP
  /*
   * The following functions are used by the suspend/hibernate code to temporarily
@@ -151,19 +170,19 @@ void pm_restrict_gfp_mask(void)
         WARN_ON(!mutex_is_locked(&pm_mutex));
         WARN_ON(saved_gfp_mask);
         saved_gfp_mask = gfp_allowed_mask;
-       gfp_allowed_mask &= ~GFP_IOFS;
+       gfp_allowed_mask &= ~(__GFP_IO | __GFP_FS);
  }
  
  bool pm_suspended_storage(void)
  {
-       if ((gfp_allowed_mask & GFP_IOFS) == GFP_IOFS)
+       if ((gfp_allowed_mask & (__GFP_IO | __GFP_FS)) == (__GFP_IO | __GFP_FS))
                 return false;
         return true;
  }
  #endif /* CONFIG_PM_SLEEP */
  
  #ifdef CONFIG_HUGETLB_PAGE_SIZE_VARIABLE
-int pageblock_order __read_mostly;
+unsigned int pageblock_order __read_mostly;
  #endif
  
  static void __free_pages_ok(struct page *page, unsigned int order);
@@ -206,6 +225,18 @@ static char * const zone_names[MAX_NR_ZONES] = {
          "HighMem",
  #endif
          "Movable",
+#ifdef CONFIG_ZONE_DEVICE
+        "Device",
+#endif
+};
+
+static void free_compound_page(struct page *page);
+compound_page_dtor * const compound_page_dtors[] = {
+       NULL,
+       free_compound_page,
+#ifdef CONFIG_HUGETLB_PAGE
+       free_huge_page,
+#endif
  };
  
  int min_free_kbytes = 1024;
@@ -248,6 +279,75 @@ static DEFINE_LOCAL_IRQ_LOCK(pa_lock);
  
  int page_group_by_mobility_disabled __read_mostly;
  
+#ifdef CONFIG_DEFERRED_STRUCT_PAGE_INIT
+static inline void reset_deferred_meminit(pg_data_t *pgdat)
+{
+       pgdat->first_deferred_pfn = ULONG_MAX;
+}
+
+/* Returns true if the struct page for the pfn is uninitialised */
+static inline bool __meminit early_page_uninitialised(unsigned long pfn)
+{
+       if (pfn >= NODE_DATA(early_pfn_to_nid(pfn))->first_deferred_pfn)
+               return true;
+
+       return false;
+}
+
+static inline bool early_page_nid_uninitialised(unsigned long pfn, int nid)
+{
+       if (pfn >= NODE_DATA(nid)->first_deferred_pfn)
+               return true;
+
+       return false;
+}
+
+/*
+ * Returns false when the remaining initialisation should be deferred until
+ * later in the boot cycle when it can be parallelised.
+ */
+static inline bool update_defer_init(pg_data_t *pgdat,
+                               unsigned long pfn, unsigned long zone_end,
+                               unsigned long *nr_initialised)
+{
+       /* Always populate low zones for address-contrained allocations */
+       if (zone_end < pgdat_end_pfn(pgdat))
+               return true;
+
+       /* Initialise at least 2G of the highest zone */
+       (*nr_initialised)++;
+       if (*nr_initialised > (2UL << (30 - PAGE_SHIFT)) &&
+           (pfn & (PAGES_PER_SECTION - 1)) == 0) {
+               pgdat->first_deferred_pfn = pfn;
+               return false;
+       }
+
+       return true;
+}
+#else
+static inline void reset_deferred_meminit(pg_data_t *pgdat)
+{
+}
+
+static inline bool early_page_uninitialised(unsigned long pfn)
+{
+       return false;
+}
+
+static inline bool early_page_nid_uninitialised(unsigned long pfn, int nid)
+{
+       return false;
+}
+
+static inline bool update_defer_init(pg_data_t *pgdat,
+                               unsigned long pfn, unsigned long zone_end,
+                               unsigned long *nr_initialised)
+{
+       return true;
+}
+#endif
+
+
  void set_pageblock_migratetype(struct page *page, int migratetype)
  {
         if (unlikely(page_group_by_mobility_disabled &&
@@ -358,15 +458,15 @@ out:
  /*
   * Higher-order pages are called "compound pages".  They are structured thusly:
   *
- * The first PAGE_SIZE page is called the "head page".
+ * The first PAGE_SIZE page is called the "head page" and have PG_head set.
   *
- * The remaining PAGE_SIZE pages are called "tail pages".
+ * The remaining PAGE_SIZE pages are called "tail pages". PageTail() is encoded
+ * in bit 0 of page->compound_head. The rest of bits is pointer to head page.
   *
- * All pages have PG_compound set.  All tail pages have their ->first_page
- * pointing at the head page.
+ * The first tail page's ->compound_dtor holds the offset in array of compound
+ * page destructors. See compound_page_dtors.
   *
- * The first tail page's ->lru.next holds the address of the compound page's
- * put_page() function.  Its ->lru.prev holds the order of allocation.
+ * The first tail page's ->compound_order holds the order of allocation.
   * This usage means that zero-order pages may not be compound.
   */
  
@@ -375,38 +475,21 @@ static void free_compound_page(struct page *page)
         __free_pages_ok(page, compound_order(page));
  }
  
-void prep_compound_page(struct page *page, unsigned long order)
+void prep_compound_page(struct page *page, unsigned int order)
  {
         int i;
         int nr_pages = 1 << order;
  
-       set_compound_page_dtor(page, free_compound_page);
+       set_compound_page_dtor(page, COMPOUND_PAGE_DTOR);
         set_compound_order(page, order);
         __SetPageHead(page);
         for (i = 1; i < nr_pages; i++) {
                 struct page *p = page + i;
                 set_page_count(p, 0);
-               p->first_page = page;
-               /* Make sure p->first_page is always valid for PageTail() */
-               smp_wmb();
-               __SetPageTail(p);
+               set_compound_head(p, page);
         }
  }
  
-static inline void prep_zero_page(struct page *page, unsigned int order,
-                                                       gfp_t gfp_flags)
-{
-       int i;
-
-       /*
-        * clear_highpage() will use KM_USER0, so it's a bug to use __GFP_ZERO
-        * and __GFP_HIGHMEM from hard or soft interrupt context.
-        */
-       VM_BUG_ON((gfp_flags & __GFP_HIGHMEM) && in_interrupt());
-       for (i = 0; i < (1 << order); i++)
-               clear_highpage(page + i);
-}
-
  #ifdef CONFIG_DEBUG_PAGEALLOC
  unsigned int _debug_guardpage_minorder;
  bool _debug_pagealloc_enabled __read_mostly;
@@ -592,7 +675,7 @@ static inline void __free_one_page(struct page *page,
         unsigned long combined_idx;
         unsigned long uninitialized_var(buddy_idx);
         struct page *buddy;
-       int max_order = MAX_ORDER;
+       unsigned int max_order = MAX_ORDER;
  
         VM_BUG_ON(!zone_is_initialized(zone));
         VM_BUG_ON_PAGE(page->flags & PAGE_FLAGS_CHECK_AT_PREP, page);
@@ -605,7 +688,7 @@ static inline void __free_one_page(struct page *page,
                  * pageblock. Without this, pageblock isolation
                  * could cause incorrect freepage accounting.
                  */
-               max_order = min(MAX_ORDER, pageblock_order + 1);
+               max_order = min_t(unsigned int, MAX_ORDER, pageblock_order + 1);
         } else {
                 __mod_zone_freepage_state(zone, 1 << order, migratetype);
         }
@@ -724,11 +807,13 @@ static void free_pcppages_bulk(struct zone *zone, int count,
                 /* must delete as __free_one_page list manipulates */
                 list_del(&page->lru);
  
-               mt = get_freepage_migratetype(page);
+               mt = get_pcppage_migratetype(page);
+               /* MIGRATE_ISOLATE page should not go to pcplists */
+               VM_BUG_ON_PAGE(is_migrate_isolate(mt), page);
+               /* Pageblock could have been isolated meanwhile */
                 if (unlikely(has_isolate_pageblock(zone)))
                         mt = get_pageblock_migratetype(page);
  
-               /* MIGRATE_MOVABLE list may include MIGRATE_RESERVEs */
                 __free_one_page(page, page_to_pfn(page), zone, 0, mt);
                 trace_mm_page_pcpu_drain(page, 0, mt);
                 to_free--;
@@ -775,6 +860,7 @@ static void isolate_pcp_pages(int to_free, struct per_cpu_pages *src,
                 do {
                         page = list_last_entry(list, struct page, lru);
                         list_del(&page->lru);
+
                         list_add(&page->lru, dst);
                 } while (--to_free && --batch_free && !list_empty(list));
         }
@@ -803,17 +889,103 @@ static void free_one_page(struct zone *zone,
  
  static int free_tail_pages_check(struct page *head_page, struct page *page)
  {
-       if (!IS_ENABLED(CONFIG_DEBUG_VM))
-               return 0;
+       int ret = 1;
+
+       /*
+        * We rely page->lru.next never has bit 0 set, unless the page
+        * is PageTail(). Let's make sure that's true even for poisoned ->lru.
+        */
+       BUILD_BUG_ON((unsigned long)LIST_POISON1 & 1);
+
+       if (!IS_ENABLED(CONFIG_DEBUG_VM)) {
+               ret = 0;
+               goto out;
+       }
         if (unlikely(!PageTail(page))) {
                 bad_page(page, "PageTail not set", 0);
-               return 1;
+               goto out;
         }
-       if (unlikely(page->first_page != head_page)) {
-               bad_page(page, "first_page not consistent", 0);
-               return 1;
+       if (unlikely(compound_head(page) != head_page)) {
+               bad_page(page, "compound_head not consistent", 0);
+               goto out;
+       }
+       ret = 0;
+out:
+       clear_compound_head(page);
+       return ret;
+}
+
+static void __meminit __init_single_page(struct page *page, unsigned long pfn,
+                               unsigned long zone, int nid)
+{
+       set_page_links(page, zone, nid, pfn);
+       init_page_count(page);
+       page_mapcount_reset(page);
+       page_cpupid_reset_last(page);
+
+       INIT_LIST_HEAD(&page->lru);
+#ifdef WANT_PAGE_VIRTUAL
+       /* The shift won't overflow because ZONE_NORMAL is below 4G. */
+       if (!is_highmem_idx(zone))
+               set_page_address(page, __va(pfn << PAGE_SHIFT));
+#endif
+}
+
+static void __meminit __init_single_pfn(unsigned long pfn, unsigned long zone,
+                                       int nid)
+{
+       return __init_single_page(pfn_to_page(pfn), pfn, zone, nid);
+}
+
+#ifdef CONFIG_DEFERRED_STRUCT_PAGE_INIT
+static void init_reserved_page(unsigned long pfn)
+{
+       pg_data_t *pgdat;
+       int nid, zid;
+
+       if (!early_page_uninitialised(pfn))
+               return;
+
+       nid = early_pfn_to_nid(pfn);
+       pgdat = NODE_DATA(nid);
+
+       for (zid = 0; zid < MAX_NR_ZONES; zid++) {
+               struct zone *zone = &pgdat->node_zones[zid];
+
+               if (pfn >= zone->zone_start_pfn && pfn < zone_end_pfn(zone))
+                       break;
+       }
+       __init_single_pfn(pfn, zid, nid);
+}
+#else
+static inline void init_reserved_page(unsigned long pfn)
+{
+}
+#endif /* CONFIG_DEFERRED_STRUCT_PAGE_INIT */
+
+/*
+ * Initialised pages do not have PageReserved set. This function is
+ * called for each range allocated by the bootmem allocator and
+ * marks the pages PageReserved. The remaining valid pages are later
+ * sent to the buddy page allocator.
+ */
+void __meminit reserve_bootmem_region(unsigned long start, unsigned long end)
+{
+       unsigned long start_pfn = PFN_DOWN(start);
+       unsigned long end_pfn = PFN_UP(end);
+
+       for (; start_pfn < end_pfn; start_pfn++) {
+               if (pfn_valid(start_pfn)) {
+                       struct page *page = pfn_to_page(start_pfn);
+
+                       init_reserved_page(start_pfn);
+
+                       /* Avoid false-positive PageTail() */
+                       INIT_LIST_HEAD(&page->lru);
+
+                       SetPageReserved(page);
+               }
         }
-       return 0;
  }
  
  static bool free_pages_prepare(struct page *page, unsigned int order)
@@ -865,12 +1037,12 @@ static void __free_pages_ok(struct page *page, unsigned int order)
         migratetype = get_pfnblock_migratetype(page, pfn);
         local_lock_irqsave(pa_lock, flags);
         __count_vm_events(PGFREE, 1 << order);
-       set_freepage_migratetype(page, migratetype);
         free_one_page(page_zone(page), page, pfn, order, migratetype);
         local_unlock_irqrestore(pa_lock, flags);
  }
  
-void __init __free_pages_bootmem(struct page *page, unsigned int order)
+static void __init __free_pages_boot_core(struct page *page,
+                                       unsigned long pfn, unsigned int order)
  {
         unsigned int nr_pages = 1 << order;
         struct page *p = page;
@@ -890,6 +1062,235 @@ void __init __free_pages_bootmem(struct page *page, unsigned int order)
         __free_pages(page, order);
  }
  
+#if defined(CONFIG_HAVE_ARCH_EARLY_PFN_TO_NID) || \
+       defined(CONFIG_HAVE_MEMBLOCK_NODE_MAP)
+
+static struct mminit_pfnnid_cache early_pfnnid_cache __meminitdata;
+
+int __meminit early_pfn_to_nid(unsigned long pfn)
+{
+       static DEFINE_SPINLOCK(early_pfn_lock);
+       int nid;
+
+       spin_lock(&early_pfn_lock);
+       nid = __early_pfn_to_nid(pfn, &early_pfnnid_cache);
+       if (nid < 0)
+               nid = 0;
+       spin_unlock(&early_pfn_lock);
+
+       return nid;
+}
+#endif
+
+#ifdef CONFIG_NODES_SPAN_OTHER_NODES
+static inline bool __meminit meminit_pfn_in_nid(unsigned long pfn, int node,
+                                       struct mminit_pfnnid_cache *state)
+{
+       int nid;
+
+       nid = __early_pfn_to_nid(pfn, state);
+       if (nid >= 0 && nid != node)
+               return false;
+       return true;
+}
+
+/* Only safe to use early in boot when initialisation is single-threaded */
+static inline bool __meminit early_pfn_in_nid(unsigned long pfn, int node)
+{
+       return meminit_pfn_in_nid(pfn, node, &early_pfnnid_cache);
+}
+
+#else
+
+static inline bool __meminit early_pfn_in_nid(unsigned long pfn, int node)
+{
+       return true;
+}
+static inline bool __meminit meminit_pfn_in_nid(unsigned long pfn, int node,
+                                       struct mminit_pfnnid_cache *state)
+{
+       return true;
+}
+#endif
+
+
+void __init __free_pages_bootmem(struct page *page, unsigned long pfn,
+                                                       unsigned int order)
+{
+       if (early_page_uninitialised(pfn))
+               return;
+       return __free_pages_boot_core(page, pfn, order);
+}
+
+#ifdef CONFIG_DEFERRED_STRUCT_PAGE_INIT
+static void __init deferred_free_range(struct page *page,
+                                       unsigned long pfn, int nr_pages)
+{
+       int i;
+
+       if (!page)
+               return;
+
+       /* Free a large naturally-aligned chunk if possible */
+       if (nr_pages == MAX_ORDER_NR_PAGES &&
+           (pfn & (MAX_ORDER_NR_PAGES-1)) == 0) {
+               set_pageblock_migratetype(page, MIGRATE_MOVABLE);
+               __free_pages_boot_core(page, pfn, MAX_ORDER-1);
+               return;
+       }
+
+       for (i = 0; i < nr_pages; i++, page++, pfn++)
+               __free_pages_boot_core(page, pfn, 0);
+}
+
+/* Completion tracking for deferred_init_memmap() threads */
+static atomic_t pgdat_init_n_undone __initdata;
+static __initdata DECLARE_COMPLETION(pgdat_init_all_done_comp);
+
+static inline void __init pgdat_init_report_one_done(void)
+{
+       if (atomic_dec_and_test(&pgdat_init_n_undone))
+               complete(&pgdat_init_all_done_comp);
+}
+
+/* Initialise remaining memory on a node */
+static int __init deferred_init_memmap(void *data)
+{
+       pg_data_t *pgdat = data;
+       int nid = pgdat->node_id;
+       struct mminit_pfnnid_cache nid_init_state = { };
+       unsigned long start = jiffies;
+       unsigned long nr_pages = 0;
+       unsigned long walk_start, walk_end;
+       int i, zid;
+       struct zone *zone;
+       unsigned long first_init_pfn = pgdat->first_deferred_pfn;
+       const struct cpumask *cpumask = cpumask_of_node(pgdat->node_id);
+
+       if (first_init_pfn == ULONG_MAX) {
+               pgdat_init_report_one_done();
+               return 0;
+       }
+
+       /* Bind memory initialisation thread to a local node if possible */
+       if (!cpumask_empty(cpumask))
+               set_cpus_allowed_ptr(current, cpumask);
+
+       /* Sanity check boundaries */
+       BUG_ON(pgdat->first_deferred_pfn < pgdat->node_start_pfn);
+       BUG_ON(pgdat->first_deferred_pfn > pgdat_end_pfn(pgdat));
+       pgdat->first_deferred_pfn = ULONG_MAX;
+
+       /* Only the highest zone is deferred so find it */
+       for (zid = 0; zid < MAX_NR_ZONES; zid++) {
+               zone = pgdat->node_zones + zid;
+               if (first_init_pfn < zone_end_pfn(zone))
+                       break;
+       }
+
+       for_each_mem_pfn_range(i, nid, &walk_start, &walk_end, NULL) {
+               unsigned long pfn, end_pfn;
+               struct page *page = NULL;
+               struct page *free_base_page = NULL;
+               unsigned long free_base_pfn = 0;
+               int nr_to_free = 0;
+
+               end_pfn = min(walk_end, zone_end_pfn(zone));
+               pfn = first_init_pfn;
+               if (pfn < walk_start)
+                       pfn = walk_start;
+               if (pfn < zone->zone_start_pfn)
+                       pfn = zone->zone_start_pfn;
+
+               for (; pfn < end_pfn; pfn++) {
+                       if (!pfn_valid_within(pfn))
+                               goto free_range;
+
+                       /*
+                        * Ensure pfn_valid is checked every
+                        * MAX_ORDER_NR_PAGES for memory holes
+                        */
+                       if ((pfn & (MAX_ORDER_NR_PAGES - 1)) == 0) {
+                               if (!pfn_valid(pfn)) {
+                                       page = NULL;
+                                       goto free_range;
+                               }
+                       }
+
+                       if (!meminit_pfn_in_nid(pfn, nid, &nid_init_state)) {
+                               page = NULL;
+                               goto free_range;
+                       }
+
+                       /* Minimise pfn page lookups and scheduler checks */
+                       if (page && (pfn & (MAX_ORDER_NR_PAGES - 1)) != 0) {
+                               page++;
+                       } else {
+                               nr_pages += nr_to_free;
+                               deferred_free_range(free_base_page,
+                                               free_base_pfn, nr_to_free);
+                               free_base_page = NULL;
+                               free_base_pfn = nr_to_free = 0;
+
+                               page = pfn_to_page(pfn);
+                               cond_resched();
+                       }
+
+                       if (page->flags) {
+                               VM_BUG_ON(page_zone(page) != zone);
+                               goto free_range;
+                       }
+
+                       __init_single_page(page, pfn, zid, nid);
+                       if (!free_base_page) {
+                               free_base_page = page;
+                               free_base_pfn = pfn;
+                               nr_to_free = 0;
+                       }
+                       nr_to_free++;
+
+                       /* Where possible, batch up pages for a single free */
+                       continue;
+free_range:
+                       /* Free the current block of pages to allocator */
+                       nr_pages += nr_to_free;
+                       deferred_free_range(free_base_page, free_base_pfn,
+                                                               nr_to_free);
+                       free_base_page = NULL;
+                       free_base_pfn = nr_to_free = 0;
+               }
+
+               first_init_pfn = max(end_pfn, first_init_pfn);
+       }
+
+       /* Sanity check that the next zone really is unpopulated */
+       WARN_ON(++zid < MAX_NR_ZONES && populated_zone(++zone));
+
+       pr_info("node %d initialised, %lu pages in %ums\n", nid, nr_pages,
+                                       jiffies_to_msecs(jiffies - start));
+
+       pgdat_init_report_one_done();
+       return 0;
+}
+
+void __init page_alloc_init_late(void)
+{
+       int nid;
+
+       /* There will be num_node_state(N_MEMORY) threads */
+       atomic_set(&pgdat_init_n_undone, num_node_state(N_MEMORY));
+       for_each_node_state(nid, N_MEMORY) {
+               kthread_run(deferred_init_memmap, NODE_DATA(nid), "pgdatinit%d", nid);
+       }
+
+       /* Block until all are initialised */
+       wait_for_completion(&pgdat_init_all_done_comp);
+
+       /* Reinit limits that are based on free pages after the kernel is up */
+       files_maxfiles_init();
+}
+#endif /* CONFIG_DEFERRED_STRUCT_PAGE_INIT */
+
  #ifdef CONFIG_CMA
  /* Free whole pageblock and set its migration type to MIGRATE_CMA. */
  void __init init_cma_reserved_pageblock(struct page *page)
@@ -979,6 +1380,10 @@ static inline int check_new_page(struct page *page)
                 bad_reason = "non-NULL mapping";
         if (unlikely(atomic_read(&page->_count) != 0))
                 bad_reason = "nonzero _count";
+       if (unlikely(page->flags & __PG_HWPOISON)) {
+               bad_reason = "HWPoisoned (hardware-corrupted)";
+               bad_flags = __PG_HWPOISON;
+       }
         if (unlikely(page->flags & PAGE_FLAGS_CHECK_AT_PREP)) {
                 bad_reason = "PAGE_FLAGS_CHECK_AT_PREP flag set";
                 bad_flags = PAGE_FLAGS_CHECK_AT_PREP;
@@ -1013,7 +1418,8 @@ static int prep_new_page(struct page *page, unsigned int order, gfp_t gfp_flags,
         kasan_alloc_pages(page, order);
  
         if (gfp_flags & __GFP_ZERO)
-               prep_zero_page(page, order, gfp_flags);
+               for (i = 0; i < (1 << order); i++)
+                       clear_highpage(page + i);
  
         if (order && (gfp_flags & __GFP_COMP))
                 prep_compound_page(page, order);
@@ -1058,7 +1464,7 @@ struct page *__rmqueue_smallest(struct zone *zone, unsigned int order,
                 rmv_page_order(page);
                 area->nr_free--;
                 expand(zone, page, order, current_order, area, migratetype);
-               set_freepage_migratetype(page, migratetype);
+               set_pcppage_migratetype(page, migratetype);
                 return page;
         }
  
@@ -1071,15 +1477,14 @@ struct page *__rmqueue_smallest(struct zone *zone, unsigned int order,
   * the free lists for the desirable migrate type are depleted
   */
  static int fallbacks[MIGRATE_TYPES][4] = {
-       [MIGRATE_UNMOVABLE]   = { MIGRATE_RECLAIMABLE, MIGRATE_MOVABLE,     MIGRATE_RESERVE },
-       [MIGRATE_RECLAIMABLE] = { MIGRATE_UNMOVABLE,   MIGRATE_MOVABLE,     MIGRATE_RESERVE },
-       [MIGRATE_MOVABLE]     = { MIGRATE_RECLAIMABLE, MIGRATE_UNMOVABLE,   MIGRATE_RESERVE },
+       [MIGRATE_UNMOVABLE]   = { MIGRATE_RECLAIMABLE, MIGRATE_MOVABLE,   MIGRATE_TYPES },
+       [MIGRATE_RECLAIMABLE] = { MIGRATE_UNMOVABLE,   MIGRATE_MOVABLE,   MIGRATE_TYPES },
+       [MIGRATE_MOVABLE]     = { MIGRATE_RECLAIMABLE, MIGRATE_UNMOVABLE, MIGRATE_TYPES },
  #ifdef CONFIG_CMA
-       [MIGRATE_CMA]         = { MIGRATE_RESERVE }, /* Never used */
+       [MIGRATE_CMA]         = { MIGRATE_TYPES }, /* Never used */
  #endif
-       [MIGRATE_RESERVE]     = { MIGRATE_RESERVE }, /* Never used */
  #ifdef CONFIG_MEMORY_ISOLATION
-       [MIGRATE_ISOLATE]     = { MIGRATE_RESERVE }, /* Never used */
+       [MIGRATE_ISOLATE]     = { MIGRATE_TYPES }, /* Never used */
  #endif
  };
  
@@ -1104,7 +1509,7 @@ int move_freepages(struct zone *zone,
                           int migratetype)
  {
         struct page *page;
-       unsigned long order;
+       unsigned int order;
         int pages_moved = 0;
  
  #ifndef CONFIG_HOLES_IN_ZONE
@@ -1135,7 +1540,6 @@ int move_freepages(struct zone *zone,
                 order = page_order(page);
                 list_move(&page->lru,
                           &zone->free_area[order].free_list[migratetype]);
-               set_freepage_migratetype(page, migratetype);
                 page += 1 << order;
                 pages_moved += 1 << order;
         }
@@ -1218,7 +1622,7 @@ static bool can_steal_fallback(unsigned int order, int start_mt)
  static void steal_suitable_fallback(struct zone *zone, struct page *page,
                                                           int start_type)
  {
-       int current_order = page_order(page);
+       unsigned int current_order = page_order(page);
         int pages;
  
         /* Take ownership for orders >= pageblock_order */
@@ -1253,7 +1657,7 @@ int find_suitable_fallback(struct free_area *area, unsigned int order,
         *can_steal = false;
         for (i = 0;; i++) {
                 fallback_mt = fallbacks[migratetype][i];
-               if (fallback_mt == MIGRATE_RESERVE)
+               if (fallback_mt == MIGRATE_TYPES)
                         break;
  
                 if (list_empty(&area->free_list[fallback_mt]))
@@ -1272,6 +1676,101 @@ int find_suitable_fallback(struct free_area *area, unsigned int order,
         return -1;
  }
  
+/*
+ * Reserve a pageblock for exclusive use of high-order atomic allocations if
+ * there are no empty page blocks that contain a page with a suitable order
+ */
+static void reserve_highatomic_pageblock(struct page *page, struct zone *zone,
+                               unsigned int alloc_order)
+{
+       int mt;
+       unsigned long max_managed, flags;
+
+       /*
+        * Limit the number reserved to 1 pageblock or roughly 1% of a zone.
+        * Check is race-prone but harmless.
+        */
+       max_managed = (zone->managed_pages / 100) + pageblock_nr_pages;
+       if (zone->nr_reserved_highatomic >= max_managed)
+               return;
+
+       spin_lock_irqsave(&zone->lock, flags);
+
+       /* Recheck the nr_reserved_highatomic limit under the lock */
+       if (zone->nr_reserved_highatomic >= max_managed)
+               goto out_unlock;
+
+       /* Yoink! */
+       mt = get_pageblock_migratetype(page);
+       if (mt != MIGRATE_HIGHATOMIC &&
+                       !is_migrate_isolate(mt) && !is_migrate_cma(mt)) {
+               zone->nr_reserved_highatomic += pageblock_nr_pages;
+               set_pageblock_migratetype(page, MIGRATE_HIGHATOMIC);
+               move_freepages_block(zone, page, MIGRATE_HIGHATOMIC);
+       }
+
+out_unlock:
+       spin_unlock_irqrestore(&zone->lock, flags);
+}
+
+/*
+ * Used when an allocation is about to fail under memory pressure. This
+ * potentially hurts the reliability of high-order allocations when under
+ * intense memory pressure but failed atomic allocations should be easier
+ * to recover from than an OOM.
+ */
+static void unreserve_highatomic_pageblock(const struct alloc_context *ac)
+{
+       struct zonelist *zonelist = ac->zonelist;
+       unsigned long flags;
+       struct zoneref *z;
+       struct zone *zone;
+       struct page *page;
+       int order;
+
+       for_each_zone_zonelist_nodemask(zone, z, zonelist, ac->high_zoneidx,
+                                                               ac->nodemask) {
+               /* Preserve at least one pageblock */
+               if (zone->nr_reserved_highatomic <= pageblock_nr_pages)
+                       continue;
+
+               spin_lock_irqsave(&zone->lock, flags);
+               for (order = 0; order < MAX_ORDER; order++) {
+                       struct free_area *area = &(zone->free_area[order]);
+
+                       if (list_empty(&area->free_list[MIGRATE_HIGHATOMIC]))
+                               continue;
+
+                       page = list_entry(area->free_list[MIGRATE_HIGHATOMIC].next,
+                                               struct page, lru);
+
+                       /*
+                        * It should never happen but changes to locking could
+                        * inadvertently allow a per-cpu drain to add pages
+                        * to MIGRATE_HIGHATOMIC while unreserving so be safe
+                        * and watch for underflows.
+                        */
+                       zone->nr_reserved_highatomic -= min(pageblock_nr_pages,
+                               zone->nr_reserved_highatomic);
+
+                       /*
+                        * Convert to ac->migratetype and avoid the normal
+                        * pageblock stealing heuristics. Minimally, the caller
+                        * is doing the work and needs the pages. More
+                        * importantly, if the block was always converted to
+                        * MIGRATE_UNMOVABLE or another type then the number
+                        * of pageblocks that cannot be completely freed
+                        * may increase.
+                        */
+                       set_pageblock_migratetype(page, ac->migratetype);
+                       move_freepages_block(zone, page, ac->migratetype);
+                       spin_unlock_irqrestore(&zone->lock, flags);
+                       return;
+               }
+               spin_unlock_irqrestore(&zone->lock, flags);
+       }
+}
+
  /* Remove an element from the buddy allocator from the fallback list */
  static inline struct page *
  __rmqueue_fallback(struct zone *zone, unsigned int order, int start_migratetype)
@@ -1305,14 +1804,13 @@ __rmqueue_fallback(struct zone *zone, unsigned int order, int start_migratetype)
                 expand(zone, page, order, current_order, area,
                                         start_migratetype);
                 /*
-                * The freepage_migratetype may differ from pageblock's
+                * The pcppage_migratetype may differ from pageblock's
                  * migratetype depending on the decisions in
-                * try_to_steal_freepages(). This is OK as long as it
-                * does not differ for MIGRATE_CMA pageblocks. For CMA
-                * we need to make sure unallocated pages flushed from
-                * pcp lists are returned to the correct freelist.
+                * find_suitable_fallback(). This is OK as long as it does not
+                * differ for MIGRATE_CMA pageblocks. Those can be used as
+                * fallback only via special __rmqueue_cma_fallback() function
                  */
-               set_freepage_migratetype(page, start_migratetype);
+               set_pcppage_migratetype(page, start_migratetype);
  
                 trace_mm_page_alloc_extfrag(page, order, current_order,
                         start_migratetype, fallback_mt);
@@ -1328,29 +1826,17 @@ __rmqueue_fallback(struct zone *zone, unsigned int order, int start_migratetype)
   * Call me with the zone->lock already held.
   */
  static struct page *__rmqueue(struct zone *zone, unsigned int order,
-                                               int migratetype)
+                               int migratetype, gfp_t gfp_flags)
  {
         struct page *page;
  
-retry_reserve:
         page = __rmqueue_smallest(zone, order, migratetype);
-
-       if (unlikely(!page) && migratetype != MIGRATE_RESERVE) {
+       if (unlikely(!page)) {
                 if (migratetype == MIGRATE_MOVABLE)
                         page = __rmqueue_cma_fallback(zone, order);
  
                 if (!page)
                         page = __rmqueue_fallback(zone, order, migratetype);
-
-               /*
-                * Use MIGRATE_RESERVE rather than fail an allocation. goto
-                * is used because __rmqueue_smallest is an inline function
-                * and we want just one call site
-                */
-               if (!page) {
-                       migratetype = MIGRATE_RESERVE;
-                       goto retry_reserve;
-               }
         }
  
         trace_mm_page_alloc_zone_locked(page, order, migratetype);
@@ -1370,7 +1856,7 @@ static int rmqueue_bulk(struct zone *zone, unsigned int order,
  
         spin_lock(&zone->lock);
         for (i = 0; i < count; ++i) {
-               struct page *page = __rmqueue(zone, order, migratetype);
+               struct page *page = __rmqueue(zone, order, migratetype, 0);
                 if (unlikely(page == NULL))
                         break;
  
@@ -1388,7 +1874,7 @@ static int rmqueue_bulk(struct zone *zone, unsigned int order,
                 else
                         list_add_tail(&page->lru, list);
                 list = &page->lru;
-               if (is_migrate_cma(get_freepage_migratetype(page)))
+               if (is_migrate_cma(get_pcppage_migratetype(page)))
                         __mod_zone_page_state(zone, NR_FREE_CMA_PAGES,
                                               -(1 << order));
         }
@@ -1601,7 +2087,7 @@ void free_hot_cold_page(struct page *page, bool cold)
                 return;
  
         migratetype = get_pfnblock_migratetype(page, pfn);
-       set_freepage_migratetype(page, migratetype);
+       set_pcppage_migratetype(page, migratetype);
         local_lock_irqsave(pa_lock, flags);
         __count_vm_event(PGFREE);
  
@@ -1665,6 +2151,7 @@ void free_hot_cold_page_list(struct list_head *list, bool cold)
  void split_page(struct page *page, unsigned int order)
  {
         int i;
+       gfp_t gfp_mask;
  
         VM_BUG_ON_PAGE(PageCompound(page), page);
         VM_BUG_ON_PAGE(!page_count(page), page);
@@ -1678,10 +2165,11 @@ void split_page(struct page *page, unsigned int order)
                 split_page(virt_to_page(page[0].shadow), order);
  #endif
  
-       set_page_owner(page, 0, 0);
+       gfp_mask = get_page_owner_gfp(page);
+       set_page_owner(page, 0, gfp_mask);
         for (i = 1; i < (1 << order); i++) {
                 set_page_refcounted(page + i);
-               set_page_owner(page + i, 0, 0);
+               set_page_owner(page + i, 0, gfp_mask);
         }
  }
  EXPORT_SYMBOL_GPL(split_page);
@@ -1711,6 +2199,8 @@ int __isolate_free_page(struct page *page, unsigned int order)
         zone->free_area[order].nr_free--;
         rmv_page_order(page);
  
+       set_page_owner(page, order, __GFP_MOVABLE);
+
         /* Set the pageblock if the isolated page is at least a pageblock */
         if (order >= pageblock_order - 1) {
                 struct page *endpage = page + (1 << order) - 1;
@@ -1722,7 +2212,7 @@ int __isolate_free_page(struct page *page, unsigned int order)
                 }
         }
  
-       set_page_owner(page, order, 0);
+
         return 1UL << order;
  }
  
@@ -1759,7 +2249,7 @@ int split_free_page(struct page *page)
  static inline
  struct page *buffered_rmqueue(struct zone *preferred_zone,
                         struct zone *zone, unsigned int order,
-                       gfp_t gfp_flags, int migratetype)
+                       gfp_t gfp_flags, int alloc_flags, int migratetype)
  {
         unsigned long flags;
         struct page *page;
@@ -1802,13 +2292,21 @@ struct page *buffered_rmqueue(struct zone *preferred_zone,
                         WARN_ON_ONCE(order > 1);
                 }
                 local_spin_lock_irqsave(pa_lock, &zone->lock, flags);
-               page = __rmqueue(zone, order, migratetype);
+
+               page = NULL;
+               if (alloc_flags & ALLOC_HARDER) {
+                       page = __rmqueue_smallest(zone, order, MIGRATE_HIGHATOMIC);
+                       if (page)
+                               trace_mm_page_alloc_zone_locked(page, order, migratetype);
+               }
+               if (!page)
+                       page = __rmqueue(zone, order, migratetype, gfp_flags);
                 if (!page) {
                         spin_unlock(&zone->lock);
                         goto failed;
                 }
                 __mod_zone_freepage_state(zone, -(1 << order),
-                                         get_freepage_migratetype(page));
+                                         get_pcppage_migratetype(page));
                 spin_unlock(&zone->lock);
         }
  
@@ -1834,13 +2332,13 @@ failed:
  static struct {
         struct fault_attr attr;
  
-       u32 ignore_gfp_highmem;
-       u32 ignore_gfp_wait;
+       bool ignore_gfp_highmem;
+       bool ignore_gfp_reclaim;
         u32 min_order;
  } fail_page_alloc = {
         .attr = FAULT_ATTR_INITIALIZER,
-       .ignore_gfp_wait = 1,
-       .ignore_gfp_highmem = 1,
+       .ignore_gfp_reclaim = true,
+       .ignore_gfp_highmem = true,
         .min_order = 1,
  };
  
@@ -1858,7 +2356,8 @@ static bool should_fail_alloc_page(gfp_t gfp_mask, unsigned int order)
                 return false;
         if (fail_page_alloc.ignore_gfp_highmem && (gfp_mask & __GFP_HIGHMEM))
                 return false;
-       if (fail_page_alloc.ignore_gfp_wait && (gfp_mask & __GFP_WAIT))
+       if (fail_page_alloc.ignore_gfp_reclaim &&
+                       (gfp_mask & __GFP_DIRECT_RECLAIM))
                 return false;
  
         return should_fail(&fail_page_alloc.attr, 1 << order);
@@ -1877,7 +2376,7 @@ static int __init fail_page_alloc_debugfs(void)
                 return PTR_ERR(dir);
  
         if (!debugfs_create_bool("ignore-gfp-wait", mode, dir,
-                               &fail_page_alloc.ignore_gfp_wait))
+                               &fail_page_alloc.ignore_gfp_reclaim))
                 goto fail;
         if (!debugfs_create_bool("ignore-gfp-highmem", mode, dir,
                                 &fail_page_alloc.ignore_gfp_highmem))
@@ -1907,180 +2406,99 @@ static inline bool should_fail_alloc_page(gfp_t gfp_mask, unsigned int order)
  #endif /* CONFIG_FAIL_PAGE_ALLOC */
  
  /*
- * Return true if free pages are above 'mark'. This takes into account the order
- * of the allocation.
+ * Return true if free base pages are above 'mark'. For high-order checks it
+ * will return true of the order-0 watermark is reached and there is at least
+ * one free page of a suitable size. Checking now avoids taking the zone lock
+ * to check in the allocation paths if no pages are free.
   */
  static bool __zone_watermark_ok(struct zone *z, unsigned int order,
                         unsigned long mark, int classzone_idx, int alloc_flags,
                         long free_pages)
  {
-       /* free_pages may go negative - that's OK */
         long min = mark;
         int o;
-       long free_cma = 0;
+       const int alloc_harder = (alloc_flags & ALLOC_HARDER);
  
+       /* free_pages may go negative - that's OK */
         free_pages -= (1 << order) - 1;
+
         if (alloc_flags & ALLOC_HIGH)
                 min -= min / 2;
-       if (alloc_flags & ALLOC_HARDER)
+
+       /*
+        * If the caller does not have rights to ALLOC_HARDER then subtract
+        * the high-atomic reserves. This will over-estimate the size of the
+        * atomic reserve but it avoids a search.
+        */
+       if (likely(!alloc_harder))
+               free_pages -= z->nr_reserved_highatomic;
+       else
                 min -= min / 4;
+
  #ifdef CONFIG_CMA
         /* If allocation can't use CMA areas don't use free CMA pages */
         if (!(alloc_flags & ALLOC_CMA))
-               free_cma = zone_page_state(z, NR_FREE_CMA_PAGES);
+               free_pages -= zone_page_state(z, NR_FREE_CMA_PAGES);
  #endif
  
-       if (free_pages - free_cma <= min + z->lowmem_reserve[classzone_idx])
+       /*
+        * Check watermarks for an order-0 allocation request. If these
+        * are not met, then a high-order request also cannot go ahead
+        * even if a suitable page happened to be free.
+        */
+       if (free_pages <= min + z->lowmem_reserve[classzone_idx])
                 return false;
-       for (o = 0; o < order; o++) {
-               /* At the next order, this order's pages become unavailable */
-               free_pages -= z->free_area[o].nr_free << o;
-
-               /* Require fewer higher order pages to be free */
-               min >>= 1;
  
-               if (free_pages <= min)
-                       return false;
-       }
-       return true;
-}
-
-bool zone_watermark_ok(struct zone *z, unsigned int order, unsigned long mark,
-                     int classzone_idx, int alloc_flags)
-{
-       return __zone_watermark_ok(z, order, mark, classzone_idx, alloc_flags,
-                                       zone_page_state(z, NR_FREE_PAGES));
-}
+       /* If this is an order-0 request then the watermark is fine */
+       if (!order)
+               return true;
  
-bool zone_watermark_ok_safe(struct zone *z, unsigned int order,
-                       unsigned long mark, int classzone_idx, int alloc_flags)
-{
-       long free_pages = zone_page_state(z, NR_FREE_PAGES);
+       /* For a high-order request, check at least one suitable page is free */
+       for (o = order; o < MAX_ORDER; o++) {
+               struct free_area *area = &z->free_area[o];
+               int mt;
  
-       if (z->percpu_drift_mark && free_pages < z->percpu_drift_mark)
-               free_pages = zone_page_state_snapshot(z, NR_FREE_PAGES);
-
-       return __zone_watermark_ok(z, order, mark, classzone_idx, alloc_flags,
-                                                               free_pages);
-}
+               if (!area->nr_free)
+                       continue;
  
-#ifdef CONFIG_NUMA
-/*
- * zlc_setup - Setup for "zonelist cache".  Uses cached zone data to
- * skip over zones that are not allowed by the cpuset, or that have
- * been recently (in last second) found to be nearly full.  See further
- * comments in mmzone.h.  Reduces cache footprint of zonelist scans
- * that have to skip over a lot of full or unallowed zones.
- *
- * If the zonelist cache is present in the passed zonelist, then
- * returns a pointer to the allowed node mask (either the current
- * tasks mems_allowed, or node_states[N_MEMORY].)
- *
- * If the zonelist cache is not available for this zonelist, does
- * nothing and returns NULL.
- *
- * If the fullzones BITMAP in the zonelist cache is stale (more than
- * a second since last zap'd) then we zap it out (clear its bits.)
- *
- * We hold off even calling zlc_setup, until after we've checked the
- * first zone in the zonelist, on the theory that most allocations will
- * be satisfied from that first zone, so best to examine that zone as
- * quickly as we can.
- */
-static nodemask_t *zlc_setup(struct zonelist *zonelist, int alloc_flags)
-{
-       struct zonelist_cache *zlc;     /* cached zonelist speedup info */
-       nodemask_t *allowednodes;       /* zonelist_cache approximation */
+               if (alloc_harder)
+                       return true;
  
-       zlc = zonelist->zlcache_ptr;
-       if (!zlc)
-               return NULL;
+               for (mt = 0; mt < MIGRATE_PCPTYPES; mt++) {
+                       if (!list_empty(&area->free_list[mt]))
+                               return true;
+               }
  
-       if (time_after(jiffies, zlc->last_full_zap + HZ)) {
-               bitmap_zero(zlc->fullzones, MAX_ZONES_PER_ZONELIST);
-               zlc->last_full_zap = jiffies;
+#ifdef CONFIG_CMA
+               if ((alloc_flags & ALLOC_CMA) &&
+                   !list_empty(&area->free_list[MIGRATE_CMA])) {
+                       return true;
+               }
+#endif
         }
-
-       allowednodes = !in_interrupt() && (alloc_flags & ALLOC_CPUSET) ?
-                                       &cpuset_current_mems_allowed :
-                                       &node_states[N_MEMORY];
-       return allowednodes;
-}
-
-/*
- * Given 'z' scanning a zonelist, run a couple of quick checks to see
- * if it is worth looking at further for free memory:
- *  1) Check that the zone isn't thought to be full (doesn't have its
- *     bit set in the zonelist_cache fullzones BITMAP).
- *  2) Check that the zones node (obtained from the zonelist_cache
- *     z_to_n[] mapping) is allowed in the passed in allowednodes mask.
- * Return true (non-zero) if zone is worth looking at further, or
- * else return false (zero) if it is not.
- *
- * This check -ignores- the distinction between various watermarks,
- * such as GFP_HIGH, GFP_ATOMIC, PF_MEMALLOC, ...  If a zone is
- * found to be full for any variation of these watermarks, it will
- * be considered full for up to one second by all requests, unless
- * we are so low on memory on all allowed nodes that we are forced
- * into the second scan of the zonelist.
- *
- * In the second scan we ignore this zonelist cache and exactly
- * apply the watermarks to all zones, even it is slower to do so.
- * We are low on memory in the second scan, and should leave no stone
- * unturned looking for a free page.
- */
-static int zlc_zone_worth_trying(struct zonelist *zonelist, struct zoneref *z,
-                                               nodemask_t *allowednodes)
-{
-       struct zonelist_cache *zlc;     /* cached zonelist speedup info */
-       int i;                          /* index of *z in zonelist zones */
-       int n;                          /* node that zone *z is on */
-
-       zlc = zonelist->zlcache_ptr;
-       if (!zlc)
-               return 1;
-
-       i = z - zonelist->_zonerefs;
-       n = zlc->z_to_n[i];
-
-       /* This zone is worth trying if it is allowed but not full */
-       return node_isset(n, *allowednodes) && !test_bit(i, zlc->fullzones);
+       return false;
  }
  
-/*
- * Given 'z' scanning a zonelist, set the corresponding bit in
- * zlc->fullzones, so that subsequent attempts to allocate a page
- * from that zone don't waste time re-examining it.
- */
-static void zlc_mark_zone_full(struct zonelist *zonelist, struct zoneref *z)
+bool zone_watermark_ok(struct zone *z, unsigned int order, unsigned long mark,
+                     int classzone_idx, int alloc_flags)
  {
-       struct zonelist_cache *zlc;     /* cached zonelist speedup info */
-       int i;                          /* index of *z in zonelist zones */
-
-       zlc = zonelist->zlcache_ptr;
-       if (!zlc)
-               return;
-
-       i = z - zonelist->_zonerefs;
-
-       set_bit(i, zlc->fullzones);
+       return __zone_watermark_ok(z, order, mark, classzone_idx, alloc_flags,
+                                       zone_page_state(z, NR_FREE_PAGES));
  }
  
-/*
- * clear all zones full, called after direct reclaim makes progress so that
- * a zone that was recently full is not skipped over for up to a second
- */
-static void zlc_clear_zones_full(struct zonelist *zonelist)
+bool zone_watermark_ok_safe(struct zone *z, unsigned int order,
+                       unsigned long mark, int classzone_idx)
  {
-       struct zonelist_cache *zlc;     /* cached zonelist speedup info */
+       long free_pages = zone_page_state(z, NR_FREE_PAGES);
  
-       zlc = zonelist->zlcache_ptr;
-       if (!zlc)
-               return;
+       if (z->percpu_drift_mark && free_pages < z->percpu_drift_mark)
+               free_pages = zone_page_state_snapshot(z, NR_FREE_PAGES);
  
-       bitmap_zero(zlc->fullzones, MAX_ZONES_PER_ZONELIST);
+       return __zone_watermark_ok(z, order, mark, classzone_idx, 0,
+                                                               free_pages);
  }
  
+#ifdef CONFIG_NUMA
  static bool zone_local(struct zone *local_zone, struct zone *zone)
  {
         return local_zone->node == zone->node;
@@ -2091,28 +2509,7 @@ static bool zone_allows_reclaim(struct zone *local_zone, struct zone *zone)
         return node_distance(zone_to_nid(local_zone), zone_to_nid(zone)) <
                                 RECLAIM_DISTANCE;
  }
-
  #else  /* CONFIG_NUMA */
-
-static nodemask_t *zlc_setup(struct zonelist *zonelist, int alloc_flags)
-{
-       return NULL;
-}
-
-static int zlc_zone_worth_trying(struct zonelist *zonelist, struct zoneref *z,
-                               nodemask_t *allowednodes)
-{
-       return 1;
-}
-
-static void zlc_mark_zone_full(struct zonelist *zonelist, struct zoneref *z)
-{
-}
-
-static void zlc_clear_zones_full(struct zonelist *zonelist)
-{
-}
-
  static bool zone_local(struct zone *local_zone, struct zone *zone)
  {
         return true;
@@ -2122,7 +2519,6 @@ static bool zone_allows_reclaim(struct zone *local_zone, struct zone *zone)
  {
         return true;
  }
-
  #endif /* CONFIG_NUMA */
  
  static void reset_alloc_batches(struct zone *preferred_zone)
@@ -2149,11 +2545,6 @@ get_page_from_freelist(gfp_t gfp_mask, unsigned int order, int alloc_flags,
         struct zoneref *z;
         struct page *page = NULL;
         struct zone *zone;
-       nodemask_t *allowednodes = NULL;/* zonelist_cache approximation */
-       int zlc_active = 0;             /* set if using zonelist_cache */
-       int did_zlc_setup = 0;          /* just call zlc_setup() one time */
-       bool consider_zone_dirty = (alloc_flags & ALLOC_WMARK_LOW) &&
-                               (gfp_mask & __GFP_WRITE);
         int nr_fair_skipped = 0;
         bool zonelist_rescan;
  
@@ -2168,9 +2559,6 @@ zonelist_scan:
                                                                 ac->nodemask) {
                 unsigned long mark;
  
-               if (IS_ENABLED(CONFIG_NUMA) && zlc_active &&
-                       !zlc_zone_worth_trying(zonelist, z, allowednodes))
-                               continue;
                 if (cpusets_enabled() &&
                         (alloc_flags & ALLOC_CPUSET) &&
                         !cpuset_zone_allowed(zone, gfp_mask))
@@ -2208,14 +2596,14 @@ zonelist_scan:
                  *
                  * XXX: For now, allow allocations to potentially
                  * exceed the per-zone dirty limit in the slowpath
-                * (ALLOC_WMARK_LOW unset) before going into reclaim,
+                * (spread_dirty_pages unset) before going into reclaim,
                  * which is important when on a NUMA setup the allowed
                  * zones are together not big enough to reach the
                  * global limit.  The proper fix for these situations
                  * will require awareness of zones in the
                  * dirty-throttling and the flusher threads.
                  */
-               if (consider_zone_dirty && !zone_dirty_ok(zone))
+               if (ac->spread_dirty_pages && !zone_dirty_ok(zone))
                         continue;
  
                 mark = zone->watermark[alloc_flags & ALLOC_WMARK_MASK];
@@ -2228,28 +2616,8 @@ zonelist_scan:
                         if (alloc_flags & ALLOC_NO_WATERMARKS)
                                 goto try_this_zone;
  
-                       if (IS_ENABLED(CONFIG_NUMA) &&
-                                       !did_zlc_setup && nr_online_nodes > 1) {
-                               /*
-                                * we do zlc_setup if there are multiple nodes
-                                * and before considering the first zone allowed
-                                * by the cpuset.
-                                */
-                               allowednodes = zlc_setup(zonelist, alloc_flags);
-                               zlc_active = 1;
-                               did_zlc_setup = 1;
-                       }
-
                         if (zone_reclaim_mode == 0 ||
                             !zone_allows_reclaim(ac->preferred_zone, zone))
-                               goto this_zone_full;
-
-                       /*
-                        * As we may have just activated ZLC, check if the first
-                        * eligible zone has failed zone_reclaim recently.
-                        */
-                       if (IS_ENABLED(CONFIG_NUMA) && zlc_active &&
-                               !zlc_zone_worth_trying(zonelist, z, allowednodes))
                                 continue;
  
                         ret = zone_reclaim(zone, gfp_mask, order);
@@ -2266,34 +2634,26 @@ zonelist_scan:
                                                 ac->classzone_idx, alloc_flags))
                                         goto try_this_zone;
  
-                               /*
-                                * Failed to reclaim enough to meet watermark.
-                                * Only mark the zone full if checking the min
-                                * watermark or if we failed to reclaim just
-                                * 1<<order pages or else the page allocator
-                                * fastpath will prematurely mark zones full
-                                * when the watermark is between the low and
-                                * min watermarks.
-                                */
-                               if (((alloc_flags & ALLOC_WMARK_MASK) == ALLOC_WMARK_MIN) ||
-                                   ret == ZONE_RECLAIM_SOME)
-                                       goto this_zone_full;
-
                                 continue;
                         }
                 }
  
  try_this_zone:
                 page = buffered_rmqueue(ac->preferred_zone, zone, order,
-                                               gfp_mask, ac->migratetype);
+                               gfp_mask, alloc_flags, ac->migratetype);
                 if (page) {
                         if (prep_new_page(page, order, gfp_mask, alloc_flags))
                                 goto try_this_zone;
+
+                       /*
+                        * If this is a high-order atomic allocation then check
+                        * if the pageblock should be reserved for the future
+                        */
+                       if (unlikely(order && (alloc_flags & ALLOC_HARDER)))
+                               reserve_highatomic_pageblock(page, zone, order);
+
                         return page;
                 }
-this_zone_full:
-               if (IS_ENABLED(CONFIG_NUMA) && zlc_active)
-                       zlc_mark_zone_full(zonelist, z);
         }
  
         /*
@@ -2314,12 +2674,6 @@ this_zone_full:
                         zonelist_rescan = true;
         }
  
-       if (unlikely(IS_ENABLED(CONFIG_NUMA) && zlc_active)) {
-               /* Disable zlc cache for second zonelist scan */
-               zlc_active = 0;
-               zonelist_rescan = true;
-       }
-
         if (zonelist_rescan)
                 goto zonelist_scan;
  
@@ -2344,7 +2698,7 @@ static DEFINE_RATELIMIT_STATE(nopage_rs,
                 DEFAULT_RATELIMIT_INTERVAL,
                 DEFAULT_RATELIMIT_BURST);
  
-void warn_alloc_failed(gfp_t gfp_mask, int order, const char *fmt, ...)
+void warn_alloc_failed(gfp_t gfp_mask, unsigned int order, const char *fmt, ...)
  {
         unsigned int filter = SHOW_MEM_FILTER_NODES;
  
@@ -2361,7 +2715,7 @@ void warn_alloc_failed(gfp_t gfp_mask, int order, const char *fmt, ...)
                 if (test_thread_flag(TIF_MEMDIE) ||
                     (current->flags & (PF_MEMALLOC | PF_EXITING)))
                         filter &= ~SHOW_MEM_FILTER_NODES;
-       if (in_interrupt() || !(gfp_mask & __GFP_WAIT))
+       if (in_interrupt() || !(gfp_mask & __GFP_DIRECT_RECLAIM))
                 filter &= ~SHOW_MEM_FILTER_NODES;
  
         if (fmt) {
@@ -2378,7 +2732,7 @@ void warn_alloc_failed(gfp_t gfp_mask, int order, const char *fmt, ...)
                 va_end(args);
         }
  
-       pr_warn("%s: page allocation failure: order:%d, mode:0x%x\n",
+       pr_warn("%s: page allocation failure: order:%u, mode:0x%x\n",
                 current->comm, order, gfp_mask);
  
         dump_stack();
@@ -2386,61 +2740,25 @@ void warn_alloc_failed(gfp_t gfp_mask, int order, const char *fmt, ...)
                 show_mem(filter);
  }
  
-static inline int
-should_alloc_retry(gfp_t gfp_mask, unsigned int order,
-                               unsigned long did_some_progress,
-                               unsigned long pages_reclaimed)
-{
-       /* Do not loop if specifically requested */
-       if (gfp_mask & __GFP_NORETRY)
-               return 0;
-
-       /* Always retry if specifically requested */
-       if (gfp_mask & __GFP_NOFAIL)
-               return 1;
-
-       /*
-        * Suspend converts GFP_KERNEL to __GFP_WAIT which can prevent reclaim
-        * making forward progress without invoking OOM. Suspend also disables
-        * storage devices so kswapd will not help. Bail if we are suspending.
-        */
-       if (!did_some_progress && pm_suspended_storage())
-               return 0;
-
-       /*
-        * In this implementation, order <= PAGE_ALLOC_COSTLY_ORDER
-        * means __GFP_NOFAIL, but that may not be true in other
-        * implementations.
-        */
-       if (order <= PAGE_ALLOC_COSTLY_ORDER)
-               return 1;
-
-       /*
-        * For order > PAGE_ALLOC_COSTLY_ORDER, if __GFP_REPEAT is
-        * specified, then we retry until we no longer reclaim any pages
-        * (above), or we've reclaimed an order of pages at least as
-        * large as the allocation's order. In both cases, if the
-        * allocation still fails, we stop retrying.
-        */
-       if (gfp_mask & __GFP_REPEAT && pages_reclaimed < (1 << order))
-               return 1;
-
-       return 0;
-}
-
  static inline struct page *
  __alloc_pages_may_oom(gfp_t gfp_mask, unsigned int order,
         const struct alloc_context *ac, unsigned long *did_some_progress)
  {
+       struct oom_control oc = {
+               .zonelist = ac->zonelist,
+               .nodemask = ac->nodemask,
+               .gfp_mask = gfp_mask,
+               .order = order,
+       };
         struct page *page;
  
         *did_some_progress = 0;
  
         /*
-        * Acquire the per-zone oom lock for each zone.  If that
-        * fails, somebody else is making progress for us.
+        * Acquire the oom lock.  If that fails, somebody else is
+        * making progress for us.
          */
-       if (!oom_zonelist_trylock(ac->zonelist, gfp_mask)) {
+       if (!mutex_trylock(&oom_lock)) {
                 *did_some_progress = 1;
                 schedule_timeout_uninterruptible(1);
                 return NULL;
@@ -2466,26 +2784,27 @@ __alloc_pages_may_oom(gfp_t gfp_mask, unsigned int order,
                 /* The OOM killer does not needlessly kill tasks for lowmem */
                 if (ac->high_zoneidx < ZONE_NORMAL)
                         goto out;
-               /* The OOM killer does not compensate for light reclaim */
+               /* The OOM killer does not compensate for IO-less reclaim */
                 if (!(gfp_mask & __GFP_FS)) {
                         /*
                          * XXX: Page reclaim didn't yield anything,
                          * and the OOM killer can't be invoked, but
-                        * keep looping as per should_alloc_retry().
+                        * keep looping as per tradition.
                          */
                         *did_some_progress = 1;
                         goto out;
                 }
+               if (pm_suspended_storage())
+                       goto out;
                 /* The OOM killer may not free memory on a specific node */
                 if (gfp_mask & __GFP_THISNODE)
                         goto out;
         }
         /* Exhausted what can be done so it's blamo time */
-       if (out_of_memory(ac->zonelist, gfp_mask, order, ac->nodemask, false)
-                       || WARN_ON_ONCE(gfp_mask & __GFP_NOFAIL))
+       if (out_of_memory(&oc) || WARN_ON_ONCE(gfp_mask & __GFP_NOFAIL))
                 *did_some_progress = 1;
  out:
-       oom_zonelist_unlock(ac->zonelist, gfp_mask);
+       mutex_unlock(&oom_lock);
         return page;
  }
  
@@ -2599,19 +2918,17 @@ __alloc_pages_direct_reclaim(gfp_t gfp_mask, unsigned int order,
         if (unlikely(!(*did_some_progress)))
                 return NULL;
  
-       /* After successful reclaim, reconsider all zones for allocation */
-       if (IS_ENABLED(CONFIG_NUMA))
-               zlc_clear_zones_full(ac->zonelist);
-
  retry:
         page = get_page_from_freelist(gfp_mask, order,
                                         alloc_flags & ~ALLOC_NO_WATERMARKS, ac);
  
         /*
          * If an allocation failed after direct reclaim, it could be because
-        * pages are pinned on the per-cpu lists. Drain them and try again
+        * pages are pinned on the per-cpu lists or in high alloc reserves.
+        * Shrink them them and try again
          */
         if (!page && !drained) {
+               unreserve_highatomic_pageblock(ac);
                 drain_all_pages(NULL);
                 drained = true;
                 goto retry;
@@ -2656,7 +2973,6 @@ static inline int
  gfp_to_alloc_flags(gfp_t gfp_mask)
  {
         int alloc_flags = ALLOC_WMARK_MIN | ALLOC_CPUSET;
-       const bool atomic = !(gfp_mask & (__GFP_WAIT | __GFP_NO_KSWAPD));
  
         /* __GFP_HIGH is assumed to be the same as ALLOC_HIGH to save a branch. */
         BUILD_BUG_ON(__GFP_HIGH != (__force gfp_t) ALLOC_HIGH);
@@ -2665,11 +2981,11 @@ gfp_to_alloc_flags(gfp_t gfp_mask)
          * The caller may dip into page reserves a bit more if the caller
          * cannot run direct reclaim, or if the caller has realtime scheduling
          * policy or is asking for __GFP_HIGH memory.  GFP_ATOMIC requests will
-        * set both ALLOC_HARDER (atomic == true) and ALLOC_HIGH (__GFP_HIGH).
+        * set both ALLOC_HARDER (__GFP_ATOMIC) and ALLOC_HIGH (__GFP_HIGH).
          */
         alloc_flags |= (__force int) (gfp_mask & __GFP_HIGH);
  
-       if (atomic) {
+       if (gfp_mask & __GFP_ATOMIC) {
                 /*
                  * Not worth trying to allocate harder for __GFP_NOMEMALLOC even
                  * if it can't schedule.
@@ -2706,11 +3022,16 @@ bool gfp_pfmemalloc_allowed(gfp_t gfp_mask)
         return !!(gfp_to_alloc_flags(gfp_mask) & ALLOC_NO_WATERMARKS);
  }
  
+static inline bool is_thp_gfp_mask(gfp_t gfp_mask)
+{
+       return (gfp_mask & (GFP_TRANSHUGE | __GFP_KSWAPD_RECLAIM)) == GFP_TRANSHUGE;
+}
+
  static inline struct page *
  __alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order,
                                                 struct alloc_context *ac)
  {
-       const gfp_t wait = gfp_mask & __GFP_WAIT;
+       bool can_direct_reclaim = gfp_mask & __GFP_DIRECT_RECLAIM;
         struct page *page = NULL;
         int alloc_flags;
         unsigned long pages_reclaimed = 0;
@@ -2730,16 +3051,24 @@ __alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order,
                 return NULL;
         }
  
+       /*
+        * We also sanity check to catch abuse of atomic reserves being used by
+        * callers that are not in atomic context.
+        */
+       if (WARN_ON_ONCE((gfp_mask & (__GFP_ATOMIC|__GFP_DIRECT_RECLAIM)) ==
+                               (__GFP_ATOMIC|__GFP_DIRECT_RECLAIM)))
+               gfp_mask &= ~__GFP_ATOMIC;
+
         /*
          * If this allocation cannot block and it is for a specific node, then
          * fail early.  There's no need to wakeup kswapd or retry for a
          * speculative node-specific allocation.
          */
-       if (IS_ENABLED(CONFIG_NUMA) && (gfp_mask & __GFP_THISNODE) && !wait)
+       if (IS_ENABLED(CONFIG_NUMA) && (gfp_mask & __GFP_THISNODE) && !can_direct_reclaim)
                 goto nopage;
  
  retry:
-       if (!(gfp_mask & __GFP_NO_KSWAPD))
+       if (gfp_mask & __GFP_KSWAPD_RECLAIM)
                 wake_all_kswapds(order, ac);
  
         /*
@@ -2782,8 +3111,8 @@ retry:
                 }
         }
  
-       /* Atomic allocations - we can't balance anything */
-       if (!wait) {
+       /* Caller is not willing to reclaim, we can't balance anything */
+       if (!can_direct_reclaim) {
                 /*
                  * All existing users of the deprecated __GFP_NOFAIL are
                  * blockable, so warn of any new users that actually allow this
@@ -2813,7 +3142,7 @@ retry:
                 goto got_pg;
  
         /* Checks for THP-specific high-order allocations */
-       if ((gfp_mask & GFP_TRANSHUGE) == GFP_TRANSHUGE) {
+       if (is_thp_gfp_mask(gfp_mask)) {
                 /*
                  * If compaction is deferred for high-order allocations, it is
                  * because sync compaction recently failed. If this is the case
@@ -2848,8 +3177,7 @@ retry:
          * fault, so use asynchronous memory compaction for THP unless it is
          * khugepaged trying to collapse.
          */
-       if ((gfp_mask & GFP_TRANSHUGE) != GFP_TRANSHUGE ||
-                                               (current->flags & PF_KTHREAD))
+       if (!is_thp_gfp_mask(gfp_mask) || (current->flags & PF_KTHREAD))
                 migration_mode = MIGRATE_SYNC_LIGHT;
  
         /* Try direct reclaim and then allocating */
@@ -2858,40 +3186,40 @@ retry:
         if (page)
                 goto got_pg;
  
-       /* Check if we should retry the allocation */
+       /* Do not loop if specifically requested */
+       if (gfp_mask & __GFP_NORETRY)
+               goto noretry;
+
+       /* Keep reclaiming pages as long as there is reasonable progress */
         pages_reclaimed += did_some_progress;
-       if (should_alloc_retry(gfp_mask, order, did_some_progress,
-                                               pages_reclaimed)) {
-               /*
-                * If we fail to make progress by freeing individual
-                * pages, but the allocation wants us to keep going,
-                * start OOM killing tasks.
-                */
-               if (!did_some_progress) {
-                       page = __alloc_pages_may_oom(gfp_mask, order, ac,
-                                                       &did_some_progress);
-                       if (page)
-                               goto got_pg;
-                       if (!did_some_progress)
-                               goto nopage;
-               }
+       if ((did_some_progress && order <= PAGE_ALLOC_COSTLY_ORDER) ||
+           ((gfp_mask & __GFP_REPEAT) && pages_reclaimed < (1 << order))) {
                 /* Wait for some write requests to complete then retry */
                 wait_iff_congested(ac->preferred_zone, BLK_RW_ASYNC, HZ/50);
                 goto retry;
-       } else {
-               /*
-                * High-order allocations do not necessarily loop after
-                * direct reclaim and reclaim/compaction depends on compaction
-                * being called after reclaim so call directly if necessary
-                */
-               page = __alloc_pages_direct_compact(gfp_mask, order,
-                                       alloc_flags, ac, migration_mode,
-                                       &contended_compaction,
-                                       &deferred_compaction);
-               if (page)
-                       goto got_pg;
         }
  
+       /* Reclaim has failed us, start killing things */
+       page = __alloc_pages_may_oom(gfp_mask, order, ac, &did_some_progress);
+       if (page)
+               goto got_pg;
+
+       /* Retry as long as the OOM killer is making progress */
+       if (did_some_progress)
+               goto retry;
+
+noretry:
+       /*
+        * High-order allocations do not necessarily loop after
+        * direct reclaim and reclaim/compaction depends on compaction
+        * being called after reclaim so call directly if necessary
+        */
+       page = __alloc_pages_direct_compact(gfp_mask, order, alloc_flags,
+                                           ac, migration_mode,
+                                           &contended_compaction,
+                                           &deferred_compaction);
+       if (page)
+               goto got_pg;
  nopage:
         warn_alloc_failed(gfp_mask, order, NULL);
  got_pg:
@@ -2920,7 +3248,7 @@ __alloc_pages_nodemask(gfp_t gfp_mask, unsigned int order,
  
         lockdep_trace_alloc(gfp_mask);
  
-       might_sleep_if(gfp_mask & __GFP_WAIT);
+       might_sleep_if(gfp_mask & __GFP_DIRECT_RECLAIM);
  
         if (should_fail_alloc_page(gfp_mask, order))
                 return NULL;
@@ -2941,6 +3269,10 @@ retry_cpuset:
  
         /* We set it here, as __alloc_pages_slowpath might have changed it */
         ac.zonelist = zonelist;
+
+       /* Dirty zone balancing only done in the fast path */
+       ac.spread_dirty_pages = (gfp_mask & __GFP_WRITE);
+
         /* The preferred zone is used for statistics later */
         preferred_zoneref = first_zones_zonelist(ac.zonelist, ac.high_zoneidx,
                                 ac.nodemask ? : &cpuset_current_mems_allowed,
@@ -2959,6 +3291,7 @@ retry_cpuset:
                  * complete.
                  */
                 alloc_mask = memalloc_noio_flags(gfp_mask);
+               ac.spread_dirty_pages = false;
  
                 page = __alloc_pages_slowpath(alloc_mask, order, &ac);
         }
@@ -3030,6 +3363,104 @@ void free_pages(unsigned long addr, unsigned int order)
  
  EXPORT_SYMBOL(free_pages);
  
+/*
+ * Page Fragment:
+ *  An arbitrary-length arbitrary-offset area of memory which resides
+ *  within a 0 or higher order page.  Multiple fragments within that page
+ *  are individually refcounted, in the page's reference counter.
+ *
+ * The page_frag functions below provide a simple allocation framework for
+ * page fragments.  This is used by the network stack and network device
+ * drivers to provide a backing region of memory for use as either an
+ * sk_buff->head, or to be used in the "frags" portion of skb_shared_info.
+ */
+static struct page *__page_frag_refill(struct page_frag_cache *nc,
+                                      gfp_t gfp_mask)
+{
+       struct page *page = NULL;
+       gfp_t gfp = gfp_mask;
+
+#if (PAGE_SIZE < PAGE_FRAG_CACHE_MAX_SIZE)
+       gfp_mask |= __GFP_COMP | __GFP_NOWARN | __GFP_NORETRY |
+                   __GFP_NOMEMALLOC;
+       page = alloc_pages_node(NUMA_NO_NODE, gfp_mask,
+                               PAGE_FRAG_CACHE_MAX_ORDER);
+       nc->size = page ? PAGE_FRAG_CACHE_MAX_SIZE : PAGE_SIZE;
+#endif
+       if (unlikely(!page))
+               page = alloc_pages_node(NUMA_NO_NODE, gfp, 0);
+
+       nc->va = page ? page_address(page) : NULL;
+
+       return page;
+}
+
+void *__alloc_page_frag(struct page_frag_cache *nc,
+                       unsigned int fragsz, gfp_t gfp_mask)
+{
+       unsigned int size = PAGE_SIZE;
+       struct page *page;
+       int offset;
+
+       if (unlikely(!nc->va)) {
+refill:
+               page = __page_frag_refill(nc, gfp_mask);
+               if (!page)
+                       return NULL;
+
+#if (PAGE_SIZE < PAGE_FRAG_CACHE_MAX_SIZE)
+               /* if size can vary use size else just use PAGE_SIZE */
+               size = nc->size;
+#endif
+               /* Even if we own the page, we do not use atomic_set().
+                * This would break get_page_unless_zero() users.
+                */
+               atomic_add(size - 1, &page->_count);
+
+               /* reset page count bias and offset to start of new frag */
+               nc->pfmemalloc = page_is_pfmemalloc(page);
+               nc->pagecnt_bias = size;
+               nc->offset = size;
+       }
+
+       offset = nc->offset - fragsz;
+       if (unlikely(offset < 0)) {
+               page = virt_to_page(nc->va);
+
+               if (!atomic_sub_and_test(nc->pagecnt_bias, &page->_count))
+                       goto refill;
+
+#if (PAGE_SIZE < PAGE_FRAG_CACHE_MAX_SIZE)
+               /* if size can vary use size else just use PAGE_SIZE */
+               size = nc->size;
+#endif
+               /* OK, page count is 0, we can safely set it */
+               atomic_set(&page->_count, size);
+
+               /* reset page count bias and offset to start of new frag */
+               nc->pagecnt_bias = size;
+               offset = size - fragsz;
+       }
+
+       nc->pagecnt_bias--;
+       nc->offset = offset;
+
+       return nc->va + offset;
+}
+EXPORT_SYMBOL(__alloc_page_frag);
+
+/*
+ * Frees a page fragment allocated out of either a compound or order 0 page.
+ */
+void __free_page_frag(void *addr)
+{
+       struct page *page = virt_to_head_page(addr);
+
+       if (unlikely(put_page_testzero(page)))
+               __free_pages_ok(page, compound_order(page));
+}
+EXPORT_SYMBOL(__free_page_frag);
+
  /*
   * alloc_kmem_pages charges newly allocated pages to the kmem resource counter
   * of the current memory cgroup.
@@ -3040,24 +3471,24 @@ EXPORT_SYMBOL(free_pages);
  struct page *alloc_kmem_pages(gfp_t gfp_mask, unsigned int order)
  {
         struct page *page;
-       struct mem_cgroup *memcg = NULL;
  
-       if (!memcg_kmem_newpage_charge(gfp_mask, &memcg, order))
-               return NULL;
         page = alloc_pages(gfp_mask, order);
-       memcg_kmem_commit_charge(page, memcg, order);
+       if (page && memcg_kmem_charge(page, gfp_mask, order) != 0) {
+               __free_pages(page, order);
+               page = NULL;
+       }
         return page;
  }
  
  struct page *alloc_kmem_pages_node(int nid, gfp_t gfp_mask, unsigned int order)
  {
         struct page *page;
-       struct mem_cgroup *memcg = NULL;
  
-       if (!memcg_kmem_newpage_charge(gfp_mask, &memcg, order))
-               return NULL;
         page = alloc_pages_node(nid, gfp_mask, order);
-       memcg_kmem_commit_charge(page, memcg, order);
+       if (page && memcg_kmem_charge(page, gfp_mask, order) != 0) {
+               __free_pages(page, order);
+               page = NULL;
+       }
         return page;
  }
  
@@ -3067,7 +3498,7 @@ struct page *alloc_kmem_pages_node(int nid, gfp_t gfp_mask, unsigned int order)
   */
  void __free_kmem_pages(struct page *page, unsigned int order)
  {
-       memcg_kmem_uncharge_pages(page, order);
+       memcg_kmem_uncharge(page, order);
         __free_pages(page, order);
  }
  
@@ -3079,7 +3510,8 @@ void free_kmem_pages(unsigned long addr, unsigned int order)
         }
  }
  
-static void *make_alloc_exact(unsigned long addr, unsigned order, size_t size)
+static void *make_alloc_exact(unsigned long addr, unsigned int order,
+               size_t size)
  {
         if (addr) {
                 unsigned long alloc_end = addr + (PAGE_SIZE << order);
@@ -3126,12 +3558,10 @@ EXPORT_SYMBOL(alloc_pages_exact);
   *
   * Like alloc_pages_exact(), but try to allocate on node nid first before falling
   * back.
- * Note this is not alloc_pages_exact_node() which allocates on a specific node,
- * but is not exact.
   */
  void * __meminit alloc_pages_exact_nid(int nid, size_t size, gfp_t gfp_mask)
  {
-       unsigned order = get_order(size);
+       unsigned int order = get_order(size);
         struct page *p = alloc_pages_node(nid, gfp_mask, order);
         if (!p)
                 return NULL;
@@ -3278,9 +3708,9 @@ static void show_migration_types(unsigned char type)
  {
         static const char types[MIGRATE_TYPES] = {
                 [MIGRATE_UNMOVABLE]     = 'U',
-               [MIGRATE_RECLAIMABLE]   = 'E',
                 [MIGRATE_MOVABLE]       = 'M',
-               [MIGRATE_RESERVE]       = 'R',
+               [MIGRATE_RECLAIMABLE]   = 'E',
+               [MIGRATE_HIGHATOMIC]    = 'H',
  #ifdef CONFIG_CMA
                 [MIGRATE_CMA]           = 'C',
  #endif
@@ -3433,7 +3863,8 @@ void show_free_areas(unsigned int filter)
         }
  
         for_each_populated_zone(zone) {
-               unsigned long nr[MAX_ORDER], flags, order, total = 0;
+               unsigned int order;
+               unsigned long nr[MAX_ORDER], flags, total = 0;
                 unsigned char types[MAX_ORDER];
  
                 if (skip_free_areas_node(filter, zone_to_nid(zone)))
@@ -3782,7 +4213,7 @@ static void build_zonelists(pg_data_t *pgdat)
         nodemask_t used_mask;
         int local_node, prev_node;
         struct zonelist *zonelist;
-       int order = current_zonelist_order;
+       unsigned int order = current_zonelist_order;
  
         /* initialize zonelists */
         for (i = 0; i < MAX_ZONELISTS; i++) {
@@ -3826,20 +4257,6 @@ static void build_zonelists(pg_data_t *pgdat)
         build_thisnode_zonelists(pgdat);
  }
  
-/* Construct the zonelist performance cache - see further mmzone.h */
-static void build_zonelist_cache(pg_data_t *pgdat)
-{
-       struct zonelist *zonelist;
-       struct zonelist_cache *zlc;
-       struct zoneref *z;
-
-       zonelist = &pgdat->node_zonelists[0];
-       zonelist->zlcache_ptr = zlc = &zonelist->zlcache;
-       bitmap_zero(zlc->fullzones, MAX_ZONES_PER_ZONELIST);
-       for (z = zonelist->_zonerefs; z->zone; z++)
-               zlc->z_to_n[z - zonelist->_zonerefs] = zonelist_node_idx(z);
-}
-
  #ifdef CONFIG_HAVE_MEMORYLESS_NODES
  /*
   * Return node id of node used for "local" allocations.
@@ -3900,12 +4317,6 @@ static void build_zonelists(pg_data_t *pgdat)
         zonelist->_zonerefs[j].zone_idx = 0;
  }
  
-/* non-NUMA variant of zonelist performance cache - just NULL zlcache_ptr */
-static void build_zonelist_cache(pg_data_t *pgdat)
-{
-       pgdat->node_zonelists[0].zlcache_ptr = NULL;
-}
-
  #endif /* CONFIG_NUMA */
  
  /*
@@ -3946,14 +4357,12 @@ static int __build_all_zonelists(void *data)
  
         if (self && !node_online(self->node_id)) {
                 build_zonelists(self);
-               build_zonelist_cache(self);
         }
  
         for_each_online_node(nid) {
                 pg_data_t *pgdat = NODE_DATA(nid);
  
                 build_zonelists(pgdat);
-               build_zonelist_cache(pgdat);
         }
  
         /*
@@ -4112,117 +4521,6 @@ static inline unsigned long wait_table_bits(unsigned long size)
         return ffz(~size);
  }
  
-/*
- * Check if a pageblock contains reserved pages
- */
-static int pageblock_is_reserved(unsigned long start_pfn, unsigned long end_pfn)
-{
-       unsigned long pfn;
-
-       for (pfn = start_pfn; pfn < end_pfn; pfn++) {
-               if (!pfn_valid_within(pfn) || PageReserved(pfn_to_page(pfn)))
-                       return 1;
-       }
-       return 0;
-}
-
-/*
- * Mark a number of pageblocks as MIGRATE_RESERVE. The number
- * of blocks reserved is based on min_wmark_pages(zone). The memory within
- * the reserve will tend to store contiguous free pages. Setting min_free_kbytes
- * higher will lead to a bigger reserve which will get freed as contiguous
- * blocks as reclaim kicks in
- */
-static void setup_zone_migrate_reserve(struct zone *zone)
-{
-       unsigned long start_pfn, pfn, end_pfn, block_end_pfn;
-       struct page *page;
-       unsigned long block_migratetype;
-       int reserve;
-       int old_reserve;
-
-       /*
-        * Get the start pfn, end pfn and the number of blocks to reserve
-        * We have to be careful to be aligned to pageblock_nr_pages to
-        * make sure that we always check pfn_valid for the first page in
-        * the block.
-        */
-       start_pfn = zone->zone_start_pfn;
-       end_pfn = zone_end_pfn(zone);
-       start_pfn = roundup(start_pfn, pageblock_nr_pages);
-       reserve = roundup(min_wmark_pages(zone), pageblock_nr_pages) >>
-                                                       pageblock_order;
-
-       /*
-        * Reserve blocks are generally in place to help high-order atomic
-        * allocations that are short-lived. A min_free_kbytes value that
-        * would result in more than 2 reserve blocks for atomic allocations
-        * is assumed to be in place to help anti-fragmentation for the
-        * future allocation of hugepages at runtime.
-        */
-       reserve = min(2, reserve);
-       old_reserve = zone->nr_migrate_reserve_block;
-
-       /* When memory hot-add, we almost always need to do nothing */
-       if (reserve == old_reserve)
-               return;
-       zone->nr_migrate_reserve_block = reserve;
-
-       for (pfn = start_pfn; pfn < end_pfn; pfn += pageblock_nr_pages) {
-               if (!pfn_valid(pfn))
-                       continue;
-               page = pfn_to_page(pfn);
-
-               /* Watch out for overlapping nodes */
-               if (page_to_nid(page) != zone_to_nid(zone))
-                       continue;
-
-               block_migratetype = get_pageblock_migratetype(page);
-
-               /* Only test what is necessary when the reserves are not met */
-               if (reserve > 0) {
-                       /*
-                        * Blocks with reserved pages will never free, skip
-                        * them.
-                        */
-                       block_end_pfn = min(pfn + pageblock_nr_pages, end_pfn);
-                       if (pageblock_is_reserved(pfn, block_end_pfn))
-                               continue;
-
-                       /* If this block is reserved, account for it */
-                       if (block_migratetype == MIGRATE_RESERVE) {
-                               reserve--;
-                               continue;
-                       }
-
-                       /* Suitable for reserving if this block is movable */
-                       if (block_migratetype == MIGRATE_MOVABLE) {
-                               set_pageblock_migratetype(page,
-                                                       MIGRATE_RESERVE);
-                               move_freepages_block(zone, page,
-                                                       MIGRATE_RESERVE);
-                               reserve--;
-                               continue;
-                       }
-               } else if (!old_reserve) {
-                       /*
-                        * At boot time we don't need to scan the whole zone
-                        * for turning off MIGRATE_RESERVE.
-                        */
-                       break;
-               }
-
-               /*
-                * If the reserve is met and this is a previous reserved block,
-                * take it back
-                */
-               if (block_migratetype == MIGRATE_RESERVE) {
-                       set_pageblock_migratetype(page, MIGRATE_MOVABLE);
-                       move_freepages_block(zone, page, MIGRATE_MOVABLE);
-               }
-       }
-}
-
  /*
   * Initially all pages are reserved - free ones are freed
   * up by free_all_bootmem() once the early boot process is
@@ -4231,15 +4529,16 @@ static void setup_zone_migrate_reserve(struct zone *zone)
  void __meminit memmap_init_zone(unsigned long size, int nid, unsigned long zone,
                 unsigned long start_pfn, enum memmap_context context)
  {
-       struct page *page;
+       pg_data_t *pgdat = NODE_DATA(nid);
         unsigned long end_pfn = start_pfn + size;
         unsigned long pfn;
         struct zone *z;
+       unsigned long nr_initialised = 0;
  
         if (highest_memmap_pfn < end_pfn - 1)
                 highest_memmap_pfn = end_pfn - 1;
  
-       z = &NODE_DATA(nid)->node_zones[zone];
+       z = &pgdat->node_zones[zone];
         for (pfn = start_pfn; pfn < end_pfn; pfn++) {
                 /*
                  * There can be holes in boot-time mem_map[]s
@@ -4251,39 +4550,31 @@ void __meminit memmap_init_zone(unsigned long size, int nid, unsigned long zone,
                                 continue;
                         if (!early_pfn_in_nid(pfn, nid))
                                 continue;
+                       if (!update_defer_init(pgdat, pfn, end_pfn,
+                                               &nr_initialised))
+                               break;
                 }
-               page = pfn_to_page(pfn);
-               set_page_links(page, zone, nid, pfn);
-               mminit_verify_page_links(page, zone, nid, pfn);
-               init_page_count(page);
-               page_mapcount_reset(page);
-               page_cpupid_reset_last(page);
-               SetPageReserved(page);
+
                 /*
                  * Mark the block movable so that blocks are reserved for
                  * movable at startup. This will force kernel allocations
                  * to reserve their blocks rather than leaking throughout
                  * the address space during boot when many long-lived
-                * kernel allocations are made. Later some blocks near
-                * the start are marked MIGRATE_RESERVE by
-                * setup_zone_migrate_reserve()
+                * kernel allocations are made.
                  *
                  * bitmap is created for zone's valid pfn range. but memmap
                  * can be created for invalid pages (for alignment)
                  * check here not to call set_pageblock_migratetype() against
                  * pfn out of zone.
                  */
-               if ((z->zone_start_pfn <= pfn)
-                   && (pfn < zone_end_pfn(z))
-                   && !(pfn & (pageblock_nr_pages - 1)))
-                       set_pageblock_migratetype(page, MIGRATE_MOVABLE);
+               if (!(pfn & (pageblock_nr_pages - 1))) {
+                       struct page *page = pfn_to_page(pfn);
  
-               INIT_LIST_HEAD(&page->lru);
-#ifdef WANT_PAGE_VIRTUAL
-               /* The shift won't overflow because ZONE_NORMAL is below 4G. */
-               if (!is_highmem_idx(zone))
-                       set_page_address(page, __va(pfn << PAGE_SHIFT));
-#endif
+                       __init_single_page(page, pfn, zone, nid);
+                       set_pageblock_migratetype(page, MIGRATE_MOVABLE);
+               } else {
+                       __init_single_pfn(pfn, zone, nid);
+               }
         }
  }
  
@@ -4516,8 +4807,7 @@ static __meminit void zone_pcp_init(struct zone *zone)
  
  int __meminit init_currently_empty_zone(struct zone *zone,
                                         unsigned long zone_start_pfn,
-                                       unsigned long size,
-                                       enum memmap_context context)
+                                       unsigned long size)
  {
         struct pglist_data *pgdat = zone->zone_pgdat;
         int ret;
@@ -4541,57 +4831,30 @@ int __meminit init_currently_empty_zone(struct zone *zone,
  
  #ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP
  #ifndef CONFIG_HAVE_ARCH_EARLY_PFN_TO_NID
+
  /*
   * Required by SPARSEMEM. Given a PFN, return what node the PFN is on.
   */
-int __meminit __early_pfn_to_nid(unsigned long pfn)
+int __meminit __early_pfn_to_nid(unsigned long pfn,
+                                       struct mminit_pfnnid_cache *state)
  {
         unsigned long start_pfn, end_pfn;
         int nid;
-       /*
-        * NOTE: The following SMP-unsafe globals are only used early in boot
-        * when the kernel is running single-threaded.
-        */
-       static unsigned long __meminitdata last_start_pfn, last_end_pfn;
-       static int __meminitdata last_nid;
  
-       if (last_start_pfn <= pfn && pfn < last_end_pfn)
-               return last_nid;
+       if (state->last_start <= pfn && pfn < state->last_end)
+               return state->last_nid;
  
         nid = memblock_search_pfn_nid(pfn, &start_pfn, &end_pfn);
         if (nid != -1) {
-               last_start_pfn = start_pfn;
-               last_end_pfn = end_pfn;
-               last_nid = nid;
+               state->last_start = start_pfn;
+               state->last_end = end_pfn;
+               state->last_nid = nid;
         }
  
         return nid;
  }
  #endif /* CONFIG_HAVE_ARCH_EARLY_PFN_TO_NID */
  
-int __meminit early_pfn_to_nid(unsigned long pfn)
-{
-       int nid;
-
-       nid = __early_pfn_to_nid(pfn);
-       if (nid >= 0)
-               return nid;
-       /* just returns 0 */
-       return 0;
-}
-
-#ifdef CONFIG_NODES_SPAN_OTHER_NODES
-bool __meminit early_pfn_in_nid(unsigned long pfn, int node)
-{
-       int nid;
-
-       nid = __early_pfn_to_nid(pfn);
-       if (nid >= 0 && nid != node)
-               return false;
-       return true;
-}
-#endif
-
  /**
   * free_bootmem_with_active_regions - Call memblock_free_early_nid for each active range
   * @nid: The node to free memory on. If MAX_NUMNODES, all nodes are freed.
@@ -4731,6 +4994,10 @@ static unsigned long __meminit zone_spanned_pages_in_node(int nid,
  {
         unsigned long zone_start_pfn, zone_end_pfn;
  
+       /* When hotadd a new node from cpu_up(), the node should be empty */
+       if (!node_start_pfn && !node_end_pfn)
+               return 0;
+
         /* Get the start and end of the zone */
         zone_start_pfn = arch_zone_lowest_possible_pfn[zone_type];
         zone_end_pfn = arch_zone_highest_possible_pfn[zone_type];
@@ -4794,6 +5061,10 @@ static unsigned long __meminit zone_absent_pages_in_node(int nid,
         unsigned long zone_high = arch_zone_highest_possible_pfn[zone_type];
         unsigned long zone_start_pfn, zone_end_pfn;
  
+       /* When hotadd a new node from cpu_up(), the node should be empty */
+       if (!node_start_pfn && !node_end_pfn)
+               return 0;
+
         zone_start_pfn = clamp(node_start_pfn, zone_low, zone_high);
         zone_end_pfn = clamp(node_end_pfn, zone_low, zone_high);
  
@@ -4833,22 +5104,28 @@ static void __meminit calculate_node_totalpages(struct pglist_data *pgdat,
                                                 unsigned long *zones_size,
                                                 unsigned long *zholes_size)
  {
-       unsigned long realtotalpages, totalpages = 0;
+       unsigned long realtotalpages = 0, totalpages = 0;
         enum zone_type i;
  
-       for (i = 0; i < MAX_NR_ZONES; i++)
-               totalpages += zone_spanned_pages_in_node(pgdat->node_id, i,
-                                                        node_start_pfn,
-                                                        node_end_pfn,
-                                                        zones_size);
-       pgdat->node_spanned_pages = totalpages;
-
-       realtotalpages = totalpages;
-       for (i = 0; i < MAX_NR_ZONES; i++)
-               realtotalpages -=
-                       zone_absent_pages_in_node(pgdat->node_id, i,
+       for (i = 0; i < MAX_NR_ZONES; i++) {
+               struct zone *zone = pgdat->node_zones + i;
+               unsigned long size, real_size;
+
+               size = zone_spanned_pages_in_node(pgdat->node_id, i,
+                                                 node_start_pfn,
+                                                 node_end_pfn,
+                                                 zones_size);
+               real_size = size - zone_absent_pages_in_node(pgdat->node_id, i,
                                                   node_start_pfn, node_end_pfn,
                                                   zholes_size);
+               zone->spanned_pages = size;
+               zone->present_pages = real_size;
+
+               totalpages += size;
+               realtotalpages += real_size;
+       }
+
+       pgdat->node_spanned_pages = totalpages;
         pgdat->node_present_pages = realtotalpages;
         printk(KERN_DEBUG "On node %d totalpages: %lu\n", pgdat->node_id,
                                                         realtotalpages);
@@ -4957,9 +5234,7 @@ static unsigned long __paginginit calc_memmap_size(unsigned long spanned_pages,
   *
   * NOTE: pgdat should get zeroed by caller.
   */
-static void __paginginit free_area_init_core(struct pglist_data *pgdat,
-               unsigned long node_start_pfn, unsigned long node_end_pfn,
-               unsigned long *zones_size, unsigned long *zholes_size)
+static void __paginginit free_area_init_core(struct pglist_data *pgdat)
  {
         enum zone_type j;
         int nid = pgdat->node_id;
@@ -4980,12 +5255,8 @@ static void __paginginit free_area_init_core(struct pglist_data *pgdat,
                 struct zone *zone = pgdat->node_zones + j;
                 unsigned long size, realsize, freesize, memmap_pages;
  
-               size = zone_spanned_pages_in_node(nid, j, node_start_pfn,
-                                                 node_end_pfn, zones_size);
-               realsize = freesize = size - zone_absent_pages_in_node(nid, j,
-                                                               node_start_pfn,
-                                                               node_end_pfn,
-                                                               zholes_size);
+               size = zone->spanned_pages;
+               realsize = freesize = zone->present_pages;
  
                 /*
                  * Adjust freesize so that it accounts for how much memory
@@ -5020,8 +5291,6 @@ static void __paginginit free_area_init_core(struct pglist_data *pgdat,
                         nr_kernel_pages -= memmap_pages;
                 nr_all_pages += freesize;
  
-               zone->spanned_pages = size;
-               zone->present_pages = realsize;
                 /*
                  * Set an approximate value for lowmem here, it will be adjusted
                  * when the bootmem allocator frees pages into the buddy system.
@@ -5050,8 +5319,7 @@ static void __paginginit free_area_init_core(struct pglist_data *pgdat,
  
                 set_pageblock_order();
                 setup_usemap(pgdat, zone, zone_start_pfn, size);
-               ret = init_currently_empty_zone(zone, zone_start_pfn,
-                                               size, MEMMAP_EARLY);
+               ret = init_currently_empty_zone(zone, zone_start_pfn, size);
                 BUG_ON(ret);
                 memmap_init(size, nid, j, zone_start_pfn);
                 zone_start_pfn += size;
@@ -5060,14 +5328,19 @@ static void __paginginit free_area_init_core(struct pglist_data *pgdat,
  
  static void __init_refok alloc_node_mem_map(struct pglist_data *pgdat)
  {
+       unsigned long __maybe_unused start = 0;
+       unsigned long __maybe_unused offset = 0;
+
         /* Skip empty nodes */
         if (!pgdat->node_spanned_pages)
                 return;
  
  #ifdef CONFIG_FLAT_NODE_MEM_MAP
+       start = pgdat->node_start_pfn & ~(MAX_ORDER_NR_PAGES - 1);
+       offset = pgdat->node_start_pfn - start;
         /* ia64 gets its own node_mem_map, before this, without bootmem */
         if (!pgdat->node_mem_map) {
-               unsigned long size, start, end;
+               unsigned long size, end;
                 struct page *map;
  
                 /*
@@ -5075,7 +5348,6 @@ static void __init_refok alloc_node_mem_map(struct pglist_data *pgdat)
                  * aligned but the node_mem_map endpoints must be in order
                  * for the buddy allocator to function correctly.
                  */
-               start = pgdat->node_start_pfn & ~(MAX_ORDER_NR_PAGES - 1);
                 end = pgdat_end_pfn(pgdat);
                 end = ALIGN(end, MAX_ORDER_NR_PAGES);
                 size =  (end - start) * sizeof(struct page);
@@ -5083,7 +5355,7 @@ static void __init_refok alloc_node_mem_map(struct pglist_data *pgdat)
                 if (!map)
                         map = memblock_virt_alloc_node_nopanic(size,
                                                                pgdat->node_id);
-               pgdat->node_mem_map = map + (pgdat->node_start_pfn - start);
+               pgdat->node_mem_map = map + offset;
         }
  #ifndef CONFIG_NEED_MULTIPLE_NODES
         /*
@@ -5091,9 +5363,9 @@ static void __init_refok alloc_node_mem_map(struct pglist_data *pgdat)
          */
         if (pgdat == NODE_DATA(0)) {
                 mem_map = NODE_DATA(0)->node_mem_map;
-#ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP
+#if defined(CONFIG_HAVE_MEMBLOCK_NODE_MAP) || defined(CONFIG_FLATMEM)
                 if (page_to_pfn(mem_map) != pgdat->node_start_pfn)
-                       mem_map -= (pgdat->node_start_pfn - ARCH_PFN_OFFSET);
+                       mem_map -= offset;
  #endif /* CONFIG_HAVE_MEMBLOCK_NODE_MAP */
         }
  #endif
@@ -5110,12 +5382,14 @@ void __paginginit free_area_init_node(int nid, unsigned long *zones_size,
         /* pg_data_t should be reset to zero when it's allocated */
         WARN_ON(pgdat->nr_zones || pgdat->classzone_idx);
  
+       reset_deferred_meminit(pgdat);
         pgdat->node_id = nid;
         pgdat->node_start_pfn = node_start_pfn;
  #ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP
         get_pfn_range_for_nid(nid, &start_pfn, &end_pfn);
         pr_info("Initmem setup node %d [mem %#018Lx-%#018Lx]\n", nid,
-               (u64)start_pfn << PAGE_SHIFT, ((u64)end_pfn << PAGE_SHIFT) - 1);
+               (u64)start_pfn << PAGE_SHIFT,
+               end_pfn ? ((u64)end_pfn << PAGE_SHIFT) - 1 : 0);
  #endif
         calculate_node_totalpages(pgdat, start_pfn, end_pfn,
                                   zones_size, zholes_size);
@@ -5127,8 +5401,7 @@ void __paginginit free_area_init_node(int nid, unsigned long *zones_size,
                 (unsigned long)pgdat->node_mem_map);
  #endif
  
-       free_area_init_core(pgdat, start_pfn, end_pfn,
-                           zones_size, zholes_size);
+       free_area_init_core(pgdat);
  }
  
  #ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP
@@ -5139,11 +5412,9 @@ void __paginginit free_area_init_node(int nid, unsigned long *zones_size,
   */
  void __init setup_nr_node_ids(void)
  {
-       unsigned int node;
-       unsigned int highest = 0;
+       unsigned int highest;
  
-       for_each_node_mask(node, node_possible_map)
-               highest = node;
+       highest = find_last_bit(node_possible_map.bits, MAX_NUMNODES);
         nr_node_ids = highest + 1;
  }
  #endif
@@ -5306,13 +5577,17 @@ static void __init find_zone_movable_pfns_for_nodes(void)
                  */
                 required_movablecore =
                         roundup(required_movablecore, MAX_ORDER_NR_PAGES);
+               required_movablecore = min(totalpages, required_movablecore);
                 corepages = totalpages - required_movablecore;
  
                 required_kernelcore = max(required_kernelcore, corepages);
         }
  
-       /* If kernelcore was not specified, there is no ZONE_MOVABLE */
-       if (!required_kernelcore)
+       /*
+        * If kernelcore was not specified or kernelcore size is larger
+        * than totalpages, there is no ZONE_MOVABLE.
+        */
+       if (!required_kernelcore || required_kernelcore >= totalpages)
                 goto out;
  
         /* usable_startpfn is the lowest possible pfn ZONE_MOVABLE can be at */
@@ -5664,7 +5939,7 @@ void __init mem_init_print_info(const char *str)
   * set_dma_reserve - set the specified number of pages reserved in the first zone
   * @new_dma_reserve: The number of pages to mark reserved
   *
- * The per-cpu batchsize and zone watermarks are determined by present_pages.
+ * The per-cpu batchsize and zone watermarks are determined by managed_pages.
   * In the DMA zone, a significant percentage may be consumed by kernel image
   * and other unfreeable allocations which can skew the watermarks badly. This
   * function may optionally be used to account for unfreeable pages in the
@@ -5718,7 +5993,7 @@ void __init page_alloc_init(void)
  }
  
  /*
- * calculate_totalreserve_pages - called when sysctl_lower_zone_reserve_ratio
+ * calculate_totalreserve_pages - called when sysctl_lowmem_reserve_ratio
   *     or min_free_kbytes changes.
   */
  static void calculate_totalreserve_pages(void)
@@ -5762,7 +6037,7 @@ static void calculate_totalreserve_pages(void)
  
  /*
   * setup_per_zone_lowmem_reserve - called whenever
- *     sysctl_lower_zone_reserve_ratio changes.  Ensures that each zone
+ *     sysctl_lowmem_reserve_ratio changes.  Ensures that each zone
   *     has a correct pages reserved value, so an adequate number of
   *     pages are left in the zone after a successful __alloc_pages().
   */
@@ -5848,7 +6123,6 @@ static void __setup_per_zone_wmarks(void)
                         high_wmark_pages(zone) - low_wmark_pages(zone) -
                         atomic_long_read(&zone->vm_stat[NR_ALLOC_BATCH]));
  
-               setup_zone_migrate_reserve(zone);
                 spin_unlock_irqrestore(&zone->lock, flags);
         }
  
@@ -6078,9 +6352,9 @@ out:
         return ret;
  }
  
+#ifdef CONFIG_NUMA
  int hashdist = HASHDIST_DEFAULT;
  
-#ifdef CONFIG_NUMA
  static int __init set_hashdist(char *str)
  {
         if (!str)
@@ -6470,7 +6744,8 @@ int alloc_contig_range(unsigned long start, unsigned long end,
                        unsigned migratetype)
  {
         unsigned long outer_start, outer_end;
-       int ret = 0, order;
+       unsigned int order;
+       int ret = 0;
  
         struct compact_control cc = {
                 .nr_migratepages = 0,