These changes are the raw update to linux-4.4.6-rt14. Kernel sources

[kvmfornfv.git] / kernel / mm / hugetlb.c
diff --git a/kernel/mm/hugetlb.c b/kernel/mm/hugetlb.c

index 8c4c1f9..ef6963b 100644 (file)
--- a/kernel/mm/hugetlb.c
+++ b/kernel/mm/hugetlb.c
@@ -64,7 +64,7 @@ DEFINE_SPINLOCK(hugetlb_lock);
   * prevent spurious OOMs when the hugepage pool is fully utilized.
   */
  static int num_fault_mutexes;
-static struct mutex *htlb_fault_mutex_table ____cacheline_aligned_in_smp;
+struct mutex *hugetlb_fault_mutex_table ____cacheline_aligned_in_smp;
  
  /* Forward declaration */
  static int hugetlb_acct_memory(struct hstate *h, long delta);
@@ -217,8 +217,20 @@ static inline struct hugepage_subpool *subpool_vma(struct vm_area_struct *vma)
   * Region tracking -- allows tracking of reservations and instantiated pages
   *                    across the pages in a mapping.
   *
- * The region data structures are embedded into a resv_map and
- * protected by a resv_map's lock
+ * The region data structures are embedded into a resv_map and protected
+ * by a resv_map's lock.  The set of regions within the resv_map represent
+ * reservations for huge pages, or huge pages that have already been
+ * instantiated within the map.  The from and to elements are huge page
+ * indicies into the associated mapping.  from indicates the starting index
+ * of the region.  to represents the first index past the end of  the region.
+ *
+ * For example, a file region structure with from == 0 and to == 4 represents
+ * four huge pages in a mapping.  It is important to note that the to element
+ * represents the first element past the end of the region. This is used in
+ * arithmetic as 4(to) - 0(from) = 4 huge pages in the region.
+ *
+ * Interval notation of the form [from, to) will be used to indicate that
+ * the endpoint from is inclusive and to is exclusive.
   */
  struct file_region {
         struct list_head link;
@@ -226,10 +238,25 @@ struct file_region {
         long to;
  };
  
+/*
+ * Add the huge page range represented by [f, t) to the reserve
+ * map.  In the normal case, existing regions will be expanded
+ * to accommodate the specified range.  Sufficient regions should
+ * exist for expansion due to the previous call to region_chg
+ * with the same range.  However, it is possible that region_del
+ * could have been called after region_chg and modifed the map
+ * in such a way that no region exists to be expanded.  In this
+ * case, pull a region descriptor from the cache associated with
+ * the map and use that for the new range.
+ *
+ * Return the number of new huge pages added to the map.  This
+ * number is greater than or equal to zero.
+ */
  static long region_add(struct resv_map *resv, long f, long t)
  {
         struct list_head *head = &resv->regions;
         struct file_region *rg, *nrg, *trg;
+       long add = 0;
  
         spin_lock(&resv->lock);
         /* Locate the region we are either in or before. */
@@ -237,6 +264,28 @@ static long region_add(struct resv_map *resv, long f, long t)
                 if (f <= rg->to)
                         break;
  
+       /*
+        * If no region exists which can be expanded to include the
+        * specified range, the list must have been modified by an
+        * interleving call to region_del().  Pull a region descriptor
+        * from the cache and use it for this range.
+        */
+       if (&rg->link == head || t < rg->from) {
+               VM_BUG_ON(resv->region_cache_count <= 0);
+
+               resv->region_cache_count--;
+               nrg = list_first_entry(&resv->region_cache, struct file_region,
+                                       link);
+               list_del(&nrg->link);
+
+               nrg->from = f;
+               nrg->to = t;
+               list_add(&nrg->link, rg->link.prev);
+
+               add += t - f;
+               goto out_locked;
+       }
+
         /* Round our left edge to the current segment if it encloses us. */
         if (f > rg->from)
                 f = rg->from;
@@ -255,16 +304,50 @@ static long region_add(struct resv_map *resv, long f, long t)
                 if (rg->to > t)
                         t = rg->to;
                 if (rg != nrg) {
+                       /* Decrement return value by the deleted range.
+                        * Another range will span this area so that by
+                        * end of routine add will be >= zero
+                        */
+                       add -= (rg->to - rg->from);
                         list_del(&rg->link);
                         kfree(rg);
                 }
         }
+
+       add += (nrg->from - f);         /* Added to beginning of region */
         nrg->from = f;
+       add += t - nrg->to;             /* Added to end of region */
         nrg->to = t;
+
+out_locked:
+       resv->adds_in_progress--;
         spin_unlock(&resv->lock);
-       return 0;
+       VM_BUG_ON(add < 0);
+       return add;
  }
  
+/*
+ * Examine the existing reserve map and determine how many
+ * huge pages in the specified range [f, t) are NOT currently
+ * represented.  This routine is called before a subsequent
+ * call to region_add that will actually modify the reserve
+ * map to add the specified range [f, t).  region_chg does
+ * not change the number of huge pages represented by the
+ * map.  However, if the existing regions in the map can not
+ * be expanded to represent the new range, a new file_region
+ * structure is added to the map as a placeholder.  This is
+ * so that the subsequent region_add call will have all the
+ * regions it needs and will not fail.
+ *
+ * Upon entry, region_chg will also examine the cache of region descriptors
+ * associated with the map.  If there are not enough descriptors cached, one
+ * will be allocated for the in progress add operation.
+ *
+ * Returns the number of huge pages that need to be added to the existing
+ * reservation map for the range [f, t).  This number is greater or equal to
+ * zero.  -ENOMEM is returned if a new file_region structure or cache entry
+ * is needed and can not be allocated.
+ */
  static long region_chg(struct resv_map *resv, long f, long t)
  {
         struct list_head *head = &resv->regions;
@@ -273,6 +356,33 @@ static long region_chg(struct resv_map *resv, long f, long t)
  
  retry:
         spin_lock(&resv->lock);
+retry_locked:
+       resv->adds_in_progress++;
+
+       /*
+        * Check for sufficient descriptors in the cache to accommodate
+        * the number of in progress add operations.
+        */
+       if (resv->adds_in_progress > resv->region_cache_count) {
+               struct file_region *trg;
+
+               VM_BUG_ON(resv->adds_in_progress - resv->region_cache_count > 1);
+               /* Must drop lock to allocate a new descriptor. */
+               resv->adds_in_progress--;
+               spin_unlock(&resv->lock);
+
+               trg = kmalloc(sizeof(*trg), GFP_KERNEL);
+               if (!trg) {
+                       kfree(nrg);
+                       return -ENOMEM;
+               }
+
+               spin_lock(&resv->lock);
+               list_add(&trg->link, &resv->region_cache);
+               resv->region_cache_count++;
+               goto retry_locked;
+       }
+
         /* Locate the region we are before or in. */
         list_for_each_entry(rg, head, link)
                 if (f <= rg->to)
@@ -283,6 +393,7 @@ retry:
          * size such that we can guarantee to record the reservation. */
         if (&rg->link == head || t < rg->from) {
                 if (!nrg) {
+                       resv->adds_in_progress--;
                         spin_unlock(&resv->lock);
                         nrg = kmalloc(sizeof(*nrg), GFP_KERNEL);
                         if (!nrg)
@@ -331,41 +442,146 @@ out_nrg:
         return chg;
  }
  
-static long region_truncate(struct resv_map *resv, long end)
+/*
+ * Abort the in progress add operation.  The adds_in_progress field
+ * of the resv_map keeps track of the operations in progress between
+ * calls to region_chg and region_add.  Operations are sometimes
+ * aborted after the call to region_chg.  In such cases, region_abort
+ * is called to decrement the adds_in_progress counter.
+ *
+ * NOTE: The range arguments [f, t) are not needed or used in this
+ * routine.  They are kept to make reading the calling code easier as
+ * arguments will match the associated region_chg call.
+ */
+static void region_abort(struct resv_map *resv, long f, long t)
+{
+       spin_lock(&resv->lock);
+       VM_BUG_ON(!resv->region_cache_count);
+       resv->adds_in_progress--;
+       spin_unlock(&resv->lock);
+}
+
+/*
+ * Delete the specified range [f, t) from the reserve map.  If the
+ * t parameter is LONG_MAX, this indicates that ALL regions after f
+ * should be deleted.  Locate the regions which intersect [f, t)
+ * and either trim, delete or split the existing regions.
+ *
+ * Returns the number of huge pages deleted from the reserve map.
+ * In the normal case, the return value is zero or more.  In the
+ * case where a region must be split, a new region descriptor must
+ * be allocated.  If the allocation fails, -ENOMEM will be returned.
+ * NOTE: If the parameter t == LONG_MAX, then we will never split
+ * a region and possibly return -ENOMEM.  Callers specifying
+ * t == LONG_MAX do not need to check for -ENOMEM error.
+ */
+static long region_del(struct resv_map *resv, long f, long t)
  {
         struct list_head *head = &resv->regions;
         struct file_region *rg, *trg;
-       long chg = 0;
+       struct file_region *nrg = NULL;
+       long del = 0;
  
+retry:
         spin_lock(&resv->lock);
-       /* Locate the region we are either in or before. */
-       list_for_each_entry(rg, head, link)
-               if (end <= rg->to)
+       list_for_each_entry_safe(rg, trg, head, link) {
+               /*
+                * Skip regions before the range to be deleted.  file_region
+                * ranges are normally of the form [from, to).  However, there
+                * may be a "placeholder" entry in the map which is of the form
+                * (from, to) with from == to.  Check for placeholder entries
+                * at the beginning of the range to be deleted.
+                */
+               if (rg->to <= f && (rg->to != rg->from || rg->to != f))
+                       continue;
+
+               if (rg->from >= t)
                         break;
-       if (&rg->link == head)
-               goto out;
  
-       /* If we are in the middle of a region then adjust it. */
-       if (end > rg->from) {
-               chg = rg->to - end;
-               rg->to = end;
-               rg = list_entry(rg->link.next, typeof(*rg), link);
-       }
+               if (f > rg->from && t < rg->to) { /* Must split region */
+                       /*
+                        * Check for an entry in the cache before dropping
+                        * lock and attempting allocation.
+                        */
+                       if (!nrg &&
+                           resv->region_cache_count > resv->adds_in_progress) {
+                               nrg = list_first_entry(&resv->region_cache,
+                                                       struct file_region,
+                                                       link);
+                               list_del(&nrg->link);
+                               resv->region_cache_count--;
+                       }
  
-       /* Drop any remaining regions. */
-       list_for_each_entry_safe(rg, trg, rg->link.prev, link) {
-               if (&rg->link == head)
+                       if (!nrg) {
+                               spin_unlock(&resv->lock);
+                               nrg = kmalloc(sizeof(*nrg), GFP_KERNEL);
+                               if (!nrg)
+                                       return -ENOMEM;
+                               goto retry;
+                       }
+
+                       del += t - f;
+
+                       /* New entry for end of split region */
+                       nrg->from = t;
+                       nrg->to = rg->to;
+                       INIT_LIST_HEAD(&nrg->link);
+
+                       /* Original entry is trimmed */
+                       rg->to = f;
+
+                       list_add(&nrg->link, &rg->link);
+                       nrg = NULL;
                         break;
-               chg += rg->to - rg->from;
-               list_del(&rg->link);
-               kfree(rg);
+               }
+
+               if (f <= rg->from && t >= rg->to) { /* Remove entire region */
+                       del += rg->to - rg->from;
+                       list_del(&rg->link);
+                       kfree(rg);
+                       continue;
+               }
+
+               if (f <= rg->from) {    /* Trim beginning of region */
+                       del += t - rg->from;
+                       rg->from = t;
+               } else {                /* Trim end of region */
+                       del += rg->to - f;
+                       rg->to = f;
+               }
         }
  
-out:
         spin_unlock(&resv->lock);
-       return chg;
+       kfree(nrg);
+       return del;
  }
  
+/*
+ * A rare out of memory error was encountered which prevented removal of
+ * the reserve map region for a page.  The huge page itself was free'ed
+ * and removed from the page cache.  This routine will adjust the subpool
+ * usage count, and the global reserve count if needed.  By incrementing
+ * these counts, the reserve map entry which could not be deleted will
+ * appear as a "reserved" entry instead of simply dangling with incorrect
+ * counts.
+ */
+void hugetlb_fix_reserve_counts(struct inode *inode, bool restore_reserve)
+{
+       struct hugepage_subpool *spool = subpool_inode(inode);
+       long rsv_adjust;
+
+       rsv_adjust = hugepage_subpool_get_pages(spool, 1);
+       if (restore_reserve && rsv_adjust) {
+               struct hstate *h = hstate_inode(inode);
+
+               hugetlb_acct_memory(h, 1);
+       }
+}
+
+/*
+ * Count and return the number of huge pages in the reserve map
+ * that intersect with the range [f, t).
+ */
  static long region_count(struct resv_map *resv, long f, long t)
  {
         struct list_head *head = &resv->regions;
@@ -482,22 +698,44 @@ static void set_vma_private_data(struct vm_area_struct *vma,
  struct resv_map *resv_map_alloc(void)
  {
         struct resv_map *resv_map = kmalloc(sizeof(*resv_map), GFP_KERNEL);
-       if (!resv_map)
+       struct file_region *rg = kmalloc(sizeof(*rg), GFP_KERNEL);
+
+       if (!resv_map || !rg) {
+               kfree(resv_map);
+               kfree(rg);
                 return NULL;
+       }
  
         kref_init(&resv_map->refs);
         spin_lock_init(&resv_map->lock);
         INIT_LIST_HEAD(&resv_map->regions);
  
+       resv_map->adds_in_progress = 0;
+
+       INIT_LIST_HEAD(&resv_map->region_cache);
+       list_add(&rg->link, &resv_map->region_cache);
+       resv_map->region_cache_count = 1;
+
         return resv_map;
  }
  
  void resv_map_release(struct kref *ref)
  {
         struct resv_map *resv_map = container_of(ref, struct resv_map, refs);
+       struct list_head *head = &resv_map->region_cache;
+       struct file_region *rg, *trg;
  
         /* Clear out any active regions before we release the map. */
-       region_truncate(resv_map, 0);
+       region_del(resv_map, 0, LONG_MAX);
+
+       /* ... and any entries left in the cache */
+       list_for_each_entry_safe(rg, trg, head, link) {
+               list_del(&rg->link);
+               kfree(rg);
+       }
+
+       VM_BUG_ON(resv_map->adds_in_progress);
+
         kfree(resv_map);
  }
  
@@ -554,7 +792,7 @@ void reset_vma_resv_huge_pages(struct vm_area_struct *vma)
  }
  
  /* Returns true if the VMA has associated reserve pages */
-static int vma_has_reserves(struct vm_area_struct *vma, long chg)
+static bool vma_has_reserves(struct vm_area_struct *vma, long chg)
  {
         if (vma->vm_flags & VM_NORESERVE) {
                 /*
@@ -567,23 +805,34 @@ static int vma_has_reserves(struct vm_area_struct *vma, long chg)
                  * properly, so add work-around here.
                  */
                 if (vma->vm_flags & VM_MAYSHARE && chg == 0)
-                       return 1;
+                       return true;
                 else
-                       return 0;
+                       return false;
         }
  
         /* Shared mappings always use reserves */
-       if (vma->vm_flags & VM_MAYSHARE)
-               return 1;
+       if (vma->vm_flags & VM_MAYSHARE) {
+               /*
+                * We know VM_NORESERVE is not set.  Therefore, there SHOULD
+                * be a region map for all pages.  The only situation where
+                * there is no region map is if a hole was punched via
+                * fallocate.  In this case, there really are no reverves to
+                * use.  This situation is indicated if chg != 0.
+                */
+               if (chg)
+                       return false;
+               else
+                       return true;
+       }
  
         /*
          * Only the process that called mmap() has reserves for
          * private mappings.
          */
         if (is_vma_resv_set(vma, HPAGE_RESV_OWNER))
-               return 1;
+               return true;
  
-       return 0;
+       return false;
  }
  
  static void enqueue_huge_page(struct hstate *h, struct page *page)
@@ -755,23 +1004,22 @@ static int hstate_next_node_to_free(struct hstate *h, nodemask_t *nodes_allowed)
  
  #if defined(CONFIG_CMA) && defined(CONFIG_X86_64)
  static void destroy_compound_gigantic_page(struct page *page,
-                                       unsigned long order)
+                                       unsigned int order)
  {
         int i;
         int nr_pages = 1 << order;
         struct page *p = page + 1;
  
         for (i = 1; i < nr_pages; i++, p = mem_map_next(p, page, i)) {
-               __ClearPageTail(p);
+               clear_compound_head(p);
                 set_page_refcounted(p);
-               p->first_page = NULL;
         }
  
         set_compound_order(page, 0);
         __ClearPageHead(page);
  }
  
-static void free_gigantic_page(struct page *page, unsigned order)
+static void free_gigantic_page(struct page *page, unsigned int order)
  {
         free_contig_range(page_to_pfn(page), 1 << order);
  }
@@ -815,7 +1063,7 @@ static bool zone_spans_last_pfn(const struct zone *zone,
         return zone_spans_pfn(zone, last_pfn);
  }
  
-static struct page *alloc_gigantic_page(int nid, unsigned order)
+static struct page *alloc_gigantic_page(int nid, unsigned int order)
  {
         unsigned long nr_pages = 1 << order;
         unsigned long ret, pfn, flags;
@@ -851,7 +1099,7 @@ static struct page *alloc_gigantic_page(int nid, unsigned order)
  }
  
  static void prep_new_huge_page(struct hstate *h, struct page *page, int nid);
-static void prep_compound_gigantic_page(struct page *page, unsigned long order);
+static void prep_compound_gigantic_page(struct page *page, unsigned int order);
  
  static struct page *alloc_fresh_gigantic_page_node(struct hstate *h, int nid)
  {
@@ -884,9 +1132,9 @@ static int alloc_fresh_gigantic_page(struct hstate *h,
  static inline bool gigantic_page_supported(void) { return true; }
  #else
  static inline bool gigantic_page_supported(void) { return false; }
-static inline void free_gigantic_page(struct page *page, unsigned order) { }
+static inline void free_gigantic_page(struct page *page, unsigned int order) { }
  static inline void destroy_compound_gigantic_page(struct page *page,
-                                               unsigned long order) { }
+                                               unsigned int order) { }
  static inline int alloc_fresh_gigantic_page(struct hstate *h,
                                         nodemask_t *nodes_allowed) { return 0; }
  #endif
@@ -907,13 +1155,12 @@ static void update_and_free_page(struct hstate *h, struct page *page)
                                 1 << PG_writeback);
         }
         VM_BUG_ON_PAGE(hugetlb_cgroup_from_page(page), page);
-       set_compound_page_dtor(page, NULL);
+       set_compound_page_dtor(page, NULL_COMPOUND_DTOR);
         set_page_refcounted(page);
         if (hstate_is_gigantic(h)) {
                 destroy_compound_gigantic_page(page, huge_page_order(h));
                 free_gigantic_page(page, huge_page_order(h));
         } else {
-               arch_release_hugepage(page);
                 __free_pages(page, huge_page_order(h));
         }
  }
@@ -1004,7 +1251,7 @@ void free_huge_page(struct page *page)
  static void prep_new_huge_page(struct hstate *h, struct page *page, int nid)
  {
         INIT_LIST_HEAD(&page->lru);
-       set_compound_page_dtor(page, free_huge_page);
+       set_compound_page_dtor(page, HUGETLB_PAGE_DTOR);
         spin_lock(&hugetlb_lock);
         set_hugetlb_cgroup(page, NULL);
         h->nr_huge_pages++;
@@ -1013,7 +1260,7 @@ static void prep_new_huge_page(struct hstate *h, struct page *page, int nid)
         put_page(page); /* free it into the hugepage allocator */
  }
  
-static void prep_compound_gigantic_page(struct page *page, unsigned long order)
+static void prep_compound_gigantic_page(struct page *page, unsigned int order)
  {
         int i;
         int nr_pages = 1 << order;
@@ -1038,10 +1285,7 @@ static void prep_compound_gigantic_page(struct page *page, unsigned long order)
                  */
                 __ClearPageReserved(p);
                 set_page_count(p, 0);
-               p->first_page = page;
-               /* Make sure p->first_page is always valid for PageTail() */
-               smp_wmb();
-               __SetPageTail(p);
+               set_compound_head(p, page);
         }
  }
  
@@ -1056,7 +1300,7 @@ int PageHuge(struct page *page)
                 return 0;
  
         page = compound_head(page);
-       return get_compound_page_dtor(page) == free_huge_page;
+       return page[1].compound_dtor == HUGETLB_PAGE_DTOR;
  }
  EXPORT_SYMBOL_GPL(PageHuge);
  
@@ -1093,15 +1337,11 @@ static struct page *alloc_fresh_huge_page_node(struct hstate *h, int nid)
  {
         struct page *page;
  
-       page = alloc_pages_exact_node(nid,
+       page = __alloc_pages_node(nid,
                 htlb_alloc_mask(h)|__GFP_COMP|__GFP_THISNODE|
                                                 __GFP_REPEAT|__GFP_NOWARN,
                 huge_page_order(h));
         if (page) {
-               if (arch_prepare_hugepage(page)) {
-                       __free_pages(page, huge_page_order(h));
-                       return NULL;
-               }
                 prep_new_huge_page(h, page, nid);
         }
  
@@ -1203,7 +1443,82 @@ void dissolve_free_huge_pages(unsigned long start_pfn, unsigned long end_pfn)
                 dissolve_free_huge_page(pfn_to_page(pfn));
  }
  
-static struct page *alloc_buddy_huge_page(struct hstate *h, int nid)
+/*
+ * There are 3 ways this can get called:
+ * 1. With vma+addr: we use the VMA's memory policy
+ * 2. With !vma, but nid=NUMA_NO_NODE:  We try to allocate a huge
+ *    page from any node, and let the buddy allocator itself figure
+ *    it out.
+ * 3. With !vma, but nid!=NUMA_NO_NODE.  We allocate a huge page
+ *    strictly from 'nid'
+ */
+static struct page *__hugetlb_alloc_buddy_huge_page(struct hstate *h,
+               struct vm_area_struct *vma, unsigned long addr, int nid)
+{
+       int order = huge_page_order(h);
+       gfp_t gfp = htlb_alloc_mask(h)|__GFP_COMP|__GFP_REPEAT|__GFP_NOWARN;
+       unsigned int cpuset_mems_cookie;
+
+       /*
+        * We need a VMA to get a memory policy.  If we do not
+        * have one, we use the 'nid' argument.
+        *
+        * The mempolicy stuff below has some non-inlined bits
+        * and calls ->vm_ops.  That makes it hard to optimize at
+        * compile-time, even when NUMA is off and it does
+        * nothing.  This helps the compiler optimize it out.
+        */
+       if (!IS_ENABLED(CONFIG_NUMA) || !vma) {
+               /*
+                * If a specific node is requested, make sure to
+                * get memory from there, but only when a node
+                * is explicitly specified.
+                */
+               if (nid != NUMA_NO_NODE)
+                       gfp |= __GFP_THISNODE;
+               /*
+                * Make sure to call something that can handle
+                * nid=NUMA_NO_NODE
+                */
+               return alloc_pages_node(nid, gfp, order);
+       }
+
+       /*
+        * OK, so we have a VMA.  Fetch the mempolicy and try to
+        * allocate a huge page with it.  We will only reach this
+        * when CONFIG_NUMA=y.
+        */
+       do {
+               struct page *page;
+               struct mempolicy *mpol;
+               struct zonelist *zl;
+               nodemask_t *nodemask;
+
+               cpuset_mems_cookie = read_mems_allowed_begin();
+               zl = huge_zonelist(vma, addr, gfp, &mpol, &nodemask);
+               mpol_cond_put(mpol);
+               page = __alloc_pages_nodemask(gfp, order, zl, nodemask);
+               if (page)
+                       return page;
+       } while (read_mems_allowed_retry(cpuset_mems_cookie));
+
+       return NULL;
+}
+
+/*
+ * There are two ways to allocate a huge page:
+ * 1. When you have a VMA and an address (like a fault)
+ * 2. When you have no VMA (like when setting /proc/.../nr_hugepages)
+ *
+ * 'vma' and 'addr' are only for (1).  'nid' is always NUMA_NO_NODE in
+ * this case which signifies that the allocation should be done with
+ * respect for the VMA's memory policy.
+ *
+ * For (2), we ignore 'vma' and 'addr' and use 'nid' exclusively. This
+ * implies that memory policies will not be taken in to account.
+ */
+static struct page *__alloc_buddy_huge_page(struct hstate *h,
+               struct vm_area_struct *vma, unsigned long addr, int nid)
  {
         struct page *page;
         unsigned int r_nid;
@@ -1211,6 +1526,15 @@ static struct page *alloc_buddy_huge_page(struct hstate *h, int nid)
         if (hstate_is_gigantic(h))
                 return NULL;
  
+       /*
+        * Make sure that anyone specifying 'nid' is not also specifying a VMA.
+        * This makes sure the caller is picking _one_ of the modes with which
+        * we can call this function, not both.
+        */
+       if (vma || (addr != -1)) {
+               VM_WARN_ON_ONCE(addr == -1);
+               VM_WARN_ON_ONCE(nid != NUMA_NO_NODE);
+       }
         /*
          * Assume we will successfully allocate the surplus page to
          * prevent racing processes from causing the surplus to exceed
@@ -1244,25 +1568,13 @@ static struct page *alloc_buddy_huge_page(struct hstate *h, int nid)
         }
         spin_unlock(&hugetlb_lock);
  
-       if (nid == NUMA_NO_NODE)
-               page = alloc_pages(htlb_alloc_mask(h)|__GFP_COMP|
-                                  __GFP_REPEAT|__GFP_NOWARN,
-                                  huge_page_order(h));
-       else
-               page = alloc_pages_exact_node(nid,
-                       htlb_alloc_mask(h)|__GFP_COMP|__GFP_THISNODE|
-                       __GFP_REPEAT|__GFP_NOWARN, huge_page_order(h));
-
-       if (page && arch_prepare_hugepage(page)) {
-               __free_pages(page, huge_page_order(h));
-               page = NULL;
-       }
+       page = __hugetlb_alloc_buddy_huge_page(h, vma, addr, nid);
  
         spin_lock(&hugetlb_lock);
         if (page) {
                 INIT_LIST_HEAD(&page->lru);
                 r_nid = page_to_nid(page);
-               set_compound_page_dtor(page, free_huge_page);
+               set_compound_page_dtor(page, HUGETLB_PAGE_DTOR);
                 set_hugetlb_cgroup(page, NULL);
                 /*
                  * We incremented the global counters already
@@ -1280,6 +1592,29 @@ static struct page *alloc_buddy_huge_page(struct hstate *h, int nid)
         return page;
  }
  
+/*
+ * Allocate a huge page from 'nid'.  Note, 'nid' may be
+ * NUMA_NO_NODE, which means that it may be allocated
+ * anywhere.
+ */
+static
+struct page *__alloc_buddy_huge_page_no_mpol(struct hstate *h, int nid)
+{
+       unsigned long addr = -1;
+
+       return __alloc_buddy_huge_page(h, NULL, addr, nid);
+}
+
+/*
+ * Use the VMA's mpolicy to allocate a huge page from the buddy.
+ */
+static
+struct page *__alloc_buddy_huge_page_with_mpol(struct hstate *h,
+               struct vm_area_struct *vma, unsigned long addr)
+{
+       return __alloc_buddy_huge_page(h, vma, addr, NUMA_NO_NODE);
+}
+
  /*
   * This allocation function is useful in the context where vma is irrelevant.
   * E.g. soft-offlining uses this function because it only cares physical
@@ -1295,7 +1630,7 @@ struct page *alloc_huge_page_node(struct hstate *h, int nid)
         spin_unlock(&hugetlb_lock);
  
         if (!page)
-               page = alloc_buddy_huge_page(h, nid);
+               page = __alloc_buddy_huge_page_no_mpol(h, nid);
  
         return page;
  }
@@ -1325,7 +1660,7 @@ static int gather_surplus_pages(struct hstate *h, int delta)
  retry:
         spin_unlock(&hugetlb_lock);
         for (i = 0; i < needed; i++) {
-               page = alloc_buddy_huge_page(h, NUMA_NO_NODE);
+               page = __alloc_buddy_huge_page_no_mpol(h, NUMA_NO_NODE);
                 if (!page) {
                         alloc_ok = false;
                         break;
@@ -1421,87 +1756,150 @@ static void return_unused_surplus_pages(struct hstate *h,
         }
  }
  
+
  /*
- * Determine if the huge page at addr within the vma has an associated
- * reservation.  Where it does not we will need to logically increase
- * reservation and actually increase subpool usage before an allocation
- * can occur.  Where any new reservation would be required the
- * reservation change is prepared, but not committed.  Once the page
- * has been allocated from the subpool and instantiated the change should
- * be committed via vma_commit_reservation.  No action is required on
- * failure.
+ * vma_needs_reservation, vma_commit_reservation and vma_end_reservation
+ * are used by the huge page allocation routines to manage reservations.
+ *
+ * vma_needs_reservation is called to determine if the huge page at addr
+ * within the vma has an associated reservation.  If a reservation is
+ * needed, the value 1 is returned.  The caller is then responsible for
+ * managing the global reservation and subpool usage counts.  After
+ * the huge page has been allocated, vma_commit_reservation is called
+ * to add the page to the reservation map.  If the page allocation fails,
+ * the reservation must be ended instead of committed.  vma_end_reservation
+ * is called in such cases.
+ *
+ * In the normal case, vma_commit_reservation returns the same value
+ * as the preceding vma_needs_reservation call.  The only time this
+ * is not the case is if a reserve map was changed between calls.  It
+ * is the responsibility of the caller to notice the difference and
+ * take appropriate action.
   */
-static long vma_needs_reservation(struct hstate *h,
-                       struct vm_area_struct *vma, unsigned long addr)
+enum vma_resv_mode {
+       VMA_NEEDS_RESV,
+       VMA_COMMIT_RESV,
+       VMA_END_RESV,
+};
+static long __vma_reservation_common(struct hstate *h,
+                               struct vm_area_struct *vma, unsigned long addr,
+                               enum vma_resv_mode mode)
  {
         struct resv_map *resv;
         pgoff_t idx;
-       long chg;
+       long ret;
  
         resv = vma_resv_map(vma);
         if (!resv)
                 return 1;
  
         idx = vma_hugecache_offset(h, vma, addr);
-       chg = region_chg(resv, idx, idx + 1);
+       switch (mode) {
+       case VMA_NEEDS_RESV:
+               ret = region_chg(resv, idx, idx + 1);
+               break;
+       case VMA_COMMIT_RESV:
+               ret = region_add(resv, idx, idx + 1);
+               break;
+       case VMA_END_RESV:
+               region_abort(resv, idx, idx + 1);
+               ret = 0;
+               break;
+       default:
+               BUG();
+       }
  
         if (vma->vm_flags & VM_MAYSHARE)
-               return chg;
+               return ret;
         else
-               return chg < 0 ? chg : 0;
+               return ret < 0 ? ret : 0;
  }
-static void vma_commit_reservation(struct hstate *h,
+
+static long vma_needs_reservation(struct hstate *h,
                         struct vm_area_struct *vma, unsigned long addr)
  {
-       struct resv_map *resv;
-       pgoff_t idx;
+       return __vma_reservation_common(h, vma, addr, VMA_NEEDS_RESV);
+}
  
-       resv = vma_resv_map(vma);
-       if (!resv)
-               return;
+static long vma_commit_reservation(struct hstate *h,
+                       struct vm_area_struct *vma, unsigned long addr)
+{
+       return __vma_reservation_common(h, vma, addr, VMA_COMMIT_RESV);
+}
  
-       idx = vma_hugecache_offset(h, vma, addr);
-       region_add(resv, idx, idx + 1);
+static void vma_end_reservation(struct hstate *h,
+                       struct vm_area_struct *vma, unsigned long addr)
+{
+       (void)__vma_reservation_common(h, vma, addr, VMA_END_RESV);
  }
  
-static struct page *alloc_huge_page(struct vm_area_struct *vma,
+struct page *alloc_huge_page(struct vm_area_struct *vma,
                                     unsigned long addr, int avoid_reserve)
  {
         struct hugepage_subpool *spool = subpool_vma(vma);
         struct hstate *h = hstate_vma(vma);
         struct page *page;
-       long chg;
+       long map_chg, map_commit;
+       long gbl_chg;
         int ret, idx;
         struct hugetlb_cgroup *h_cg;
  
         idx = hstate_index(h);
         /*
-        * Processes that did not create the mapping will have no
-        * reserves and will not have accounted against subpool
-        * limit. Check that the subpool limit can be made before
-        * satisfying the allocation MAP_NORESERVE mappings may also
-        * need pages and subpool limit allocated allocated if no reserve
-        * mapping overlaps.
+        * Examine the region/reserve map to determine if the process
+        * has a reservation for the page to be allocated.  A return
+        * code of zero indicates a reservation exists (no change).
          */
-       chg = vma_needs_reservation(h, vma, addr);
-       if (chg < 0)
+       map_chg = gbl_chg = vma_needs_reservation(h, vma, addr);
+       if (map_chg < 0)
                 return ERR_PTR(-ENOMEM);
-       if (chg || avoid_reserve)
-               if (hugepage_subpool_get_pages(spool, 1) < 0)
+
+       /*
+        * Processes that did not create the mapping will have no
+        * reserves as indicated by the region/reserve map. Check
+        * that the allocation will not exceed the subpool limit.
+        * Allocations for MAP_NORESERVE mappings also need to be
+        * checked against any subpool limit.
+        */
+       if (map_chg || avoid_reserve) {
+               gbl_chg = hugepage_subpool_get_pages(spool, 1);
+               if (gbl_chg < 0) {
+                       vma_end_reservation(h, vma, addr);
                         return ERR_PTR(-ENOSPC);
+               }
+
+               /*
+                * Even though there was no reservation in the region/reserve
+                * map, there could be reservations associated with the
+                * subpool that can be used.  This would be indicated if the
+                * return value of hugepage_subpool_get_pages() is zero.
+                * However, if avoid_reserve is specified we still avoid even
+                * the subpool reservations.
+                */
+               if (avoid_reserve)
+                       gbl_chg = 1;
+       }
  
         ret = hugetlb_cgroup_charge_cgroup(idx, pages_per_huge_page(h), &h_cg);
         if (ret)
                 goto out_subpool_put;
  
         spin_lock(&hugetlb_lock);
-       page = dequeue_huge_page_vma(h, vma, addr, avoid_reserve, chg);
+       /*
+        * glb_chg is passed to indicate whether or not a page must be taken
+        * from the global free pool (global change).  gbl_chg == 0 indicates
+        * a reservation exists for the allocation.
+        */
+       page = dequeue_huge_page_vma(h, vma, addr, avoid_reserve, gbl_chg);
         if (!page) {
                 spin_unlock(&hugetlb_lock);
-               page = alloc_buddy_huge_page(h, NUMA_NO_NODE);
+               page = __alloc_buddy_huge_page_with_mpol(h, vma, addr);
                 if (!page)
                         goto out_uncharge_cgroup;
-
+               if (!avoid_reserve && vma_has_reserves(vma, gbl_chg)) {
+                       SetPagePrivate(page);
+                       h->resv_huge_pages--;
+               }
                 spin_lock(&hugetlb_lock);
                 list_move(&page->lru, &h->hugepage_activelist);
                 /* Fall through */
@@ -1511,14 +1909,30 @@ static struct page *alloc_huge_page(struct vm_area_struct *vma,
  
         set_page_private(page, (unsigned long)spool);
  
-       vma_commit_reservation(h, vma, addr);
+       map_commit = vma_commit_reservation(h, vma, addr);
+       if (unlikely(map_chg > map_commit)) {
+               /*
+                * The page was added to the reservation map between
+                * vma_needs_reservation and vma_commit_reservation.
+                * This indicates a race with hugetlb_reserve_pages.
+                * Adjust for the subpool count incremented above AND
+                * in hugetlb_reserve_pages for the same page.  Also,
+                * the reservation count added in hugetlb_reserve_pages
+                * no longer applies.
+                */
+               long rsv_adjust;
+
+               rsv_adjust = hugepage_subpool_put_pages(spool, 1);
+               hugetlb_acct_memory(h, -rsv_adjust);
+       }
         return page;
  
  out_uncharge_cgroup:
         hugetlb_cgroup_uncharge_cgroup(idx, pages_per_huge_page(h), h_cg);
  out_subpool_put:
-       if (chg || avoid_reserve)
+       if (map_chg || avoid_reserve)
                 hugepage_subpool_put_pages(spool, 1);
+       vma_end_reservation(h, vma, addr);
         return ERR_PTR(-ENOSPC);
  }
  
@@ -1567,7 +1981,8 @@ found:
         return 1;
  }
  
-static void __init prep_compound_huge_page(struct page *page, int order)
+static void __init prep_compound_huge_page(struct page *page,
+               unsigned int order)
  {
         if (unlikely(order > (MAX_ORDER - 1)))
                 prep_compound_gigantic_page(page, order);
@@ -1736,7 +2151,7 @@ static unsigned long set_max_huge_pages(struct hstate *h, unsigned long count,
          * First take pages out of surplus state.  Then make up the
          * remaining difference by allocating fresh huge pages.
          *
-        * We might race with alloc_buddy_huge_page() here and be unable
+        * We might race with __alloc_buddy_huge_page() here and be unable
          * to convert a surplus huge page to a normal huge page. That is
          * not critical, though, it just means the overall size of the
          * pool might be one hugepage larger than it needs to be, but
@@ -1778,7 +2193,7 @@ static unsigned long set_max_huge_pages(struct hstate *h, unsigned long count,
          * By placing pages into the surplus state independent of the
          * overcommit value, we are allowing the surplus pool size to
          * exceed overcommit. There are few sane options here. Since
-        * alloc_buddy_huge_page() is checking the global counter,
+        * __alloc_buddy_huge_page() is checking the global counter,
          * though, we'll note that we're not allowed to exceed surplus
          * and won't grow the pool anywhere else. Not until one of the
          * sysctls are changed, or the surplus pages go out of use.
@@ -2071,7 +2486,7 @@ struct node_hstate {
         struct kobject          *hugepages_kobj;
         struct kobject          *hstate_kobjs[HUGE_MAX_HSTATE];
  };
-struct node_hstate node_hstates[MAX_NUMNODES];
+static struct node_hstate node_hstates[MAX_NUMNODES];
  
  /*
   * A subset of global hstate attributes for node devices
@@ -2234,7 +2649,7 @@ static void __exit hugetlb_exit(void)
         }
  
         kobject_put(hugepages_kobj);
-       kfree(htlb_fault_mutex_table);
+       kfree(hugetlb_fault_mutex_table);
  }
  module_exit(hugetlb_exit);
  
@@ -2267,18 +2682,18 @@ static int __init hugetlb_init(void)
  #else
         num_fault_mutexes = 1;
  #endif
-       htlb_fault_mutex_table =
+       hugetlb_fault_mutex_table =
                 kmalloc(sizeof(struct mutex) * num_fault_mutexes, GFP_KERNEL);
-       BUG_ON(!htlb_fault_mutex_table);
+       BUG_ON(!hugetlb_fault_mutex_table);
  
         for (i = 0; i < num_fault_mutexes; i++)
-               mutex_init(&htlb_fault_mutex_table[i]);
+               mutex_init(&hugetlb_fault_mutex_table[i]);
         return 0;
  }
  module_init(hugetlb_init);
  
  /* Should be called on processing a hugepagesz=... option */
-void __init hugetlb_add_hstate(unsigned order)
+void __init hugetlb_add_hstate(unsigned int order)
  {
         struct hstate *h;
         unsigned long i;
@@ -2485,6 +2900,12 @@ void hugetlb_show_meminfo(void)
                                 1UL << (huge_page_order(h) + PAGE_SHIFT - 10));
  }
  
+void hugetlb_report_usage(struct seq_file *m, struct mm_struct *mm)
+{
+       seq_printf(m, "HugetlbPages:\t%8lu kB\n",
+                  atomic_long_read(&mm->hugetlb_usage) << (PAGE_SHIFT - 10));
+}
+
  /* Return the number pages of memory we physically have, in PAGE_SIZE units. */
  unsigned long hugetlb_total_pages(void)
  {
@@ -2720,6 +3141,7 @@ int copy_hugetlb_page_range(struct mm_struct *dst, struct mm_struct *src,
                         get_page(ptepage);
                         page_dup_rmap(ptepage);
                         set_huge_pte_at(dst, addr, dst_pte, entry);
+                       hugetlb_count_add(pages_per_huge_page(h), dst);
                 }
                 spin_unlock(src_ptl);
                 spin_unlock(dst_ptl);
@@ -2800,6 +3222,7 @@ again:
                 if (huge_pte_dirty(pte))
                         set_page_dirty(page);
  
+               hugetlb_count_sub(pages_per_huge_page(h), mm);
                 page_remove_rmap(page);
                 force_flush = !__tlb_remove_page(tlb, page);
                 if (force_flush) {
@@ -2896,6 +3319,14 @@ static void unmap_ref_private(struct mm_struct *mm, struct vm_area_struct *vma,
                 if (iter_vma == vma)
                         continue;
  
+               /*
+                * Shared VMAs have their own reserves and do not affect
+                * MAP_PRIVATE accounting but it is possible that a shared
+                * VMA is using the same page so check and skip such VMAs.
+                */
+               if (iter_vma->vm_flags & VM_MAYSHARE)
+                       continue;
+
                 /*
                  * Unmap the page from other VMAs without their own reserves.
                  * They get marked to be SIGKILLed if they fault in these
@@ -3070,6 +3501,23 @@ static bool hugetlbfs_pagecache_present(struct hstate *h,
         return page != NULL;
  }
  
+int huge_add_to_page_cache(struct page *page, struct address_space *mapping,
+                          pgoff_t idx)
+{
+       struct inode *inode = mapping->host;
+       struct hstate *h = hstate_inode(inode);
+       int err = add_to_page_cache(page, mapping, idx, GFP_KERNEL);
+
+       if (err)
+               return err;
+       ClearPagePrivate(page);
+
+       spin_lock(&inode->i_lock);
+       inode->i_blocks += blocks_per_huge_page(h);
+       spin_unlock(&inode->i_lock);
+       return 0;
+}
+
  static int hugetlb_no_page(struct mm_struct *mm, struct vm_area_struct *vma,
                            struct address_space *mapping, pgoff_t idx,
                            unsigned long address, pte_t *ptep, unsigned int flags)
@@ -3117,21 +3565,13 @@ retry:
                 set_page_huge_active(page);
  
                 if (vma->vm_flags & VM_MAYSHARE) {
-                       int err;
-                       struct inode *inode = mapping->host;
-
-                       err = add_to_page_cache(page, mapping, idx, GFP_KERNEL);
+                       int err = huge_add_to_page_cache(page, mapping, idx);
                         if (err) {
                                 put_page(page);
                                 if (err == -EEXIST)
                                         goto retry;
                                 goto out;
                         }
-                       ClearPagePrivate(page);
-
-                       spin_lock(&inode->i_lock);
-                       inode->i_blocks += blocks_per_huge_page(h);
-                       spin_unlock(&inode->i_lock);
                 } else {
                         lock_page(page);
                         if (unlikely(anon_vma_prepare(vma))) {
@@ -3159,11 +3599,14 @@ retry:
          * any allocations necessary to record that reservation occur outside
          * the spinlock.
          */
-       if ((flags & FAULT_FLAG_WRITE) && !(vma->vm_flags & VM_SHARED))
+       if ((flags & FAULT_FLAG_WRITE) && !(vma->vm_flags & VM_SHARED)) {
                 if (vma_needs_reservation(h, vma, address) < 0) {
                         ret = VM_FAULT_OOM;
                         goto backout_unlocked;
                 }
+               /* Just decrements count, does not deallocate */
+               vma_end_reservation(h, vma, address);
+       }
  
         ptl = huge_pte_lockptr(h, mm, ptep);
         spin_lock(ptl);
@@ -3184,6 +3627,7 @@ retry:
                                 && (vma->vm_flags & VM_SHARED)));
         set_huge_pte_at(mm, address, ptep, new_pte);
  
+       hugetlb_count_add(pages_per_huge_page(h), mm);
         if ((flags & FAULT_FLAG_WRITE) && !(vma->vm_flags & VM_SHARED)) {
                 /* Optimization, do the COW without a second fault */
                 ret = hugetlb_cow(mm, vma, address, ptep, new_pte, page, ptl);
@@ -3203,7 +3647,7 @@ backout_unlocked:
  }
  
  #ifdef CONFIG_SMP
-static u32 fault_mutex_hash(struct hstate *h, struct mm_struct *mm,
+u32 hugetlb_fault_mutex_hash(struct hstate *h, struct mm_struct *mm,
                             struct vm_area_struct *vma,
                             struct address_space *mapping,
                             pgoff_t idx, unsigned long address)
@@ -3228,7 +3672,7 @@ static u32 fault_mutex_hash(struct hstate *h, struct mm_struct *mm,
   * For uniprocesor systems we always use a single mutex, so just
   * return 0 and avoid the hashing overhead.
   */
-static u32 fault_mutex_hash(struct hstate *h, struct mm_struct *mm,
+u32 hugetlb_fault_mutex_hash(struct hstate *h, struct mm_struct *mm,
                             struct vm_area_struct *vma,
                             struct address_space *mapping,
                             pgoff_t idx, unsigned long address)
@@ -3262,12 +3706,12 @@ int hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma,
                 } else if (unlikely(is_hugetlb_entry_hwpoisoned(entry)))
                         return VM_FAULT_HWPOISON_LARGE |
                                 VM_FAULT_SET_HINDEX(hstate_index(h));
+       } else {
+               ptep = huge_pte_alloc(mm, address, huge_page_size(h));
+               if (!ptep)
+                       return VM_FAULT_OOM;
         }
  
-       ptep = huge_pte_alloc(mm, address, huge_page_size(h));
-       if (!ptep)
-               return VM_FAULT_OOM;
-
         mapping = vma->vm_file->f_mapping;
         idx = vma_hugecache_offset(h, vma, address);
  
@@ -3276,8 +3720,8 @@ int hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma,
          * get spurious allocation failures if two CPUs race to instantiate
          * the same page in the page cache.
          */
-       hash = fault_mutex_hash(h, mm, vma, mapping, idx, address);
-       mutex_lock(&htlb_fault_mutex_table[hash]);
+       hash = hugetlb_fault_mutex_hash(h, mm, vma, mapping, idx, address);
+       mutex_lock(&hugetlb_fault_mutex_table[hash]);
  
         entry = huge_ptep_get(ptep);
         if (huge_pte_none(entry)) {
@@ -3310,6 +3754,8 @@ int hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma,
                         ret = VM_FAULT_OOM;
                         goto out_mutex;
                 }
+               /* Just decrements count, does not deallocate */
+               vma_end_reservation(h, vma, address);
  
                 if (!(vma->vm_flags & VM_MAYSHARE))
                         pagecache_page = hugetlbfs_pagecache_page(h,
@@ -3360,7 +3806,7 @@ out_ptl:
                 put_page(pagecache_page);
         }
  out_mutex:
-       mutex_unlock(&htlb_fault_mutex_table[hash]);
+       mutex_unlock(&hugetlb_fault_mutex_table[hash]);
         /*
          * Generally it's safe to hold refcount during waiting page lock. But
          * here we just wait to defer the next page fault to avoid busy loop and
@@ -3629,16 +4075,35 @@ int hugetlb_reserve_pages(struct inode *inode,
          * consumed reservations are stored in the map. Hence, nothing
          * else has to be done for private mappings here
          */
-       if (!vma || vma->vm_flags & VM_MAYSHARE)
-               region_add(resv_map, from, to);
+       if (!vma || vma->vm_flags & VM_MAYSHARE) {
+               long add = region_add(resv_map, from, to);
+
+               if (unlikely(chg > add)) {
+                       /*
+                        * pages in this range were added to the reserve
+                        * map between region_chg and region_add.  This
+                        * indicates a race with alloc_huge_page.  Adjust
+                        * the subpool and reserve counts modified above
+                        * based on the difference.
+                        */
+                       long rsv_adjust;
+
+                       rsv_adjust = hugepage_subpool_put_pages(spool,
+                                                               chg - add);
+                       hugetlb_acct_memory(h, -rsv_adjust);
+               }
+       }
         return 0;
  out_err:
+       if (!vma || vma->vm_flags & VM_MAYSHARE)
+               region_abort(resv_map, from, to);
         if (vma && is_vma_resv_set(vma, HPAGE_RESV_OWNER))
                 kref_put(&resv_map->refs, resv_map_release);
         return ret;
  }
  
-void hugetlb_unreserve_pages(struct inode *inode, long offset, long freed)
+long hugetlb_unreserve_pages(struct inode *inode, long start, long end,
+                                                               long freed)
  {
         struct hstate *h = hstate_inode(inode);
         struct resv_map *resv_map = inode_resv_map(inode);
@@ -3646,8 +4111,17 @@ void hugetlb_unreserve_pages(struct inode *inode, long offset, long freed)
         struct hugepage_subpool *spool = subpool_inode(inode);
         long gbl_reserve;
  
-       if (resv_map)
-               chg = region_truncate(resv_map, offset);
+       if (resv_map) {
+               chg = region_del(resv_map, start, end);
+               /*
+                * region_del() can fail in the rare case where a region
+                * must be split and another region descriptor can not be
+                * allocated.  If end == LONG_MAX, it will not fail.
+                */
+               if (chg < 0)
+                       return chg;
+       }
+
         spin_lock(&inode->i_lock);
         inode->i_blocks -= (blocks_per_huge_page(h) * freed);
         spin_unlock(&inode->i_lock);
@@ -3658,6 +4132,8 @@ void hugetlb_unreserve_pages(struct inode *inode, long offset, long freed)
          */
         gbl_reserve = hugepage_subpool_put_pages(spool, (chg - freed));
         hugetlb_acct_memory(h, -gbl_reserve);
+
+       return 0;
  }
  
  #ifdef CONFIG_ARCH_WANT_HUGE_PMD_SHARE
@@ -3671,8 +4147,8 @@ static unsigned long page_table_shareable(struct vm_area_struct *svma,
         unsigned long s_end = sbase + PUD_SIZE;
  
         /* Allow segments to share if only one is marked locked */
-       unsigned long vm_flags = vma->vm_flags & ~VM_LOCKED;
-       unsigned long svm_flags = svma->vm_flags & ~VM_LOCKED;
+       unsigned long vm_flags = vma->vm_flags & VM_LOCKED_CLEAR_MASK;
+       unsigned long svm_flags = svma->vm_flags & VM_LOCKED_CLEAR_MASK;
  
         /*
          * match the virtual addresses, permission and the alignment of the
@@ -3686,7 +4162,7 @@ static unsigned long page_table_shareable(struct vm_area_struct *svma,
         return saddr;
  }
  
-static int vma_shareable(struct vm_area_struct *vma, unsigned long addr)
+static bool vma_shareable(struct vm_area_struct *vma, unsigned long addr)
  {
         unsigned long base = addr & PUD_MASK;
         unsigned long end = base + PUD_SIZE;
@@ -3696,8 +4172,8 @@ static int vma_shareable(struct vm_area_struct *vma, unsigned long addr)
          */
         if (vma->vm_flags & VM_MAYSHARE &&
             vma->vm_start <= base && end <= vma->vm_end)
-               return 1;
-       return 0;
+               return true;
+       return false;
  }
  
  /*
@@ -3792,6 +4268,11 @@ pte_t *huge_pmd_share(struct mm_struct *mm, unsigned long addr, pud_t *pud)
  {
         return NULL;
  }
+
+int huge_pmd_unshare(struct mm_struct *mm, unsigned long *addr, pte_t *ptep)
+{
+       return 0;
+}
  #define want_pmd_share()       (0)
  #endif /* CONFIG_ARCH_WANT_HUGE_PMD_SHARE */