[fuel-plugin] Rename node_reboot task

[kvmfornfv.git] / kernel / mm / memory.c
diff --git a/kernel/mm/memory.c b/kernel/mm/memory.c

index 17734c3..76dcee3 100644 (file)
--- a/kernel/mm/memory.c
+++ b/kernel/mm/memory.c
@@ -61,6 +61,7 @@
  #include <linux/string.h>
  #include <linux/dma-debug.h>
  #include <linux/debugfs.h>
+#include <linux/userfaultfd_k.h>
  
  #include <asm/io.h>
  #include <asm/pgalloc.h>
@@ -180,22 +181,22 @@ static void check_sync_rss_stat(struct task_struct *task)
  
  #ifdef HAVE_GENERIC_MMU_GATHER
  
-static int tlb_next_batch(struct mmu_gather *tlb)
+static bool tlb_next_batch(struct mmu_gather *tlb)
  {
         struct mmu_gather_batch *batch;
  
         batch = tlb->active;
         if (batch->next) {
                 tlb->active = batch->next;
-               return 1;
+               return true;
         }
  
         if (tlb->batch_count == MAX_GATHER_BATCH_COUNT)
-               return 0;
+               return false;
  
         batch = (void *)__get_free_pages(GFP_NOWAIT | __GFP_NOWARN, 0);
         if (!batch)
-               return 0;
+               return false;
  
         tlb->batch_count++;
         batch->next = NULL;
@@ -205,7 +206,7 @@ static int tlb_next_batch(struct mmu_gather *tlb)
         tlb->active->next = batch;
         tlb->active = batch;
  
-       return 1;
+       return true;
  }
  
  /* tlb_gather_mmu
@@ -796,6 +797,46 @@ out:
         return pfn_to_page(pfn);
  }
  
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+struct page *vm_normal_page_pmd(struct vm_area_struct *vma, unsigned long addr,
+                               pmd_t pmd)
+{
+       unsigned long pfn = pmd_pfn(pmd);
+
+       /*
+        * There is no pmd_special() but there may be special pmds, e.g.
+        * in a direct-access (dax) mapping, so let's just replicate the
+        * !HAVE_PTE_SPECIAL case from vm_normal_page() here.
+        */
+       if (unlikely(vma->vm_flags & (VM_PFNMAP|VM_MIXEDMAP))) {
+               if (vma->vm_flags & VM_MIXEDMAP) {
+                       if (!pfn_valid(pfn))
+                               return NULL;
+                       goto out;
+               } else {
+                       unsigned long off;
+                       off = (addr - vma->vm_start) >> PAGE_SHIFT;
+                       if (pfn == vma->vm_pgoff + off)
+                               return NULL;
+                       if (!is_cow_mapping(vma->vm_flags))
+                               return NULL;
+               }
+       }
+
+       if (is_zero_pfn(pfn))
+               return NULL;
+       if (unlikely(pfn > highest_memmap_pfn))
+               return NULL;
+
+       /*
+        * NOTE! We still have PageReserved() pages in the page tables.
+        * eg. VDSO mappings can cause them to exist.
+        */
+out:
+       return pfn_to_page(pfn);
+}
+#endif
+
  /*
   * copy one vm_area from one task to the other. Assumes the page tables
   * already present in the new task to be cleared in the whole range
@@ -2081,11 +2122,12 @@ static int wp_page_copy(struct mm_struct *mm, struct vm_area_struct *vma,
                         goto oom;
                 cow_user_page(new_page, old_page, address, vma);
         }
-       __SetPageUptodate(new_page);
  
         if (mem_cgroup_try_charge(new_page, mm, GFP_KERNEL, &memcg))
                 goto oom_free_new;
  
+       __SetPageUptodate(new_page);
+
         mmu_notifier_invalidate_range_start(mm, mmun_start, mmun_end);
  
         /*
@@ -2669,6 +2711,10 @@ static int do_anonymous_page(struct mm_struct *mm, struct vm_area_struct *vma,
  
         pte_unmap(page_table);
  
+       /* File mapping without ->vm_ops ? */
+       if (vma->vm_flags & VM_SHARED)
+               return VM_FAULT_SIGBUS;
+
         /* Check if we need to add a guard page to the stack */
         if (check_stack_guard_page(vma, address) < 0)
                 return VM_FAULT_SIGSEGV;
@@ -2680,6 +2726,12 @@ static int do_anonymous_page(struct mm_struct *mm, struct vm_area_struct *vma,
                 page_table = pte_offset_map_lock(mm, pmd, address, &ptl);
                 if (!pte_none(*page_table))
                         goto unlock;
+               /* Deliver the page fault to userland, check inside PT lock */
+               if (userfaultfd_missing(vma)) {
+                       pte_unmap_unlock(page_table, ptl);
+                       return handle_userfault(vma, address, flags,
+                                               VM_UFFD_MISSING);
+               }
                 goto setpte;
         }
  
@@ -2689,6 +2741,10 @@ static int do_anonymous_page(struct mm_struct *mm, struct vm_area_struct *vma,
         page = alloc_zeroed_user_highpage_movable(vma, address);
         if (!page)
                 goto oom;
+
+       if (mem_cgroup_try_charge(page, mm, GFP_KERNEL, &memcg))
+               goto oom_free_page;
+
         /*
          * The memory barrier inside __SetPageUptodate makes sure that
          * preceeding stores to the page contents become visible before
@@ -2696,9 +2752,6 @@ static int do_anonymous_page(struct mm_struct *mm, struct vm_area_struct *vma,
          */
         __SetPageUptodate(page);
  
-       if (mem_cgroup_try_charge(page, mm, GFP_KERNEL, &memcg))
-               goto oom_free_page;
-
         entry = mk_pte(page, vma->vm_page_prot);
         if (vma->vm_flags & VM_WRITE)
                 entry = pte_mkwrite(pte_mkdirty(entry));
@@ -2707,6 +2760,15 @@ static int do_anonymous_page(struct mm_struct *mm, struct vm_area_struct *vma,
         if (!pte_none(*page_table))
                 goto release;
  
+       /* Deliver the page fault to userland, check inside PT lock */
+       if (userfaultfd_missing(vma)) {
+               pte_unmap_unlock(page_table, ptl);
+               mem_cgroup_cancel_charge(page, memcg);
+               page_cache_release(page);
+               return handle_userfault(vma, address, flags,
+                                       VM_UFFD_MISSING);
+       }
+
         inc_mm_counter_fast(mm, MM_ANONPAGES);
         page_add_new_anon_rmap(page, vma, address);
         mem_cgroup_commit_charge(page, memcg, false);
@@ -3097,6 +3159,9 @@ static int do_fault(struct mm_struct *mm, struct vm_area_struct *vma,
                         - vma->vm_start) >> PAGE_SHIFT) + vma->vm_pgoff;
  
         pte_unmap(page_table);
+       /* The VMA was not fully populated on mmap() or missing VM_DONTEXPAND */
+       if (!vma->vm_ops->fault)
+               return VM_FAULT_SIGBUS;
         if (!(flags & FAULT_FLAG_WRITE))
                 return do_read_fault(mm, vma, address, pmd, pgoff, flags,
                                 orig_pte);
@@ -3207,6 +3272,27 @@ out:
         return 0;
  }
  
+static int create_huge_pmd(struct mm_struct *mm, struct vm_area_struct *vma,
+                       unsigned long address, pmd_t *pmd, unsigned int flags)
+{
+       if (vma_is_anonymous(vma))
+               return do_huge_pmd_anonymous_page(mm, vma, address, pmd, flags);
+       if (vma->vm_ops->pmd_fault)
+               return vma->vm_ops->pmd_fault(vma, address, pmd, flags);
+       return VM_FAULT_FALLBACK;
+}
+
+static int wp_huge_pmd(struct mm_struct *mm, struct vm_area_struct *vma,
+                       unsigned long address, pmd_t *pmd, pmd_t orig_pmd,
+                       unsigned int flags)
+{
+       if (vma_is_anonymous(vma))
+               return do_huge_pmd_wp_page(mm, vma, address, pmd, orig_pmd);
+       if (vma->vm_ops->pmd_fault)
+               return vma->vm_ops->pmd_fault(vma, address, pmd, flags);
+       return VM_FAULT_FALLBACK;
+}
+
  /*
   * These routines also need to handle stuff like marking pages dirty
   * and/or accessed for architectures that don't do it in hardware (most
@@ -3242,13 +3328,12 @@ static int handle_pte_fault(struct mm_struct *mm,
         barrier();
         if (!pte_present(entry)) {
                 if (pte_none(entry)) {
-                       if (vma->vm_ops) {
-                               if (likely(vma->vm_ops->fault))
-                                       return do_fault(mm, vma, address, pte,
-                                                       pmd, flags, entry);
-                       }
-                       return do_anonymous_page(mm, vma, address,
-                                                pte, pmd, flags);
+                       if (vma_is_anonymous(vma))
+                               return do_anonymous_page(mm, vma, address,
+                                                        pte, pmd, flags);
+                       else
+                               return do_fault(mm, vma, address, pte, pmd,
+                                               flags, entry);
                 }
                 return do_swap_page(mm, vma, address,
                                         pte, pmd, flags, entry);
@@ -3310,10 +3395,7 @@ static int __handle_mm_fault(struct mm_struct *mm, struct vm_area_struct *vma,
         if (!pmd)
                 return VM_FAULT_OOM;
         if (pmd_none(*pmd) && transparent_hugepage_enabled(vma)) {
-               int ret = VM_FAULT_FALLBACK;
-               if (!vma->vm_ops)
-                       ret = do_huge_pmd_anonymous_page(mm, vma, address,
-                                       pmd, flags);
+               int ret = create_huge_pmd(mm, vma, address, pmd, flags);
                 if (!(ret & VM_FAULT_FALLBACK))
                         return ret;
         } else {
@@ -3337,8 +3419,8 @@ static int __handle_mm_fault(struct mm_struct *mm, struct vm_area_struct *vma,
                                                              orig_pmd, pmd);
  
                         if (dirty && !pmd_write(orig_pmd)) {
-                               ret = do_huge_pmd_wp_page(mm, vma, address, pmd,
-                                                         orig_pmd);
+                               ret = wp_huge_pmd(mm, vma, address, pmd,
+                                                       orig_pmd, flags);
                                 if (!(ret & VM_FAULT_FALLBACK))
                                         return ret;
                         } else {
@@ -3357,8 +3439,18 @@ static int __handle_mm_fault(struct mm_struct *mm, struct vm_area_struct *vma,
         if (unlikely(pmd_none(*pmd)) &&
             unlikely(__pte_alloc(mm, vma, pmd, address)))
                 return VM_FAULT_OOM;
-       /* if an huge pmd materialized from under us just retry later */
-       if (unlikely(pmd_trans_huge(*pmd)))
+       /*
+        * If a huge pmd materialized under us just retry later.  Use
+        * pmd_trans_unstable() instead of pmd_trans_huge() to ensure the pmd
+        * didn't become pmd_trans_huge under us and then back to pmd_none, as
+        * a result of MADV_DONTNEED running immediately after a huge pmd fault
+        * in a different thread of this mm, in turn leading to a misleading
+        * pmd_trans_huge() retval.  All we have to ensure is that it is a
+        * regular pmd that we can walk with pte_offset_map() and we can do that
+        * through an atomic read in C, which is what pmd_trans_unstable()
+        * provides.
+        */
+       if (unlikely(pmd_trans_unstable(pmd)))
                 return 0;
         /*
          * A regular pmd is established and it can't morph into a huge pmd
@@ -3724,7 +3816,7 @@ void print_vma_addr(char *prefix, unsigned long ip)
                 if (buf) {
                         char *p;
  
-                       p = d_path(&f->f_path, buf, PAGE_SIZE);
+                       p = file_path(f, buf, PAGE_SIZE);
                         if (IS_ERR(p))
                                 p = "?";
                         printk("%s%s[%lx+%lx]", prefix, kbasename(p),