These changes are the raw update to linux-4.4.6-rt14. Kernel sources
[kvmfornfv.git] / kernel / mm / memory.c
index 3fc6efd..b80bf47 100644 (file)
@@ -61,6 +61,7 @@
 #include <linux/string.h>
 #include <linux/dma-debug.h>
 #include <linux/debugfs.h>
+#include <linux/userfaultfd_k.h>
 
 #include <asm/io.h>
 #include <asm/pgalloc.h>
@@ -180,22 +181,22 @@ static void check_sync_rss_stat(struct task_struct *task)
 
 #ifdef HAVE_GENERIC_MMU_GATHER
 
-static int tlb_next_batch(struct mmu_gather *tlb)
+static bool tlb_next_batch(struct mmu_gather *tlb)
 {
        struct mmu_gather_batch *batch;
 
        batch = tlb->active;
        if (batch->next) {
                tlb->active = batch->next;
-               return 1;
+               return true;
        }
 
        if (tlb->batch_count == MAX_GATHER_BATCH_COUNT)
-               return 0;
+               return false;
 
        batch = (void *)__get_free_pages(GFP_NOWAIT | __GFP_NOWARN, 0);
        if (!batch)
-               return 0;
+               return false;
 
        tlb->batch_count++;
        batch->next = NULL;
@@ -205,7 +206,7 @@ static int tlb_next_batch(struct mmu_gather *tlb)
        tlb->active->next = batch;
        tlb->active = batch;
 
-       return 1;
+       return true;
 }
 
 /* tlb_gather_mmu
@@ -2081,11 +2082,12 @@ static int wp_page_copy(struct mm_struct *mm, struct vm_area_struct *vma,
                        goto oom;
                cow_user_page(new_page, old_page, address, vma);
        }
-       __SetPageUptodate(new_page);
 
        if (mem_cgroup_try_charge(new_page, mm, GFP_KERNEL, &memcg))
                goto oom_free_new;
 
+       __SetPageUptodate(new_page);
+
        mmu_notifier_invalidate_range_start(mm, mmun_start, mmun_end);
 
        /*
@@ -2684,6 +2686,12 @@ static int do_anonymous_page(struct mm_struct *mm, struct vm_area_struct *vma,
                page_table = pte_offset_map_lock(mm, pmd, address, &ptl);
                if (!pte_none(*page_table))
                        goto unlock;
+               /* Deliver the page fault to userland, check inside PT lock */
+               if (userfaultfd_missing(vma)) {
+                       pte_unmap_unlock(page_table, ptl);
+                       return handle_userfault(vma, address, flags,
+                                               VM_UFFD_MISSING);
+               }
                goto setpte;
        }
 
@@ -2693,6 +2701,10 @@ static int do_anonymous_page(struct mm_struct *mm, struct vm_area_struct *vma,
        page = alloc_zeroed_user_highpage_movable(vma, address);
        if (!page)
                goto oom;
+
+       if (mem_cgroup_try_charge(page, mm, GFP_KERNEL, &memcg))
+               goto oom_free_page;
+
        /*
         * The memory barrier inside __SetPageUptodate makes sure that
         * preceeding stores to the page contents become visible before
@@ -2700,9 +2712,6 @@ static int do_anonymous_page(struct mm_struct *mm, struct vm_area_struct *vma,
         */
        __SetPageUptodate(page);
 
-       if (mem_cgroup_try_charge(page, mm, GFP_KERNEL, &memcg))
-               goto oom_free_page;
-
        entry = mk_pte(page, vma->vm_page_prot);
        if (vma->vm_flags & VM_WRITE)
                entry = pte_mkwrite(pte_mkdirty(entry));
@@ -2711,6 +2720,15 @@ static int do_anonymous_page(struct mm_struct *mm, struct vm_area_struct *vma,
        if (!pte_none(*page_table))
                goto release;
 
+       /* Deliver the page fault to userland, check inside PT lock */
+       if (userfaultfd_missing(vma)) {
+               pte_unmap_unlock(page_table, ptl);
+               mem_cgroup_cancel_charge(page, memcg);
+               page_cache_release(page);
+               return handle_userfault(vma, address, flags,
+                                       VM_UFFD_MISSING);
+       }
+
        inc_mm_counter_fast(mm, MM_ANONPAGES);
        page_add_new_anon_rmap(page, vma, address);
        mem_cgroup_commit_charge(page, memcg, false);
@@ -3214,6 +3232,27 @@ out:
        return 0;
 }
 
+static int create_huge_pmd(struct mm_struct *mm, struct vm_area_struct *vma,
+                       unsigned long address, pmd_t *pmd, unsigned int flags)
+{
+       if (vma_is_anonymous(vma))
+               return do_huge_pmd_anonymous_page(mm, vma, address, pmd, flags);
+       if (vma->vm_ops->pmd_fault)
+               return vma->vm_ops->pmd_fault(vma, address, pmd, flags);
+       return VM_FAULT_FALLBACK;
+}
+
+static int wp_huge_pmd(struct mm_struct *mm, struct vm_area_struct *vma,
+                       unsigned long address, pmd_t *pmd, pmd_t orig_pmd,
+                       unsigned int flags)
+{
+       if (vma_is_anonymous(vma))
+               return do_huge_pmd_wp_page(mm, vma, address, pmd, orig_pmd);
+       if (vma->vm_ops->pmd_fault)
+               return vma->vm_ops->pmd_fault(vma, address, pmd, flags);
+       return VM_FAULT_FALLBACK;
+}
+
 /*
  * These routines also need to handle stuff like marking pages dirty
  * and/or accessed for architectures that don't do it in hardware (most
@@ -3249,12 +3288,12 @@ static int handle_pte_fault(struct mm_struct *mm,
        barrier();
        if (!pte_present(entry)) {
                if (pte_none(entry)) {
-                       if (vma->vm_ops)
+                       if (vma_is_anonymous(vma))
+                               return do_anonymous_page(mm, vma, address,
+                                                        pte, pmd, flags);
+                       else
                                return do_fault(mm, vma, address, pte, pmd,
                                                flags, entry);
-
-                       return do_anonymous_page(mm, vma, address, pte, pmd,
-                                       flags);
                }
                return do_swap_page(mm, vma, address,
                                        pte, pmd, flags, entry);
@@ -3316,10 +3355,7 @@ static int __handle_mm_fault(struct mm_struct *mm, struct vm_area_struct *vma,
        if (!pmd)
                return VM_FAULT_OOM;
        if (pmd_none(*pmd) && transparent_hugepage_enabled(vma)) {
-               int ret = VM_FAULT_FALLBACK;
-               if (!vma->vm_ops)
-                       ret = do_huge_pmd_anonymous_page(mm, vma, address,
-                                       pmd, flags);
+               int ret = create_huge_pmd(mm, vma, address, pmd, flags);
                if (!(ret & VM_FAULT_FALLBACK))
                        return ret;
        } else {
@@ -3343,8 +3379,8 @@ static int __handle_mm_fault(struct mm_struct *mm, struct vm_area_struct *vma,
                                                             orig_pmd, pmd);
 
                        if (dirty && !pmd_write(orig_pmd)) {
-                               ret = do_huge_pmd_wp_page(mm, vma, address, pmd,
-                                                         orig_pmd);
+                               ret = wp_huge_pmd(mm, vma, address, pmd,
+                                                       orig_pmd, flags);
                                if (!(ret & VM_FAULT_FALLBACK))
                                        return ret;
                        } else {
@@ -3363,8 +3399,18 @@ static int __handle_mm_fault(struct mm_struct *mm, struct vm_area_struct *vma,
        if (unlikely(pmd_none(*pmd)) &&
            unlikely(__pte_alloc(mm, vma, pmd, address)))
                return VM_FAULT_OOM;
-       /* if an huge pmd materialized from under us just retry later */
-       if (unlikely(pmd_trans_huge(*pmd)))
+       /*
+        * If a huge pmd materialized under us just retry later.  Use
+        * pmd_trans_unstable() instead of pmd_trans_huge() to ensure the pmd
+        * didn't become pmd_trans_huge under us and then back to pmd_none, as
+        * a result of MADV_DONTNEED running immediately after a huge pmd fault
+        * in a different thread of this mm, in turn leading to a misleading
+        * pmd_trans_huge() retval.  All we have to ensure is that it is a
+        * regular pmd that we can walk with pte_offset_map() and we can do that
+        * through an atomic read in C, which is what pmd_trans_unstable()
+        * provides.
+        */
+       if (unlikely(pmd_trans_unstable(pmd)))
                return 0;
        /*
         * A regular pmd is established and it can't morph into a huge pmd
@@ -3730,7 +3776,7 @@ void print_vma_addr(char *prefix, unsigned long ip)
                if (buf) {
                        char *p;
 
-                       p = d_path(&f->f_path, buf, PAGE_SIZE);
+                       p = file_path(f, buf, PAGE_SIZE);
                        if (IS_ERR(p))
                                p = "?";
                        printk("%s%s[%lx+%lx]", prefix, kbasename(p),