#include <linux/string.h>
#include <linux/dma-debug.h>
#include <linux/debugfs.h>
+#include <linux/userfaultfd_k.h>
#include <asm/io.h>
#include <asm/pgalloc.h>
#ifdef HAVE_GENERIC_MMU_GATHER
-static int tlb_next_batch(struct mmu_gather *tlb)
+static bool tlb_next_batch(struct mmu_gather *tlb)
{
struct mmu_gather_batch *batch;
batch = tlb->active;
if (batch->next) {
tlb->active = batch->next;
- return 1;
+ return true;
}
if (tlb->batch_count == MAX_GATHER_BATCH_COUNT)
- return 0;
+ return false;
batch = (void *)__get_free_pages(GFP_NOWAIT | __GFP_NOWARN, 0);
if (!batch)
- return 0;
+ return false;
tlb->batch_count++;
batch->next = NULL;
tlb->active->next = batch;
tlb->active = batch;
- return 1;
+ return true;
}
/* tlb_gather_mmu
return pfn_to_page(pfn);
}
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+struct page *vm_normal_page_pmd(struct vm_area_struct *vma, unsigned long addr,
+ pmd_t pmd)
+{
+ unsigned long pfn = pmd_pfn(pmd);
+
+ /*
+ * There is no pmd_special() but there may be special pmds, e.g.
+ * in a direct-access (dax) mapping, so let's just replicate the
+ * !HAVE_PTE_SPECIAL case from vm_normal_page() here.
+ */
+ if (unlikely(vma->vm_flags & (VM_PFNMAP|VM_MIXEDMAP))) {
+ if (vma->vm_flags & VM_MIXEDMAP) {
+ if (!pfn_valid(pfn))
+ return NULL;
+ goto out;
+ } else {
+ unsigned long off;
+ off = (addr - vma->vm_start) >> PAGE_SHIFT;
+ if (pfn == vma->vm_pgoff + off)
+ return NULL;
+ if (!is_cow_mapping(vma->vm_flags))
+ return NULL;
+ }
+ }
+
+ if (is_zero_pfn(pfn))
+ return NULL;
+ if (unlikely(pfn > highest_memmap_pfn))
+ return NULL;
+
+ /*
+ * NOTE! We still have PageReserved() pages in the page tables.
+ * eg. VDSO mappings can cause them to exist.
+ */
+out:
+ return pfn_to_page(pfn);
+}
+#endif
+
/*
* copy one vm_area from one task to the other. Assumes the page tables
* already present in the new task to be cleared in the whole range
goto oom;
cow_user_page(new_page, old_page, address, vma);
}
- __SetPageUptodate(new_page);
if (mem_cgroup_try_charge(new_page, mm, GFP_KERNEL, &memcg))
goto oom_free_new;
+ __SetPageUptodate(new_page);
+
mmu_notifier_invalidate_range_start(mm, mmun_start, mmun_end);
/*
pte_unmap(page_table);
+ /* File mapping without ->vm_ops ? */
+ if (vma->vm_flags & VM_SHARED)
+ return VM_FAULT_SIGBUS;
+
/* Check if we need to add a guard page to the stack */
if (check_stack_guard_page(vma, address) < 0)
return VM_FAULT_SIGSEGV;
page_table = pte_offset_map_lock(mm, pmd, address, &ptl);
if (!pte_none(*page_table))
goto unlock;
+ /* Deliver the page fault to userland, check inside PT lock */
+ if (userfaultfd_missing(vma)) {
+ pte_unmap_unlock(page_table, ptl);
+ return handle_userfault(vma, address, flags,
+ VM_UFFD_MISSING);
+ }
goto setpte;
}
page = alloc_zeroed_user_highpage_movable(vma, address);
if (!page)
goto oom;
+
+ if (mem_cgroup_try_charge(page, mm, GFP_KERNEL, &memcg))
+ goto oom_free_page;
+
/*
* The memory barrier inside __SetPageUptodate makes sure that
* preceeding stores to the page contents become visible before
*/
__SetPageUptodate(page);
- if (mem_cgroup_try_charge(page, mm, GFP_KERNEL, &memcg))
- goto oom_free_page;
-
entry = mk_pte(page, vma->vm_page_prot);
if (vma->vm_flags & VM_WRITE)
entry = pte_mkwrite(pte_mkdirty(entry));
if (!pte_none(*page_table))
goto release;
+ /* Deliver the page fault to userland, check inside PT lock */
+ if (userfaultfd_missing(vma)) {
+ pte_unmap_unlock(page_table, ptl);
+ mem_cgroup_cancel_charge(page, memcg);
+ page_cache_release(page);
+ return handle_userfault(vma, address, flags,
+ VM_UFFD_MISSING);
+ }
+
inc_mm_counter_fast(mm, MM_ANONPAGES);
page_add_new_anon_rmap(page, vma, address);
mem_cgroup_commit_charge(page, memcg, false);
- vma->vm_start) >> PAGE_SHIFT) + vma->vm_pgoff;
pte_unmap(page_table);
+ /* The VMA was not fully populated on mmap() or missing VM_DONTEXPAND */
+ if (!vma->vm_ops->fault)
+ return VM_FAULT_SIGBUS;
if (!(flags & FAULT_FLAG_WRITE))
return do_read_fault(mm, vma, address, pmd, pgoff, flags,
orig_pte);
return 0;
}
+static int create_huge_pmd(struct mm_struct *mm, struct vm_area_struct *vma,
+ unsigned long address, pmd_t *pmd, unsigned int flags)
+{
+ if (vma_is_anonymous(vma))
+ return do_huge_pmd_anonymous_page(mm, vma, address, pmd, flags);
+ if (vma->vm_ops->pmd_fault)
+ return vma->vm_ops->pmd_fault(vma, address, pmd, flags);
+ return VM_FAULT_FALLBACK;
+}
+
+static int wp_huge_pmd(struct mm_struct *mm, struct vm_area_struct *vma,
+ unsigned long address, pmd_t *pmd, pmd_t orig_pmd,
+ unsigned int flags)
+{
+ if (vma_is_anonymous(vma))
+ return do_huge_pmd_wp_page(mm, vma, address, pmd, orig_pmd);
+ if (vma->vm_ops->pmd_fault)
+ return vma->vm_ops->pmd_fault(vma, address, pmd, flags);
+ return VM_FAULT_FALLBACK;
+}
+
/*
* These routines also need to handle stuff like marking pages dirty
* and/or accessed for architectures that don't do it in hardware (most
barrier();
if (!pte_present(entry)) {
if (pte_none(entry)) {
- if (vma->vm_ops) {
- if (likely(vma->vm_ops->fault))
- return do_fault(mm, vma, address, pte,
- pmd, flags, entry);
- }
- return do_anonymous_page(mm, vma, address,
- pte, pmd, flags);
+ if (vma_is_anonymous(vma))
+ return do_anonymous_page(mm, vma, address,
+ pte, pmd, flags);
+ else
+ return do_fault(mm, vma, address, pte, pmd,
+ flags, entry);
}
return do_swap_page(mm, vma, address,
pte, pmd, flags, entry);
if (!pmd)
return VM_FAULT_OOM;
if (pmd_none(*pmd) && transparent_hugepage_enabled(vma)) {
- int ret = VM_FAULT_FALLBACK;
- if (!vma->vm_ops)
- ret = do_huge_pmd_anonymous_page(mm, vma, address,
- pmd, flags);
+ int ret = create_huge_pmd(mm, vma, address, pmd, flags);
if (!(ret & VM_FAULT_FALLBACK))
return ret;
} else {
orig_pmd, pmd);
if (dirty && !pmd_write(orig_pmd)) {
- ret = do_huge_pmd_wp_page(mm, vma, address, pmd,
- orig_pmd);
+ ret = wp_huge_pmd(mm, vma, address, pmd,
+ orig_pmd, flags);
if (!(ret & VM_FAULT_FALLBACK))
return ret;
} else {
if (unlikely(pmd_none(*pmd)) &&
unlikely(__pte_alloc(mm, vma, pmd, address)))
return VM_FAULT_OOM;
- /* if an huge pmd materialized from under us just retry later */
- if (unlikely(pmd_trans_huge(*pmd)))
+ /*
+ * If a huge pmd materialized under us just retry later. Use
+ * pmd_trans_unstable() instead of pmd_trans_huge() to ensure the pmd
+ * didn't become pmd_trans_huge under us and then back to pmd_none, as
+ * a result of MADV_DONTNEED running immediately after a huge pmd fault
+ * in a different thread of this mm, in turn leading to a misleading
+ * pmd_trans_huge() retval. All we have to ensure is that it is a
+ * regular pmd that we can walk with pte_offset_map() and we can do that
+ * through an atomic read in C, which is what pmd_trans_unstable()
+ * provides.
+ */
+ if (unlikely(pmd_trans_unstable(pmd)))
return 0;
/*
* A regular pmd is established and it can't morph into a huge pmd
if (buf) {
char *p;
- p = d_path(&f->f_path, buf, PAGE_SIZE);
+ p = file_path(f, buf, PAGE_SIZE);
if (IS_ERR(p))
p = "?";
printk("%s%s[%lx+%lx]", prefix, kbasename(p),