These changes are the raw update to linux-4.4.6-rt14. Kernel sources
[kvmfornfv.git] / kernel / mm / rmap.c
index 24dd3f9..b577fbb 100644 (file)
@@ -30,6 +30,8 @@
  *             swap_lock (in swap_duplicate, swap_info_get)
  *               mmlist_lock (in mmput, drain_mmlist and others)
  *               mapping->private_lock (in __set_page_dirty_buffers)
+ *                 mem_cgroup_{begin,end}_page_stat (memcg->move_lock)
+ *                   mapping->tree_lock (widely used)
  *               inode->i_lock (in set_page_dirty's __mark_inode_dirty)
  *               bdi.wb->list_lock (in set_page_dirty's __mark_inode_dirty)
  *                 sb_lock (within inode_lock in fs/fs-writeback.c)
 #include <linux/migrate.h>
 #include <linux/hugetlb.h>
 #include <linux/backing-dev.h>
+#include <linux/page_idle.h>
 
 #include <asm/tlbflush.h>
 
+#include <trace/events/tlb.h>
+
 #include "internal.h"
 
 static struct kmem_cache *anon_vma_cachep;
@@ -581,6 +586,107 @@ vma_address(struct page *page, struct vm_area_struct *vma)
        return address;
 }
 
+#ifdef CONFIG_ARCH_WANT_BATCHED_UNMAP_TLB_FLUSH
+static void percpu_flush_tlb_batch_pages(void *data)
+{
+       /*
+        * All TLB entries are flushed on the assumption that it is
+        * cheaper to flush all TLBs and let them be refilled than
+        * flushing individual PFNs. Note that we do not track mm's
+        * to flush as that might simply be multiple full TLB flushes
+        * for no gain.
+        */
+       count_vm_tlb_event(NR_TLB_REMOTE_FLUSH_RECEIVED);
+       flush_tlb_local();
+}
+
+/*
+ * Flush TLB entries for recently unmapped pages from remote CPUs. It is
+ * important if a PTE was dirty when it was unmapped that it's flushed
+ * before any IO is initiated on the page to prevent lost writes. Similarly,
+ * it must be flushed before freeing to prevent data leakage.
+ */
+void try_to_unmap_flush(void)
+{
+       struct tlbflush_unmap_batch *tlb_ubc = &current->tlb_ubc;
+       int cpu;
+
+       if (!tlb_ubc->flush_required)
+               return;
+
+       cpu = get_cpu();
+
+       trace_tlb_flush(TLB_REMOTE_SHOOTDOWN, -1UL);
+
+       if (cpumask_test_cpu(cpu, &tlb_ubc->cpumask))
+               percpu_flush_tlb_batch_pages(&tlb_ubc->cpumask);
+
+       if (cpumask_any_but(&tlb_ubc->cpumask, cpu) < nr_cpu_ids) {
+               smp_call_function_many(&tlb_ubc->cpumask,
+                       percpu_flush_tlb_batch_pages, (void *)tlb_ubc, true);
+       }
+       cpumask_clear(&tlb_ubc->cpumask);
+       tlb_ubc->flush_required = false;
+       tlb_ubc->writable = false;
+       put_cpu();
+}
+
+/* Flush iff there are potentially writable TLB entries that can race with IO */
+void try_to_unmap_flush_dirty(void)
+{
+       struct tlbflush_unmap_batch *tlb_ubc = &current->tlb_ubc;
+
+       if (tlb_ubc->writable)
+               try_to_unmap_flush();
+}
+
+static void set_tlb_ubc_flush_pending(struct mm_struct *mm,
+               struct page *page, bool writable)
+{
+       struct tlbflush_unmap_batch *tlb_ubc = &current->tlb_ubc;
+
+       cpumask_or(&tlb_ubc->cpumask, &tlb_ubc->cpumask, mm_cpumask(mm));
+       tlb_ubc->flush_required = true;
+
+       /*
+        * If the PTE was dirty then it's best to assume it's writable. The
+        * caller must use try_to_unmap_flush_dirty() or try_to_unmap_flush()
+        * before the page is queued for IO.
+        */
+       if (writable)
+               tlb_ubc->writable = true;
+}
+
+/*
+ * Returns true if the TLB flush should be deferred to the end of a batch of
+ * unmap operations to reduce IPIs.
+ */
+static bool should_defer_flush(struct mm_struct *mm, enum ttu_flags flags)
+{
+       bool should_defer = false;
+
+       if (!(flags & TTU_BATCH_FLUSH))
+               return false;
+
+       /* If remote CPUs need to be flushed then defer batch the flush */
+       if (cpumask_any_but(mm_cpumask(mm), get_cpu()) < nr_cpu_ids)
+               should_defer = true;
+       put_cpu();
+
+       return should_defer;
+}
+#else
+static void set_tlb_ubc_flush_pending(struct mm_struct *mm,
+               struct page *page, bool writable)
+{
+}
+
+static bool should_defer_flush(struct mm_struct *mm, enum ttu_flags flags)
+{
+       return false;
+}
+#endif /* CONFIG_ARCH_WANT_BATCHED_UNMAP_TLB_FLUSH */
+
 /*
  * At what user virtual address is page expected in vma?
  * Caller should check the page is actually part of the vma.
@@ -625,7 +731,7 @@ pmd_t *mm_find_pmd(struct mm_struct *mm, unsigned long address)
 
        pmd = pmd_offset(pud, address);
        /*
-        * Some THP functions use the sequence pmdp_clear_flush(), set_pmd_at()
+        * Some THP functions use the sequence pmdp_huge_clear_flush(), set_pmd_at()
         * without holding anon_vma lock for write.  So when looking for a
         * genuine pmde (in which to find pte), test present and !THP together.
         */
@@ -781,6 +887,11 @@ static int page_referenced_one(struct page *page, struct vm_area_struct *vma,
                pte_unmap_unlock(pte, ptl);
        }
 
+       if (referenced)
+               clear_page_idle(page);
+       if (test_and_clear_page_young(page))
+               referenced++;
+
        if (referenced) {
                pra->referenced++;
                pra->vm_flags |= vma->vm_flags;
@@ -950,7 +1061,12 @@ void page_move_anon_rmap(struct page *page,
        VM_BUG_ON_PAGE(page->index != linear_page_index(vma, address), page);
 
        anon_vma = (void *) anon_vma + PAGE_MAPPING_ANON;
-       page->mapping = (struct address_space *) anon_vma;
+       /*
+        * Ensure that anon_vma and the PAGE_MAPPING_ANON bit are written
+        * simultaneously, so a concurrent reader (eg page_referenced()'s
+        * PageAnon()) will not see one without the other.
+        */
+       WRITE_ONCE(page->mapping, (struct address_space *) anon_vma);
 }
 
 /**
@@ -1188,6 +1304,10 @@ static int try_to_unmap_one(struct page *page, struct vm_area_struct *vma,
        int ret = SWAP_AGAIN;
        enum ttu_flags flags = (enum ttu_flags)arg;
 
+       /* munlock has nothing to gain from examining un-locked vmas */
+       if ((flags & TTU_MUNLOCK) && !(vma->vm_flags & VM_LOCKED))
+               goto out;
+
        pte = page_check_address(page, mm, address, &ptl, 0);
        if (!pte)
                goto out;
@@ -1198,9 +1318,12 @@ static int try_to_unmap_one(struct page *page, struct vm_area_struct *vma,
         * skipped over this mm) then we should reactivate it.
         */
        if (!(flags & TTU_IGNORE_MLOCK)) {
-               if (vma->vm_flags & VM_LOCKED)
-                       goto out_mlock;
-
+               if (vma->vm_flags & VM_LOCKED) {
+                       /* Holding pte lock, we do *not* need mmap_sem here */
+                       mlock_vma_page(page);
+                       ret = SWAP_MLOCK;
+                       goto out_unmap;
+               }
                if (flags & TTU_MUNLOCK)
                        goto out_unmap;
        }
@@ -1213,7 +1336,20 @@ static int try_to_unmap_one(struct page *page, struct vm_area_struct *vma,
 
        /* Nuke the page table entry. */
        flush_cache_page(vma, address, page_to_pfn(page));
-       pteval = ptep_clear_flush(vma, address, pte);
+       if (should_defer_flush(mm, flags)) {
+               /*
+                * We clear the PTE but do not flush so potentially a remote
+                * CPU could still be writing to the page. If the entry was
+                * previously clean then the architecture must guarantee that
+                * a clear->dirty transition on a cached TLB entry is written
+                * through and traps if the PTE is unmapped.
+                */
+               pteval = ptep_get_and_clear(mm, address, pte);
+
+               set_tlb_ubc_flush_pending(mm, page, pte_dirty(pteval));
+       } else {
+               pteval = ptep_clear_flush(vma, address, pte);
+       }
 
        /* Move the dirty bit to the physical page now the pte is gone. */
        if (pte_dirty(pteval))
@@ -1223,7 +1359,9 @@ static int try_to_unmap_one(struct page *page, struct vm_area_struct *vma,
        update_hiwater_rss(mm);
 
        if (PageHWPoison(page) && !(flags & TTU_IGNORE_HWPOISON)) {
-               if (!PageHuge(page)) {
+               if (PageHuge(page)) {
+                       hugetlb_count_sub(1 << compound_order(page), mm);
+               } else {
                        if (PageAnon(page))
                                dec_mm_counter(mm, MM_ANONPAGES);
                        else
@@ -1241,47 +1379,44 @@ static int try_to_unmap_one(struct page *page, struct vm_area_struct *vma,
                        dec_mm_counter(mm, MM_ANONPAGES);
                else
                        dec_mm_counter(mm, MM_FILEPAGES);
+       } else if (IS_ENABLED(CONFIG_MIGRATION) && (flags & TTU_MIGRATION)) {
+               swp_entry_t entry;
+               pte_t swp_pte;
+               /*
+                * Store the pfn of the page in a special migration
+                * pte. do_swap_page() will wait until the migration
+                * pte is removed and then restart fault handling.
+                */
+               entry = make_migration_entry(page, pte_write(pteval));
+               swp_pte = swp_entry_to_pte(entry);
+               if (pte_soft_dirty(pteval))
+                       swp_pte = pte_swp_mksoft_dirty(swp_pte);
+               set_pte_at(mm, address, pte, swp_pte);
        } else if (PageAnon(page)) {
                swp_entry_t entry = { .val = page_private(page) };
                pte_t swp_pte;
-
-               if (PageSwapCache(page)) {
-                       /*
-                        * Store the swap location in the pte.
-                        * See handle_pte_fault() ...
-                        */
-                       if (swap_duplicate(entry) < 0) {
-                               set_pte_at(mm, address, pte, pteval);
-                               ret = SWAP_FAIL;
-                               goto out_unmap;
-                       }
-                       if (list_empty(&mm->mmlist)) {
-                               spin_lock(&mmlist_lock);
-                               if (list_empty(&mm->mmlist))
-                                       list_add(&mm->mmlist, &init_mm.mmlist);
-                               spin_unlock(&mmlist_lock);
-                       }
-                       dec_mm_counter(mm, MM_ANONPAGES);
-                       inc_mm_counter(mm, MM_SWAPENTS);
-               } else if (IS_ENABLED(CONFIG_MIGRATION)) {
-                       /*
-                        * Store the pfn of the page in a special migration
-                        * pte. do_swap_page() will wait until the migration
-                        * pte is removed and then restart fault handling.
-                        */
-                       BUG_ON(!(flags & TTU_MIGRATION));
-                       entry = make_migration_entry(page, pte_write(pteval));
+               /*
+                * Store the swap location in the pte.
+                * See handle_pte_fault() ...
+                */
+               VM_BUG_ON_PAGE(!PageSwapCache(page), page);
+               if (swap_duplicate(entry) < 0) {
+                       set_pte_at(mm, address, pte, pteval);
+                       ret = SWAP_FAIL;
+                       goto out_unmap;
+               }
+               if (list_empty(&mm->mmlist)) {
+                       spin_lock(&mmlist_lock);
+                       if (list_empty(&mm->mmlist))
+                               list_add(&mm->mmlist, &init_mm.mmlist);
+                       spin_unlock(&mmlist_lock);
                }
+               dec_mm_counter(mm, MM_ANONPAGES);
+               inc_mm_counter(mm, MM_SWAPENTS);
                swp_pte = swp_entry_to_pte(entry);
                if (pte_soft_dirty(pteval))
                        swp_pte = pte_swp_mksoft_dirty(swp_pte);
                set_pte_at(mm, address, pte, swp_pte);
-       } else if (IS_ENABLED(CONFIG_MIGRATION) &&
-                  (flags & TTU_MIGRATION)) {
-               /* Establish migration entry for a file page */
-               swp_entry_t entry;
-               entry = make_migration_entry(page, pte_write(pteval));
-               set_pte_at(mm, address, pte, swp_entry_to_pte(entry));
        } else
                dec_mm_counter(mm, MM_FILEPAGES);
 
@@ -1290,31 +1425,10 @@ static int try_to_unmap_one(struct page *page, struct vm_area_struct *vma,
 
 out_unmap:
        pte_unmap_unlock(pte, ptl);
-       if (ret != SWAP_FAIL && !(flags & TTU_MUNLOCK))
+       if (ret != SWAP_FAIL && ret != SWAP_MLOCK && !(flags & TTU_MUNLOCK))
                mmu_notifier_invalidate_page(mm, address);
 out:
        return ret;
-
-out_mlock:
-       pte_unmap_unlock(pte, ptl);
-
-
-       /*
-        * We need mmap_sem locking, Otherwise VM_LOCKED check makes
-        * unstable result and race. Plus, We can't wait here because
-        * we now hold anon_vma->rwsem or mapping->i_mmap_rwsem.
-        * if trylock failed, the page remain in evictable lru and later
-        * vmscan could retry to move the page to unevictable lru if the
-        * page is actually mlocked.
-        */
-       if (down_read_trylock(&vma->vm_mm->mmap_sem)) {
-               if (vma->vm_flags & VM_LOCKED) {
-                       mlock_vma_page(page);
-                       ret = SWAP_MLOCK;
-               }
-               up_read(&vma->vm_mm->mmap_sem);
-       }
-       return ret;
 }
 
 bool is_vma_temporary_stack(struct vm_area_struct *vma)
@@ -1478,6 +1592,8 @@ static int rmap_walk_anon(struct page *page, struct rmap_walk_control *rwc)
                struct vm_area_struct *vma = avc->vma;
                unsigned long address = vma_address(page, vma);
 
+               cond_resched();
+
                if (rwc->invalid_vma && rwc->invalid_vma(vma, rwc->arg))
                        continue;
 
@@ -1527,6 +1643,8 @@ static int rmap_walk_file(struct page *page, struct rmap_walk_control *rwc)
        vma_interval_tree_foreach(vma, &mapping->i_mmap, pgoff, pgoff) {
                unsigned long address = vma_address(page, vma);
 
+               cond_resched();
+
                if (rwc->invalid_vma && rwc->invalid_vma(vma, rwc->arg))
                        continue;