These changes are the raw update to linux-4.4.6-rt14. Kernel sources

[kvmfornfv.git] / kernel / mm / migrate.c
diff --git a/kernel/mm/migrate.c b/kernel/mm/migrate.c

index f53838f..6d17e0a 100644 (file)
--- a/kernel/mm/migrate.c
+++ b/kernel/mm/migrate.c
@@ -1,5 +1,5 @@
  /*
- * Memory Migration functionality - linux/mm/migration.c
+ * Memory Migration functionality - linux/mm/migrate.c
   *
   * Copyright (C) 2006 Silicon Graphics, Inc., Christoph Lameter
   *
@@ -30,13 +30,14 @@
  #include <linux/mempolicy.h>
  #include <linux/vmalloc.h>
  #include <linux/security.h>
-#include <linux/memcontrol.h>
+#include <linux/backing-dev.h>
  #include <linux/syscalls.h>
  #include <linux/hugetlb.h>
  #include <linux/hugetlb_cgroup.h>
  #include <linux/gfp.h>
  #include <linux/balloon_compaction.h>
  #include <linux/mmu_notifier.h>
+#include <linux/page_idle.h>
  
  #include <asm/tlbflush.h>
  
@@ -170,6 +171,9 @@ static int remove_migration_pte(struct page *new, struct vm_area_struct *vma,
         else
                 page_add_file_rmap(new);
  
+       if (vma->vm_flags & VM_LOCKED)
+               mlock_vma_page(new);
+
         /* No need to invalidate - it was non-present before */
         update_mmu_cache(vma, addr, ptep);
  unlock:
@@ -310,6 +314,8 @@ int migrate_page_move_mapping(struct address_space *mapping,
                 struct buffer_head *head, enum migrate_mode mode,
                 int extra_count)
  {
+       struct zone *oldzone, *newzone;
+       int dirty;
         int expected_count = 1 + extra_count;
         void **pslot;
  
@@ -317,9 +323,20 @@ int migrate_page_move_mapping(struct address_space *mapping,
                 /* Anonymous page without mapping */
                 if (page_count(page) != expected_count)
                         return -EAGAIN;
+
+               /* No turning back from here */
+               set_page_memcg(newpage, page_memcg(page));
+               newpage->index = page->index;
+               newpage->mapping = page->mapping;
+               if (PageSwapBacked(page))
+                       SetPageSwapBacked(newpage);
+
                 return MIGRATEPAGE_SUCCESS;
         }
  
+       oldzone = page_zone(page);
+       newzone = page_zone(newpage);
+
         spin_lock_irq(&mapping->tree_lock);
  
         pslot = radix_tree_lookup_slot(&mapping->page_tree,
@@ -352,14 +369,28 @@ int migrate_page_move_mapping(struct address_space *mapping,
         }
  
         /*
-        * Now we know that no one else is looking at the page.
+        * Now we know that no one else is looking at the page:
+        * no turning back from here.
          */
+       set_page_memcg(newpage, page_memcg(page));
+       newpage->index = page->index;
+       newpage->mapping = page->mapping;
+       if (PageSwapBacked(page))
+               SetPageSwapBacked(newpage);
+
         get_page(newpage);      /* add cache reference */
         if (PageSwapCache(page)) {
                 SetPageSwapCache(newpage);
                 set_page_private(newpage, page_private(page));
         }
  
+       /* Move dirty while page refs frozen and newpage not yet exposed */
+       dirty = PageDirty(page);
+       if (dirty) {
+               ClearPageDirty(page);
+               SetPageDirty(newpage);
+       }
+
         radix_tree_replace_slot(pslot, newpage);
  
         /*
@@ -369,6 +400,9 @@ int migrate_page_move_mapping(struct address_space *mapping,
          */
         page_unfreeze_refs(page, expected_count - 1);
  
+       spin_unlock(&mapping->tree_lock);
+       /* Leave irq disabled to prevent preemption while updating stats */
+
         /*
          * If moved to a different zone then also account
          * the page for that zone. Other VM counters will be
@@ -379,13 +413,19 @@ int migrate_page_move_mapping(struct address_space *mapping,
          * via NR_FILE_PAGES and NR_ANON_PAGES if they
          * are mapped to swap space.
          */
-       __dec_zone_page_state(page, NR_FILE_PAGES);
-       __inc_zone_page_state(newpage, NR_FILE_PAGES);
-       if (!PageSwapCache(page) && PageSwapBacked(page)) {
-               __dec_zone_page_state(page, NR_SHMEM);
-               __inc_zone_page_state(newpage, NR_SHMEM);
+       if (newzone != oldzone) {
+               __dec_zone_state(oldzone, NR_FILE_PAGES);
+               __inc_zone_state(newzone, NR_FILE_PAGES);
+               if (PageSwapBacked(page) && !PageSwapCache(page)) {
+                       __dec_zone_state(oldzone, NR_SHMEM);
+                       __inc_zone_state(newzone, NR_SHMEM);
+               }
+               if (dirty && mapping_cap_account_dirty(mapping)) {
+                       __dec_zone_state(oldzone, NR_FILE_DIRTY);
+                       __inc_zone_state(newzone, NR_FILE_DIRTY);
+               }
         }
-       spin_unlock_irq(&mapping->tree_lock);
+       local_irq_enable();
  
         return MIGRATEPAGE_SUCCESS;
  }
@@ -400,12 +440,6 @@ int migrate_huge_page_move_mapping(struct address_space *mapping,
         int expected_count;
         void **pslot;
  
-       if (!mapping) {
-               if (page_count(page) != 1)
-                       return -EAGAIN;
-               return MIGRATEPAGE_SUCCESS;
-       }
-
         spin_lock_irq(&mapping->tree_lock);
  
         pslot = radix_tree_lookup_slot(&mapping->page_tree,
@@ -423,6 +457,9 @@ int migrate_huge_page_move_mapping(struct address_space *mapping,
                 return -EAGAIN;
         }
  
+       set_page_memcg(newpage, page_memcg(page));
+       newpage->index = page->index;
+       newpage->mapping = page->mapping;
         get_page(newpage);
  
         radix_tree_replace_slot(pslot, newpage);
@@ -509,20 +546,14 @@ void migrate_page_copy(struct page *newpage, struct page *page)
         if (PageMappedToDisk(page))
                 SetPageMappedToDisk(newpage);
  
-       if (PageDirty(page)) {
-               clear_page_dirty_for_io(page);
-               /*
-                * Want to mark the page and the radix tree as dirty, and
-                * redo the accounting that clear_page_dirty_for_io undid,
-                * but we can't use set_page_dirty because that function
-                * is actually a signal that all of the page has become dirty.
-                * Whereas only part of our page may be dirty.
-                */
-               if (PageSwapBacked(page))
-                       SetPageDirty(newpage);
-               else
-                       __set_page_dirty_nobuffers(newpage);
-       }
+       /* Move dirty on pages not done by migrate_page_move_mapping() */
+       if (PageDirty(page))
+               SetPageDirty(newpage);
+
+       if (page_is_young(page))
+               set_page_young(newpage);
+       if (page_is_idle(page))
+               set_page_idle(newpage);
  
         /*
          * Copy NUMA information to the new page, to prevent over-eager
@@ -531,7 +562,6 @@ void migrate_page_copy(struct page *newpage, struct page *page)
         cpupid = page_cpupid_xchg_last(page, -1);
         page_cpupid_xchg_last(newpage, cpupid);
  
-       mlock_migrate_page(newpage, page);
         ksm_migrate_page(newpage, page);
         /*
          * Please do not reorder this without considering how mm/ksm.c's
@@ -715,24 +745,13 @@ static int fallback_migrate_page(struct address_space *mapping,
   *  MIGRATEPAGE_SUCCESS - success
   */
  static int move_to_new_page(struct page *newpage, struct page *page,
-                               int page_was_mapped, enum migrate_mode mode)
+                               enum migrate_mode mode)
  {
         struct address_space *mapping;
         int rc;
  
-       /*
-        * Block others from accessing the page when we get around to
-        * establishing additional references. We are the only one
-        * holding a reference to the new page at this point.
-        */
-       if (!trylock_page(newpage))
-               BUG();
-
-       /* Prepare mapping for the new page.*/
-       newpage->index = page->index;
-       newpage->mapping = page->mapping;
-       if (PageSwapBacked(page))
-               SetPageSwapBacked(newpage);
+       VM_BUG_ON_PAGE(!PageLocked(page), page);
+       VM_BUG_ON_PAGE(!PageLocked(newpage), newpage);
  
         mapping = page_mapping(page);
         if (!mapping)
@@ -744,22 +763,19 @@ static int move_to_new_page(struct page *newpage, struct page *page,
                  * space which also has its own migratepage callback. This
                  * is the most common path for page migration.
                  */
-               rc = mapping->a_ops->migratepage(mapping,
-                                               newpage, page, mode);
+               rc = mapping->a_ops->migratepage(mapping, newpage, page, mode);
         else
                 rc = fallback_migrate_page(mapping, newpage, page, mode);
  
-       if (rc != MIGRATEPAGE_SUCCESS) {
-               newpage->mapping = NULL;
-       } else {
-               mem_cgroup_migrate(page, newpage, false);
-               if (page_was_mapped)
-                       remove_migration_ptes(page, newpage);
-               page->mapping = NULL;
+       /*
+        * When successful, old pagecache page->mapping must be cleared before
+        * page is freed; but stats require that PageAnon be left as PageAnon.
+        */
+       if (rc == MIGRATEPAGE_SUCCESS) {
+               set_page_memcg(page, NULL);
+               if (!PageAnon(page))
+                       page->mapping = NULL;
         }
-
-       unlock_page(newpage);
-
         return rc;
  }
  
@@ -808,6 +824,7 @@ static int __unmap_and_move(struct page *page, struct page *newpage,
                         goto out_unlock;
                 wait_on_page_writeback(page);
         }
+
         /*
          * By try_to_unmap(), page->mapcount goes down to 0 here. In this case,
          * we cannot notice that anon_vma is freed while we migrates a page.
@@ -815,34 +832,26 @@ static int __unmap_and_move(struct page *page, struct page *newpage,
          * of migration. File cache pages are no problem because of page_lock()
          * File Caches may use write_page() or lock_page() in migration, then,
          * just care Anon page here.
+        *
+        * Only page_get_anon_vma() understands the subtleties of
+        * getting a hold on an anon_vma from outside one of its mms.
+        * But if we cannot get anon_vma, then we won't need it anyway,
+        * because that implies that the anon page is no longer mapped
+        * (and cannot be remapped so long as we hold the page lock).
          */
-       if (PageAnon(page) && !PageKsm(page)) {
-               /*
-                * Only page_lock_anon_vma_read() understands the subtleties of
-                * getting a hold on an anon_vma from outside one of its mms.
-                */
+       if (PageAnon(page) && !PageKsm(page))
                 anon_vma = page_get_anon_vma(page);
-               if (anon_vma) {
-                       /*
-                        * Anon page
-                        */
-               } else if (PageSwapCache(page)) {
-                       /*
-                        * We cannot be sure that the anon_vma of an unmapped
-                        * swapcache page is safe to use because we don't
-                        * know in advance if the VMA that this page belonged
-                        * to still exists. If the VMA and others sharing the
-                        * data have been freed, then the anon_vma could
-                        * already be invalid.
-                        *
-                        * To avoid this possibility, swapcache pages get
-                        * migrated but are not remapped when migration
-                        * completes
-                        */
-               } else {
-                       goto out_unlock;
-               }
-       }
+
+       /*
+        * Block others from accessing the new page when we get around to
+        * establishing additional references. We are usually the only one
+        * holding a reference to newpage at this point. We used to have a BUG
+        * here if trylock_page(newpage) fails, but would like to allow for
+        * cases where there might be a race with the previous use of newpage.
+        * This is much like races on refcount of oldpage: just don't BUG().
+        */
+       if (unlikely(!trylock_page(newpage)))
+               goto out_unlock;
  
         if (unlikely(isolated_balloon_page(page))) {
                 /*
@@ -853,7 +862,7 @@ static int __unmap_and_move(struct page *page, struct page *newpage,
                  * the page migration right away (proteced by page lock).
                  */
                 rc = balloon_page_migrate(newpage, page, mode);
-               goto out_unlock;
+               goto out_unlock_both;
         }
  
         /*
@@ -872,30 +881,30 @@ static int __unmap_and_move(struct page *page, struct page *newpage,
                 VM_BUG_ON_PAGE(PageAnon(page), page);
                 if (page_has_private(page)) {
                         try_to_free_buffers(page);
-                       goto out_unlock;
+                       goto out_unlock_both;
                 }
-               goto skip_unmap;
-       }
-
-       /* Establish migration ptes or remove ptes */
-       if (page_mapped(page)) {
+       } else if (page_mapped(page)) {
+               /* Establish migration ptes */
+               VM_BUG_ON_PAGE(PageAnon(page) && !PageKsm(page) && !anon_vma,
+                               page);
                 try_to_unmap(page,
                         TTU_MIGRATION|TTU_IGNORE_MLOCK|TTU_IGNORE_ACCESS);
                 page_was_mapped = 1;
         }
  
-skip_unmap:
         if (!page_mapped(page))
-               rc = move_to_new_page(newpage, page, page_was_mapped, mode);
+               rc = move_to_new_page(newpage, page, mode);
  
-       if (rc && page_was_mapped)
-               remove_migration_ptes(page, page);
+       if (page_was_mapped)
+               remove_migration_ptes(page,
+                       rc == MIGRATEPAGE_SUCCESS ? newpage : page);
  
+out_unlock_both:
+       unlock_page(newpage);
+out_unlock:
         /* Drop an anon_vma reference if we took one */
         if (anon_vma)
                 put_anon_vma(anon_vma);
-
-out_unlock:
         unlock_page(page);
  out:
         return rc;
@@ -918,12 +927,14 @@ out:
  static ICE_noinline int unmap_and_move(new_page_t get_new_page,
                                    free_page_t put_new_page,
                                    unsigned long private, struct page *page,
-                                  int force, enum migrate_mode mode)
+                                  int force, enum migrate_mode mode,
+                                  enum migrate_reason reason)
  {
-       int rc = 0;
+       int rc = MIGRATEPAGE_SUCCESS;
         int *result = NULL;
-       struct page *newpage = get_new_page(page, private, &result);
+       struct page *newpage;
  
+       newpage = get_new_page(page, private, &result);
         if (!newpage)
                 return -ENOMEM;
  
@@ -937,6 +948,8 @@ static ICE_noinline int unmap_and_move(new_page_t get_new_page,
                         goto out;
  
         rc = __unmap_and_move(page, newpage, force, mode);
+       if (rc == MIGRATEPAGE_SUCCESS)
+               put_new_page = NULL;
  
  out:
         if (rc != -EAGAIN) {
@@ -949,7 +962,13 @@ out:
                 list_del(&page->lru);
                 dec_zone_page_state(page, NR_ISOLATED_ANON +
                                 page_is_file_cache(page));
-               putback_lru_page(page);
+               /* Soft-offlined page shouldn't go through lru cache list */
+               if (reason == MR_MEMORY_FAILURE) {
+                       put_page(page);
+                       if (!test_set_page_hwpoison(page))
+                               num_poisoned_pages_inc();
+               } else
+                       putback_lru_page(page);
         }
  
         /*
@@ -957,10 +976,9 @@ out:
          * it.  Otherwise, putback_lru_page() will drop the reference grabbed
          * during isolation.
          */
-       if (rc != MIGRATEPAGE_SUCCESS && put_new_page) {
-               ClearPageSwapBacked(newpage);
+       if (put_new_page)
                 put_new_page(newpage, private);
-       } else if (unlikely(__is_movable_balloon_page(newpage))) {
+       else if (unlikely(__is_movable_balloon_page(newpage))) {
                 /* drop our reference, page already in the balloon */
                 put_page(newpage);
         } else
@@ -998,7 +1016,7 @@ static int unmap_and_move_huge_page(new_page_t get_new_page,
                                 struct page *hpage, int force,
                                 enum migrate_mode mode)
  {
-       int rc = 0;
+       int rc = -EAGAIN;
         int *result = NULL;
         int page_was_mapped = 0;
         struct page *new_hpage;
@@ -1020,8 +1038,6 @@ static int unmap_and_move_huge_page(new_page_t get_new_page,
         if (!new_hpage)
                 return -ENOMEM;
  
-       rc = -EAGAIN;
-
         if (!trylock_page(hpage)) {
                 if (!force || mode != MIGRATE_SYNC)
                         goto out;
@@ -1031,6 +1047,9 @@ static int unmap_and_move_huge_page(new_page_t get_new_page,
         if (PageAnon(hpage))
                 anon_vma = page_get_anon_vma(hpage);
  
+       if (unlikely(!trylock_page(new_hpage)))
+               goto put_anon;
+
         if (page_mapped(hpage)) {
                 try_to_unmap(hpage,
                         TTU_MIGRATION|TTU_IGNORE_MLOCK|TTU_IGNORE_ACCESS);
@@ -1038,16 +1057,22 @@ static int unmap_and_move_huge_page(new_page_t get_new_page,
         }
  
         if (!page_mapped(hpage))
-               rc = move_to_new_page(new_hpage, hpage, page_was_mapped, mode);
+               rc = move_to_new_page(new_hpage, hpage, mode);
  
-       if (rc != MIGRATEPAGE_SUCCESS && page_was_mapped)
-               remove_migration_ptes(hpage, hpage);
+       if (page_was_mapped)
+               remove_migration_ptes(hpage,
+                       rc == MIGRATEPAGE_SUCCESS ? new_hpage : hpage);
  
+       unlock_page(new_hpage);
+
+put_anon:
         if (anon_vma)
                 put_anon_vma(anon_vma);
  
-       if (rc == MIGRATEPAGE_SUCCESS)
+       if (rc == MIGRATEPAGE_SUCCESS) {
                 hugetlb_cgroup_migrate(hpage, new_hpage);
+               put_new_page = NULL;
+       }
  
         unlock_page(hpage);
  out:
@@ -1059,10 +1084,10 @@ out:
          * it.  Otherwise, put_page() will drop the reference grabbed during
          * isolation.
          */
-       if (rc != MIGRATEPAGE_SUCCESS && put_new_page)
+       if (put_new_page)
                 put_new_page(new_hpage, private);
         else
-               put_page(new_hpage);
+               putback_active_hugepage(new_hpage);
  
         if (result) {
                 if (rc)
@@ -1089,7 +1114,7 @@ out:
   *
   * The function returns after 10 attempts or if no pages are movable any more
   * because the list has become empty or no retryable pages exist any more.
- * The caller should call putback_lru_pages() to return pages to the LRU
+ * The caller should call putback_movable_pages() to return pages to the LRU
   * or free list only if ret != 0.
   *
   * Returns the number of pages that were not migrated, or an error code.
@@ -1122,7 +1147,8 @@ int migrate_pages(struct list_head *from, new_page_t get_new_page,
                                                 pass > 2, mode);
                         else
                                 rc = unmap_and_move(get_new_page, put_new_page,
-                                               private, page, pass > 2, mode);
+                                               private, page, pass > 2, mode,
+                                               reason);
  
                         switch(rc) {
                         case -ENOMEM:
@@ -1145,7 +1171,8 @@ int migrate_pages(struct list_head *from, new_page_t get_new_page,
                         }
                 }
         }
-       rc = nr_failed + retry;
+       nr_failed += retry;
+       rc = nr_failed;
  out:
         if (nr_succeeded)
                 count_vm_events(PGMIGRATE_SUCCESS, nr_succeeded);
@@ -1187,7 +1214,7 @@ static struct page *new_page_node(struct page *p, unsigned long private,
                 return alloc_huge_page_node(page_hstate(compound_head(p)),
                                         pm->node);
         else
-               return alloc_pages_exact_node(pm->node,
+               return __alloc_pages_node(pm->node,
                                 GFP_HIGHUSER_MOVABLE | __GFP_THISNODE, 0);
  }
  
@@ -1219,7 +1246,9 @@ static int do_move_page_to_node_array(struct mm_struct *mm,
                 if (!vma || pp->addr < vma->vm_start || !vma_migratable(vma))
                         goto set_status;
  
-               page = follow_page(vma, pp->addr, FOLL_GET|FOLL_SPLIT);
+               /* FOLL_DUMP to ignore special (like zero) pages */
+               page = follow_page(vma, pp->addr,
+                               FOLL_GET | FOLL_SPLIT | FOLL_DUMP);
  
                 err = PTR_ERR(page);
                 if (IS_ERR(page))
@@ -1229,10 +1258,6 @@ static int do_move_page_to_node_array(struct mm_struct *mm,
                 if (!page)
                         goto set_status;
  
-               /* Use PageReserved to check for zero page */
-               if (PageReserved(page))
-                       goto put_and_set;
-
                 pp->page = page;
                 err = page_to_nid(page);
  
@@ -1389,18 +1414,14 @@ static void do_pages_stat_array(struct mm_struct *mm, unsigned long nr_pages,
                 if (!vma || addr < vma->vm_start)
                         goto set_status;
  
-               page = follow_page(vma, addr, 0);
+               /* FOLL_DUMP to ignore special (like zero) pages */
+               page = follow_page(vma, addr, FOLL_DUMP);
  
                 err = PTR_ERR(page);
                 if (IS_ERR(page))
                         goto set_status;
  
-               err = -ENOENT;
-               /* Use PageReserved to check for zero page */
-               if (!page || PageReserved(page))
-                       goto set_status;
-
-               err = page_to_nid(page);
+               err = page ? page_to_nid(page) : -ENOENT;
  set_status:
                 *status = err;
  
@@ -1553,11 +1574,11 @@ static struct page *alloc_misplaced_dst_page(struct page *page,
         int nid = (int) data;
         struct page *newpage;
  
-       newpage = alloc_pages_exact_node(nid,
+       newpage = __alloc_pages_node(nid,
                                          (GFP_HIGHUSER_MOVABLE |
                                           __GFP_THISNODE | __GFP_NOMEMALLOC |
                                           __GFP_NORETRY | __GFP_NOWARN) &
-                                        ~GFP_IOFS, 0);
+                                        ~__GFP_RECLAIM, 0);
  
         return newpage;
  }
@@ -1731,7 +1752,7 @@ int migrate_misplaced_transhuge_page(struct mm_struct *mm,
                 goto out_dropref;
  
         new_page = alloc_pages_node(node,
-               (GFP_TRANSHUGE | __GFP_THISNODE) & ~__GFP_WAIT,
+               (GFP_TRANSHUGE | __GFP_THISNODE) & ~__GFP_RECLAIM,
                 HPAGE_PMD_ORDER);
         if (!new_page)
                 goto out_fail;
@@ -1768,7 +1789,6 @@ fail_putback:
                         SetPageActive(page);
                 if (TestClearPageUnevictable(new_page))
                         SetPageUnevictable(page);
-               mlock_migrate_page(page, new_page);
  
                 unlock_page(new_page);
                 put_page(new_page);             /* Free it */
@@ -1796,7 +1816,7 @@ fail_putback:
          */
         flush_cache_range(vma, mmun_start, mmun_end);
         page_add_anon_rmap(new_page, vma, mmun_start);
-       pmdp_clear_flush_notify(vma, mmun_start, pmd);
+       pmdp_huge_clear_flush_notify(vma, mmun_start, pmd);
         set_pmd_at(mm, mmun_start, pmd, entry);
         flush_tlb_range(vma, mmun_start, mmun_end);
         update_mmu_cache_pmd(vma, address, &entry);
@@ -1810,8 +1830,9 @@ fail_putback:
                 goto fail_putback;
         }
  
-       mem_cgroup_migrate(page, new_page, false);
-
+       mlock_migrate_page(new_page, page);
+       set_page_memcg(new_page, page_memcg(page));
+       set_page_memcg(page, NULL);
         page_remove_rmap(page);
  
         spin_unlock(ptl);