Upgrade to 4.4.50-rt62

[kvmfornfv.git] / kernel / mm / vmscan.c
diff --git a/kernel/mm/vmscan.c b/kernel/mm/vmscan.c

index 5e8eadd..bfc5050 100644 (file)
--- a/kernel/mm/vmscan.c
+++ b/kernel/mm/vmscan.c
@@ -154,16 +154,47 @@ static bool global_reclaim(struct scan_control *sc)
  {
         return !sc->target_mem_cgroup;
  }
+
+/**
+ * sane_reclaim - is the usual dirty throttling mechanism operational?
+ * @sc: scan_control in question
+ *
+ * The normal page dirty throttling mechanism in balance_dirty_pages() is
+ * completely broken with the legacy memcg and direct stalling in
+ * shrink_page_list() is used for throttling instead, which lacks all the
+ * niceties such as fairness, adaptive pausing, bandwidth proportional
+ * allocation and configurability.
+ *
+ * This function tests whether the vmscan currently in progress can assume
+ * that the normal dirty throttling mechanism is operational.
+ */
+static bool sane_reclaim(struct scan_control *sc)
+{
+       struct mem_cgroup *memcg = sc->target_mem_cgroup;
+
+       if (!memcg)
+               return true;
+#ifdef CONFIG_CGROUP_WRITEBACK
+       if (cgroup_subsys_on_dfl(memory_cgrp_subsys))
+               return true;
+#endif
+       return false;
+}
  #else
  static bool global_reclaim(struct scan_control *sc)
  {
         return true;
  }
+
+static bool sane_reclaim(struct scan_control *sc)
+{
+       return true;
+}
  #endif
  
  static unsigned long zone_reclaimable_pages(struct zone *zone)
  {
-       int nr;
+       unsigned long nr;
  
         nr = zone_page_state(zone, NR_ACTIVE_FILE) +
              zone_page_state(zone, NR_INACTIVE_FILE);
@@ -246,6 +277,7 @@ static unsigned long do_shrink_slab(struct shrink_control *shrinkctl,
         int nid = shrinkctl->nid;
         long batch_size = shrinker->batch ? shrinker->batch
                                           : SHRINK_BATCH;
+       long scanned = 0, next_deferred;
  
         freeable = shrinker->count_objects(shrinker, shrinkctl);
         if (freeable == 0)
@@ -267,7 +299,9 @@ static unsigned long do_shrink_slab(struct shrink_control *shrinkctl,
                 pr_err("shrink_slab: %pF negative objects to delete nr=%ld\n",
                        shrinker->scan_objects, total_scan);
                 total_scan = freeable;
-       }
+               next_deferred = nr;
+       } else
+               next_deferred = total_scan;
  
         /*
          * We need to avoid excessive windup on filesystem shrinkers
@@ -324,17 +358,22 @@ static unsigned long do_shrink_slab(struct shrink_control *shrinkctl,
  
                 count_vm_events(SLABS_SCANNED, nr_to_scan);
                 total_scan -= nr_to_scan;
+               scanned += nr_to_scan;
  
                 cond_resched();
         }
  
+       if (next_deferred >= scanned)
+               next_deferred -= scanned;
+       else
+               next_deferred = 0;
         /*
          * move the unused scan count back into the shrinker in a
          * manner that handles concurrent updates. If we exhausted the
          * scan, there is no need to do an update.
          */
-       if (total_scan > 0)
-               new_nr = atomic_long_add_return(total_scan,
+       if (next_deferred > 0)
+               new_nr = atomic_long_add_return(next_deferred,
                                                 &shrinker->nr_deferred[nid]);
         else
                 new_nr = atomic_long_read(&shrinker->nr_deferred[nid]);
@@ -452,14 +491,13 @@ static inline int is_page_cache_freeable(struct page *page)
         return page_count(page) - page_has_private(page) == 2;
  }
  
-static int may_write_to_queue(struct backing_dev_info *bdi,
-                             struct scan_control *sc)
+static int may_write_to_inode(struct inode *inode, struct scan_control *sc)
  {
         if (current->flags & PF_SWAPWRITE)
                 return 1;
-       if (!bdi_write_congested(bdi))
+       if (!inode_write_congested(inode))
                 return 1;
-       if (bdi == current->backing_dev_info)
+       if (inode_to_bdi(inode) == current->backing_dev_info)
                 return 1;
         return 0;
  }
@@ -538,7 +576,7 @@ static pageout_t pageout(struct page *page, struct address_space *mapping,
         }
         if (mapping->a_ops->writepage == NULL)
                 return PAGE_ACTIVATE;
-       if (!may_write_to_queue(inode_to_bdi(mapping->host), sc))
+       if (!may_write_to_inode(mapping->host, sc))
                 return PAGE_KEEP;
  
         if (clear_page_dirty_for_io(page)) {
@@ -579,10 +617,14 @@ static pageout_t pageout(struct page *page, struct address_space *mapping,
  static int __remove_mapping(struct address_space *mapping, struct page *page,
                             bool reclaimed)
  {
+       unsigned long flags;
+       struct mem_cgroup *memcg;
+
         BUG_ON(!PageLocked(page));
         BUG_ON(mapping != page_mapping(page));
  
-       spin_lock_irq(&mapping->tree_lock);
+       memcg = mem_cgroup_begin_page_stat(page);
+       spin_lock_irqsave(&mapping->tree_lock, flags);
         /*
          * The non racy check for a busy page.
          *
@@ -620,7 +662,8 @@ static int __remove_mapping(struct address_space *mapping, struct page *page,
                 swp_entry_t swap = { .val = page_private(page) };
                 mem_cgroup_swapout(page, swap);
                 __delete_from_swap_cache(page);
-               spin_unlock_irq(&mapping->tree_lock);
+               spin_unlock_irqrestore(&mapping->tree_lock, flags);
+               mem_cgroup_end_page_stat(memcg);
                 swapcache_free(swap);
         } else {
                 void (*freepage)(struct page *);
@@ -640,8 +683,9 @@ static int __remove_mapping(struct address_space *mapping, struct page *page,
                 if (reclaimed && page_is_file_cache(page) &&
                     !mapping_exiting(mapping))
                         shadow = workingset_eviction(mapping, page);
-               __delete_from_page_cache(page, shadow);
-               spin_unlock_irq(&mapping->tree_lock);
+               __delete_from_page_cache(page, shadow, memcg);
+               spin_unlock_irqrestore(&mapping->tree_lock, flags);
+               mem_cgroup_end_page_stat(memcg);
  
                 if (freepage != NULL)
                         freepage(page);
@@ -650,7 +694,8 @@ static int __remove_mapping(struct address_space *mapping, struct page *page,
         return 1;
  
  cannot_free:
-       spin_unlock_irq(&mapping->tree_lock);
+       spin_unlock_irqrestore(&mapping->tree_lock, flags);
+       mem_cgroup_end_page_stat(memcg);
         return 0;
  }
  
@@ -917,7 +962,7 @@ static unsigned long shrink_page_list(struct list_head *page_list,
                  */
                 mapping = page_mapping(page);
                 if (((dirty || writeback) && mapping &&
-                    bdi_write_congested(inode_to_bdi(mapping->host))) ||
+                    inode_write_congested(mapping->host)) ||
                     (writeback && PageReclaim(page)))
                         nr_congested++;
  
@@ -935,24 +980,20 @@ static unsigned long shrink_page_list(struct list_head *page_list,
                  *    note that the LRU is being scanned too quickly and the
                  *    caller can stall after page list has been processed.
                  *
-                * 2) Global reclaim encounters a page, memcg encounters a
-                *    page that is not marked for immediate reclaim or
-                *    the caller does not have __GFP_IO. In this case mark
-                *    the page for immediate reclaim and continue scanning.
+                * 2) Global or new memcg reclaim encounters a page that is
+                *    not marked for immediate reclaim, or the caller does not
+                *    have __GFP_FS (or __GFP_IO if it's simply going to swap,
+                *    not to fs). In this case mark the page for immediate
+                *    reclaim and continue scanning.
                  *
-                *    __GFP_IO is checked  because a loop driver thread might
+                *    Require may_enter_fs because we would wait on fs, which
+                *    may not have submitted IO yet. And the loop driver might
                  *    enter reclaim, and deadlock if it waits on a page for
                  *    which it is needed to do the write (loop masks off
                  *    __GFP_IO|__GFP_FS for this reason); but more thought
                  *    would probably show more reasons.
                  *
-                *    Don't require __GFP_FS, since we're not going into the
-                *    FS, just waiting on its writeback completion. Worryingly,
-                *    ext4 gfs2 and xfs allocate pages with
-                *    grab_cache_page_write_begin(,,AOP_FLAG_NOFS), so testing
-                *    may_enter_fs here is liable to OOM on them.
-                *
-                * 3) memcg encounters a page that is not already marked
+                * 3) Legacy memcg encounters a page that is already marked
                  *    PageReclaim. memcg does not have any dirty pages
                  *    throttling so we could easily OOM just because too many
                  *    pages are in writeback and there is nothing else to
@@ -967,8 +1008,8 @@ static unsigned long shrink_page_list(struct list_head *page_list,
                                 goto keep_locked;
  
                         /* Case 2 above */
-                       } else if (global_reclaim(sc) ||
-                           !PageReclaim(page) || !(sc->gfp_mask & __GFP_IO)) {
+                       } else if (sane_reclaim(sc) ||
+                           !PageReclaim(page) || !may_enter_fs) {
                                 /*
                                  * This is slightly racy - end_page_writeback()
                                  * might have just cleared PageReclaim, then
@@ -982,12 +1023,15 @@ static unsigned long shrink_page_list(struct list_head *page_list,
                                  */
                                 SetPageReclaim(page);
                                 nr_writeback++;
-
                                 goto keep_locked;
  
                         /* Case 3 above */
                         } else {
+                               unlock_page(page);
                                 wait_on_page_writeback(page);
+                               /* then go back and try same page again */
+                               list_add_tail(&page->lru, page_list);
+                               continue;
                         }
                 }
  
@@ -1024,7 +1068,8 @@ static unsigned long shrink_page_list(struct list_head *page_list,
                  * processes. Try to unmap it here.
                  */
                 if (page_mapped(page) && mapping) {
-                       switch (try_to_unmap(page, ttu_flags)) {
+                       switch (try_to_unmap(page,
+                                       ttu_flags|TTU_BATCH_FLUSH)) {
                         case SWAP_FAIL:
                                 goto activate_locked;
                         case SWAP_AGAIN:
@@ -1064,7 +1109,12 @@ static unsigned long shrink_page_list(struct list_head *page_list,
                         if (!sc->may_writepage)
                                 goto keep_locked;
  
-                       /* Page is dirty, try to write it out here */
+                       /*
+                        * Page is dirty. Flush the TLB if a writable entry
+                        * potentially exists to avoid CPU writes after IO
+                        * starts and then write it out here.
+                        */
+                       try_to_unmap_flush_dirty();
                         switch (pageout(page, mapping, sc)) {
                         case PAGE_KEEP:
                                 goto keep_locked;
@@ -1157,7 +1207,7 @@ cull_mlocked:
                 if (PageSwapCache(page))
                         try_to_free_swap(page);
                 unlock_page(page);
-               putback_lru_page(page);
+               list_add(&page->lru, &ret_pages);
                 continue;
  
  activate_locked:
@@ -1175,6 +1225,7 @@ keep:
         }
  
         mem_cgroup_uncharge_list(&free_pages);
+       try_to_unmap_flush();
         free_hot_cold_page_list(&free_pages, true);
  
         list_splice(&ret_pages, page_list);
@@ -1319,7 +1370,8 @@ static unsigned long isolate_lru_pages(unsigned long nr_to_scan,
         unsigned long nr_taken = 0;
         unsigned long scan;
  
-       for (scan = 0; scan < nr_to_scan && !list_empty(src); scan++) {
+       for (scan = 0; scan < nr_to_scan && nr_taken < nr_to_scan &&
+                                       !list_empty(src); scan++) {
                 struct page *page;
                 int nr_pages;
  
@@ -1416,7 +1468,7 @@ static int too_many_isolated(struct zone *zone, int file,
         if (current_is_kswapd())
                 return 0;
  
-       if (!global_reclaim(sc))
+       if (!sane_reclaim(sc))
                 return 0;
  
         if (file) {
@@ -1432,7 +1484,7 @@ static int too_many_isolated(struct zone *zone, int file,
          * won't get blocked by normal direct-reclaimers, forming a circular
          * deadlock.
          */
-       if ((sc->gfp_mask & GFP_IOFS) == GFP_IOFS)
+       if ((sc->gfp_mask & (__GFP_IO | __GFP_FS)) == (__GFP_IO | __GFP_FS))
                 inactive >>= 3;
  
         return isolated > inactive;
@@ -1608,10 +1660,10 @@ shrink_inactive_list(unsigned long nr_to_scan, struct lruvec *lruvec,
                 set_bit(ZONE_WRITEBACK, &zone->flags);
  
         /*
-        * memcg will stall in page writeback so only consider forcibly
-        * stalling for global reclaim
+        * Legacy memcg will stall in page writeback so avoid forcibly
+        * stalling here.
          */
-       if (global_reclaim(sc)) {
+       if (sane_reclaim(sc)) {
                 /*
                  * Tag a zone as congested if all the dirty pages scanned were
                  * backed by a congested BDI and wait_iff_congested will stall.
@@ -1815,17 +1867,14 @@ static void shrink_active_list(unsigned long nr_to_scan,
  }
  
  #ifdef CONFIG_SWAP
-static int inactive_anon_is_low_global(struct zone *zone)
+static bool inactive_anon_is_low_global(struct zone *zone)
  {
         unsigned long active, inactive;
  
         active = zone_page_state(zone, NR_ACTIVE_ANON);
         inactive = zone_page_state(zone, NR_INACTIVE_ANON);
  
-       if (inactive * zone->inactive_ratio < active)
-               return 1;
-
-       return 0;
+       return inactive * zone->inactive_ratio < active;
  }
  
  /**
@@ -1835,14 +1884,14 @@ static int inactive_anon_is_low_global(struct zone *zone)
   * Returns true if the zone does not have enough inactive anon pages,
   * meaning some active anon pages need to be deactivated.
   */
-static int inactive_anon_is_low(struct lruvec *lruvec)
+static bool inactive_anon_is_low(struct lruvec *lruvec)
  {
         /*
          * If we don't have swap space, anonymous page deactivation
          * is pointless.
          */
         if (!total_swap_pages)
-               return 0;
+               return false;
  
         if (!mem_cgroup_disabled())
                 return mem_cgroup_inactive_anon_is_low(lruvec);
@@ -1850,9 +1899,9 @@ static int inactive_anon_is_low(struct lruvec *lruvec)
         return inactive_anon_is_low_global(lruvec_zone(lruvec));
  }
  #else
-static inline int inactive_anon_is_low(struct lruvec *lruvec)
+static inline bool inactive_anon_is_low(struct lruvec *lruvec)
  {
-       return 0;
+       return false;
  }
  #endif
  
@@ -1870,7 +1919,7 @@ static inline int inactive_anon_is_low(struct lruvec *lruvec)
   * This uses a different ratio than the anonymous pages, because
   * the page cache uses a use-once replacement algorithm.
   */
-static int inactive_file_is_low(struct lruvec *lruvec)
+static bool inactive_file_is_low(struct lruvec *lruvec)
  {
         unsigned long inactive;
         unsigned long active;
@@ -1881,7 +1930,7 @@ static int inactive_file_is_low(struct lruvec *lruvec)
         return active > inactive;
  }
  
-static int inactive_list_is_low(struct lruvec *lruvec, enum lru_list lru)
+static bool inactive_list_is_low(struct lruvec *lruvec, enum lru_list lru)
  {
         if (is_file_lru(lru))
                 return inactive_file_is_low(lruvec);
@@ -2417,7 +2466,7 @@ static inline bool compaction_ready(struct zone *zone, int order)
         balance_gap = min(low_wmark_pages(zone), DIV_ROUND_UP(
                         zone->managed_pages, KSWAPD_ZONE_BALANCE_GAP_RATIO));
         watermark = high_wmark_pages(zone) + balance_gap + (2UL << order);
-       watermark_ok = zone_watermark_ok_safe(zone, 0, watermark, 0, 0);
+       watermark_ok = zone_watermark_ok_safe(zone, 0, watermark, 0);
  
         /*
          * If compaction is deferred, reclaim up to a point where
@@ -2474,7 +2523,7 @@ static bool shrink_zones(struct zonelist *zonelist, struct scan_control *sc)
                 sc->gfp_mask |= __GFP_HIGHMEM;
  
         for_each_zone_zonelist_nodemask(zone, z, zonelist,
-                                       requested_highidx, sc->nodemask) {
+                                       gfp_zone(sc->gfp_mask), sc->nodemask) {
                 enum zone_type classzone_idx;
  
                 if (!populated_zone(zone))
@@ -2646,7 +2695,8 @@ static bool pfmemalloc_watermark_ok(pg_data_t *pgdat)
  
         for (i = 0; i <= ZONE_NORMAL; i++) {
                 zone = &pgdat->node_zones[i];
-               if (!populated_zone(zone))
+               if (!populated_zone(zone) ||
+                   zone_reclaimable_pages(zone) == 0)
                         continue;
  
                 pfmemalloc_reserve += min_wmark_pages(zone);
@@ -2868,7 +2918,9 @@ unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *memcg,
                                             sc.may_writepage,
                                             sc.gfp_mask);
  
+       current->flags |= PF_MEMALLOC;
         nr_reclaimed = do_try_to_free_pages(zonelist, &sc);
+       current->flags &= ~PF_MEMALLOC;
  
         trace_mm_vmscan_memcg_reclaim_end(nr_reclaimed);
  
@@ -2899,7 +2951,7 @@ static bool zone_balanced(struct zone *zone, int order,
                           unsigned long balance_gap, int classzone_idx)
  {
         if (!zone_watermark_ok_safe(zone, order, high_wmark_pages(zone) +
-                                   balance_gap, classzone_idx, 0))
+                                   balance_gap, classzone_idx))
                 return false;
  
         if (IS_ENABLED(CONFIG_COMPACTION) && order && compaction_suitable(zone,
@@ -3596,7 +3648,7 @@ int zone_reclaim_mode __read_mostly;
  #define RECLAIM_OFF 0
  #define RECLAIM_ZONE (1<<0)    /* Run shrink_inactive_list on the zone */
  #define RECLAIM_WRITE (1<<1)   /* Writeout pages during reclaim */
-#define RECLAIM_SWAP (1<<2)    /* Swap pages out during reclaim */
+#define RECLAIM_UNMAP (1<<2)   /* Unmap pages during reclaim */
  
  /*
   * Priority for ZONE_RECLAIM. This determines the fraction of pages
@@ -3632,18 +3684,18 @@ static inline unsigned long zone_unmapped_file_pages(struct zone *zone)
  }
  
  /* Work out how many page cache pages we can reclaim in this reclaim_mode */
-static long zone_pagecache_reclaimable(struct zone *zone)
+static unsigned long zone_pagecache_reclaimable(struct zone *zone)
  {
-       long nr_pagecache_reclaimable;
-       long delta = 0;
+       unsigned long nr_pagecache_reclaimable;
+       unsigned long delta = 0;
  
         /*
-        * If RECLAIM_SWAP is set, then all file pages are considered
+        * If RECLAIM_UNMAP is set, then all file pages are considered
          * potentially reclaimable. Otherwise, we have to worry about
          * pages like swapcache and zone_unmapped_file_pages() provides
          * a better estimate
          */
-       if (zone_reclaim_mode & RECLAIM_SWAP)
+       if (zone_reclaim_mode & RECLAIM_UNMAP)
                 nr_pagecache_reclaimable = zone_page_state(zone, NR_FILE_PAGES);
         else
                 nr_pagecache_reclaimable = zone_unmapped_file_pages(zone);
@@ -3674,15 +3726,15 @@ static int __zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order)
                 .order = order,
                 .priority = ZONE_RECLAIM_PRIORITY,
                 .may_writepage = !!(zone_reclaim_mode & RECLAIM_WRITE),
-               .may_unmap = !!(zone_reclaim_mode & RECLAIM_SWAP),
+               .may_unmap = !!(zone_reclaim_mode & RECLAIM_UNMAP),
                 .may_swap = 1,
         };
  
         cond_resched();
         /*
-        * We need to be able to allocate from the reserves for RECLAIM_SWAP
+        * We need to be able to allocate from the reserves for RECLAIM_UNMAP
          * and we also need to be able to write out pages for RECLAIM_WRITE
-        * and RECLAIM_SWAP.
+        * and RECLAIM_UNMAP.
          */
         p->flags |= PF_MEMALLOC | PF_SWAPWRITE;
         lockdep_set_current_reclaim_state(gfp_mask);
@@ -3730,7 +3782,7 @@ int zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order)
         /*
          * Do not scan if the allocation should not be delayed.
          */
-       if (!(gfp_mask & __GFP_WAIT) || (current->flags & PF_MEMALLOC))
+       if (!gfpflags_allow_blocking(gfp_mask) || (current->flags & PF_MEMALLOC))
                 return ZONE_RECLAIM_NOSCAN;
  
         /*