X-Git-Url: https://gerrit.opnfv.org/gerrit/gitweb?p=kvmfornfv.git;a=blobdiff_plain;f=kernel%2Fmm%2Fvmscan.c;h=2aec4241b42a9f0b02c8f4124698d6b595c96390;hp=1a17bd7c0ce582b1f1de89a5c0f9fd2f5ba9e611;hb=e09b41010ba33a20a87472ee821fa407a5b8da36;hpb=f93b97fd65072de626c074dbe099a1fff05ce060 diff --git a/kernel/mm/vmscan.c b/kernel/mm/vmscan.c index 1a17bd7c0..2aec4241b 100644 --- a/kernel/mm/vmscan.c +++ b/kernel/mm/vmscan.c @@ -154,16 +154,47 @@ static bool global_reclaim(struct scan_control *sc) { return !sc->target_mem_cgroup; } + +/** + * sane_reclaim - is the usual dirty throttling mechanism operational? + * @sc: scan_control in question + * + * The normal page dirty throttling mechanism in balance_dirty_pages() is + * completely broken with the legacy memcg and direct stalling in + * shrink_page_list() is used for throttling instead, which lacks all the + * niceties such as fairness, adaptive pausing, bandwidth proportional + * allocation and configurability. + * + * This function tests whether the vmscan currently in progress can assume + * that the normal dirty throttling mechanism is operational. + */ +static bool sane_reclaim(struct scan_control *sc) +{ + struct mem_cgroup *memcg = sc->target_mem_cgroup; + + if (!memcg) + return true; +#ifdef CONFIG_CGROUP_WRITEBACK + if (cgroup_subsys_on_dfl(memory_cgrp_subsys)) + return true; +#endif + return false; +} #else static bool global_reclaim(struct scan_control *sc) { return true; } + +static bool sane_reclaim(struct scan_control *sc) +{ + return true; +} #endif static unsigned long zone_reclaimable_pages(struct zone *zone) { - int nr; + unsigned long nr; nr = zone_page_state(zone, NR_ACTIVE_FILE) + zone_page_state(zone, NR_INACTIVE_FILE); @@ -452,14 +483,13 @@ static inline int is_page_cache_freeable(struct page *page) return page_count(page) - page_has_private(page) == 2; } -static int may_write_to_queue(struct backing_dev_info *bdi, - struct scan_control *sc) +static int may_write_to_inode(struct inode *inode, struct scan_control *sc) { if (current->flags & PF_SWAPWRITE) return 1; - if (!bdi_write_congested(bdi)) + if (!inode_write_congested(inode)) return 1; - if (bdi == current->backing_dev_info) + if (inode_to_bdi(inode) == current->backing_dev_info) return 1; return 0; } @@ -538,7 +568,7 @@ static pageout_t pageout(struct page *page, struct address_space *mapping, } if (mapping->a_ops->writepage == NULL) return PAGE_ACTIVATE; - if (!may_write_to_queue(inode_to_bdi(mapping->host), sc)) + if (!may_write_to_inode(mapping->host, sc)) return PAGE_KEEP; if (clear_page_dirty_for_io(page)) { @@ -579,10 +609,14 @@ static pageout_t pageout(struct page *page, struct address_space *mapping, static int __remove_mapping(struct address_space *mapping, struct page *page, bool reclaimed) { + unsigned long flags; + struct mem_cgroup *memcg; + BUG_ON(!PageLocked(page)); BUG_ON(mapping != page_mapping(page)); - spin_lock_irq(&mapping->tree_lock); + memcg = mem_cgroup_begin_page_stat(page); + spin_lock_irqsave(&mapping->tree_lock, flags); /* * The non racy check for a busy page. * @@ -620,7 +654,8 @@ static int __remove_mapping(struct address_space *mapping, struct page *page, swp_entry_t swap = { .val = page_private(page) }; mem_cgroup_swapout(page, swap); __delete_from_swap_cache(page); - spin_unlock_irq(&mapping->tree_lock); + spin_unlock_irqrestore(&mapping->tree_lock, flags); + mem_cgroup_end_page_stat(memcg); swapcache_free(swap); } else { void (*freepage)(struct page *); @@ -640,8 +675,9 @@ static int __remove_mapping(struct address_space *mapping, struct page *page, if (reclaimed && page_is_file_cache(page) && !mapping_exiting(mapping)) shadow = workingset_eviction(mapping, page); - __delete_from_page_cache(page, shadow); - spin_unlock_irq(&mapping->tree_lock); + __delete_from_page_cache(page, shadow, memcg); + spin_unlock_irqrestore(&mapping->tree_lock, flags); + mem_cgroup_end_page_stat(memcg); if (freepage != NULL) freepage(page); @@ -650,7 +686,8 @@ static int __remove_mapping(struct address_space *mapping, struct page *page, return 1; cannot_free: - spin_unlock_irq(&mapping->tree_lock); + spin_unlock_irqrestore(&mapping->tree_lock, flags); + mem_cgroup_end_page_stat(memcg); return 0; } @@ -917,7 +954,7 @@ static unsigned long shrink_page_list(struct list_head *page_list, */ mapping = page_mapping(page); if (((dirty || writeback) && mapping && - bdi_write_congested(inode_to_bdi(mapping->host))) || + inode_write_congested(mapping->host)) || (writeback && PageReclaim(page))) nr_congested++; @@ -935,11 +972,11 @@ static unsigned long shrink_page_list(struct list_head *page_list, * note that the LRU is being scanned too quickly and the * caller can stall after page list has been processed. * - * 2) Global reclaim encounters a page, memcg encounters a - * page that is not marked for immediate reclaim or - * the caller does not have __GFP_FS (or __GFP_IO if it's - * simply going to swap, not to fs). In this case mark - * the page for immediate reclaim and continue scanning. + * 2) Global or new memcg reclaim encounters a page that is + * not marked for immediate reclaim, or the caller does not + * have __GFP_FS (or __GFP_IO if it's simply going to swap, + * not to fs). In this case mark the page for immediate + * reclaim and continue scanning. * * Require may_enter_fs because we would wait on fs, which * may not have submitted IO yet. And the loop driver might @@ -948,7 +985,7 @@ static unsigned long shrink_page_list(struct list_head *page_list, * __GFP_IO|__GFP_FS for this reason); but more thought * would probably show more reasons. * - * 3) memcg encounters a page that is not already marked + * 3) Legacy memcg encounters a page that is already marked * PageReclaim. memcg does not have any dirty pages * throttling so we could easily OOM just because too many * pages are in writeback and there is nothing else to @@ -963,7 +1000,7 @@ static unsigned long shrink_page_list(struct list_head *page_list, goto keep_locked; /* Case 2 above */ - } else if (global_reclaim(sc) || + } else if (sane_reclaim(sc) || !PageReclaim(page) || !may_enter_fs) { /* * This is slightly racy - end_page_writeback() @@ -978,12 +1015,15 @@ static unsigned long shrink_page_list(struct list_head *page_list, */ SetPageReclaim(page); nr_writeback++; - goto keep_locked; /* Case 3 above */ } else { + unlock_page(page); wait_on_page_writeback(page); + /* then go back and try same page again */ + list_add_tail(&page->lru, page_list); + continue; } } @@ -1020,7 +1060,8 @@ static unsigned long shrink_page_list(struct list_head *page_list, * processes. Try to unmap it here. */ if (page_mapped(page) && mapping) { - switch (try_to_unmap(page, ttu_flags)) { + switch (try_to_unmap(page, + ttu_flags|TTU_BATCH_FLUSH)) { case SWAP_FAIL: goto activate_locked; case SWAP_AGAIN: @@ -1060,7 +1101,12 @@ static unsigned long shrink_page_list(struct list_head *page_list, if (!sc->may_writepage) goto keep_locked; - /* Page is dirty, try to write it out here */ + /* + * Page is dirty. Flush the TLB if a writable entry + * potentially exists to avoid CPU writes after IO + * starts and then write it out here. + */ + try_to_unmap_flush_dirty(); switch (pageout(page, mapping, sc)) { case PAGE_KEEP: goto keep_locked; @@ -1171,6 +1217,7 @@ keep: } mem_cgroup_uncharge_list(&free_pages); + try_to_unmap_flush(); free_hot_cold_page_list(&free_pages, true); list_splice(&ret_pages, page_list); @@ -1315,7 +1362,8 @@ static unsigned long isolate_lru_pages(unsigned long nr_to_scan, unsigned long nr_taken = 0; unsigned long scan; - for (scan = 0; scan < nr_to_scan && !list_empty(src); scan++) { + for (scan = 0; scan < nr_to_scan && nr_taken < nr_to_scan && + !list_empty(src); scan++) { struct page *page; int nr_pages; @@ -1412,7 +1460,7 @@ static int too_many_isolated(struct zone *zone, int file, if (current_is_kswapd()) return 0; - if (!global_reclaim(sc)) + if (!sane_reclaim(sc)) return 0; if (file) { @@ -1428,7 +1476,7 @@ static int too_many_isolated(struct zone *zone, int file, * won't get blocked by normal direct-reclaimers, forming a circular * deadlock. */ - if ((sc->gfp_mask & GFP_IOFS) == GFP_IOFS) + if ((sc->gfp_mask & (__GFP_IO | __GFP_FS)) == (__GFP_IO | __GFP_FS)) inactive >>= 3; return isolated > inactive; @@ -1604,10 +1652,10 @@ shrink_inactive_list(unsigned long nr_to_scan, struct lruvec *lruvec, set_bit(ZONE_WRITEBACK, &zone->flags); /* - * memcg will stall in page writeback so only consider forcibly - * stalling for global reclaim + * Legacy memcg will stall in page writeback so avoid forcibly + * stalling here. */ - if (global_reclaim(sc)) { + if (sane_reclaim(sc)) { /* * Tag a zone as congested if all the dirty pages scanned were * backed by a congested BDI and wait_iff_congested will stall. @@ -1811,17 +1859,14 @@ static void shrink_active_list(unsigned long nr_to_scan, } #ifdef CONFIG_SWAP -static int inactive_anon_is_low_global(struct zone *zone) +static bool inactive_anon_is_low_global(struct zone *zone) { unsigned long active, inactive; active = zone_page_state(zone, NR_ACTIVE_ANON); inactive = zone_page_state(zone, NR_INACTIVE_ANON); - if (inactive * zone->inactive_ratio < active) - return 1; - - return 0; + return inactive * zone->inactive_ratio < active; } /** @@ -1831,14 +1876,14 @@ static int inactive_anon_is_low_global(struct zone *zone) * Returns true if the zone does not have enough inactive anon pages, * meaning some active anon pages need to be deactivated. */ -static int inactive_anon_is_low(struct lruvec *lruvec) +static bool inactive_anon_is_low(struct lruvec *lruvec) { /* * If we don't have swap space, anonymous page deactivation * is pointless. */ if (!total_swap_pages) - return 0; + return false; if (!mem_cgroup_disabled()) return mem_cgroup_inactive_anon_is_low(lruvec); @@ -1846,9 +1891,9 @@ static int inactive_anon_is_low(struct lruvec *lruvec) return inactive_anon_is_low_global(lruvec_zone(lruvec)); } #else -static inline int inactive_anon_is_low(struct lruvec *lruvec) +static inline bool inactive_anon_is_low(struct lruvec *lruvec) { - return 0; + return false; } #endif @@ -1866,7 +1911,7 @@ static inline int inactive_anon_is_low(struct lruvec *lruvec) * This uses a different ratio than the anonymous pages, because * the page cache uses a use-once replacement algorithm. */ -static int inactive_file_is_low(struct lruvec *lruvec) +static bool inactive_file_is_low(struct lruvec *lruvec) { unsigned long inactive; unsigned long active; @@ -1877,7 +1922,7 @@ static int inactive_file_is_low(struct lruvec *lruvec) return active > inactive; } -static int inactive_list_is_low(struct lruvec *lruvec, enum lru_list lru) +static bool inactive_list_is_low(struct lruvec *lruvec, enum lru_list lru) { if (is_file_lru(lru)) return inactive_file_is_low(lruvec); @@ -2114,6 +2159,23 @@ out: } } +#ifdef CONFIG_ARCH_WANT_BATCHED_UNMAP_TLB_FLUSH +static void init_tlb_ubc(void) +{ + /* + * This deliberately does not clear the cpumask as it's expensive + * and unnecessary. If there happens to be data in there then the + * first SWAP_CLUSTER_MAX pages will send an unnecessary IPI and + * then will be cleared. + */ + current->tlb_ubc.flush_required = false; +} +#else +static inline void init_tlb_ubc(void) +{ +} +#endif /* CONFIG_ARCH_WANT_BATCHED_UNMAP_TLB_FLUSH */ + /* * This is a basic per-zone page freer. Used by both kswapd and direct reclaim. */ @@ -2148,6 +2210,8 @@ static void shrink_lruvec(struct lruvec *lruvec, int swappiness, scan_adjusted = (global_reclaim(sc) && !current_is_kswapd() && sc->priority == DEF_PRIORITY); + init_tlb_ubc(); + blk_start_plug(&plug); while (nr[LRU_INACTIVE_ANON] || nr[LRU_ACTIVE_FILE] || nr[LRU_INACTIVE_FILE]) { @@ -2413,7 +2477,7 @@ static inline bool compaction_ready(struct zone *zone, int order) balance_gap = min(low_wmark_pages(zone), DIV_ROUND_UP( zone->managed_pages, KSWAPD_ZONE_BALANCE_GAP_RATIO)); watermark = high_wmark_pages(zone) + balance_gap + (2UL << order); - watermark_ok = zone_watermark_ok_safe(zone, 0, watermark, 0, 0); + watermark_ok = zone_watermark_ok_safe(zone, 0, watermark, 0); /* * If compaction is deferred, reclaim up to a point where @@ -2642,7 +2706,8 @@ static bool pfmemalloc_watermark_ok(pg_data_t *pgdat) for (i = 0; i <= ZONE_NORMAL; i++) { zone = &pgdat->node_zones[i]; - if (!populated_zone(zone)) + if (!populated_zone(zone) || + zone_reclaimable_pages(zone) == 0) continue; pfmemalloc_reserve += min_wmark_pages(zone); @@ -2895,7 +2960,7 @@ static bool zone_balanced(struct zone *zone, int order, unsigned long balance_gap, int classzone_idx) { if (!zone_watermark_ok_safe(zone, order, high_wmark_pages(zone) + - balance_gap, classzone_idx, 0)) + balance_gap, classzone_idx)) return false; if (IS_ENABLED(CONFIG_COMPACTION) && order && compaction_suitable(zone, @@ -3592,7 +3657,7 @@ int zone_reclaim_mode __read_mostly; #define RECLAIM_OFF 0 #define RECLAIM_ZONE (1<<0) /* Run shrink_inactive_list on the zone */ #define RECLAIM_WRITE (1<<1) /* Writeout pages during reclaim */ -#define RECLAIM_SWAP (1<<2) /* Swap pages out during reclaim */ +#define RECLAIM_UNMAP (1<<2) /* Unmap pages during reclaim */ /* * Priority for ZONE_RECLAIM. This determines the fraction of pages @@ -3628,18 +3693,18 @@ static inline unsigned long zone_unmapped_file_pages(struct zone *zone) } /* Work out how many page cache pages we can reclaim in this reclaim_mode */ -static long zone_pagecache_reclaimable(struct zone *zone) +static unsigned long zone_pagecache_reclaimable(struct zone *zone) { - long nr_pagecache_reclaimable; - long delta = 0; + unsigned long nr_pagecache_reclaimable; + unsigned long delta = 0; /* - * If RECLAIM_SWAP is set, then all file pages are considered + * If RECLAIM_UNMAP is set, then all file pages are considered * potentially reclaimable. Otherwise, we have to worry about * pages like swapcache and zone_unmapped_file_pages() provides * a better estimate */ - if (zone_reclaim_mode & RECLAIM_SWAP) + if (zone_reclaim_mode & RECLAIM_UNMAP) nr_pagecache_reclaimable = zone_page_state(zone, NR_FILE_PAGES); else nr_pagecache_reclaimable = zone_unmapped_file_pages(zone); @@ -3670,15 +3735,15 @@ static int __zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order) .order = order, .priority = ZONE_RECLAIM_PRIORITY, .may_writepage = !!(zone_reclaim_mode & RECLAIM_WRITE), - .may_unmap = !!(zone_reclaim_mode & RECLAIM_SWAP), + .may_unmap = !!(zone_reclaim_mode & RECLAIM_UNMAP), .may_swap = 1, }; cond_resched(); /* - * We need to be able to allocate from the reserves for RECLAIM_SWAP + * We need to be able to allocate from the reserves for RECLAIM_UNMAP * and we also need to be able to write out pages for RECLAIM_WRITE - * and RECLAIM_SWAP. + * and RECLAIM_UNMAP. */ p->flags |= PF_MEMALLOC | PF_SWAPWRITE; lockdep_set_current_reclaim_state(gfp_mask); @@ -3726,7 +3791,7 @@ int zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order) /* * Do not scan if the allocation should not be delayed. */ - if (!(gfp_mask & __GFP_WAIT) || (current->flags & PF_MEMALLOC)) + if (!gfpflags_allow_blocking(gfp_mask) || (current->flags & PF_MEMALLOC)) return ZONE_RECLAIM_NOSCAN; /*