{
return !sc->target_mem_cgroup;
}
+
+/**
+ * sane_reclaim - is the usual dirty throttling mechanism operational?
+ * @sc: scan_control in question
+ *
+ * The normal page dirty throttling mechanism in balance_dirty_pages() is
+ * completely broken with the legacy memcg and direct stalling in
+ * shrink_page_list() is used for throttling instead, which lacks all the
+ * niceties such as fairness, adaptive pausing, bandwidth proportional
+ * allocation and configurability.
+ *
+ * This function tests whether the vmscan currently in progress can assume
+ * that the normal dirty throttling mechanism is operational.
+ */
+static bool sane_reclaim(struct scan_control *sc)
+{
+ struct mem_cgroup *memcg = sc->target_mem_cgroup;
+
+ if (!memcg)
+ return true;
+#ifdef CONFIG_CGROUP_WRITEBACK
+ if (cgroup_subsys_on_dfl(memory_cgrp_subsys))
+ return true;
+#endif
+ return false;
+}
#else
static bool global_reclaim(struct scan_control *sc)
{
return true;
}
+
+static bool sane_reclaim(struct scan_control *sc)
+{
+ return true;
+}
#endif
static unsigned long zone_reclaimable_pages(struct zone *zone)
{
- int nr;
+ unsigned long nr;
nr = zone_page_state(zone, NR_ACTIVE_FILE) +
zone_page_state(zone, NR_INACTIVE_FILE);
int nid = shrinkctl->nid;
long batch_size = shrinker->batch ? shrinker->batch
: SHRINK_BATCH;
+ long scanned = 0, next_deferred;
freeable = shrinker->count_objects(shrinker, shrinkctl);
if (freeable == 0)
pr_err("shrink_slab: %pF negative objects to delete nr=%ld\n",
shrinker->scan_objects, total_scan);
total_scan = freeable;
- }
+ next_deferred = nr;
+ } else
+ next_deferred = total_scan;
/*
* We need to avoid excessive windup on filesystem shrinkers
count_vm_events(SLABS_SCANNED, nr_to_scan);
total_scan -= nr_to_scan;
+ scanned += nr_to_scan;
cond_resched();
}
+ if (next_deferred >= scanned)
+ next_deferred -= scanned;
+ else
+ next_deferred = 0;
/*
* move the unused scan count back into the shrinker in a
* manner that handles concurrent updates. If we exhausted the
* scan, there is no need to do an update.
*/
- if (total_scan > 0)
- new_nr = atomic_long_add_return(total_scan,
+ if (next_deferred > 0)
+ new_nr = atomic_long_add_return(next_deferred,
&shrinker->nr_deferred[nid]);
else
new_nr = atomic_long_read(&shrinker->nr_deferred[nid]);
return page_count(page) - page_has_private(page) == 2;
}
-static int may_write_to_queue(struct backing_dev_info *bdi,
- struct scan_control *sc)
+static int may_write_to_inode(struct inode *inode, struct scan_control *sc)
{
if (current->flags & PF_SWAPWRITE)
return 1;
- if (!bdi_write_congested(bdi))
+ if (!inode_write_congested(inode))
return 1;
- if (bdi == current->backing_dev_info)
+ if (inode_to_bdi(inode) == current->backing_dev_info)
return 1;
return 0;
}
}
if (mapping->a_ops->writepage == NULL)
return PAGE_ACTIVATE;
- if (!may_write_to_queue(inode_to_bdi(mapping->host), sc))
+ if (!may_write_to_inode(mapping->host, sc))
return PAGE_KEEP;
if (clear_page_dirty_for_io(page)) {
static int __remove_mapping(struct address_space *mapping, struct page *page,
bool reclaimed)
{
+ unsigned long flags;
+ struct mem_cgroup *memcg;
+
BUG_ON(!PageLocked(page));
BUG_ON(mapping != page_mapping(page));
- spin_lock_irq(&mapping->tree_lock);
+ memcg = mem_cgroup_begin_page_stat(page);
+ spin_lock_irqsave(&mapping->tree_lock, flags);
/*
* The non racy check for a busy page.
*
swp_entry_t swap = { .val = page_private(page) };
mem_cgroup_swapout(page, swap);
__delete_from_swap_cache(page);
- spin_unlock_irq(&mapping->tree_lock);
+ spin_unlock_irqrestore(&mapping->tree_lock, flags);
+ mem_cgroup_end_page_stat(memcg);
swapcache_free(swap);
} else {
void (*freepage)(struct page *);
if (reclaimed && page_is_file_cache(page) &&
!mapping_exiting(mapping))
shadow = workingset_eviction(mapping, page);
- __delete_from_page_cache(page, shadow);
- spin_unlock_irq(&mapping->tree_lock);
+ __delete_from_page_cache(page, shadow, memcg);
+ spin_unlock_irqrestore(&mapping->tree_lock, flags);
+ mem_cgroup_end_page_stat(memcg);
if (freepage != NULL)
freepage(page);
return 1;
cannot_free:
- spin_unlock_irq(&mapping->tree_lock);
+ spin_unlock_irqrestore(&mapping->tree_lock, flags);
+ mem_cgroup_end_page_stat(memcg);
return 0;
}
*/
mapping = page_mapping(page);
if (((dirty || writeback) && mapping &&
- bdi_write_congested(inode_to_bdi(mapping->host))) ||
+ inode_write_congested(mapping->host)) ||
(writeback && PageReclaim(page)))
nr_congested++;
* note that the LRU is being scanned too quickly and the
* caller can stall after page list has been processed.
*
- * 2) Global reclaim encounters a page, memcg encounters a
- * page that is not marked for immediate reclaim or
- * the caller does not have __GFP_FS (or __GFP_IO if it's
- * simply going to swap, not to fs). In this case mark
- * the page for immediate reclaim and continue scanning.
+ * 2) Global or new memcg reclaim encounters a page that is
+ * not marked for immediate reclaim, or the caller does not
+ * have __GFP_FS (or __GFP_IO if it's simply going to swap,
+ * not to fs). In this case mark the page for immediate
+ * reclaim and continue scanning.
*
* Require may_enter_fs because we would wait on fs, which
* may not have submitted IO yet. And the loop driver might
* __GFP_IO|__GFP_FS for this reason); but more thought
* would probably show more reasons.
*
- * 3) memcg encounters a page that is not already marked
+ * 3) Legacy memcg encounters a page that is already marked
* PageReclaim. memcg does not have any dirty pages
* throttling so we could easily OOM just because too many
* pages are in writeback and there is nothing else to
goto keep_locked;
/* Case 2 above */
- } else if (global_reclaim(sc) ||
+ } else if (sane_reclaim(sc) ||
!PageReclaim(page) || !may_enter_fs) {
/*
* This is slightly racy - end_page_writeback()
*/
SetPageReclaim(page);
nr_writeback++;
-
goto keep_locked;
/* Case 3 above */
} else {
+ unlock_page(page);
wait_on_page_writeback(page);
+ /* then go back and try same page again */
+ list_add_tail(&page->lru, page_list);
+ continue;
}
}
* processes. Try to unmap it here.
*/
if (page_mapped(page) && mapping) {
- switch (try_to_unmap(page, ttu_flags)) {
+ switch (try_to_unmap(page,
+ ttu_flags|TTU_BATCH_FLUSH)) {
case SWAP_FAIL:
goto activate_locked;
case SWAP_AGAIN:
if (!sc->may_writepage)
goto keep_locked;
- /* Page is dirty, try to write it out here */
+ /*
+ * Page is dirty. Flush the TLB if a writable entry
+ * potentially exists to avoid CPU writes after IO
+ * starts and then write it out here.
+ */
+ try_to_unmap_flush_dirty();
switch (pageout(page, mapping, sc)) {
case PAGE_KEEP:
goto keep_locked;
if (PageSwapCache(page))
try_to_free_swap(page);
unlock_page(page);
- putback_lru_page(page);
+ list_add(&page->lru, &ret_pages);
continue;
activate_locked:
}
mem_cgroup_uncharge_list(&free_pages);
+ try_to_unmap_flush();
free_hot_cold_page_list(&free_pages, true);
list_splice(&ret_pages, page_list);
unsigned long nr_taken = 0;
unsigned long scan;
- for (scan = 0; scan < nr_to_scan && !list_empty(src); scan++) {
+ for (scan = 0; scan < nr_to_scan && nr_taken < nr_to_scan &&
+ !list_empty(src); scan++) {
struct page *page;
int nr_pages;
if (current_is_kswapd())
return 0;
- if (!global_reclaim(sc))
+ if (!sane_reclaim(sc))
return 0;
if (file) {
* won't get blocked by normal direct-reclaimers, forming a circular
* deadlock.
*/
- if ((sc->gfp_mask & GFP_IOFS) == GFP_IOFS)
+ if ((sc->gfp_mask & (__GFP_IO | __GFP_FS)) == (__GFP_IO | __GFP_FS))
inactive >>= 3;
return isolated > inactive;
set_bit(ZONE_WRITEBACK, &zone->flags);
/*
- * memcg will stall in page writeback so only consider forcibly
- * stalling for global reclaim
+ * Legacy memcg will stall in page writeback so avoid forcibly
+ * stalling here.
*/
- if (global_reclaim(sc)) {
+ if (sane_reclaim(sc)) {
/*
* Tag a zone as congested if all the dirty pages scanned were
* backed by a congested BDI and wait_iff_congested will stall.
}
#ifdef CONFIG_SWAP
-static int inactive_anon_is_low_global(struct zone *zone)
+static bool inactive_anon_is_low_global(struct zone *zone)
{
unsigned long active, inactive;
active = zone_page_state(zone, NR_ACTIVE_ANON);
inactive = zone_page_state(zone, NR_INACTIVE_ANON);
- if (inactive * zone->inactive_ratio < active)
- return 1;
-
- return 0;
+ return inactive * zone->inactive_ratio < active;
}
/**
* Returns true if the zone does not have enough inactive anon pages,
* meaning some active anon pages need to be deactivated.
*/
-static int inactive_anon_is_low(struct lruvec *lruvec)
+static bool inactive_anon_is_low(struct lruvec *lruvec)
{
/*
* If we don't have swap space, anonymous page deactivation
* is pointless.
*/
if (!total_swap_pages)
- return 0;
+ return false;
if (!mem_cgroup_disabled())
return mem_cgroup_inactive_anon_is_low(lruvec);
return inactive_anon_is_low_global(lruvec_zone(lruvec));
}
#else
-static inline int inactive_anon_is_low(struct lruvec *lruvec)
+static inline bool inactive_anon_is_low(struct lruvec *lruvec)
{
- return 0;
+ return false;
}
#endif
* This uses a different ratio than the anonymous pages, because
* the page cache uses a use-once replacement algorithm.
*/
-static int inactive_file_is_low(struct lruvec *lruvec)
+static bool inactive_file_is_low(struct lruvec *lruvec)
{
unsigned long inactive;
unsigned long active;
return active > inactive;
}
-static int inactive_list_is_low(struct lruvec *lruvec, enum lru_list lru)
+static bool inactive_list_is_low(struct lruvec *lruvec, enum lru_list lru)
{
if (is_file_lru(lru))
return inactive_file_is_low(lruvec);
balance_gap = min(low_wmark_pages(zone), DIV_ROUND_UP(
zone->managed_pages, KSWAPD_ZONE_BALANCE_GAP_RATIO));
watermark = high_wmark_pages(zone) + balance_gap + (2UL << order);
- watermark_ok = zone_watermark_ok_safe(zone, 0, watermark, 0, 0);
+ watermark_ok = zone_watermark_ok_safe(zone, 0, watermark, 0);
/*
* If compaction is deferred, reclaim up to a point where
sc->gfp_mask |= __GFP_HIGHMEM;
for_each_zone_zonelist_nodemask(zone, z, zonelist,
- requested_highidx, sc->nodemask) {
+ gfp_zone(sc->gfp_mask), sc->nodemask) {
enum zone_type classzone_idx;
if (!populated_zone(zone))
for (i = 0; i <= ZONE_NORMAL; i++) {
zone = &pgdat->node_zones[i];
- if (!populated_zone(zone))
+ if (!populated_zone(zone) ||
+ zone_reclaimable_pages(zone) == 0)
continue;
pfmemalloc_reserve += min_wmark_pages(zone);
sc.may_writepage,
sc.gfp_mask);
+ current->flags |= PF_MEMALLOC;
nr_reclaimed = do_try_to_free_pages(zonelist, &sc);
+ current->flags &= ~PF_MEMALLOC;
trace_mm_vmscan_memcg_reclaim_end(nr_reclaimed);
unsigned long balance_gap, int classzone_idx)
{
if (!zone_watermark_ok_safe(zone, order, high_wmark_pages(zone) +
- balance_gap, classzone_idx, 0))
+ balance_gap, classzone_idx))
return false;
if (IS_ENABLED(CONFIG_COMPACTION) && order && compaction_suitable(zone,
#define RECLAIM_OFF 0
#define RECLAIM_ZONE (1<<0) /* Run shrink_inactive_list on the zone */
#define RECLAIM_WRITE (1<<1) /* Writeout pages during reclaim */
-#define RECLAIM_SWAP (1<<2) /* Swap pages out during reclaim */
+#define RECLAIM_UNMAP (1<<2) /* Unmap pages during reclaim */
/*
* Priority for ZONE_RECLAIM. This determines the fraction of pages
}
/* Work out how many page cache pages we can reclaim in this reclaim_mode */
-static long zone_pagecache_reclaimable(struct zone *zone)
+static unsigned long zone_pagecache_reclaimable(struct zone *zone)
{
- long nr_pagecache_reclaimable;
- long delta = 0;
+ unsigned long nr_pagecache_reclaimable;
+ unsigned long delta = 0;
/*
- * If RECLAIM_SWAP is set, then all file pages are considered
+ * If RECLAIM_UNMAP is set, then all file pages are considered
* potentially reclaimable. Otherwise, we have to worry about
* pages like swapcache and zone_unmapped_file_pages() provides
* a better estimate
*/
- if (zone_reclaim_mode & RECLAIM_SWAP)
+ if (zone_reclaim_mode & RECLAIM_UNMAP)
nr_pagecache_reclaimable = zone_page_state(zone, NR_FILE_PAGES);
else
nr_pagecache_reclaimable = zone_unmapped_file_pages(zone);
.order = order,
.priority = ZONE_RECLAIM_PRIORITY,
.may_writepage = !!(zone_reclaim_mode & RECLAIM_WRITE),
- .may_unmap = !!(zone_reclaim_mode & RECLAIM_SWAP),
+ .may_unmap = !!(zone_reclaim_mode & RECLAIM_UNMAP),
.may_swap = 1,
};
cond_resched();
/*
- * We need to be able to allocate from the reserves for RECLAIM_SWAP
+ * We need to be able to allocate from the reserves for RECLAIM_UNMAP
* and we also need to be able to write out pages for RECLAIM_WRITE
- * and RECLAIM_SWAP.
+ * and RECLAIM_UNMAP.
*/
p->flags |= PF_MEMALLOC | PF_SWAPWRITE;
lockdep_set_current_reclaim_state(gfp_mask);
/*
* Do not scan if the allocation should not be delayed.
*/
- if (!(gfp_mask & __GFP_WAIT) || (current->flags & PF_MEMALLOC))
+ if (!gfpflags_allow_blocking(gfp_mask) || (current->flags & PF_MEMALLOC))
return ZONE_RECLAIM_NOSCAN;
/*