These changes are the raw update to linux-4.4.6-rt14. Kernel sources
[kvmfornfv.git] / kernel / mm / compaction.c
index 0af17fe..ba0f146 100644 (file)
@@ -35,17 +35,6 @@ static inline void count_compact_events(enum vm_event_item item, long delta)
 #endif
 
 #if defined CONFIG_COMPACTION || defined CONFIG_CMA
-#ifdef CONFIG_TRACEPOINTS
-static const char *const compaction_status_string[] = {
-       "deferred",
-       "skipped",
-       "continue",
-       "partial",
-       "complete",
-       "no_suitable_page",
-       "not_suitable_zone",
-};
-#endif
 
 #define CREATE_TRACE_POINTS
 #include <trace/events/compaction.h>
@@ -207,6 +196,13 @@ static inline bool isolation_suitable(struct compact_control *cc,
        return !get_pageblock_skip(page);
 }
 
+static void reset_cached_positions(struct zone *zone)
+{
+       zone->compact_cached_migrate_pfn[0] = zone->zone_start_pfn;
+       zone->compact_cached_migrate_pfn[1] = zone->zone_start_pfn;
+       zone->compact_cached_free_pfn = zone_end_pfn(zone);
+}
+
 /*
  * This function is called to clear all cached information on pageblocks that
  * should be skipped for page isolation when the migrate and free page scanner
@@ -218,9 +214,6 @@ static void __reset_isolation_suitable(struct zone *zone)
        unsigned long end_pfn = zone_end_pfn(zone);
        unsigned long pfn;
 
-       zone->compact_cached_migrate_pfn[0] = start_pfn;
-       zone->compact_cached_migrate_pfn[1] = start_pfn;
-       zone->compact_cached_free_pfn = end_pfn;
        zone->compact_blockskip_flush = false;
 
        /* Walk the zone and mark every pageblock as suitable for isolation */
@@ -238,6 +231,8 @@ static void __reset_isolation_suitable(struct zone *zone)
 
                clear_pageblock_skip(page);
        }
+
+       reset_cached_positions(zone);
 }
 
 void reset_isolation_suitable(pg_data_t *pgdat)
@@ -431,6 +426,24 @@ static unsigned long isolate_freepages_block(struct compact_control *cc,
 
                if (!valid_page)
                        valid_page = page;
+
+               /*
+                * For compound pages such as THP and hugetlbfs, we can save
+                * potentially a lot of iterations if we skip them at once.
+                * The check is racy, but we can consider only valid values
+                * and the only danger is skipping too much.
+                */
+               if (PageCompound(page)) {
+                       unsigned int comp_order = compound_order(page);
+
+                       if (likely(comp_order < MAX_ORDER)) {
+                               blockpfn += (1UL << comp_order) - 1;
+                               cursor += (1UL << comp_order) - 1;
+                       }
+
+                       goto isolate_fail;
+               }
+
                if (!PageBuddy(page))
                        goto isolate_fail;
 
@@ -490,6 +503,13 @@ isolate_fail:
 
        }
 
+       /*
+        * There is a tiny chance that we have read bogus compound_order(),
+        * so be careful to not go outside of the pageblock.
+        */
+       if (unlikely(blockpfn > end_pfn))
+               blockpfn = end_pfn;
+
        trace_mm_compaction_isolate_freepages(*start_pfn, blockpfn,
                                        nr_scanned, total_isolated);
 
@@ -674,6 +694,8 @@ isolate_migratepages_block(struct compact_control *cc, unsigned long low_pfn,
 
        /* Time to isolate some pages for migration */
        for (; low_pfn < end_pfn; low_pfn++) {
+               bool is_lru;
+
                /*
                 * Periodically drop the lock (if held) regardless of its
                 * contention, to give chance to IRQs. Abort async compaction
@@ -717,36 +739,35 @@ isolate_migratepages_block(struct compact_control *cc, unsigned long low_pfn,
                 * It's possible to migrate LRU pages and balloon pages
                 * Skip any other type of page
                 */
-               if (!PageLRU(page)) {
+               is_lru = PageLRU(page);
+               if (!is_lru) {
                        if (unlikely(balloon_page_movable(page))) {
                                if (balloon_page_isolate(page)) {
                                        /* Successfully isolated */
                                        goto isolate_success;
                                }
                        }
-                       continue;
                }
 
                /*
-                * PageLRU is set. lru_lock normally excludes isolation
-                * splitting and collapsing (collapsing has already happened
-                * if PageLRU is set) but the lock is not necessarily taken
-                * here and it is wasteful to take it just to check transhuge.
-                * Check TransHuge without lock and skip the whole pageblock if
-                * it's either a transhuge or hugetlbfs page, as calling
-                * compound_order() without preventing THP from splitting the
-                * page underneath us may return surprising results.
+                * Regardless of being on LRU, compound pages such as THP and
+                * hugetlbfs are not to be compacted. We can potentially save
+                * a lot of iterations if we skip them at once. The check is
+                * racy, but we can consider only valid values and the only
+                * danger is skipping too much.
                 */
-               if (PageTransHuge(page)) {
-                       if (!locked)
-                               low_pfn = ALIGN(low_pfn + 1,
-                                               pageblock_nr_pages) - 1;
-                       else
-                               low_pfn += (1 << compound_order(page)) - 1;
+               if (PageCompound(page)) {
+                       unsigned int comp_order = compound_order(page);
+
+                       if (likely(comp_order < MAX_ORDER))
+                               low_pfn += (1UL << comp_order) - 1;
 
                        continue;
                }
 
+               if (!is_lru)
+                       continue;
+
                /*
                 * Migration will fail if an anonymous page is pinned in memory,
                 * so avoid taking lru_lock and isolating it unnecessarily in an
@@ -763,11 +784,17 @@ isolate_migratepages_block(struct compact_control *cc, unsigned long low_pfn,
                        if (!locked)
                                break;
 
-                       /* Recheck PageLRU and PageTransHuge under lock */
+                       /* Recheck PageLRU and PageCompound under lock */
                        if (!PageLRU(page))
                                continue;
-                       if (PageTransHuge(page)) {
-                               low_pfn += (1 << compound_order(page)) - 1;
+
+                       /*
+                        * Page become compound since the non-locked check,
+                        * and it's on LRU. It can only be a THP so the order
+                        * is safe to read and it's 0 for tail pages.
+                        */
+                       if (unlikely(PageCompound(page))) {
+                               low_pfn += (1UL << compound_order(page)) - 1;
                                continue;
                        }
                }
@@ -778,7 +805,7 @@ isolate_migratepages_block(struct compact_control *cc, unsigned long low_pfn,
                if (__isolate_lru_page(page, isolate_mode) != 0)
                        continue;
 
-               VM_BUG_ON_PAGE(PageTransCompound(page), page);
+               VM_BUG_ON_PAGE(PageCompound(page), page);
 
                /* Successfully isolated */
                del_page_from_lru_list(page, lruvec, page_lru(page));
@@ -897,6 +924,16 @@ static bool suitable_migration_target(struct page *page)
        return false;
 }
 
+/*
+ * Test whether the free scanner has reached the same or lower pageblock than
+ * the migration scanner, and compaction should thus terminate.
+ */
+static inline bool compact_scanners_met(struct compact_control *cc)
+{
+       return (cc->free_pfn >> pageblock_order)
+               <= (cc->migrate_pfn >> pageblock_order);
+}
+
 /*
  * Based on information in the current compact_control, find blocks
  * suitable for isolating free pages from and then isolate them.
@@ -933,8 +970,7 @@ static void isolate_freepages(struct compact_control *cc)
         * pages on cc->migratepages. We stop searching if the migrate
         * and free page scanners meet or enough free pages are isolated.
         */
-       for (; block_start_pfn >= low_pfn &&
-                       cc->nr_migratepages > cc->nr_freepages;
+       for (; block_start_pfn >= low_pfn;
                                block_end_pfn = block_start_pfn,
                                block_start_pfn -= pageblock_nr_pages,
                                isolate_start_pfn = block_start_pfn) {
@@ -966,6 +1002,8 @@ static void isolate_freepages(struct compact_control *cc)
                                        block_end_pfn, freelist, false);
 
                /*
+                * If we isolated enough freepages, or aborted due to async
+                * compaction being contended, terminate the loop.
                 * Remember where the free scanner should restart next time,
                 * which is where isolate_freepages_block() left off.
                 * But if it scanned the whole pageblock, isolate_start_pfn
@@ -974,27 +1012,31 @@ static void isolate_freepages(struct compact_control *cc)
                 * In that case we will however want to restart at the start
                 * of the previous pageblock.
                 */
-               cc->free_pfn = (isolate_start_pfn < block_end_pfn) ?
-                               isolate_start_pfn :
-                               block_start_pfn - pageblock_nr_pages;
-
-               /*
-                * isolate_freepages_block() might have aborted due to async
-                * compaction being contended
-                */
-               if (cc->contended)
+               if ((cc->nr_freepages >= cc->nr_migratepages)
+                                                       || cc->contended) {
+                       if (isolate_start_pfn >= block_end_pfn)
+                               isolate_start_pfn =
+                                       block_start_pfn - pageblock_nr_pages;
                        break;
+               } else {
+                       /*
+                        * isolate_freepages_block() should not terminate
+                        * prematurely unless contended, or isolated enough
+                        */
+                       VM_BUG_ON(isolate_start_pfn < block_end_pfn);
+               }
        }
 
        /* split_free_page does not map the pages */
        map_pages(freelist);
 
        /*
-        * If we crossed the migrate scanner, we want to keep it that way
-        * so that compact_finished() may detect this
+        * Record where the free scanner will restart next time. Either we
+        * broke from the loop and set isolate_start_pfn based on the last
+        * call to isolate_freepages_block(), or we met the migration scanner
+        * and the loop terminated due to isolate_start_pfn < low_pfn
         */
-       if (block_start_pfn < low_pfn)
-               cc->free_pfn = cc->migrate_pfn;
+       cc->free_pfn = isolate_start_pfn;
 }
 
 /*
@@ -1062,6 +1104,7 @@ static isolate_migrate_t isolate_migratepages(struct zone *zone,
                                        struct compact_control *cc)
 {
        unsigned long low_pfn, end_pfn;
+       unsigned long isolate_start_pfn;
        struct page *page;
        const isolate_mode_t isolate_mode =
                (sysctl_compact_unevictable_allowed ? ISOLATE_UNEVICTABLE : 0) |
@@ -1110,6 +1153,7 @@ static isolate_migrate_t isolate_migratepages(struct zone *zone,
                        continue;
 
                /* Perform the isolation */
+               isolate_start_pfn = low_pfn;
                low_pfn = isolate_migratepages_block(cc, low_pfn, end_pfn,
                                                                isolate_mode);
 
@@ -1118,6 +1162,15 @@ static isolate_migrate_t isolate_migratepages(struct zone *zone,
                        return ISOLATE_ABORT;
                }
 
+               /*
+                * Record where we could have freed pages by migration and not
+                * yet flushed them to buddy allocator.
+                * - this is the lowest page that could have been isolated and
+                * then freed by migration.
+                */
+               if (cc->nr_migratepages && !cc->last_migrated_pfn)
+                       cc->last_migrated_pfn = isolate_start_pfn;
+
                /*
                 * Either we isolated something and proceed with migration. Or
                 * we failed and compact_zone should decide if we should
@@ -1127,16 +1180,21 @@ static isolate_migrate_t isolate_migratepages(struct zone *zone,
        }
 
        acct_isolated(zone, cc);
-       /*
-        * Record where migration scanner will be restarted. If we end up in
-        * the same pageblock as the free scanner, make the scanners fully
-        * meet so that compact_finished() terminates compaction.
-        */
-       cc->migrate_pfn = (end_pfn <= cc->free_pfn) ? low_pfn : cc->free_pfn;
+       /* Record where migration scanner will be restarted. */
+       cc->migrate_pfn = low_pfn;
 
        return cc->nr_migratepages ? ISOLATE_SUCCESS : ISOLATE_NONE;
 }
 
+/*
+ * order == -1 is expected when compacting via
+ * /proc/sys/vm/compact_memory
+ */
+static inline bool is_via_compact_memory(int order)
+{
+       return order == -1;
+}
+
 static int __compact_finished(struct zone *zone, struct compact_control *cc,
                            const int migratetype)
 {
@@ -1144,14 +1202,12 @@ static int __compact_finished(struct zone *zone, struct compact_control *cc,
        unsigned long watermark;
 
        if (cc->contended || fatal_signal_pending(current))
-               return COMPACT_PARTIAL;
+               return COMPACT_CONTENDED;
 
        /* Compaction run completes if the migrate and free scanner meet */
-       if (cc->free_pfn <= cc->migrate_pfn) {
+       if (compact_scanners_met(cc)) {
                /* Let the next compaction start anew. */
-               zone->compact_cached_migrate_pfn[0] = zone->zone_start_pfn;
-               zone->compact_cached_migrate_pfn[1] = zone->zone_start_pfn;
-               zone->compact_cached_free_pfn = zone_end_pfn(zone);
+               reset_cached_positions(zone);
 
                /*
                 * Mark that the PG_migrate_skip information should be cleared
@@ -1165,11 +1221,7 @@ static int __compact_finished(struct zone *zone, struct compact_control *cc,
                return COMPACT_COMPLETE;
        }
 
-       /*
-        * order == -1 is expected when compacting via
-        * /proc/sys/vm/compact_memory
-        */
-       if (cc->order == -1)
+       if (is_via_compact_memory(cc->order))
                return COMPACT_CONTINUE;
 
        /* Compaction run is not finished if the watermark is not met */
@@ -1232,11 +1284,7 @@ static unsigned long __compaction_suitable(struct zone *zone, int order,
        int fragindex;
        unsigned long watermark;
 
-       /*
-        * order == -1 is expected when compacting via
-        * /proc/sys/vm/compact_memory
-        */
-       if (order == -1)
+       if (is_via_compact_memory(order))
                return COMPACT_CONTINUE;
 
        watermark = low_wmark_pages(zone);
@@ -1295,7 +1343,6 @@ static int compact_zone(struct zone *zone, struct compact_control *cc)
        unsigned long end_pfn = zone_end_pfn(zone);
        const int migratetype = gfpflags_to_migratetype(cc->gfp_mask);
        const bool sync = cc->mode != MIGRATE_ASYNC;
-       unsigned long last_migrated_pfn = 0;
 
        ret = compaction_suitable(zone, cc->order, cc->alloc_flags,
                                                        cc->classzone_idx);
@@ -1333,6 +1380,7 @@ static int compact_zone(struct zone *zone, struct compact_control *cc)
                zone->compact_cached_migrate_pfn[0] = cc->migrate_pfn;
                zone->compact_cached_migrate_pfn[1] = cc->migrate_pfn;
        }
+       cc->last_migrated_pfn = 0;
 
        trace_mm_compaction_begin(start_pfn, cc->migrate_pfn,
                                cc->free_pfn, end_pfn, sync);
@@ -1342,11 +1390,10 @@ static int compact_zone(struct zone *zone, struct compact_control *cc)
        while ((ret = compact_finished(zone, cc, migratetype)) ==
                                                COMPACT_CONTINUE) {
                int err;
-               unsigned long isolate_start_pfn = cc->migrate_pfn;
 
                switch (isolate_migratepages(zone, cc)) {
                case ISOLATE_ABORT:
-                       ret = COMPACT_PARTIAL;
+                       ret = COMPACT_CONTENDED;
                        putback_movable_pages(&cc->migratepages);
                        cc->nr_migratepages = 0;
                        goto out;
@@ -1376,22 +1423,12 @@ static int compact_zone(struct zone *zone, struct compact_control *cc)
                         * migrate_pages() may return -ENOMEM when scanners meet
                         * and we want compact_finished() to detect it
                         */
-                       if (err == -ENOMEM && cc->free_pfn > cc->migrate_pfn) {
-                               ret = COMPACT_PARTIAL;
+                       if (err == -ENOMEM && !compact_scanners_met(cc)) {
+                               ret = COMPACT_CONTENDED;
                                goto out;
                        }
                }
 
-               /*
-                * Record where we could have freed pages by migration and not
-                * yet flushed them to buddy allocator. We use the pfn that
-                * isolate_migratepages() started from in this loop iteration
-                * - this is the lowest page that could have been isolated and
-                * then freed by migration.
-                */
-               if (!last_migrated_pfn)
-                       last_migrated_pfn = isolate_start_pfn;
-
 check_drain:
                /*
                 * Has the migration scanner moved away from the previous
@@ -1400,12 +1437,12 @@ check_drain:
                 * compact_finished() can detect immediately if allocation
                 * would succeed.
                 */
-               if (cc->order > 0 && last_migrated_pfn) {
+               if (cc->order > 0 && cc->last_migrated_pfn) {
                        int cpu;
                        unsigned long current_block_start =
                                cc->migrate_pfn & ~((1UL << cc->order) - 1);
 
-                       if (last_migrated_pfn < current_block_start) {
+                       if (cc->last_migrated_pfn < current_block_start) {
                                cpu = get_cpu_light();
                                local_lock_irq(swapvec_lock);
                                lru_add_drain_cpu(cpu);
@@ -1413,7 +1450,7 @@ check_drain:
                                drain_local_pages(zone);
                                put_cpu_light();
                                /* No more flushing until we migrate again */
-                               last_migrated_pfn = 0;
+                               cc->last_migrated_pfn = 0;
                        }
                }
 
@@ -1442,6 +1479,9 @@ out:
        trace_mm_compaction_end(start_pfn, cc->migrate_pfn,
                                cc->free_pfn, end_pfn, sync, ret);
 
+       if (ret == COMPACT_CONTENDED)
+               ret = COMPACT_PARTIAL;
+
        return ret;
 }
 
@@ -1613,10 +1653,11 @@ static void __compact_pgdat(pg_data_t *pgdat, struct compact_control *cc)
                 * this makes sure we compact the whole zone regardless of
                 * cached scanner positions.
                 */
-               if (cc->order == -1)
+               if (is_via_compact_memory(cc->order))
                        __reset_isolation_suitable(zone);
 
-               if (cc->order == -1 || !compaction_deferred(zone, cc->order))
+               if (is_via_compact_memory(cc->order) ||
+                               !compaction_deferred(zone, cc->order))
                        compact_zone(zone, cc);
 
                if (cc->order > 0) {