These changes are the raw update to linux-4.4.6-rt14. Kernel sources

[kvmfornfv.git] / kernel / include / linux / mm.h
diff --git a/kernel/include/linux/mm.h b/kernel/include/linux/mm.h

index b208558..00bad77 100644 (file)
--- a/kernel/include/linux/mm.h
+++ b/kernel/include/linux/mm.h
@@ -20,6 +20,7 @@
  #include <linux/shrinker.h>
  #include <linux/resource.h>
  #include <linux/page_ext.h>
+#include <linux/err.h>
  
  struct mempolicy;
  struct anon_vma;
@@ -27,6 +28,7 @@ struct anon_vma_chain;
  struct file_ra_state;
  struct user_struct;
  struct writeback_control;
+struct bdi_writeback;
  
  #ifndef CONFIG_NEED_MULTIPLE_NODES     /* Don't use mapnrs, do it properly */
  extern unsigned long max_mapnr;
@@ -123,8 +125,10 @@ extern unsigned int kobjsize(const void *objp);
  #define VM_MAYSHARE    0x00000080
  
  #define VM_GROWSDOWN   0x00000100      /* general info on the segment */
+#define VM_UFFD_MISSING        0x00000200      /* missing pages tracking */
  #define VM_PFNMAP      0x00000400      /* Page-ranges managed without "struct page", just pure PFN */
  #define VM_DENYWRITE   0x00000800      /* ETXTBSY on write attempts.. */
+#define VM_UFFD_WP     0x00001000      /* wrprotect pages tracking */
  
  #define VM_LOCKED      0x00002000
  #define VM_IO           0x00004000     /* Memory mapped I/O or similar */
@@ -135,6 +139,7 @@ extern unsigned int kobjsize(const void *objp);
  
  #define VM_DONTCOPY    0x00020000      /* Do not copy this vma on fork */
  #define VM_DONTEXPAND  0x00040000      /* Cannot expand with mremap() */
+#define VM_LOCKONFAULT 0x00080000      /* Lock the pages covered when they are faulted in */
  #define VM_ACCOUNT     0x00100000      /* Is a VM accounted object */
  #define VM_NORESERVE   0x00200000      /* should the VM suppress accounting */
  #define VM_HUGETLB     0x00400000      /* Huge TLB Page VM */
@@ -198,6 +203,9 @@ extern unsigned int kobjsize(const void *objp);
  /* This mask defines which mm->def_flags a process can inherit its parent */
  #define VM_INIT_DEF_MASK       VM_NOHUGEPAGE
  
+/* This mask is used to clear all the VMA flags used by mlock */
+#define VM_LOCKED_CLEAR_MASK   (~(VM_LOCKED | VM_LOCKONFAULT))
+
  /*
   * mapping from the currently active vm_flags protection bits (the
   * low four bits) to a page protection mask..
@@ -244,7 +252,10 @@ struct vm_fault {
  struct vm_operations_struct {
         void (*open)(struct vm_area_struct * area);
         void (*close)(struct vm_area_struct * area);
+       int (*mremap)(struct vm_area_struct * area);
         int (*fault)(struct vm_area_struct *vma, struct vm_fault *vmf);
+       int (*pmd_fault)(struct vm_area_struct *, unsigned long address,
+                                               pmd_t *, unsigned int flags);
         void (*map_pages)(struct vm_area_struct *vma, struct vm_fault *vmf);
  
         /* notification that a previously read-only page is about to become
@@ -303,18 +314,6 @@ struct inode;
  #define page_private(page)             ((page)->private)
  #define set_page_private(page, v)      ((page)->private = (v))
  
-/* It's valid only if the page is free path or free_list */
-static inline void set_freepage_migratetype(struct page *page, int migratetype)
-{
-       page->index = migratetype;
-}
-
-/* It's valid only if the page is free path or free_list */
-static inline int get_freepage_migratetype(struct page *page)
-{
-       return page->index;
-}
-
  /*
   * FIXME: take this include out, include page-flags.h in
   * files which need it (119 of them)
@@ -355,20 +354,15 @@ static inline int get_page_unless_zero(struct page *page)
         return atomic_inc_not_zero(&page->_count);
  }
  
-/*
- * Try to drop a ref unless the page has a refcount of one, return false if
- * that is the case.
- * This is to make sure that the refcount won't become zero after this drop.
- * This can be called when MMU is off so it must not access
- * any of the virtual mappings.
- */
-static inline int put_page_unless_one(struct page *page)
-{
-       return atomic_add_unless(&page->_count, -1, 1);
-}
-
  extern int page_is_ram(unsigned long pfn);
-extern int region_is_ram(resource_size_t phys_addr, unsigned long size);
+
+enum {
+       REGION_INTERSECTS,
+       REGION_DISJOINT,
+       REGION_MIXED,
+};
+
+int region_intersects(resource_size_t offset, size_t size, const char *type);
  
  /* Support for virtually mapped pages */
  struct page *vmalloc_to_page(const void *addr);
@@ -436,46 +430,6 @@ static inline void compound_unlock_irqrestore(struct page *page,
  #endif
  }
  
-static inline struct page *compound_head_by_tail(struct page *tail)
-{
-       struct page *head = tail->first_page;
-
-       /*
-        * page->first_page may be a dangling pointer to an old
-        * compound page, so recheck that it is still a tail
-        * page before returning.
-        */
-       smp_rmb();
-       if (likely(PageTail(tail)))
-               return head;
-       return tail;
-}
-
-/*
- * Since either compound page could be dismantled asynchronously in THP
- * or we access asynchronously arbitrary positioned struct page, there
- * would be tail flag race. To handle this race, we should call
- * smp_rmb() before checking tail flag. compound_head_by_tail() did it.
- */
-static inline struct page *compound_head(struct page *page)
-{
-       if (unlikely(PageTail(page)))
-               return compound_head_by_tail(page);
-       return page;
-}
-
-/*
- * If we access compound page synchronously such as access to
- * allocated page, there is no need to handle tail flag race, so we can
- * check tail flag directly without any synchronization primitive.
- */
-static inline struct page *compound_head_fast(struct page *page)
-{
-       if (unlikely(PageTail(page)))
-               return page->first_page;
-       return page;
-}
-
  /*
   * The atomic page->_mapcount, starts from -1: so that transitions
   * both from it and to it can be tracked, using atomic_inc_and_test
@@ -499,7 +453,7 @@ static inline int page_count(struct page *page)
  
  static inline bool __compound_tail_refcounted(struct page *page)
  {
-       return !PageSlab(page) && !PageHeadHuge(page);
+       return PageAnon(page) && !PageSlab(page) && !PageHeadHuge(page);
  }
  
  /*
@@ -524,7 +478,7 @@ static inline void get_huge_page_tail(struct page *page)
         VM_BUG_ON_PAGE(!PageTail(page), page);
         VM_BUG_ON_PAGE(page_mapcount(page) < 0, page);
         VM_BUG_ON_PAGE(atomic_read(&page->_count) != 0, page);
-       if (compound_tail_refcounted(page->first_page))
+       if (compound_tail_refcounted(compound_head(page)))
                 atomic_inc(&page->_mapcount);
  }
  
@@ -547,13 +501,7 @@ static inline struct page *virt_to_head_page(const void *x)
  {
         struct page *page = virt_to_page(x);
  
-       /*
-        * We don't need to worry about synchronization of tail flag
-        * when we call virt_to_head_page() since it is only called for
-        * already allocated page and this page won't be freed until
-        * this virt_to_head_page() is finished. So use _fast variant.
-        */
-       return compound_head_fast(page);
+       return compound_head(page);
  }
  
  /*
@@ -574,28 +522,42 @@ int split_free_page(struct page *page);
  /*
   * Compound pages have a destructor function.  Provide a
   * prototype for that function and accessor functions.
- * These are _only_ valid on the head of a PG_compound page.
+ * These are _only_ valid on the head of a compound page.
   */
+typedef void compound_page_dtor(struct page *);
+
+/* Keep the enum in sync with compound_page_dtors array in mm/page_alloc.c */
+enum compound_dtor_id {
+       NULL_COMPOUND_DTOR,
+       COMPOUND_PAGE_DTOR,
+#ifdef CONFIG_HUGETLB_PAGE
+       HUGETLB_PAGE_DTOR,
+#endif
+       NR_COMPOUND_DTORS,
+};
+extern compound_page_dtor * const compound_page_dtors[];
  
  static inline void set_compound_page_dtor(struct page *page,
-                                               compound_page_dtor *dtor)
+               enum compound_dtor_id compound_dtor)
  {
-       page[1].compound_dtor = dtor;
+       VM_BUG_ON_PAGE(compound_dtor >= NR_COMPOUND_DTORS, page);
+       page[1].compound_dtor = compound_dtor;
  }
  
  static inline compound_page_dtor *get_compound_page_dtor(struct page *page)
  {
-       return page[1].compound_dtor;
+       VM_BUG_ON_PAGE(page[1].compound_dtor >= NR_COMPOUND_DTORS, page);
+       return compound_page_dtors[page[1].compound_dtor];
  }
  
-static inline int compound_order(struct page *page)
+static inline unsigned int compound_order(struct page *page)
  {
         if (!PageHead(page))
                 return 0;
         return page[1].compound_order;
  }
  
-static inline void set_compound_order(struct page *page, unsigned long order)
+static inline void set_compound_order(struct page *page, unsigned int order)
  {
         page[1].compound_order = order;
  }
@@ -915,6 +877,27 @@ static inline void set_page_links(struct page *page, enum zone_type zone,
  #endif
  }
  
+#ifdef CONFIG_MEMCG
+static inline struct mem_cgroup *page_memcg(struct page *page)
+{
+       return page->mem_cgroup;
+}
+
+static inline void set_page_memcg(struct page *page, struct mem_cgroup *memcg)
+{
+       page->mem_cgroup = memcg;
+}
+#else
+static inline struct mem_cgroup *page_memcg(struct page *page)
+{
+       return NULL;
+}
+
+static inline void set_page_memcg(struct page *page, struct mem_cgroup *memcg)
+{
+}
+#endif
+
  /*
   * Some inline functions in vmstat.h depend on page_zone()
   */
@@ -1225,6 +1208,49 @@ long get_user_pages_unlocked(struct task_struct *tsk, struct mm_struct *mm,
                     int write, int force, struct page **pages);
  int get_user_pages_fast(unsigned long start, int nr_pages, int write,
                         struct page **pages);
+
+/* Container for pinned pfns / pages */
+struct frame_vector {
+       unsigned int nr_allocated;      /* Number of frames we have space for */
+       unsigned int nr_frames; /* Number of frames stored in ptrs array */
+       bool got_ref;           /* Did we pin pages by getting page ref? */
+       bool is_pfns;           /* Does array contain pages or pfns? */
+       void *ptrs[0];          /* Array of pinned pfns / pages. Use
+                                * pfns_vector_pages() or pfns_vector_pfns()
+                                * for access */
+};
+
+struct frame_vector *frame_vector_create(unsigned int nr_frames);
+void frame_vector_destroy(struct frame_vector *vec);
+int get_vaddr_frames(unsigned long start, unsigned int nr_pfns,
+                    bool write, bool force, struct frame_vector *vec);
+void put_vaddr_frames(struct frame_vector *vec);
+int frame_vector_to_pages(struct frame_vector *vec);
+void frame_vector_to_pfns(struct frame_vector *vec);
+
+static inline unsigned int frame_vector_count(struct frame_vector *vec)
+{
+       return vec->nr_frames;
+}
+
+static inline struct page **frame_vector_pages(struct frame_vector *vec)
+{
+       if (vec->is_pfns) {
+               int err = frame_vector_to_pages(vec);
+
+               if (err)
+                       return ERR_PTR(err);
+       }
+       return (struct page **)(vec->ptrs);
+}
+
+static inline unsigned long *frame_vector_pfns(struct frame_vector *vec)
+{
+       if (!vec->is_pfns)
+               frame_vector_to_pfns(vec);
+       return (unsigned long *)(vec->ptrs);
+}
+
  struct kvec;
  int get_kernel_pages(const struct kvec *iov, int nr_pages, int write,
                         struct page **pages);
@@ -1239,10 +1265,13 @@ int __set_page_dirty_nobuffers(struct page *page);
  int __set_page_dirty_no_writeback(struct page *page);
  int redirty_page_for_writepage(struct writeback_control *wbc,
                                 struct page *page);
-void account_page_dirtied(struct page *page, struct address_space *mapping);
-void account_page_cleaned(struct page *page, struct address_space *mapping);
+void account_page_dirtied(struct page *page, struct address_space *mapping,
+                         struct mem_cgroup *memcg);
+void account_page_cleaned(struct page *page, struct address_space *mapping,
+                         struct mem_cgroup *memcg, struct bdi_writeback *wb);
  int set_page_dirty(struct page *page);
  int set_page_dirty_lock(struct page *page);
+void cancel_dirty_page(struct page *page);
  int clear_page_dirty_for_io(struct page *page);
  
  int get_cmdline(struct task_struct *task, char *buffer, int buflen);
@@ -1253,6 +1282,11 @@ static inline int vma_growsdown(struct vm_area_struct *vma, unsigned long addr)
         return vma && (vma->vm_end == addr) && (vma->vm_flags & VM_GROWSDOWN);
  }
  
+static inline bool vma_is_anonymous(struct vm_area_struct *vma)
+{
+       return !vma->vm_ops;
+}
+
  static inline int stack_guard_page_start(struct vm_area_struct *vma,
                                              unsigned long addr)
  {
@@ -1506,8 +1540,7 @@ static inline bool ptlock_init(struct page *page)
          * with 0. Make sure nobody took it in use in between.
          *
          * It can happen if arch try to use slab for page table allocation:
-        * slab code uses page->slab_cache and page->first_page (for tail
-        * pages), which share storage with page->ptl.
+        * slab code uses page->slab_cache, which share storage with page->ptl.
          */
         VM_BUG_ON_PAGE(*(unsigned long *)&page->ptl, page);
         if (!ptlock_alloc(page))
@@ -1544,8 +1577,10 @@ static inline void pgtable_init(void)
  
  static inline bool pgtable_page_ctor(struct page *page)
  {
+       if (!ptlock_init(page))
+               return false;
         inc_zone_page_state(page, NR_PAGETABLE);
-       return ptlock_init(page);
+       return true;
  }
  
  static inline void pgtable_page_dtor(struct page *page)
@@ -1659,6 +1694,8 @@ extern void free_highmem_page(struct page *page);
  extern void adjust_managed_page_count(struct page *page, long count);
  extern void mem_init_print_info(const char *str);
  
+extern void reserve_bootmem_region(unsigned long start, unsigned long end);
+
  /* Free the reserved page into the buddy system, so it gets managed. */
  static inline void __free_reserved_page(struct page *page)
  {
@@ -1748,7 +1785,8 @@ extern void sparse_memory_present_with_active_regions(int nid);
  
  #if !defined(CONFIG_HAVE_MEMBLOCK_NODE_MAP) && \
      !defined(CONFIG_HAVE_ARCH_EARLY_PFN_TO_NID)
-static inline int __early_pfn_to_nid(unsigned long pfn)
+static inline int __early_pfn_to_nid(unsigned long pfn,
+                                       struct mminit_pfnnid_cache *state)
  {
         return 0;
  }
@@ -1756,7 +1794,8 @@ static inline int __early_pfn_to_nid(unsigned long pfn)
  /* please see mm/page_alloc.c */
  extern int __meminit early_pfn_to_nid(unsigned long pfn);
  /* there is a per-arch backend function. */
-extern int __meminit __early_pfn_to_nid(unsigned long pfn);
+extern int __meminit __early_pfn_to_nid(unsigned long pfn,
+                                       struct mminit_pfnnid_cache *state);
  #endif
  
  extern void set_dma_reserve(unsigned long new_dma_reserve);
@@ -1771,7 +1810,8 @@ extern void si_meminfo(struct sysinfo * val);
  extern void si_meminfo_node(struct sysinfo *val, int nid);
  
  extern __printf(3, 4)
-void warn_alloc_failed(gfp_t gfp_mask, int order, const char *fmt, ...);
+void warn_alloc_failed(gfp_t gfp_mask, unsigned int order,
+               const char *fmt, ...);
  
  extern void setup_per_cpu_pageset(void);
  
@@ -1825,7 +1865,7 @@ extern int vma_adjust(struct vm_area_struct *vma, unsigned long start,
  extern struct vm_area_struct *vma_merge(struct mm_struct *,
         struct vm_area_struct *prev, unsigned long addr, unsigned long end,
         unsigned long vm_flags, struct anon_vma *, struct file *, pgoff_t,
-       struct mempolicy *);
+       struct mempolicy *, struct vm_userfaultfd_ctx);
  extern struct anon_vma *find_mergeable_anon_vma(struct vm_area_struct *);
  extern int split_vma(struct mm_struct *,
         struct vm_area_struct *, unsigned long addr, int new_below);
@@ -1872,11 +1912,19 @@ extern unsigned long get_unmapped_area(struct file *, unsigned long, unsigned lo
  
  extern unsigned long mmap_region(struct file *file, unsigned long addr,
         unsigned long len, vm_flags_t vm_flags, unsigned long pgoff);
-extern unsigned long do_mmap_pgoff(struct file *file, unsigned long addr,
+extern unsigned long do_mmap(struct file *file, unsigned long addr,
         unsigned long len, unsigned long prot, unsigned long flags,
-       unsigned long pgoff, unsigned long *populate);
+       vm_flags_t vm_flags, unsigned long pgoff, unsigned long *populate);
  extern int do_munmap(struct mm_struct *, unsigned long, size_t);
  
+static inline unsigned long
+do_mmap_pgoff(struct file *file, unsigned long addr,
+       unsigned long len, unsigned long prot, unsigned long flags,
+       unsigned long pgoff, unsigned long *populate)
+{
+       return do_mmap(file, addr, len, prot, flags, 0, pgoff, populate);
+}
+
  #ifdef CONFIG_MMU
  extern int __mm_populate(unsigned long addr, unsigned long len,
                          int ignore_errors);
@@ -1962,8 +2010,6 @@ void page_cache_async_readahead(struct address_space *mapping,
                                 pgoff_t offset,
                                 unsigned long size);
  
-unsigned long max_sane_readahead(unsigned long nr);
-
  /* Generic expand stack which grows the stack according to GROWS{UP,DOWN} */
  extern int expand_stack(struct vm_area_struct *vma, unsigned long address);
  
@@ -2063,6 +2109,7 @@ static inline struct page *follow_page(struct vm_area_struct *vma,
  #define FOLL_NUMA      0x200   /* force NUMA hinting page fault */
  #define FOLL_MIGRATION 0x400   /* wait for page to replace migration entry */
  #define FOLL_TRIED     0x800   /* a retry, previous pass started an IO */
+#define FOLL_MLOCK     0x1000  /* lock present pages */
  
  typedef int (*pte_fn_t)(pte_t *pte, pgtable_t token, unsigned long addr,
                         void *data);
@@ -2174,12 +2221,48 @@ enum mf_flags {
  extern int memory_failure(unsigned long pfn, int trapno, int flags);
  extern void memory_failure_queue(unsigned long pfn, int trapno, int flags);
  extern int unpoison_memory(unsigned long pfn);
+extern int get_hwpoison_page(struct page *page);
+extern void put_hwpoison_page(struct page *page);
  extern int sysctl_memory_failure_early_kill;
  extern int sysctl_memory_failure_recovery;
  extern void shake_page(struct page *p, int access);
  extern atomic_long_t num_poisoned_pages;
  extern int soft_offline_page(struct page *page, int flags);
  
+
+/*
+ * Error handlers for various types of pages.
+ */
+enum mf_result {
+       MF_IGNORED,     /* Error: cannot be handled */
+       MF_FAILED,      /* Error: handling failed */
+       MF_DELAYED,     /* Will be handled later */
+       MF_RECOVERED,   /* Successfully recovered */
+};
+
+enum mf_action_page_type {
+       MF_MSG_KERNEL,
+       MF_MSG_KERNEL_HIGH_ORDER,
+       MF_MSG_SLAB,
+       MF_MSG_DIFFERENT_COMPOUND,
+       MF_MSG_POISONED_HUGE,
+       MF_MSG_HUGE,
+       MF_MSG_FREE_HUGE,
+       MF_MSG_UNMAP_FAILED,
+       MF_MSG_DIRTY_SWAPCACHE,
+       MF_MSG_CLEAN_SWAPCACHE,
+       MF_MSG_DIRTY_MLOCKED_LRU,
+       MF_MSG_CLEAN_MLOCKED_LRU,
+       MF_MSG_DIRTY_UNEVICTABLE_LRU,
+       MF_MSG_CLEAN_UNEVICTABLE_LRU,
+       MF_MSG_DIRTY_LRU,
+       MF_MSG_CLEAN_LRU,
+       MF_MSG_TRUNCATED_LRU,
+       MF_MSG_BUDDY,
+       MF_MSG_BUDDY_2ND,
+       MF_MSG_UNKNOWN,
+};
+
  #if defined(CONFIG_TRANSPARENT_HUGEPAGE) || defined(CONFIG_HUGETLBFS)
  extern void clear_huge_page(struct page *page,
                             unsigned long addr,