These changes are the raw update to linux-4.4.6-rt14. Kernel sources

[kvmfornfv.git] / kernel / mm / filemap.c
diff --git a/kernel/mm/filemap.c b/kernel/mm/filemap.c

index 01cf284..4430136 100644 (file)
--- a/kernel/mm/filemap.c
+++ b/kernel/mm/filemap.c
@@ -100,6 +100,7 @@
   *    ->tree_lock              (page_remove_rmap->set_page_dirty)
   *    bdi.wb->list_lock                (page_remove_rmap->set_page_dirty)
   *    ->inode->i_lock          (page_remove_rmap->set_page_dirty)
+ *    ->memcg->move_lock       (page_remove_rmap->mem_cgroup_begin_page_stat)
   *    bdi.wb->list_lock                (zap_pte_range->set_page_dirty)
   *    ->inode->i_lock          (zap_pte_range->set_page_dirty)
   *    ->private_lock           (zap_pte_range->__set_page_dirty_buffers)
@@ -176,9 +177,11 @@ static void page_cache_tree_delete(struct address_space *mapping,
  /*
   * Delete a page from the page cache and free it. Caller has to make
   * sure the page is locked and that nobody else uses it - or that usage
- * is safe.  The caller must hold the mapping's tree_lock.
+ * is safe.  The caller must hold the mapping's tree_lock and
+ * mem_cgroup_begin_page_stat().
   */
-void __delete_from_page_cache(struct page *page, void *shadow)
+void __delete_from_page_cache(struct page *page, void *shadow,
+                             struct mem_cgroup *memcg)
  {
         struct address_space *mapping = page->mapping;
  
@@ -198,7 +201,9 @@ void __delete_from_page_cache(struct page *page, void *shadow)
         page->mapping = NULL;
         /* Leave page->index set: truncation lookup relies upon it */
  
-       __dec_zone_page_state(page, NR_FILE_PAGES);
+       /* hugetlb pages do not participate in page cache accounting. */
+       if (!PageHuge(page))
+               __dec_zone_page_state(page, NR_FILE_PAGES);
         if (PageSwapBacked(page))
                 __dec_zone_page_state(page, NR_SHMEM);
         BUG_ON(page_mapped(page));
@@ -212,7 +217,8 @@ void __delete_from_page_cache(struct page *page, void *shadow)
          * anyway will be cleared before returning page into buddy allocator.
          */
         if (WARN_ON_ONCE(PageDirty(page)))
-               account_page_cleaned(page, mapping);
+               account_page_cleaned(page, mapping, memcg,
+                                    inode_to_wb(mapping->host));
  }
  
  /**
@@ -226,14 +232,20 @@ void __delete_from_page_cache(struct page *page, void *shadow)
  void delete_from_page_cache(struct page *page)
  {
         struct address_space *mapping = page->mapping;
+       struct mem_cgroup *memcg;
+       unsigned long flags;
+
         void (*freepage)(struct page *);
  
         BUG_ON(!PageLocked(page));
  
         freepage = mapping->a_ops->freepage;
-       spin_lock_irq(&mapping->tree_lock);
-       __delete_from_page_cache(page, NULL);
-       spin_unlock_irq(&mapping->tree_lock);
+
+       memcg = mem_cgroup_begin_page_stat(page);
+       spin_lock_irqsave(&mapping->tree_lock, flags);
+       __delete_from_page_cache(page, NULL, memcg);
+       spin_unlock_irqrestore(&mapping->tree_lock, flags);
+       mem_cgroup_end_page_stat(memcg);
  
         if (freepage)
                 freepage(page);
@@ -283,7 +295,9 @@ int __filemap_fdatawrite_range(struct address_space *mapping, loff_t start,
         if (!mapping_cap_writeback_dirty(mapping))
                 return 0;
  
+       wbc_attach_fdatawrite_inode(&wbc, mapping->host);
         ret = do_writepages(mapping, &wbc);
+       wbc_detach_inode(&wbc);
         return ret;
  }
  
@@ -319,23 +333,14 @@ int filemap_flush(struct address_space *mapping)
  }
  EXPORT_SYMBOL(filemap_flush);
  
-/**
- * filemap_fdatawait_range - wait for writeback to complete
- * @mapping:           address space structure to wait for
- * @start_byte:                offset in bytes where the range starts
- * @end_byte:          offset in bytes where the range ends (inclusive)
- *
- * Walk the list of under-writeback pages of the given address space
- * in the given range and wait for all of them.
- */
-int filemap_fdatawait_range(struct address_space *mapping, loff_t start_byte,
-                           loff_t end_byte)
+static int __filemap_fdatawait_range(struct address_space *mapping,
+                                    loff_t start_byte, loff_t end_byte)
  {
         pgoff_t index = start_byte >> PAGE_CACHE_SHIFT;
         pgoff_t end = end_byte >> PAGE_CACHE_SHIFT;
         struct pagevec pvec;
         int nr_pages;
-       int ret2, ret = 0;
+       int ret = 0;
  
         if (end_byte < start_byte)
                 goto out;
@@ -362,6 +367,29 @@ int filemap_fdatawait_range(struct address_space *mapping, loff_t start_byte,
                 cond_resched();
         }
  out:
+       return ret;
+}
+
+/**
+ * filemap_fdatawait_range - wait for writeback to complete
+ * @mapping:           address space structure to wait for
+ * @start_byte:                offset in bytes where the range starts
+ * @end_byte:          offset in bytes where the range ends (inclusive)
+ *
+ * Walk the list of under-writeback pages of the given address space
+ * in the given range and wait for all of them.  Check error status of
+ * the address space and return it.
+ *
+ * Since the error status of the address space is cleared by this function,
+ * callers are responsible for checking the return value and handling and/or
+ * reporting the error.
+ */
+int filemap_fdatawait_range(struct address_space *mapping, loff_t start_byte,
+                           loff_t end_byte)
+{
+       int ret, ret2;
+
+       ret = __filemap_fdatawait_range(mapping, start_byte, end_byte);
         ret2 = filemap_check_errors(mapping);
         if (!ret)
                 ret = ret2;
@@ -370,12 +398,39 @@ out:
  }
  EXPORT_SYMBOL(filemap_fdatawait_range);
  
+/**
+ * filemap_fdatawait_keep_errors - wait for writeback without clearing errors
+ * @mapping: address space structure to wait for
+ *
+ * Walk the list of under-writeback pages of the given address space
+ * and wait for all of them.  Unlike filemap_fdatawait(), this function
+ * does not clear error status of the address space.
+ *
+ * Use this function if callers don't handle errors themselves.  Expected
+ * call sites are system-wide / filesystem-wide data flushers: e.g. sync(2),
+ * fsfreeze(8)
+ */
+void filemap_fdatawait_keep_errors(struct address_space *mapping)
+{
+       loff_t i_size = i_size_read(mapping->host);
+
+       if (i_size == 0)
+               return;
+
+       __filemap_fdatawait_range(mapping, 0, i_size - 1);
+}
+
  /**
   * filemap_fdatawait - wait for all under-writeback pages to complete
   * @mapping: address space structure to wait for
   *
   * Walk the list of under-writeback pages of the given address space
- * and wait for all of them.
+ * and wait for all of them.  Check error status of the address space
+ * and return it.
+ *
+ * Since the error status of the address space is cleared by this function,
+ * callers are responsible for checking the return value and handling and/or
+ * reporting the error.
   */
  int filemap_fdatawait(struct address_space *mapping)
  {
@@ -472,6 +527,8 @@ int replace_page_cache_page(struct page *old, struct page *new, gfp_t gfp_mask)
         if (!error) {
                 struct address_space *mapping = old->mapping;
                 void (*freepage)(struct page *);
+               struct mem_cgroup *memcg;
+               unsigned long flags;
  
                 pgoff_t offset = old->index;
                 freepage = mapping->a_ops->freepage;
@@ -480,16 +537,23 @@ int replace_page_cache_page(struct page *old, struct page *new, gfp_t gfp_mask)
                 new->mapping = mapping;
                 new->index = offset;
  
-               spin_lock_irq(&mapping->tree_lock);
-               __delete_from_page_cache(old, NULL);
+               memcg = mem_cgroup_begin_page_stat(old);
+               spin_lock_irqsave(&mapping->tree_lock, flags);
+               __delete_from_page_cache(old, NULL, memcg);
                 error = radix_tree_insert(&mapping->page_tree, offset, new);
                 BUG_ON(error);
                 mapping->nrpages++;
-               __inc_zone_page_state(new, NR_FILE_PAGES);
+
+               /*
+                * hugetlb pages do not participate in page cache accounting.
+                */
+               if (!PageHuge(new))
+                       __inc_zone_page_state(new, NR_FILE_PAGES);
                 if (PageSwapBacked(new))
                         __inc_zone_page_state(new, NR_SHMEM);
-               spin_unlock_irq(&mapping->tree_lock);
-               mem_cgroup_migrate(old, new, true);
+               spin_unlock_irqrestore(&mapping->tree_lock, flags);
+               mem_cgroup_end_page_stat(memcg);
+               mem_cgroup_replace_page(old, new);
                 radix_tree_preload_end();
                 if (freepage)
                         freepage(old);
@@ -580,7 +644,10 @@ static int __add_to_page_cache_locked(struct page *page,
         radix_tree_preload_end();
         if (unlikely(error))
                 goto err_insert;
-       __inc_zone_page_state(page, NR_FILE_PAGES);
+
+       /* hugetlb pages do not participate in page cache accounting. */
+       if (!huge)
+               __inc_zone_page_state(page, NR_FILE_PAGES);
         spin_unlock_irq(&mapping->tree_lock);
         if (!huge)
                 mem_cgroup_commit_charge(page, memcg, false);
@@ -653,7 +720,7 @@ struct page *__page_cache_alloc(gfp_t gfp)
                 do {
                         cpuset_mems_cookie = read_mems_allowed_begin();
                         n = cpuset_mem_spread_node();
-                       page = alloc_pages_exact_node(n, gfp, 0);
+                       page = __alloc_pages_node(n, gfp, 0);
                 } while (!page && read_mems_allowed_retry(cpuset_mems_cookie));
  
                 return page;
@@ -1659,8 +1726,8 @@ no_cached_page:
                         error = -ENOMEM;
                         goto out;
                 }
-               error = add_to_page_cache_lru(page, mapping,
-                                               index, GFP_KERNEL);
+               error = add_to_page_cache_lru(page, mapping, index,
+                               mapping_gfp_constraint(mapping, GFP_KERNEL));
                 if (error) {
                         page_cache_release(page);
                         if (error == -EEXIST) {
@@ -1761,7 +1828,8 @@ static int page_cache_read(struct file *file, pgoff_t offset)
                 if (!page)
                         return -ENOMEM;
  
-               ret = add_to_page_cache_lru(page, mapping, offset, GFP_KERNEL);
+               ret = add_to_page_cache_lru(page, mapping, offset,
+                               mapping_gfp_constraint(mapping, GFP_KERNEL));
                 if (ret == 0)
                         ret = mapping->a_ops->readpage(file, page);
                 else if (ret == -EEXIST)
@@ -1785,7 +1853,6 @@ static void do_sync_mmap_readahead(struct vm_area_struct *vma,
                                    struct file *file,
                                    pgoff_t offset)
  {
-       unsigned long ra_pages;
         struct address_space *mapping = file->f_mapping;
  
         /* If we don't want any read-ahead, don't bother */
@@ -1814,10 +1881,9 @@ static void do_sync_mmap_readahead(struct vm_area_struct *vma,
         /*
          * mmap read-around
          */
-       ra_pages = max_sane_readahead(ra->ra_pages);
-       ra->start = max_t(long, 0, offset - ra_pages / 2);
-       ra->size = ra_pages;
-       ra->async_size = ra_pages / 4;
+       ra->start = max_t(long, 0, offset - ra->ra_pages / 2);
+       ra->size = ra->ra_pages;
+       ra->async_size = ra->ra_pages / 4;
         ra_submit(ra, mapping, file);
  }
  
@@ -2466,6 +2532,11 @@ again:
                         break;
                 }
  
+               if (fatal_signal_pending(current)) {
+                       status = -EINTR;
+                       break;
+               }
+
                 status = a_ops->write_begin(file, mapping, pos, bytes, flags,
                                                 &page, &fsdata);
                 if (unlikely(status < 0))
@@ -2503,10 +2574,6 @@ again:
                 written += copied;
  
                 balance_dirty_pages_ratelimited(mapping);
-               if (fatal_signal_pending(current)) {
-                       status = -EINTR;
-                       break;
-               }
         } while (iov_iter_count(i));
  
         return written ? written : status;
@@ -2541,7 +2608,7 @@ ssize_t __generic_file_write_iter(struct kiocb *iocb, struct iov_iter *from)
  
         /* We can write back this queue in page reclaim */
         current->backing_dev_info = inode_to_bdi(inode);
-       err = file_remove_suid(file);
+       err = file_remove_privs(file);
         if (err)
                 goto out;
  
@@ -2651,7 +2718,7 @@ EXPORT_SYMBOL(generic_file_write_iter);
   * page is known to the local caching routines.
   *
   * The @gfp_mask argument specifies whether I/O may be performed to release
- * this page (__GFP_IO), and whether the call may block (__GFP_WAIT & __GFP_FS).
+ * this page (__GFP_IO), and whether the call may block (__GFP_RECLAIM & __GFP_FS).
   *
   */
  int try_to_release_page(struct page *page, gfp_t gfp_mask)