These changes are the raw update to linux-4.4.6-rt14. Kernel sources
[kvmfornfv.git] / kernel / mm / filemap.c
index 01cf284..4430136 100644 (file)
  *    ->tree_lock              (page_remove_rmap->set_page_dirty)
  *    bdi.wb->list_lock                (page_remove_rmap->set_page_dirty)
  *    ->inode->i_lock          (page_remove_rmap->set_page_dirty)
+ *    ->memcg->move_lock       (page_remove_rmap->mem_cgroup_begin_page_stat)
  *    bdi.wb->list_lock                (zap_pte_range->set_page_dirty)
  *    ->inode->i_lock          (zap_pte_range->set_page_dirty)
  *    ->private_lock           (zap_pte_range->__set_page_dirty_buffers)
@@ -176,9 +177,11 @@ static void page_cache_tree_delete(struct address_space *mapping,
 /*
  * Delete a page from the page cache and free it. Caller has to make
  * sure the page is locked and that nobody else uses it - or that usage
- * is safe.  The caller must hold the mapping's tree_lock.
+ * is safe.  The caller must hold the mapping's tree_lock and
+ * mem_cgroup_begin_page_stat().
  */
-void __delete_from_page_cache(struct page *page, void *shadow)
+void __delete_from_page_cache(struct page *page, void *shadow,
+                             struct mem_cgroup *memcg)
 {
        struct address_space *mapping = page->mapping;
 
@@ -198,7 +201,9 @@ void __delete_from_page_cache(struct page *page, void *shadow)
        page->mapping = NULL;
        /* Leave page->index set: truncation lookup relies upon it */
 
-       __dec_zone_page_state(page, NR_FILE_PAGES);
+       /* hugetlb pages do not participate in page cache accounting. */
+       if (!PageHuge(page))
+               __dec_zone_page_state(page, NR_FILE_PAGES);
        if (PageSwapBacked(page))
                __dec_zone_page_state(page, NR_SHMEM);
        BUG_ON(page_mapped(page));
@@ -212,7 +217,8 @@ void __delete_from_page_cache(struct page *page, void *shadow)
         * anyway will be cleared before returning page into buddy allocator.
         */
        if (WARN_ON_ONCE(PageDirty(page)))
-               account_page_cleaned(page, mapping);
+               account_page_cleaned(page, mapping, memcg,
+                                    inode_to_wb(mapping->host));
 }
 
 /**
@@ -226,14 +232,20 @@ void __delete_from_page_cache(struct page *page, void *shadow)
 void delete_from_page_cache(struct page *page)
 {
        struct address_space *mapping = page->mapping;
+       struct mem_cgroup *memcg;
+       unsigned long flags;
+
        void (*freepage)(struct page *);
 
        BUG_ON(!PageLocked(page));
 
        freepage = mapping->a_ops->freepage;
-       spin_lock_irq(&mapping->tree_lock);
-       __delete_from_page_cache(page, NULL);
-       spin_unlock_irq(&mapping->tree_lock);
+
+       memcg = mem_cgroup_begin_page_stat(page);
+       spin_lock_irqsave(&mapping->tree_lock, flags);
+       __delete_from_page_cache(page, NULL, memcg);
+       spin_unlock_irqrestore(&mapping->tree_lock, flags);
+       mem_cgroup_end_page_stat(memcg);
 
        if (freepage)
                freepage(page);
@@ -283,7 +295,9 @@ int __filemap_fdatawrite_range(struct address_space *mapping, loff_t start,
        if (!mapping_cap_writeback_dirty(mapping))
                return 0;
 
+       wbc_attach_fdatawrite_inode(&wbc, mapping->host);
        ret = do_writepages(mapping, &wbc);
+       wbc_detach_inode(&wbc);
        return ret;
 }
 
@@ -319,23 +333,14 @@ int filemap_flush(struct address_space *mapping)
 }
 EXPORT_SYMBOL(filemap_flush);
 
-/**
- * filemap_fdatawait_range - wait for writeback to complete
- * @mapping:           address space structure to wait for
- * @start_byte:                offset in bytes where the range starts
- * @end_byte:          offset in bytes where the range ends (inclusive)
- *
- * Walk the list of under-writeback pages of the given address space
- * in the given range and wait for all of them.
- */
-int filemap_fdatawait_range(struct address_space *mapping, loff_t start_byte,
-                           loff_t end_byte)
+static int __filemap_fdatawait_range(struct address_space *mapping,
+                                    loff_t start_byte, loff_t end_byte)
 {
        pgoff_t index = start_byte >> PAGE_CACHE_SHIFT;
        pgoff_t end = end_byte >> PAGE_CACHE_SHIFT;
        struct pagevec pvec;
        int nr_pages;
-       int ret2, ret = 0;
+       int ret = 0;
 
        if (end_byte < start_byte)
                goto out;
@@ -362,6 +367,29 @@ int filemap_fdatawait_range(struct address_space *mapping, loff_t start_byte,
                cond_resched();
        }
 out:
+       return ret;
+}
+
+/**
+ * filemap_fdatawait_range - wait for writeback to complete
+ * @mapping:           address space structure to wait for
+ * @start_byte:                offset in bytes where the range starts
+ * @end_byte:          offset in bytes where the range ends (inclusive)
+ *
+ * Walk the list of under-writeback pages of the given address space
+ * in the given range and wait for all of them.  Check error status of
+ * the address space and return it.
+ *
+ * Since the error status of the address space is cleared by this function,
+ * callers are responsible for checking the return value and handling and/or
+ * reporting the error.
+ */
+int filemap_fdatawait_range(struct address_space *mapping, loff_t start_byte,
+                           loff_t end_byte)
+{
+       int ret, ret2;
+
+       ret = __filemap_fdatawait_range(mapping, start_byte, end_byte);
        ret2 = filemap_check_errors(mapping);
        if (!ret)
                ret = ret2;
@@ -370,12 +398,39 @@ out:
 }
 EXPORT_SYMBOL(filemap_fdatawait_range);
 
+/**
+ * filemap_fdatawait_keep_errors - wait for writeback without clearing errors
+ * @mapping: address space structure to wait for
+ *
+ * Walk the list of under-writeback pages of the given address space
+ * and wait for all of them.  Unlike filemap_fdatawait(), this function
+ * does not clear error status of the address space.
+ *
+ * Use this function if callers don't handle errors themselves.  Expected
+ * call sites are system-wide / filesystem-wide data flushers: e.g. sync(2),
+ * fsfreeze(8)
+ */
+void filemap_fdatawait_keep_errors(struct address_space *mapping)
+{
+       loff_t i_size = i_size_read(mapping->host);
+
+       if (i_size == 0)
+               return;
+
+       __filemap_fdatawait_range(mapping, 0, i_size - 1);
+}
+
 /**
  * filemap_fdatawait - wait for all under-writeback pages to complete
  * @mapping: address space structure to wait for
  *
  * Walk the list of under-writeback pages of the given address space
- * and wait for all of them.
+ * and wait for all of them.  Check error status of the address space
+ * and return it.
+ *
+ * Since the error status of the address space is cleared by this function,
+ * callers are responsible for checking the return value and handling and/or
+ * reporting the error.
  */
 int filemap_fdatawait(struct address_space *mapping)
 {
@@ -472,6 +527,8 @@ int replace_page_cache_page(struct page *old, struct page *new, gfp_t gfp_mask)
        if (!error) {
                struct address_space *mapping = old->mapping;
                void (*freepage)(struct page *);
+               struct mem_cgroup *memcg;
+               unsigned long flags;
 
                pgoff_t offset = old->index;
                freepage = mapping->a_ops->freepage;
@@ -480,16 +537,23 @@ int replace_page_cache_page(struct page *old, struct page *new, gfp_t gfp_mask)
                new->mapping = mapping;
                new->index = offset;
 
-               spin_lock_irq(&mapping->tree_lock);
-               __delete_from_page_cache(old, NULL);
+               memcg = mem_cgroup_begin_page_stat(old);
+               spin_lock_irqsave(&mapping->tree_lock, flags);
+               __delete_from_page_cache(old, NULL, memcg);
                error = radix_tree_insert(&mapping->page_tree, offset, new);
                BUG_ON(error);
                mapping->nrpages++;
-               __inc_zone_page_state(new, NR_FILE_PAGES);
+
+               /*
+                * hugetlb pages do not participate in page cache accounting.
+                */
+               if (!PageHuge(new))
+                       __inc_zone_page_state(new, NR_FILE_PAGES);
                if (PageSwapBacked(new))
                        __inc_zone_page_state(new, NR_SHMEM);
-               spin_unlock_irq(&mapping->tree_lock);
-               mem_cgroup_migrate(old, new, true);
+               spin_unlock_irqrestore(&mapping->tree_lock, flags);
+               mem_cgroup_end_page_stat(memcg);
+               mem_cgroup_replace_page(old, new);
                radix_tree_preload_end();
                if (freepage)
                        freepage(old);
@@ -580,7 +644,10 @@ static int __add_to_page_cache_locked(struct page *page,
        radix_tree_preload_end();
        if (unlikely(error))
                goto err_insert;
-       __inc_zone_page_state(page, NR_FILE_PAGES);
+
+       /* hugetlb pages do not participate in page cache accounting. */
+       if (!huge)
+               __inc_zone_page_state(page, NR_FILE_PAGES);
        spin_unlock_irq(&mapping->tree_lock);
        if (!huge)
                mem_cgroup_commit_charge(page, memcg, false);
@@ -653,7 +720,7 @@ struct page *__page_cache_alloc(gfp_t gfp)
                do {
                        cpuset_mems_cookie = read_mems_allowed_begin();
                        n = cpuset_mem_spread_node();
-                       page = alloc_pages_exact_node(n, gfp, 0);
+                       page = __alloc_pages_node(n, gfp, 0);
                } while (!page && read_mems_allowed_retry(cpuset_mems_cookie));
 
                return page;
@@ -1659,8 +1726,8 @@ no_cached_page:
                        error = -ENOMEM;
                        goto out;
                }
-               error = add_to_page_cache_lru(page, mapping,
-                                               index, GFP_KERNEL);
+               error = add_to_page_cache_lru(page, mapping, index,
+                               mapping_gfp_constraint(mapping, GFP_KERNEL));
                if (error) {
                        page_cache_release(page);
                        if (error == -EEXIST) {
@@ -1761,7 +1828,8 @@ static int page_cache_read(struct file *file, pgoff_t offset)
                if (!page)
                        return -ENOMEM;
 
-               ret = add_to_page_cache_lru(page, mapping, offset, GFP_KERNEL);
+               ret = add_to_page_cache_lru(page, mapping, offset,
+                               mapping_gfp_constraint(mapping, GFP_KERNEL));
                if (ret == 0)
                        ret = mapping->a_ops->readpage(file, page);
                else if (ret == -EEXIST)
@@ -1785,7 +1853,6 @@ static void do_sync_mmap_readahead(struct vm_area_struct *vma,
                                   struct file *file,
                                   pgoff_t offset)
 {
-       unsigned long ra_pages;
        struct address_space *mapping = file->f_mapping;
 
        /* If we don't want any read-ahead, don't bother */
@@ -1814,10 +1881,9 @@ static void do_sync_mmap_readahead(struct vm_area_struct *vma,
        /*
         * mmap read-around
         */
-       ra_pages = max_sane_readahead(ra->ra_pages);
-       ra->start = max_t(long, 0, offset - ra_pages / 2);
-       ra->size = ra_pages;
-       ra->async_size = ra_pages / 4;
+       ra->start = max_t(long, 0, offset - ra->ra_pages / 2);
+       ra->size = ra->ra_pages;
+       ra->async_size = ra->ra_pages / 4;
        ra_submit(ra, mapping, file);
 }
 
@@ -2466,6 +2532,11 @@ again:
                        break;
                }
 
+               if (fatal_signal_pending(current)) {
+                       status = -EINTR;
+                       break;
+               }
+
                status = a_ops->write_begin(file, mapping, pos, bytes, flags,
                                                &page, &fsdata);
                if (unlikely(status < 0))
@@ -2503,10 +2574,6 @@ again:
                written += copied;
 
                balance_dirty_pages_ratelimited(mapping);
-               if (fatal_signal_pending(current)) {
-                       status = -EINTR;
-                       break;
-               }
        } while (iov_iter_count(i));
 
        return written ? written : status;
@@ -2541,7 +2608,7 @@ ssize_t __generic_file_write_iter(struct kiocb *iocb, struct iov_iter *from)
 
        /* We can write back this queue in page reclaim */
        current->backing_dev_info = inode_to_bdi(inode);
-       err = file_remove_suid(file);
+       err = file_remove_privs(file);
        if (err)
                goto out;
 
@@ -2651,7 +2718,7 @@ EXPORT_SYMBOL(generic_file_write_iter);
  * page is known to the local caching routines.
  *
  * The @gfp_mask argument specifies whether I/O may be performed to release
- * this page (__GFP_IO), and whether the call may block (__GFP_WAIT & __GFP_FS).
+ * this page (__GFP_IO), and whether the call may block (__GFP_RECLAIM & __GFP_FS).
  *
  */
 int try_to_release_page(struct page *page, gfp_t gfp_mask)