To update vsperf/packet-forwarding configuration files
[kvmfornfv.git] / kernel / fs / buffer.c
1 /*
2  *  linux/fs/buffer.c
3  *
4  *  Copyright (C) 1991, 1992, 2002  Linus Torvalds
5  */
6
7 /*
8  * Start bdflush() with kernel_thread not syscall - Paul Gortmaker, 12/95
9  *
10  * Removed a lot of unnecessary code and simplified things now that
11  * the buffer cache isn't our primary cache - Andrew Tridgell 12/96
12  *
13  * Speed up hash, lru, and free list operations.  Use gfp() for allocating
14  * hash table, use SLAB cache for buffer heads. SMP threading.  -DaveM
15  *
16  * Added 32k buffer block sizes - these are required older ARM systems. - RMK
17  *
18  * async buffer flushing, 1999 Andrea Arcangeli <andrea@suse.de>
19  */
20
21 #include <linux/kernel.h>
22 #include <linux/syscalls.h>
23 #include <linux/fs.h>
24 #include <linux/mm.h>
25 #include <linux/percpu.h>
26 #include <linux/slab.h>
27 #include <linux/capability.h>
28 #include <linux/blkdev.h>
29 #include <linux/file.h>
30 #include <linux/quotaops.h>
31 #include <linux/highmem.h>
32 #include <linux/export.h>
33 #include <linux/backing-dev.h>
34 #include <linux/writeback.h>
35 #include <linux/hash.h>
36 #include <linux/suspend.h>
37 #include <linux/buffer_head.h>
38 #include <linux/task_io_accounting_ops.h>
39 #include <linux/bio.h>
40 #include <linux/notifier.h>
41 #include <linux/cpu.h>
42 #include <linux/bitops.h>
43 #include <linux/mpage.h>
44 #include <linux/bit_spinlock.h>
45 #include <trace/events/block.h>
46
47 static int fsync_buffers_list(spinlock_t *lock, struct list_head *list);
48 static int submit_bh_wbc(int rw, struct buffer_head *bh,
49                          unsigned long bio_flags,
50                          struct writeback_control *wbc);
51
52 #define BH_ENTRY(list) list_entry((list), struct buffer_head, b_assoc_buffers)
53
54 void init_buffer(struct buffer_head *bh, bh_end_io_t *handler, void *private)
55 {
56         bh->b_end_io = handler;
57         bh->b_private = private;
58 }
59 EXPORT_SYMBOL(init_buffer);
60
61 inline void touch_buffer(struct buffer_head *bh)
62 {
63         trace_block_touch_buffer(bh);
64         mark_page_accessed(bh->b_page);
65 }
66 EXPORT_SYMBOL(touch_buffer);
67
68 void __lock_buffer(struct buffer_head *bh)
69 {
70         wait_on_bit_lock_io(&bh->b_state, BH_Lock, TASK_UNINTERRUPTIBLE);
71 }
72 EXPORT_SYMBOL(__lock_buffer);
73
74 void unlock_buffer(struct buffer_head *bh)
75 {
76         clear_bit_unlock(BH_Lock, &bh->b_state);
77         smp_mb__after_atomic();
78         wake_up_bit(&bh->b_state, BH_Lock);
79 }
80 EXPORT_SYMBOL(unlock_buffer);
81
82 /*
83  * Returns if the page has dirty or writeback buffers. If all the buffers
84  * are unlocked and clean then the PageDirty information is stale. If
85  * any of the pages are locked, it is assumed they are locked for IO.
86  */
87 void buffer_check_dirty_writeback(struct page *page,
88                                      bool *dirty, bool *writeback)
89 {
90         struct buffer_head *head, *bh;
91         *dirty = false;
92         *writeback = false;
93
94         BUG_ON(!PageLocked(page));
95
96         if (!page_has_buffers(page))
97                 return;
98
99         if (PageWriteback(page))
100                 *writeback = true;
101
102         head = page_buffers(page);
103         bh = head;
104         do {
105                 if (buffer_locked(bh))
106                         *writeback = true;
107
108                 if (buffer_dirty(bh))
109                         *dirty = true;
110
111                 bh = bh->b_this_page;
112         } while (bh != head);
113 }
114 EXPORT_SYMBOL(buffer_check_dirty_writeback);
115
116 /*
117  * Block until a buffer comes unlocked.  This doesn't stop it
118  * from becoming locked again - you have to lock it yourself
119  * if you want to preserve its state.
120  */
121 void __wait_on_buffer(struct buffer_head * bh)
122 {
123         wait_on_bit_io(&bh->b_state, BH_Lock, TASK_UNINTERRUPTIBLE);
124 }
125 EXPORT_SYMBOL(__wait_on_buffer);
126
127 static void
128 __clear_page_buffers(struct page *page)
129 {
130         ClearPagePrivate(page);
131         set_page_private(page, 0);
132         page_cache_release(page);
133 }
134
135 static void buffer_io_error(struct buffer_head *bh, char *msg)
136 {
137         char b[BDEVNAME_SIZE];
138
139         if (!test_bit(BH_Quiet, &bh->b_state))
140                 printk_ratelimited(KERN_ERR
141                         "Buffer I/O error on dev %s, logical block %llu%s\n",
142                         bdevname(bh->b_bdev, b),
143                         (unsigned long long)bh->b_blocknr, msg);
144 }
145
146 /*
147  * End-of-IO handler helper function which does not touch the bh after
148  * unlocking it.
149  * Note: unlock_buffer() sort-of does touch the bh after unlocking it, but
150  * a race there is benign: unlock_buffer() only use the bh's address for
151  * hashing after unlocking the buffer, so it doesn't actually touch the bh
152  * itself.
153  */
154 static void __end_buffer_read_notouch(struct buffer_head *bh, int uptodate)
155 {
156         if (uptodate) {
157                 set_buffer_uptodate(bh);
158         } else {
159                 /* This happens, due to failed READA attempts. */
160                 clear_buffer_uptodate(bh);
161         }
162         unlock_buffer(bh);
163 }
164
165 /*
166  * Default synchronous end-of-IO handler..  Just mark it up-to-date and
167  * unlock the buffer. This is what ll_rw_block uses too.
168  */
169 void end_buffer_read_sync(struct buffer_head *bh, int uptodate)
170 {
171         __end_buffer_read_notouch(bh, uptodate);
172         put_bh(bh);
173 }
174 EXPORT_SYMBOL(end_buffer_read_sync);
175
176 void end_buffer_write_sync(struct buffer_head *bh, int uptodate)
177 {
178         if (uptodate) {
179                 set_buffer_uptodate(bh);
180         } else {
181                 buffer_io_error(bh, ", lost sync page write");
182                 set_buffer_write_io_error(bh);
183                 clear_buffer_uptodate(bh);
184         }
185         unlock_buffer(bh);
186         put_bh(bh);
187 }
188 EXPORT_SYMBOL(end_buffer_write_sync);
189
190 /*
191  * Various filesystems appear to want __find_get_block to be non-blocking.
192  * But it's the page lock which protects the buffers.  To get around this,
193  * we get exclusion from try_to_free_buffers with the blockdev mapping's
194  * private_lock.
195  *
196  * Hack idea: for the blockdev mapping, i_bufferlist_lock contention
197  * may be quite high.  This code could TryLock the page, and if that
198  * succeeds, there is no need to take private_lock. (But if
199  * private_lock is contended then so is mapping->tree_lock).
200  */
201 static struct buffer_head *
202 __find_get_block_slow(struct block_device *bdev, sector_t block)
203 {
204         struct inode *bd_inode = bdev->bd_inode;
205         struct address_space *bd_mapping = bd_inode->i_mapping;
206         struct buffer_head *ret = NULL;
207         pgoff_t index;
208         struct buffer_head *bh;
209         struct buffer_head *head;
210         struct page *page;
211         int all_mapped = 1;
212
213         index = block >> (PAGE_CACHE_SHIFT - bd_inode->i_blkbits);
214         page = find_get_page_flags(bd_mapping, index, FGP_ACCESSED);
215         if (!page)
216                 goto out;
217
218         spin_lock(&bd_mapping->private_lock);
219         if (!page_has_buffers(page))
220                 goto out_unlock;
221         head = page_buffers(page);
222         bh = head;
223         do {
224                 if (!buffer_mapped(bh))
225                         all_mapped = 0;
226                 else if (bh->b_blocknr == block) {
227                         ret = bh;
228                         get_bh(bh);
229                         goto out_unlock;
230                 }
231                 bh = bh->b_this_page;
232         } while (bh != head);
233
234         /* we might be here because some of the buffers on this page are
235          * not mapped.  This is due to various races between
236          * file io on the block device and getblk.  It gets dealt with
237          * elsewhere, don't buffer_error if we had some unmapped buffers
238          */
239         if (all_mapped) {
240                 char b[BDEVNAME_SIZE];
241
242                 printk("__find_get_block_slow() failed. "
243                         "block=%llu, b_blocknr=%llu\n",
244                         (unsigned long long)block,
245                         (unsigned long long)bh->b_blocknr);
246                 printk("b_state=0x%08lx, b_size=%zu\n",
247                         bh->b_state, bh->b_size);
248                 printk("device %s blocksize: %d\n", bdevname(bdev, b),
249                         1 << bd_inode->i_blkbits);
250         }
251 out_unlock:
252         spin_unlock(&bd_mapping->private_lock);
253         page_cache_release(page);
254 out:
255         return ret;
256 }
257
258 /*
259  * Kick the writeback threads then try to free up some ZONE_NORMAL memory.
260  */
261 static void free_more_memory(void)
262 {
263         struct zone *zone;
264         int nid;
265
266         wakeup_flusher_threads(1024, WB_REASON_FREE_MORE_MEM);
267         yield();
268
269         for_each_online_node(nid) {
270                 (void)first_zones_zonelist(node_zonelist(nid, GFP_NOFS),
271                                                 gfp_zone(GFP_NOFS), NULL,
272                                                 &zone);
273                 if (zone)
274                         try_to_free_pages(node_zonelist(nid, GFP_NOFS), 0,
275                                                 GFP_NOFS, NULL);
276         }
277 }
278
279 /*
280  * I/O completion handler for block_read_full_page() - pages
281  * which come unlocked at the end of I/O.
282  */
283 static void end_buffer_async_read(struct buffer_head *bh, int uptodate)
284 {
285         unsigned long flags;
286         struct buffer_head *first;
287         struct buffer_head *tmp;
288         struct page *page;
289         int page_uptodate = 1;
290
291         BUG_ON(!buffer_async_read(bh));
292
293         page = bh->b_page;
294         if (uptodate) {
295                 set_buffer_uptodate(bh);
296         } else {
297                 clear_buffer_uptodate(bh);
298                 buffer_io_error(bh, ", async page read");
299                 SetPageError(page);
300         }
301
302         /*
303          * Be _very_ careful from here on. Bad things can happen if
304          * two buffer heads end IO at almost the same time and both
305          * decide that the page is now completely done.
306          */
307         first = page_buffers(page);
308         flags = bh_uptodate_lock_irqsave(first);
309         clear_buffer_async_read(bh);
310         unlock_buffer(bh);
311         tmp = bh;
312         do {
313                 if (!buffer_uptodate(tmp))
314                         page_uptodate = 0;
315                 if (buffer_async_read(tmp)) {
316                         BUG_ON(!buffer_locked(tmp));
317                         goto still_busy;
318                 }
319                 tmp = tmp->b_this_page;
320         } while (tmp != bh);
321         bh_uptodate_unlock_irqrestore(first, flags);
322
323         /*
324          * If none of the buffers had errors and they are all
325          * uptodate then we can set the page uptodate.
326          */
327         if (page_uptodate && !PageError(page))
328                 SetPageUptodate(page);
329         unlock_page(page);
330         return;
331
332 still_busy:
333         bh_uptodate_unlock_irqrestore(first, flags);
334 }
335
336 /*
337  * Completion handler for block_write_full_page() - pages which are unlocked
338  * during I/O, and which have PageWriteback cleared upon I/O completion.
339  */
340 void end_buffer_async_write(struct buffer_head *bh, int uptodate)
341 {
342         unsigned long flags;
343         struct buffer_head *first;
344         struct buffer_head *tmp;
345         struct page *page;
346
347         BUG_ON(!buffer_async_write(bh));
348
349         page = bh->b_page;
350         if (uptodate) {
351                 set_buffer_uptodate(bh);
352         } else {
353                 buffer_io_error(bh, ", lost async page write");
354                 set_bit(AS_EIO, &page->mapping->flags);
355                 set_buffer_write_io_error(bh);
356                 clear_buffer_uptodate(bh);
357                 SetPageError(page);
358         }
359
360         first = page_buffers(page);
361         flags = bh_uptodate_lock_irqsave(first);
362
363         clear_buffer_async_write(bh);
364         unlock_buffer(bh);
365         tmp = bh->b_this_page;
366         while (tmp != bh) {
367                 if (buffer_async_write(tmp)) {
368                         BUG_ON(!buffer_locked(tmp));
369                         goto still_busy;
370                 }
371                 tmp = tmp->b_this_page;
372         }
373         bh_uptodate_unlock_irqrestore(first, flags);
374         end_page_writeback(page);
375         return;
376
377 still_busy:
378         bh_uptodate_unlock_irqrestore(first, flags);
379 }
380 EXPORT_SYMBOL(end_buffer_async_write);
381
382 /*
383  * If a page's buffers are under async readin (end_buffer_async_read
384  * completion) then there is a possibility that another thread of
385  * control could lock one of the buffers after it has completed
386  * but while some of the other buffers have not completed.  This
387  * locked buffer would confuse end_buffer_async_read() into not unlocking
388  * the page.  So the absence of BH_Async_Read tells end_buffer_async_read()
389  * that this buffer is not under async I/O.
390  *
391  * The page comes unlocked when it has no locked buffer_async buffers
392  * left.
393  *
394  * PageLocked prevents anyone starting new async I/O reads any of
395  * the buffers.
396  *
397  * PageWriteback is used to prevent simultaneous writeout of the same
398  * page.
399  *
400  * PageLocked prevents anyone from starting writeback of a page which is
401  * under read I/O (PageWriteback is only ever set against a locked page).
402  */
403 static void mark_buffer_async_read(struct buffer_head *bh)
404 {
405         bh->b_end_io = end_buffer_async_read;
406         set_buffer_async_read(bh);
407 }
408
409 static void mark_buffer_async_write_endio(struct buffer_head *bh,
410                                           bh_end_io_t *handler)
411 {
412         bh->b_end_io = handler;
413         set_buffer_async_write(bh);
414 }
415
416 void mark_buffer_async_write(struct buffer_head *bh)
417 {
418         mark_buffer_async_write_endio(bh, end_buffer_async_write);
419 }
420 EXPORT_SYMBOL(mark_buffer_async_write);
421
422
423 /*
424  * fs/buffer.c contains helper functions for buffer-backed address space's
425  * fsync functions.  A common requirement for buffer-based filesystems is
426  * that certain data from the backing blockdev needs to be written out for
427  * a successful fsync().  For example, ext2 indirect blocks need to be
428  * written back and waited upon before fsync() returns.
429  *
430  * The functions mark_buffer_inode_dirty(), fsync_inode_buffers(),
431  * inode_has_buffers() and invalidate_inode_buffers() are provided for the
432  * management of a list of dependent buffers at ->i_mapping->private_list.
433  *
434  * Locking is a little subtle: try_to_free_buffers() will remove buffers
435  * from their controlling inode's queue when they are being freed.  But
436  * try_to_free_buffers() will be operating against the *blockdev* mapping
437  * at the time, not against the S_ISREG file which depends on those buffers.
438  * So the locking for private_list is via the private_lock in the address_space
439  * which backs the buffers.  Which is different from the address_space 
440  * against which the buffers are listed.  So for a particular address_space,
441  * mapping->private_lock does *not* protect mapping->private_list!  In fact,
442  * mapping->private_list will always be protected by the backing blockdev's
443  * ->private_lock.
444  *
445  * Which introduces a requirement: all buffers on an address_space's
446  * ->private_list must be from the same address_space: the blockdev's.
447  *
448  * address_spaces which do not place buffers at ->private_list via these
449  * utility functions are free to use private_lock and private_list for
450  * whatever they want.  The only requirement is that list_empty(private_list)
451  * be true at clear_inode() time.
452  *
453  * FIXME: clear_inode should not call invalidate_inode_buffers().  The
454  * filesystems should do that.  invalidate_inode_buffers() should just go
455  * BUG_ON(!list_empty).
456  *
457  * FIXME: mark_buffer_dirty_inode() is a data-plane operation.  It should
458  * take an address_space, not an inode.  And it should be called
459  * mark_buffer_dirty_fsync() to clearly define why those buffers are being
460  * queued up.
461  *
462  * FIXME: mark_buffer_dirty_inode() doesn't need to add the buffer to the
463  * list if it is already on a list.  Because if the buffer is on a list,
464  * it *must* already be on the right one.  If not, the filesystem is being
465  * silly.  This will save a ton of locking.  But first we have to ensure
466  * that buffers are taken *off* the old inode's list when they are freed
467  * (presumably in truncate).  That requires careful auditing of all
468  * filesystems (do it inside bforget()).  It could also be done by bringing
469  * b_inode back.
470  */
471
472 /*
473  * The buffer's backing address_space's private_lock must be held
474  */
475 static void __remove_assoc_queue(struct buffer_head *bh)
476 {
477         list_del_init(&bh->b_assoc_buffers);
478         WARN_ON(!bh->b_assoc_map);
479         if (buffer_write_io_error(bh))
480                 set_bit(AS_EIO, &bh->b_assoc_map->flags);
481         bh->b_assoc_map = NULL;
482 }
483
484 int inode_has_buffers(struct inode *inode)
485 {
486         return !list_empty(&inode->i_data.private_list);
487 }
488
489 /*
490  * osync is designed to support O_SYNC io.  It waits synchronously for
491  * all already-submitted IO to complete, but does not queue any new
492  * writes to the disk.
493  *
494  * To do O_SYNC writes, just queue the buffer writes with ll_rw_block as
495  * you dirty the buffers, and then use osync_inode_buffers to wait for
496  * completion.  Any other dirty buffers which are not yet queued for
497  * write will not be flushed to disk by the osync.
498  */
499 static int osync_buffers_list(spinlock_t *lock, struct list_head *list)
500 {
501         struct buffer_head *bh;
502         struct list_head *p;
503         int err = 0;
504
505         spin_lock(lock);
506 repeat:
507         list_for_each_prev(p, list) {
508                 bh = BH_ENTRY(p);
509                 if (buffer_locked(bh)) {
510                         get_bh(bh);
511                         spin_unlock(lock);
512                         wait_on_buffer(bh);
513                         if (!buffer_uptodate(bh))
514                                 err = -EIO;
515                         brelse(bh);
516                         spin_lock(lock);
517                         goto repeat;
518                 }
519         }
520         spin_unlock(lock);
521         return err;
522 }
523
524 static void do_thaw_one(struct super_block *sb, void *unused)
525 {
526         char b[BDEVNAME_SIZE];
527         while (sb->s_bdev && !thaw_bdev(sb->s_bdev, sb))
528                 printk(KERN_WARNING "Emergency Thaw on %s\n",
529                        bdevname(sb->s_bdev, b));
530 }
531
532 static void do_thaw_all(struct work_struct *work)
533 {
534         iterate_supers(do_thaw_one, NULL);
535         kfree(work);
536         printk(KERN_WARNING "Emergency Thaw complete\n");
537 }
538
539 /**
540  * emergency_thaw_all -- forcibly thaw every frozen filesystem
541  *
542  * Used for emergency unfreeze of all filesystems via SysRq
543  */
544 void emergency_thaw_all(void)
545 {
546         struct work_struct *work;
547
548         work = kmalloc(sizeof(*work), GFP_ATOMIC);
549         if (work) {
550                 INIT_WORK(work, do_thaw_all);
551                 schedule_work(work);
552         }
553 }
554
555 /**
556  * sync_mapping_buffers - write out & wait upon a mapping's "associated" buffers
557  * @mapping: the mapping which wants those buffers written
558  *
559  * Starts I/O against the buffers at mapping->private_list, and waits upon
560  * that I/O.
561  *
562  * Basically, this is a convenience function for fsync().
563  * @mapping is a file or directory which needs those buffers to be written for
564  * a successful fsync().
565  */
566 int sync_mapping_buffers(struct address_space *mapping)
567 {
568         struct address_space *buffer_mapping = mapping->private_data;
569
570         if (buffer_mapping == NULL || list_empty(&mapping->private_list))
571                 return 0;
572
573         return fsync_buffers_list(&buffer_mapping->private_lock,
574                                         &mapping->private_list);
575 }
576 EXPORT_SYMBOL(sync_mapping_buffers);
577
578 /*
579  * Called when we've recently written block `bblock', and it is known that
580  * `bblock' was for a buffer_boundary() buffer.  This means that the block at
581  * `bblock + 1' is probably a dirty indirect block.  Hunt it down and, if it's
582  * dirty, schedule it for IO.  So that indirects merge nicely with their data.
583  */
584 void write_boundary_block(struct block_device *bdev,
585                         sector_t bblock, unsigned blocksize)
586 {
587         struct buffer_head *bh = __find_get_block(bdev, bblock + 1, blocksize);
588         if (bh) {
589                 if (buffer_dirty(bh))
590                         ll_rw_block(WRITE, 1, &bh);
591                 put_bh(bh);
592         }
593 }
594
595 void mark_buffer_dirty_inode(struct buffer_head *bh, struct inode *inode)
596 {
597         struct address_space *mapping = inode->i_mapping;
598         struct address_space *buffer_mapping = bh->b_page->mapping;
599
600         mark_buffer_dirty(bh);
601         if (!mapping->private_data) {
602                 mapping->private_data = buffer_mapping;
603         } else {
604                 BUG_ON(mapping->private_data != buffer_mapping);
605         }
606         if (!bh->b_assoc_map) {
607                 spin_lock(&buffer_mapping->private_lock);
608                 list_move_tail(&bh->b_assoc_buffers,
609                                 &mapping->private_list);
610                 bh->b_assoc_map = mapping;
611                 spin_unlock(&buffer_mapping->private_lock);
612         }
613 }
614 EXPORT_SYMBOL(mark_buffer_dirty_inode);
615
616 /*
617  * Mark the page dirty, and set it dirty in the radix tree, and mark the inode
618  * dirty.
619  *
620  * If warn is true, then emit a warning if the page is not uptodate and has
621  * not been truncated.
622  *
623  * The caller must hold mem_cgroup_begin_page_stat() lock.
624  */
625 static void __set_page_dirty(struct page *page, struct address_space *mapping,
626                              struct mem_cgroup *memcg, int warn)
627 {
628         unsigned long flags;
629
630         spin_lock_irqsave(&mapping->tree_lock, flags);
631         if (page->mapping) {    /* Race with truncate? */
632                 WARN_ON_ONCE(warn && !PageUptodate(page));
633                 account_page_dirtied(page, mapping, memcg);
634                 radix_tree_tag_set(&mapping->page_tree,
635                                 page_index(page), PAGECACHE_TAG_DIRTY);
636         }
637         spin_unlock_irqrestore(&mapping->tree_lock, flags);
638 }
639
640 /*
641  * Add a page to the dirty page list.
642  *
643  * It is a sad fact of life that this function is called from several places
644  * deeply under spinlocking.  It may not sleep.
645  *
646  * If the page has buffers, the uptodate buffers are set dirty, to preserve
647  * dirty-state coherency between the page and the buffers.  It the page does
648  * not have buffers then when they are later attached they will all be set
649  * dirty.
650  *
651  * The buffers are dirtied before the page is dirtied.  There's a small race
652  * window in which a writepage caller may see the page cleanness but not the
653  * buffer dirtiness.  That's fine.  If this code were to set the page dirty
654  * before the buffers, a concurrent writepage caller could clear the page dirty
655  * bit, see a bunch of clean buffers and we'd end up with dirty buffers/clean
656  * page on the dirty page list.
657  *
658  * We use private_lock to lock against try_to_free_buffers while using the
659  * page's buffer list.  Also use this to protect against clean buffers being
660  * added to the page after it was set dirty.
661  *
662  * FIXME: may need to call ->reservepage here as well.  That's rather up to the
663  * address_space though.
664  */
665 int __set_page_dirty_buffers(struct page *page)
666 {
667         int newly_dirty;
668         struct mem_cgroup *memcg;
669         struct address_space *mapping = page_mapping(page);
670
671         if (unlikely(!mapping))
672                 return !TestSetPageDirty(page);
673
674         spin_lock(&mapping->private_lock);
675         if (page_has_buffers(page)) {
676                 struct buffer_head *head = page_buffers(page);
677                 struct buffer_head *bh = head;
678
679                 do {
680                         set_buffer_dirty(bh);
681                         bh = bh->b_this_page;
682                 } while (bh != head);
683         }
684         /*
685          * Use mem_group_begin_page_stat() to keep PageDirty synchronized with
686          * per-memcg dirty page counters.
687          */
688         memcg = mem_cgroup_begin_page_stat(page);
689         newly_dirty = !TestSetPageDirty(page);
690         spin_unlock(&mapping->private_lock);
691
692         if (newly_dirty)
693                 __set_page_dirty(page, mapping, memcg, 1);
694
695         mem_cgroup_end_page_stat(memcg);
696
697         if (newly_dirty)
698                 __mark_inode_dirty(mapping->host, I_DIRTY_PAGES);
699
700         return newly_dirty;
701 }
702 EXPORT_SYMBOL(__set_page_dirty_buffers);
703
704 /*
705  * Write out and wait upon a list of buffers.
706  *
707  * We have conflicting pressures: we want to make sure that all
708  * initially dirty buffers get waited on, but that any subsequently
709  * dirtied buffers don't.  After all, we don't want fsync to last
710  * forever if somebody is actively writing to the file.
711  *
712  * Do this in two main stages: first we copy dirty buffers to a
713  * temporary inode list, queueing the writes as we go.  Then we clean
714  * up, waiting for those writes to complete.
715  * 
716  * During this second stage, any subsequent updates to the file may end
717  * up refiling the buffer on the original inode's dirty list again, so
718  * there is a chance we will end up with a buffer queued for write but
719  * not yet completed on that list.  So, as a final cleanup we go through
720  * the osync code to catch these locked, dirty buffers without requeuing
721  * any newly dirty buffers for write.
722  */
723 static int fsync_buffers_list(spinlock_t *lock, struct list_head *list)
724 {
725         struct buffer_head *bh;
726         struct list_head tmp;
727         struct address_space *mapping;
728         int err = 0, err2;
729         struct blk_plug plug;
730
731         INIT_LIST_HEAD(&tmp);
732         blk_start_plug(&plug);
733
734         spin_lock(lock);
735         while (!list_empty(list)) {
736                 bh = BH_ENTRY(list->next);
737                 mapping = bh->b_assoc_map;
738                 __remove_assoc_queue(bh);
739                 /* Avoid race with mark_buffer_dirty_inode() which does
740                  * a lockless check and we rely on seeing the dirty bit */
741                 smp_mb();
742                 if (buffer_dirty(bh) || buffer_locked(bh)) {
743                         list_add(&bh->b_assoc_buffers, &tmp);
744                         bh->b_assoc_map = mapping;
745                         if (buffer_dirty(bh)) {
746                                 get_bh(bh);
747                                 spin_unlock(lock);
748                                 /*
749                                  * Ensure any pending I/O completes so that
750                                  * write_dirty_buffer() actually writes the
751                                  * current contents - it is a noop if I/O is
752                                  * still in flight on potentially older
753                                  * contents.
754                                  */
755                                 write_dirty_buffer(bh, WRITE_SYNC);
756
757                                 /*
758                                  * Kick off IO for the previous mapping. Note
759                                  * that we will not run the very last mapping,
760                                  * wait_on_buffer() will do that for us
761                                  * through sync_buffer().
762                                  */
763                                 brelse(bh);
764                                 spin_lock(lock);
765                         }
766                 }
767         }
768
769         spin_unlock(lock);
770         blk_finish_plug(&plug);
771         spin_lock(lock);
772
773         while (!list_empty(&tmp)) {
774                 bh = BH_ENTRY(tmp.prev);
775                 get_bh(bh);
776                 mapping = bh->b_assoc_map;
777                 __remove_assoc_queue(bh);
778                 /* Avoid race with mark_buffer_dirty_inode() which does
779                  * a lockless check and we rely on seeing the dirty bit */
780                 smp_mb();
781                 if (buffer_dirty(bh)) {
782                         list_add(&bh->b_assoc_buffers,
783                                  &mapping->private_list);
784                         bh->b_assoc_map = mapping;
785                 }
786                 spin_unlock(lock);
787                 wait_on_buffer(bh);
788                 if (!buffer_uptodate(bh))
789                         err = -EIO;
790                 brelse(bh);
791                 spin_lock(lock);
792         }
793         
794         spin_unlock(lock);
795         err2 = osync_buffers_list(lock, list);
796         if (err)
797                 return err;
798         else
799                 return err2;
800 }
801
802 /*
803  * Invalidate any and all dirty buffers on a given inode.  We are
804  * probably unmounting the fs, but that doesn't mean we have already
805  * done a sync().  Just drop the buffers from the inode list.
806  *
807  * NOTE: we take the inode's blockdev's mapping's private_lock.  Which
808  * assumes that all the buffers are against the blockdev.  Not true
809  * for reiserfs.
810  */
811 void invalidate_inode_buffers(struct inode *inode)
812 {
813         if (inode_has_buffers(inode)) {
814                 struct address_space *mapping = &inode->i_data;
815                 struct list_head *list = &mapping->private_list;
816                 struct address_space *buffer_mapping = mapping->private_data;
817
818                 spin_lock(&buffer_mapping->private_lock);
819                 while (!list_empty(list))
820                         __remove_assoc_queue(BH_ENTRY(list->next));
821                 spin_unlock(&buffer_mapping->private_lock);
822         }
823 }
824 EXPORT_SYMBOL(invalidate_inode_buffers);
825
826 /*
827  * Remove any clean buffers from the inode's buffer list.  This is called
828  * when we're trying to free the inode itself.  Those buffers can pin it.
829  *
830  * Returns true if all buffers were removed.
831  */
832 int remove_inode_buffers(struct inode *inode)
833 {
834         int ret = 1;
835
836         if (inode_has_buffers(inode)) {
837                 struct address_space *mapping = &inode->i_data;
838                 struct list_head *list = &mapping->private_list;
839                 struct address_space *buffer_mapping = mapping->private_data;
840
841                 spin_lock(&buffer_mapping->private_lock);
842                 while (!list_empty(list)) {
843                         struct buffer_head *bh = BH_ENTRY(list->next);
844                         if (buffer_dirty(bh)) {
845                                 ret = 0;
846                                 break;
847                         }
848                         __remove_assoc_queue(bh);
849                 }
850                 spin_unlock(&buffer_mapping->private_lock);
851         }
852         return ret;
853 }
854
855 /*
856  * Create the appropriate buffers when given a page for data area and
857  * the size of each buffer.. Use the bh->b_this_page linked list to
858  * follow the buffers created.  Return NULL if unable to create more
859  * buffers.
860  *
861  * The retry flag is used to differentiate async IO (paging, swapping)
862  * which may not fail from ordinary buffer allocations.
863  */
864 struct buffer_head *alloc_page_buffers(struct page *page, unsigned long size,
865                 int retry)
866 {
867         struct buffer_head *bh, *head;
868         long offset;
869
870 try_again:
871         head = NULL;
872         offset = PAGE_SIZE;
873         while ((offset -= size) >= 0) {
874                 bh = alloc_buffer_head(GFP_NOFS);
875                 if (!bh)
876                         goto no_grow;
877
878                 bh->b_this_page = head;
879                 bh->b_blocknr = -1;
880                 head = bh;
881
882                 bh->b_size = size;
883
884                 /* Link the buffer to its page */
885                 set_bh_page(bh, page, offset);
886         }
887         return head;
888 /*
889  * In case anything failed, we just free everything we got.
890  */
891 no_grow:
892         if (head) {
893                 do {
894                         bh = head;
895                         head = head->b_this_page;
896                         free_buffer_head(bh);
897                 } while (head);
898         }
899
900         /*
901          * Return failure for non-async IO requests.  Async IO requests
902          * are not allowed to fail, so we have to wait until buffer heads
903          * become available.  But we don't want tasks sleeping with 
904          * partially complete buffers, so all were released above.
905          */
906         if (!retry)
907                 return NULL;
908
909         /* We're _really_ low on memory. Now we just
910          * wait for old buffer heads to become free due to
911          * finishing IO.  Since this is an async request and
912          * the reserve list is empty, we're sure there are 
913          * async buffer heads in use.
914          */
915         free_more_memory();
916         goto try_again;
917 }
918 EXPORT_SYMBOL_GPL(alloc_page_buffers);
919
920 static inline void
921 link_dev_buffers(struct page *page, struct buffer_head *head)
922 {
923         struct buffer_head *bh, *tail;
924
925         bh = head;
926         do {
927                 tail = bh;
928                 bh = bh->b_this_page;
929         } while (bh);
930         tail->b_this_page = head;
931         attach_page_buffers(page, head);
932 }
933
934 static sector_t blkdev_max_block(struct block_device *bdev, unsigned int size)
935 {
936         sector_t retval = ~((sector_t)0);
937         loff_t sz = i_size_read(bdev->bd_inode);
938
939         if (sz) {
940                 unsigned int sizebits = blksize_bits(size);
941                 retval = (sz >> sizebits);
942         }
943         return retval;
944 }
945
946 /*
947  * Initialise the state of a blockdev page's buffers.
948  */ 
949 static sector_t
950 init_page_buffers(struct page *page, struct block_device *bdev,
951                         sector_t block, int size)
952 {
953         struct buffer_head *head = page_buffers(page);
954         struct buffer_head *bh = head;
955         int uptodate = PageUptodate(page);
956         sector_t end_block = blkdev_max_block(I_BDEV(bdev->bd_inode), size);
957
958         do {
959                 if (!buffer_mapped(bh)) {
960                         init_buffer(bh, NULL, NULL);
961                         bh->b_bdev = bdev;
962                         bh->b_blocknr = block;
963                         if (uptodate)
964                                 set_buffer_uptodate(bh);
965                         if (block < end_block)
966                                 set_buffer_mapped(bh);
967                 }
968                 block++;
969                 bh = bh->b_this_page;
970         } while (bh != head);
971
972         /*
973          * Caller needs to validate requested block against end of device.
974          */
975         return end_block;
976 }
977
978 /*
979  * Create the page-cache page that contains the requested block.
980  *
981  * This is used purely for blockdev mappings.
982  */
983 static int
984 grow_dev_page(struct block_device *bdev, sector_t block,
985               pgoff_t index, int size, int sizebits, gfp_t gfp)
986 {
987         struct inode *inode = bdev->bd_inode;
988         struct page *page;
989         struct buffer_head *bh;
990         sector_t end_block;
991         int ret = 0;            /* Will call free_more_memory() */
992         gfp_t gfp_mask;
993
994         gfp_mask = mapping_gfp_constraint(inode->i_mapping, ~__GFP_FS) | gfp;
995
996         /*
997          * XXX: __getblk_slow() can not really deal with failure and
998          * will endlessly loop on improvised global reclaim.  Prefer
999          * looping in the allocator rather than here, at least that
1000          * code knows what it's doing.
1001          */
1002         gfp_mask |= __GFP_NOFAIL;
1003
1004         page = find_or_create_page(inode->i_mapping, index, gfp_mask);
1005         if (!page)
1006                 return ret;
1007
1008         BUG_ON(!PageLocked(page));
1009
1010         if (page_has_buffers(page)) {
1011                 bh = page_buffers(page);
1012                 if (bh->b_size == size) {
1013                         end_block = init_page_buffers(page, bdev,
1014                                                 (sector_t)index << sizebits,
1015                                                 size);
1016                         goto done;
1017                 }
1018                 if (!try_to_free_buffers(page))
1019                         goto failed;
1020         }
1021
1022         /*
1023          * Allocate some buffers for this page
1024          */
1025         bh = alloc_page_buffers(page, size, 0);
1026         if (!bh)
1027                 goto failed;
1028
1029         /*
1030          * Link the page to the buffers and initialise them.  Take the
1031          * lock to be atomic wrt __find_get_block(), which does not
1032          * run under the page lock.
1033          */
1034         spin_lock(&inode->i_mapping->private_lock);
1035         link_dev_buffers(page, bh);
1036         end_block = init_page_buffers(page, bdev, (sector_t)index << sizebits,
1037                         size);
1038         spin_unlock(&inode->i_mapping->private_lock);
1039 done:
1040         ret = (block < end_block) ? 1 : -ENXIO;
1041 failed:
1042         unlock_page(page);
1043         page_cache_release(page);
1044         return ret;
1045 }
1046
1047 /*
1048  * Create buffers for the specified block device block's page.  If
1049  * that page was dirty, the buffers are set dirty also.
1050  */
1051 static int
1052 grow_buffers(struct block_device *bdev, sector_t block, int size, gfp_t gfp)
1053 {
1054         pgoff_t index;
1055         int sizebits;
1056
1057         sizebits = -1;
1058         do {
1059                 sizebits++;
1060         } while ((size << sizebits) < PAGE_SIZE);
1061
1062         index = block >> sizebits;
1063
1064         /*
1065          * Check for a block which wants to lie outside our maximum possible
1066          * pagecache index.  (this comparison is done using sector_t types).
1067          */
1068         if (unlikely(index != block >> sizebits)) {
1069                 char b[BDEVNAME_SIZE];
1070
1071                 printk(KERN_ERR "%s: requested out-of-range block %llu for "
1072                         "device %s\n",
1073                         __func__, (unsigned long long)block,
1074                         bdevname(bdev, b));
1075                 return -EIO;
1076         }
1077
1078         /* Create a page with the proper size buffers.. */
1079         return grow_dev_page(bdev, block, index, size, sizebits, gfp);
1080 }
1081
1082 struct buffer_head *
1083 __getblk_slow(struct block_device *bdev, sector_t block,
1084              unsigned size, gfp_t gfp)
1085 {
1086         /* Size must be multiple of hard sectorsize */
1087         if (unlikely(size & (bdev_logical_block_size(bdev)-1) ||
1088                         (size < 512 || size > PAGE_SIZE))) {
1089                 printk(KERN_ERR "getblk(): invalid block size %d requested\n",
1090                                         size);
1091                 printk(KERN_ERR "logical block size: %d\n",
1092                                         bdev_logical_block_size(bdev));
1093
1094                 dump_stack();
1095                 return NULL;
1096         }
1097
1098         for (;;) {
1099                 struct buffer_head *bh;
1100                 int ret;
1101
1102                 bh = __find_get_block(bdev, block, size);
1103                 if (bh)
1104                         return bh;
1105
1106                 ret = grow_buffers(bdev, block, size, gfp);
1107                 if (ret < 0)
1108                         return NULL;
1109                 if (ret == 0)
1110                         free_more_memory();
1111         }
1112 }
1113 EXPORT_SYMBOL(__getblk_slow);
1114
1115 /*
1116  * The relationship between dirty buffers and dirty pages:
1117  *
1118  * Whenever a page has any dirty buffers, the page's dirty bit is set, and
1119  * the page is tagged dirty in its radix tree.
1120  *
1121  * At all times, the dirtiness of the buffers represents the dirtiness of
1122  * subsections of the page.  If the page has buffers, the page dirty bit is
1123  * merely a hint about the true dirty state.
1124  *
1125  * When a page is set dirty in its entirety, all its buffers are marked dirty
1126  * (if the page has buffers).
1127  *
1128  * When a buffer is marked dirty, its page is dirtied, but the page's other
1129  * buffers are not.
1130  *
1131  * Also.  When blockdev buffers are explicitly read with bread(), they
1132  * individually become uptodate.  But their backing page remains not
1133  * uptodate - even if all of its buffers are uptodate.  A subsequent
1134  * block_read_full_page() against that page will discover all the uptodate
1135  * buffers, will set the page uptodate and will perform no I/O.
1136  */
1137
1138 /**
1139  * mark_buffer_dirty - mark a buffer_head as needing writeout
1140  * @bh: the buffer_head to mark dirty
1141  *
1142  * mark_buffer_dirty() will set the dirty bit against the buffer, then set its
1143  * backing page dirty, then tag the page as dirty in its address_space's radix
1144  * tree and then attach the address_space's inode to its superblock's dirty
1145  * inode list.
1146  *
1147  * mark_buffer_dirty() is atomic.  It takes bh->b_page->mapping->private_lock,
1148  * mapping->tree_lock and mapping->host->i_lock.
1149  */
1150 void mark_buffer_dirty(struct buffer_head *bh)
1151 {
1152         WARN_ON_ONCE(!buffer_uptodate(bh));
1153
1154         trace_block_dirty_buffer(bh);
1155
1156         /*
1157          * Very *carefully* optimize the it-is-already-dirty case.
1158          *
1159          * Don't let the final "is it dirty" escape to before we
1160          * perhaps modified the buffer.
1161          */
1162         if (buffer_dirty(bh)) {
1163                 smp_mb();
1164                 if (buffer_dirty(bh))
1165                         return;
1166         }
1167
1168         if (!test_set_buffer_dirty(bh)) {
1169                 struct page *page = bh->b_page;
1170                 struct address_space *mapping = NULL;
1171                 struct mem_cgroup *memcg;
1172
1173                 memcg = mem_cgroup_begin_page_stat(page);
1174                 if (!TestSetPageDirty(page)) {
1175                         mapping = page_mapping(page);
1176                         if (mapping)
1177                                 __set_page_dirty(page, mapping, memcg, 0);
1178                 }
1179                 mem_cgroup_end_page_stat(memcg);
1180                 if (mapping)
1181                         __mark_inode_dirty(mapping->host, I_DIRTY_PAGES);
1182         }
1183 }
1184 EXPORT_SYMBOL(mark_buffer_dirty);
1185
1186 /*
1187  * Decrement a buffer_head's reference count.  If all buffers against a page
1188  * have zero reference count, are clean and unlocked, and if the page is clean
1189  * and unlocked then try_to_free_buffers() may strip the buffers from the page
1190  * in preparation for freeing it (sometimes, rarely, buffers are removed from
1191  * a page but it ends up not being freed, and buffers may later be reattached).
1192  */
1193 void __brelse(struct buffer_head * buf)
1194 {
1195         if (atomic_read(&buf->b_count)) {
1196                 put_bh(buf);
1197                 return;
1198         }
1199         WARN(1, KERN_ERR "VFS: brelse: Trying to free free buffer\n");
1200 }
1201 EXPORT_SYMBOL(__brelse);
1202
1203 /*
1204  * bforget() is like brelse(), except it discards any
1205  * potentially dirty data.
1206  */
1207 void __bforget(struct buffer_head *bh)
1208 {
1209         clear_buffer_dirty(bh);
1210         if (bh->b_assoc_map) {
1211                 struct address_space *buffer_mapping = bh->b_page->mapping;
1212
1213                 spin_lock(&buffer_mapping->private_lock);
1214                 list_del_init(&bh->b_assoc_buffers);
1215                 bh->b_assoc_map = NULL;
1216                 spin_unlock(&buffer_mapping->private_lock);
1217         }
1218         __brelse(bh);
1219 }
1220 EXPORT_SYMBOL(__bforget);
1221
1222 static struct buffer_head *__bread_slow(struct buffer_head *bh)
1223 {
1224         lock_buffer(bh);
1225         if (buffer_uptodate(bh)) {
1226                 unlock_buffer(bh);
1227                 return bh;
1228         } else {
1229                 get_bh(bh);
1230                 bh->b_end_io = end_buffer_read_sync;
1231                 submit_bh(READ, bh);
1232                 wait_on_buffer(bh);
1233                 if (buffer_uptodate(bh))
1234                         return bh;
1235         }
1236         brelse(bh);
1237         return NULL;
1238 }
1239
1240 /*
1241  * Per-cpu buffer LRU implementation.  To reduce the cost of __find_get_block().
1242  * The bhs[] array is sorted - newest buffer is at bhs[0].  Buffers have their
1243  * refcount elevated by one when they're in an LRU.  A buffer can only appear
1244  * once in a particular CPU's LRU.  A single buffer can be present in multiple
1245  * CPU's LRUs at the same time.
1246  *
1247  * This is a transparent caching front-end to sb_bread(), sb_getblk() and
1248  * sb_find_get_block().
1249  *
1250  * The LRUs themselves only need locking against invalidate_bh_lrus.  We use
1251  * a local interrupt disable for that.
1252  */
1253
1254 #define BH_LRU_SIZE     16
1255
1256 struct bh_lru {
1257         struct buffer_head *bhs[BH_LRU_SIZE];
1258 };
1259
1260 static DEFINE_PER_CPU(struct bh_lru, bh_lrus) = {{ NULL }};
1261
1262 #ifdef CONFIG_SMP
1263 #define bh_lru_lock()   local_irq_disable()
1264 #define bh_lru_unlock() local_irq_enable()
1265 #else
1266 #define bh_lru_lock()   preempt_disable()
1267 #define bh_lru_unlock() preempt_enable()
1268 #endif
1269
1270 static inline void check_irqs_on(void)
1271 {
1272 #ifdef irqs_disabled
1273         BUG_ON(irqs_disabled());
1274 #endif
1275 }
1276
1277 /*
1278  * The LRU management algorithm is dopey-but-simple.  Sorry.
1279  */
1280 static void bh_lru_install(struct buffer_head *bh)
1281 {
1282         struct buffer_head *evictee = NULL;
1283
1284         check_irqs_on();
1285         bh_lru_lock();
1286         if (__this_cpu_read(bh_lrus.bhs[0]) != bh) {
1287                 struct buffer_head *bhs[BH_LRU_SIZE];
1288                 int in;
1289                 int out = 0;
1290
1291                 get_bh(bh);
1292                 bhs[out++] = bh;
1293                 for (in = 0; in < BH_LRU_SIZE; in++) {
1294                         struct buffer_head *bh2 =
1295                                 __this_cpu_read(bh_lrus.bhs[in]);
1296
1297                         if (bh2 == bh) {
1298                                 __brelse(bh2);
1299                         } else {
1300                                 if (out >= BH_LRU_SIZE) {
1301                                         BUG_ON(evictee != NULL);
1302                                         evictee = bh2;
1303                                 } else {
1304                                         bhs[out++] = bh2;
1305                                 }
1306                         }
1307                 }
1308                 while (out < BH_LRU_SIZE)
1309                         bhs[out++] = NULL;
1310                 memcpy(this_cpu_ptr(&bh_lrus.bhs), bhs, sizeof(bhs));
1311         }
1312         bh_lru_unlock();
1313
1314         if (evictee)
1315                 __brelse(evictee);
1316 }
1317
1318 /*
1319  * Look up the bh in this cpu's LRU.  If it's there, move it to the head.
1320  */
1321 static struct buffer_head *
1322 lookup_bh_lru(struct block_device *bdev, sector_t block, unsigned size)
1323 {
1324         struct buffer_head *ret = NULL;
1325         unsigned int i;
1326
1327         check_irqs_on();
1328         bh_lru_lock();
1329         for (i = 0; i < BH_LRU_SIZE; i++) {
1330                 struct buffer_head *bh = __this_cpu_read(bh_lrus.bhs[i]);
1331
1332                 if (bh && bh->b_blocknr == block && bh->b_bdev == bdev &&
1333                     bh->b_size == size) {
1334                         if (i) {
1335                                 while (i) {
1336                                         __this_cpu_write(bh_lrus.bhs[i],
1337                                                 __this_cpu_read(bh_lrus.bhs[i - 1]));
1338                                         i--;
1339                                 }
1340                                 __this_cpu_write(bh_lrus.bhs[0], bh);
1341                         }
1342                         get_bh(bh);
1343                         ret = bh;
1344                         break;
1345                 }
1346         }
1347         bh_lru_unlock();
1348         return ret;
1349 }
1350
1351 /*
1352  * Perform a pagecache lookup for the matching buffer.  If it's there, refresh
1353  * it in the LRU and mark it as accessed.  If it is not present then return
1354  * NULL
1355  */
1356 struct buffer_head *
1357 __find_get_block(struct block_device *bdev, sector_t block, unsigned size)
1358 {
1359         struct buffer_head *bh = lookup_bh_lru(bdev, block, size);
1360
1361         if (bh == NULL) {
1362                 /* __find_get_block_slow will mark the page accessed */
1363                 bh = __find_get_block_slow(bdev, block);
1364                 if (bh)
1365                         bh_lru_install(bh);
1366         } else
1367                 touch_buffer(bh);
1368
1369         return bh;
1370 }
1371 EXPORT_SYMBOL(__find_get_block);
1372
1373 /*
1374  * __getblk_gfp() will locate (and, if necessary, create) the buffer_head
1375  * which corresponds to the passed block_device, block and size. The
1376  * returned buffer has its reference count incremented.
1377  *
1378  * __getblk_gfp() will lock up the machine if grow_dev_page's
1379  * try_to_free_buffers() attempt is failing.  FIXME, perhaps?
1380  */
1381 struct buffer_head *
1382 __getblk_gfp(struct block_device *bdev, sector_t block,
1383              unsigned size, gfp_t gfp)
1384 {
1385         struct buffer_head *bh = __find_get_block(bdev, block, size);
1386
1387         might_sleep();
1388         if (bh == NULL)
1389                 bh = __getblk_slow(bdev, block, size, gfp);
1390         return bh;
1391 }
1392 EXPORT_SYMBOL(__getblk_gfp);
1393
1394 /*
1395  * Do async read-ahead on a buffer..
1396  */
1397 void __breadahead(struct block_device *bdev, sector_t block, unsigned size)
1398 {
1399         struct buffer_head *bh = __getblk(bdev, block, size);
1400         if (likely(bh)) {
1401                 ll_rw_block(READA, 1, &bh);
1402                 brelse(bh);
1403         }
1404 }
1405 EXPORT_SYMBOL(__breadahead);
1406
1407 /**
1408  *  __bread_gfp() - reads a specified block and returns the bh
1409  *  @bdev: the block_device to read from
1410  *  @block: number of block
1411  *  @size: size (in bytes) to read
1412  *  @gfp: page allocation flag
1413  *
1414  *  Reads a specified block, and returns buffer head that contains it.
1415  *  The page cache can be allocated from non-movable area
1416  *  not to prevent page migration if you set gfp to zero.
1417  *  It returns NULL if the block was unreadable.
1418  */
1419 struct buffer_head *
1420 __bread_gfp(struct block_device *bdev, sector_t block,
1421                    unsigned size, gfp_t gfp)
1422 {
1423         struct buffer_head *bh = __getblk_gfp(bdev, block, size, gfp);
1424
1425         if (likely(bh) && !buffer_uptodate(bh))
1426                 bh = __bread_slow(bh);
1427         return bh;
1428 }
1429 EXPORT_SYMBOL(__bread_gfp);
1430
1431 /*
1432  * invalidate_bh_lrus() is called rarely - but not only at unmount.
1433  * This doesn't race because it runs in each cpu either in irq
1434  * or with preempt disabled.
1435  */
1436 static void invalidate_bh_lru(void *arg)
1437 {
1438         struct bh_lru *b = &get_cpu_var(bh_lrus);
1439         int i;
1440
1441         for (i = 0; i < BH_LRU_SIZE; i++) {
1442                 brelse(b->bhs[i]);
1443                 b->bhs[i] = NULL;
1444         }
1445         put_cpu_var(bh_lrus);
1446 }
1447
1448 static bool has_bh_in_lru(int cpu, void *dummy)
1449 {
1450         struct bh_lru *b = per_cpu_ptr(&bh_lrus, cpu);
1451         int i;
1452         
1453         for (i = 0; i < BH_LRU_SIZE; i++) {
1454                 if (b->bhs[i])
1455                         return 1;
1456         }
1457
1458         return 0;
1459 }
1460
1461 void invalidate_bh_lrus(void)
1462 {
1463         on_each_cpu_cond(has_bh_in_lru, invalidate_bh_lru, NULL, 1, GFP_KERNEL);
1464 }
1465 EXPORT_SYMBOL_GPL(invalidate_bh_lrus);
1466
1467 void set_bh_page(struct buffer_head *bh,
1468                 struct page *page, unsigned long offset)
1469 {
1470         bh->b_page = page;
1471         BUG_ON(offset >= PAGE_SIZE);
1472         if (PageHighMem(page))
1473                 /*
1474                  * This catches illegal uses and preserves the offset:
1475                  */
1476                 bh->b_data = (char *)(0 + offset);
1477         else
1478                 bh->b_data = page_address(page) + offset;
1479 }
1480 EXPORT_SYMBOL(set_bh_page);
1481
1482 /*
1483  * Called when truncating a buffer on a page completely.
1484  */
1485
1486 /* Bits that are cleared during an invalidate */
1487 #define BUFFER_FLAGS_DISCARD \
1488         (1 << BH_Mapped | 1 << BH_New | 1 << BH_Req | \
1489          1 << BH_Delay | 1 << BH_Unwritten)
1490
1491 static void discard_buffer(struct buffer_head * bh)
1492 {
1493         unsigned long b_state, b_state_old;
1494
1495         lock_buffer(bh);
1496         clear_buffer_dirty(bh);
1497         bh->b_bdev = NULL;
1498         b_state = bh->b_state;
1499         for (;;) {
1500                 b_state_old = cmpxchg(&bh->b_state, b_state,
1501                                       (b_state & ~BUFFER_FLAGS_DISCARD));
1502                 if (b_state_old == b_state)
1503                         break;
1504                 b_state = b_state_old;
1505         }
1506         unlock_buffer(bh);
1507 }
1508
1509 /**
1510  * block_invalidatepage - invalidate part or all of a buffer-backed page
1511  *
1512  * @page: the page which is affected
1513  * @offset: start of the range to invalidate
1514  * @length: length of the range to invalidate
1515  *
1516  * block_invalidatepage() is called when all or part of the page has become
1517  * invalidated by a truncate operation.
1518  *
1519  * block_invalidatepage() does not have to release all buffers, but it must
1520  * ensure that no dirty buffer is left outside @offset and that no I/O
1521  * is underway against any of the blocks which are outside the truncation
1522  * point.  Because the caller is about to free (and possibly reuse) those
1523  * blocks on-disk.
1524  */
1525 void block_invalidatepage(struct page *page, unsigned int offset,
1526                           unsigned int length)
1527 {
1528         struct buffer_head *head, *bh, *next;
1529         unsigned int curr_off = 0;
1530         unsigned int stop = length + offset;
1531
1532         BUG_ON(!PageLocked(page));
1533         if (!page_has_buffers(page))
1534                 goto out;
1535
1536         /*
1537          * Check for overflow
1538          */
1539         BUG_ON(stop > PAGE_CACHE_SIZE || stop < length);
1540
1541         head = page_buffers(page);
1542         bh = head;
1543         do {
1544                 unsigned int next_off = curr_off + bh->b_size;
1545                 next = bh->b_this_page;
1546
1547                 /*
1548                  * Are we still fully in range ?
1549                  */
1550                 if (next_off > stop)
1551                         goto out;
1552
1553                 /*
1554                  * is this block fully invalidated?
1555                  */
1556                 if (offset <= curr_off)
1557                         discard_buffer(bh);
1558                 curr_off = next_off;
1559                 bh = next;
1560         } while (bh != head);
1561
1562         /*
1563          * We release buffers only if the entire page is being invalidated.
1564          * The get_block cached value has been unconditionally invalidated,
1565          * so real IO is not possible anymore.
1566          */
1567         if (offset == 0)
1568                 try_to_release_page(page, 0);
1569 out:
1570         return;
1571 }
1572 EXPORT_SYMBOL(block_invalidatepage);
1573
1574
1575 /*
1576  * We attach and possibly dirty the buffers atomically wrt
1577  * __set_page_dirty_buffers() via private_lock.  try_to_free_buffers
1578  * is already excluded via the page lock.
1579  */
1580 void create_empty_buffers(struct page *page,
1581                         unsigned long blocksize, unsigned long b_state)
1582 {
1583         struct buffer_head *bh, *head, *tail;
1584
1585         head = alloc_page_buffers(page, blocksize, 1);
1586         bh = head;
1587         do {
1588                 bh->b_state |= b_state;
1589                 tail = bh;
1590                 bh = bh->b_this_page;
1591         } while (bh);
1592         tail->b_this_page = head;
1593
1594         spin_lock(&page->mapping->private_lock);
1595         if (PageUptodate(page) || PageDirty(page)) {
1596                 bh = head;
1597                 do {
1598                         if (PageDirty(page))
1599                                 set_buffer_dirty(bh);
1600                         if (PageUptodate(page))
1601                                 set_buffer_uptodate(bh);
1602                         bh = bh->b_this_page;
1603                 } while (bh != head);
1604         }
1605         attach_page_buffers(page, head);
1606         spin_unlock(&page->mapping->private_lock);
1607 }
1608 EXPORT_SYMBOL(create_empty_buffers);
1609
1610 /*
1611  * We are taking a block for data and we don't want any output from any
1612  * buffer-cache aliases starting from return from that function and
1613  * until the moment when something will explicitly mark the buffer
1614  * dirty (hopefully that will not happen until we will free that block ;-)
1615  * We don't even need to mark it not-uptodate - nobody can expect
1616  * anything from a newly allocated buffer anyway. We used to used
1617  * unmap_buffer() for such invalidation, but that was wrong. We definitely
1618  * don't want to mark the alias unmapped, for example - it would confuse
1619  * anyone who might pick it with bread() afterwards...
1620  *
1621  * Also..  Note that bforget() doesn't lock the buffer.  So there can
1622  * be writeout I/O going on against recently-freed buffers.  We don't
1623  * wait on that I/O in bforget() - it's more efficient to wait on the I/O
1624  * only if we really need to.  That happens here.
1625  */
1626 void unmap_underlying_metadata(struct block_device *bdev, sector_t block)
1627 {
1628         struct buffer_head *old_bh;
1629
1630         might_sleep();
1631
1632         old_bh = __find_get_block_slow(bdev, block);
1633         if (old_bh) {
1634                 clear_buffer_dirty(old_bh);
1635                 wait_on_buffer(old_bh);
1636                 clear_buffer_req(old_bh);
1637                 __brelse(old_bh);
1638         }
1639 }
1640 EXPORT_SYMBOL(unmap_underlying_metadata);
1641
1642 /*
1643  * Size is a power-of-two in the range 512..PAGE_SIZE,
1644  * and the case we care about most is PAGE_SIZE.
1645  *
1646  * So this *could* possibly be written with those
1647  * constraints in mind (relevant mostly if some
1648  * architecture has a slow bit-scan instruction)
1649  */
1650 static inline int block_size_bits(unsigned int blocksize)
1651 {
1652         return ilog2(blocksize);
1653 }
1654
1655 static struct buffer_head *create_page_buffers(struct page *page, struct inode *inode, unsigned int b_state)
1656 {
1657         BUG_ON(!PageLocked(page));
1658
1659         if (!page_has_buffers(page))
1660                 create_empty_buffers(page, 1 << ACCESS_ONCE(inode->i_blkbits), b_state);
1661         return page_buffers(page);
1662 }
1663
1664 /*
1665  * NOTE! All mapped/uptodate combinations are valid:
1666  *
1667  *      Mapped  Uptodate        Meaning
1668  *
1669  *      No      No              "unknown" - must do get_block()
1670  *      No      Yes             "hole" - zero-filled
1671  *      Yes     No              "allocated" - allocated on disk, not read in
1672  *      Yes     Yes             "valid" - allocated and up-to-date in memory.
1673  *
1674  * "Dirty" is valid only with the last case (mapped+uptodate).
1675  */
1676
1677 /*
1678  * While block_write_full_page is writing back the dirty buffers under
1679  * the page lock, whoever dirtied the buffers may decide to clean them
1680  * again at any time.  We handle that by only looking at the buffer
1681  * state inside lock_buffer().
1682  *
1683  * If block_write_full_page() is called for regular writeback
1684  * (wbc->sync_mode == WB_SYNC_NONE) then it will redirty a page which has a
1685  * locked buffer.   This only can happen if someone has written the buffer
1686  * directly, with submit_bh().  At the address_space level PageWriteback
1687  * prevents this contention from occurring.
1688  *
1689  * If block_write_full_page() is called with wbc->sync_mode ==
1690  * WB_SYNC_ALL, the writes are posted using WRITE_SYNC; this
1691  * causes the writes to be flagged as synchronous writes.
1692  */
1693 static int __block_write_full_page(struct inode *inode, struct page *page,
1694                         get_block_t *get_block, struct writeback_control *wbc,
1695                         bh_end_io_t *handler)
1696 {
1697         int err;
1698         sector_t block;
1699         sector_t last_block;
1700         struct buffer_head *bh, *head;
1701         unsigned int blocksize, bbits;
1702         int nr_underway = 0;
1703         int write_op = (wbc->sync_mode == WB_SYNC_ALL ? WRITE_SYNC : WRITE);
1704
1705         head = create_page_buffers(page, inode,
1706                                         (1 << BH_Dirty)|(1 << BH_Uptodate));
1707
1708         /*
1709          * Be very careful.  We have no exclusion from __set_page_dirty_buffers
1710          * here, and the (potentially unmapped) buffers may become dirty at
1711          * any time.  If a buffer becomes dirty here after we've inspected it
1712          * then we just miss that fact, and the page stays dirty.
1713          *
1714          * Buffers outside i_size may be dirtied by __set_page_dirty_buffers;
1715          * handle that here by just cleaning them.
1716          */
1717
1718         bh = head;
1719         blocksize = bh->b_size;
1720         bbits = block_size_bits(blocksize);
1721
1722         block = (sector_t)page->index << (PAGE_CACHE_SHIFT - bbits);
1723         last_block = (i_size_read(inode) - 1) >> bbits;
1724
1725         /*
1726          * Get all the dirty buffers mapped to disk addresses and
1727          * handle any aliases from the underlying blockdev's mapping.
1728          */
1729         do {
1730                 if (block > last_block) {
1731                         /*
1732                          * mapped buffers outside i_size will occur, because
1733                          * this page can be outside i_size when there is a
1734                          * truncate in progress.
1735                          */
1736                         /*
1737                          * The buffer was zeroed by block_write_full_page()
1738                          */
1739                         clear_buffer_dirty(bh);
1740                         set_buffer_uptodate(bh);
1741                 } else if ((!buffer_mapped(bh) || buffer_delay(bh)) &&
1742                            buffer_dirty(bh)) {
1743                         WARN_ON(bh->b_size != blocksize);
1744                         err = get_block(inode, block, bh, 1);
1745                         if (err)
1746                                 goto recover;
1747                         clear_buffer_delay(bh);
1748                         if (buffer_new(bh)) {
1749                                 /* blockdev mappings never come here */
1750                                 clear_buffer_new(bh);
1751                                 unmap_underlying_metadata(bh->b_bdev,
1752                                                         bh->b_blocknr);
1753                         }
1754                 }
1755                 bh = bh->b_this_page;
1756                 block++;
1757         } while (bh != head);
1758
1759         do {
1760                 if (!buffer_mapped(bh))
1761                         continue;
1762                 /*
1763                  * If it's a fully non-blocking write attempt and we cannot
1764                  * lock the buffer then redirty the page.  Note that this can
1765                  * potentially cause a busy-wait loop from writeback threads
1766                  * and kswapd activity, but those code paths have their own
1767                  * higher-level throttling.
1768                  */
1769                 if (wbc->sync_mode != WB_SYNC_NONE) {
1770                         lock_buffer(bh);
1771                 } else if (!trylock_buffer(bh)) {
1772                         redirty_page_for_writepage(wbc, page);
1773                         continue;
1774                 }
1775                 if (test_clear_buffer_dirty(bh)) {
1776                         mark_buffer_async_write_endio(bh, handler);
1777                 } else {
1778                         unlock_buffer(bh);
1779                 }
1780         } while ((bh = bh->b_this_page) != head);
1781
1782         /*
1783          * The page and its buffers are protected by PageWriteback(), so we can
1784          * drop the bh refcounts early.
1785          */
1786         BUG_ON(PageWriteback(page));
1787         set_page_writeback(page);
1788
1789         do {
1790                 struct buffer_head *next = bh->b_this_page;
1791                 if (buffer_async_write(bh)) {
1792                         submit_bh_wbc(write_op, bh, 0, wbc);
1793                         nr_underway++;
1794                 }
1795                 bh = next;
1796         } while (bh != head);
1797         unlock_page(page);
1798
1799         err = 0;
1800 done:
1801         if (nr_underway == 0) {
1802                 /*
1803                  * The page was marked dirty, but the buffers were
1804                  * clean.  Someone wrote them back by hand with
1805                  * ll_rw_block/submit_bh.  A rare case.
1806                  */
1807                 end_page_writeback(page);
1808
1809                 /*
1810                  * The page and buffer_heads can be released at any time from
1811                  * here on.
1812                  */
1813         }
1814         return err;
1815
1816 recover:
1817         /*
1818          * ENOSPC, or some other error.  We may already have added some
1819          * blocks to the file, so we need to write these out to avoid
1820          * exposing stale data.
1821          * The page is currently locked and not marked for writeback
1822          */
1823         bh = head;
1824         /* Recovery: lock and submit the mapped buffers */
1825         do {
1826                 if (buffer_mapped(bh) && buffer_dirty(bh) &&
1827                     !buffer_delay(bh)) {
1828                         lock_buffer(bh);
1829                         mark_buffer_async_write_endio(bh, handler);
1830                 } else {
1831                         /*
1832                          * The buffer may have been set dirty during
1833                          * attachment to a dirty page.
1834                          */
1835                         clear_buffer_dirty(bh);
1836                 }
1837         } while ((bh = bh->b_this_page) != head);
1838         SetPageError(page);
1839         BUG_ON(PageWriteback(page));
1840         mapping_set_error(page->mapping, err);
1841         set_page_writeback(page);
1842         do {
1843                 struct buffer_head *next = bh->b_this_page;
1844                 if (buffer_async_write(bh)) {
1845                         clear_buffer_dirty(bh);
1846                         submit_bh_wbc(write_op, bh, 0, wbc);
1847                         nr_underway++;
1848                 }
1849                 bh = next;
1850         } while (bh != head);
1851         unlock_page(page);
1852         goto done;
1853 }
1854
1855 /*
1856  * If a page has any new buffers, zero them out here, and mark them uptodate
1857  * and dirty so they'll be written out (in order to prevent uninitialised
1858  * block data from leaking). And clear the new bit.
1859  */
1860 void page_zero_new_buffers(struct page *page, unsigned from, unsigned to)
1861 {
1862         unsigned int block_start, block_end;
1863         struct buffer_head *head, *bh;
1864
1865         BUG_ON(!PageLocked(page));
1866         if (!page_has_buffers(page))
1867                 return;
1868
1869         bh = head = page_buffers(page);
1870         block_start = 0;
1871         do {
1872                 block_end = block_start + bh->b_size;
1873
1874                 if (buffer_new(bh)) {
1875                         if (block_end > from && block_start < to) {
1876                                 if (!PageUptodate(page)) {
1877                                         unsigned start, size;
1878
1879                                         start = max(from, block_start);
1880                                         size = min(to, block_end) - start;
1881
1882                                         zero_user(page, start, size);
1883                                         set_buffer_uptodate(bh);
1884                                 }
1885
1886                                 clear_buffer_new(bh);
1887                                 mark_buffer_dirty(bh);
1888                         }
1889                 }
1890
1891                 block_start = block_end;
1892                 bh = bh->b_this_page;
1893         } while (bh != head);
1894 }
1895 EXPORT_SYMBOL(page_zero_new_buffers);
1896
1897 int __block_write_begin(struct page *page, loff_t pos, unsigned len,
1898                 get_block_t *get_block)
1899 {
1900         unsigned from = pos & (PAGE_CACHE_SIZE - 1);
1901         unsigned to = from + len;
1902         struct inode *inode = page->mapping->host;
1903         unsigned block_start, block_end;
1904         sector_t block;
1905         int err = 0;
1906         unsigned blocksize, bbits;
1907         struct buffer_head *bh, *head, *wait[2], **wait_bh=wait;
1908
1909         BUG_ON(!PageLocked(page));
1910         BUG_ON(from > PAGE_CACHE_SIZE);
1911         BUG_ON(to > PAGE_CACHE_SIZE);
1912         BUG_ON(from > to);
1913
1914         head = create_page_buffers(page, inode, 0);
1915         blocksize = head->b_size;
1916         bbits = block_size_bits(blocksize);
1917
1918         block = (sector_t)page->index << (PAGE_CACHE_SHIFT - bbits);
1919
1920         for(bh = head, block_start = 0; bh != head || !block_start;
1921             block++, block_start=block_end, bh = bh->b_this_page) {
1922                 block_end = block_start + blocksize;
1923                 if (block_end <= from || block_start >= to) {
1924                         if (PageUptodate(page)) {
1925                                 if (!buffer_uptodate(bh))
1926                                         set_buffer_uptodate(bh);
1927                         }
1928                         continue;
1929                 }
1930                 if (buffer_new(bh))
1931                         clear_buffer_new(bh);
1932                 if (!buffer_mapped(bh)) {
1933                         WARN_ON(bh->b_size != blocksize);
1934                         err = get_block(inode, block, bh, 1);
1935                         if (err)
1936                                 break;
1937                         if (buffer_new(bh)) {
1938                                 unmap_underlying_metadata(bh->b_bdev,
1939                                                         bh->b_blocknr);
1940                                 if (PageUptodate(page)) {
1941                                         clear_buffer_new(bh);
1942                                         set_buffer_uptodate(bh);
1943                                         mark_buffer_dirty(bh);
1944                                         continue;
1945                                 }
1946                                 if (block_end > to || block_start < from)
1947                                         zero_user_segments(page,
1948                                                 to, block_end,
1949                                                 block_start, from);
1950                                 continue;
1951                         }
1952                 }
1953                 if (PageUptodate(page)) {
1954                         if (!buffer_uptodate(bh))
1955                                 set_buffer_uptodate(bh);
1956                         continue; 
1957                 }
1958                 if (!buffer_uptodate(bh) && !buffer_delay(bh) &&
1959                     !buffer_unwritten(bh) &&
1960                      (block_start < from || block_end > to)) {
1961                         ll_rw_block(READ, 1, &bh);
1962                         *wait_bh++=bh;
1963                 }
1964         }
1965         /*
1966          * If we issued read requests - let them complete.
1967          */
1968         while(wait_bh > wait) {
1969                 wait_on_buffer(*--wait_bh);
1970                 if (!buffer_uptodate(*wait_bh))
1971                         err = -EIO;
1972         }
1973         if (unlikely(err))
1974                 page_zero_new_buffers(page, from, to);
1975         return err;
1976 }
1977 EXPORT_SYMBOL(__block_write_begin);
1978
1979 static int __block_commit_write(struct inode *inode, struct page *page,
1980                 unsigned from, unsigned to)
1981 {
1982         unsigned block_start, block_end;
1983         int partial = 0;
1984         unsigned blocksize;
1985         struct buffer_head *bh, *head;
1986
1987         bh = head = page_buffers(page);
1988         blocksize = bh->b_size;
1989
1990         block_start = 0;
1991         do {
1992                 block_end = block_start + blocksize;
1993                 if (block_end <= from || block_start >= to) {
1994                         if (!buffer_uptodate(bh))
1995                                 partial = 1;
1996                 } else {
1997                         set_buffer_uptodate(bh);
1998                         mark_buffer_dirty(bh);
1999                 }
2000                 clear_buffer_new(bh);
2001
2002                 block_start = block_end;
2003                 bh = bh->b_this_page;
2004         } while (bh != head);
2005
2006         /*
2007          * If this is a partial write which happened to make all buffers
2008          * uptodate then we can optimize away a bogus readpage() for
2009          * the next read(). Here we 'discover' whether the page went
2010          * uptodate as a result of this (potentially partial) write.
2011          */
2012         if (!partial)
2013                 SetPageUptodate(page);
2014         return 0;
2015 }
2016
2017 /*
2018  * block_write_begin takes care of the basic task of block allocation and
2019  * bringing partial write blocks uptodate first.
2020  *
2021  * The filesystem needs to handle block truncation upon failure.
2022  */
2023 int block_write_begin(struct address_space *mapping, loff_t pos, unsigned len,
2024                 unsigned flags, struct page **pagep, get_block_t *get_block)
2025 {
2026         pgoff_t index = pos >> PAGE_CACHE_SHIFT;
2027         struct page *page;
2028         int status;
2029
2030         page = grab_cache_page_write_begin(mapping, index, flags);
2031         if (!page)
2032                 return -ENOMEM;
2033
2034         status = __block_write_begin(page, pos, len, get_block);
2035         if (unlikely(status)) {
2036                 unlock_page(page);
2037                 page_cache_release(page);
2038                 page = NULL;
2039         }
2040
2041         *pagep = page;
2042         return status;
2043 }
2044 EXPORT_SYMBOL(block_write_begin);
2045
2046 int block_write_end(struct file *file, struct address_space *mapping,
2047                         loff_t pos, unsigned len, unsigned copied,
2048                         struct page *page, void *fsdata)
2049 {
2050         struct inode *inode = mapping->host;
2051         unsigned start;
2052
2053         start = pos & (PAGE_CACHE_SIZE - 1);
2054
2055         if (unlikely(copied < len)) {
2056                 /*
2057                  * The buffers that were written will now be uptodate, so we
2058                  * don't have to worry about a readpage reading them and
2059                  * overwriting a partial write. However if we have encountered
2060                  * a short write and only partially written into a buffer, it
2061                  * will not be marked uptodate, so a readpage might come in and
2062                  * destroy our partial write.
2063                  *
2064                  * Do the simplest thing, and just treat any short write to a
2065                  * non uptodate page as a zero-length write, and force the
2066                  * caller to redo the whole thing.
2067                  */
2068                 if (!PageUptodate(page))
2069                         copied = 0;
2070
2071                 page_zero_new_buffers(page, start+copied, start+len);
2072         }
2073         flush_dcache_page(page);
2074
2075         /* This could be a short (even 0-length) commit */
2076         __block_commit_write(inode, page, start, start+copied);
2077
2078         return copied;
2079 }
2080 EXPORT_SYMBOL(block_write_end);
2081
2082 int generic_write_end(struct file *file, struct address_space *mapping,
2083                         loff_t pos, unsigned len, unsigned copied,
2084                         struct page *page, void *fsdata)
2085 {
2086         struct inode *inode = mapping->host;
2087         loff_t old_size = inode->i_size;
2088         int i_size_changed = 0;
2089
2090         copied = block_write_end(file, mapping, pos, len, copied, page, fsdata);
2091
2092         /*
2093          * No need to use i_size_read() here, the i_size
2094          * cannot change under us because we hold i_mutex.
2095          *
2096          * But it's important to update i_size while still holding page lock:
2097          * page writeout could otherwise come in and zero beyond i_size.
2098          */
2099         if (pos+copied > inode->i_size) {
2100                 i_size_write(inode, pos+copied);
2101                 i_size_changed = 1;
2102         }
2103
2104         unlock_page(page);
2105         page_cache_release(page);
2106
2107         if (old_size < pos)
2108                 pagecache_isize_extended(inode, old_size, pos);
2109         /*
2110          * Don't mark the inode dirty under page lock. First, it unnecessarily
2111          * makes the holding time of page lock longer. Second, it forces lock
2112          * ordering of page lock and transaction start for journaling
2113          * filesystems.
2114          */
2115         if (i_size_changed)
2116                 mark_inode_dirty(inode);
2117
2118         return copied;
2119 }
2120 EXPORT_SYMBOL(generic_write_end);
2121
2122 /*
2123  * block_is_partially_uptodate checks whether buffers within a page are
2124  * uptodate or not.
2125  *
2126  * Returns true if all buffers which correspond to a file portion
2127  * we want to read are uptodate.
2128  */
2129 int block_is_partially_uptodate(struct page *page, unsigned long from,
2130                                         unsigned long count)
2131 {
2132         unsigned block_start, block_end, blocksize;
2133         unsigned to;
2134         struct buffer_head *bh, *head;
2135         int ret = 1;
2136
2137         if (!page_has_buffers(page))
2138                 return 0;
2139
2140         head = page_buffers(page);
2141         blocksize = head->b_size;
2142         to = min_t(unsigned, PAGE_CACHE_SIZE - from, count);
2143         to = from + to;
2144         if (from < blocksize && to > PAGE_CACHE_SIZE - blocksize)
2145                 return 0;
2146
2147         bh = head;
2148         block_start = 0;
2149         do {
2150                 block_end = block_start + blocksize;
2151                 if (block_end > from && block_start < to) {
2152                         if (!buffer_uptodate(bh)) {
2153                                 ret = 0;
2154                                 break;
2155                         }
2156                         if (block_end >= to)
2157                                 break;
2158                 }
2159                 block_start = block_end;
2160                 bh = bh->b_this_page;
2161         } while (bh != head);
2162
2163         return ret;
2164 }
2165 EXPORT_SYMBOL(block_is_partially_uptodate);
2166
2167 /*
2168  * Generic "read page" function for block devices that have the normal
2169  * get_block functionality. This is most of the block device filesystems.
2170  * Reads the page asynchronously --- the unlock_buffer() and
2171  * set/clear_buffer_uptodate() functions propagate buffer state into the
2172  * page struct once IO has completed.
2173  */
2174 int block_read_full_page(struct page *page, get_block_t *get_block)
2175 {
2176         struct inode *inode = page->mapping->host;
2177         sector_t iblock, lblock;
2178         struct buffer_head *bh, *head, *arr[MAX_BUF_PER_PAGE];
2179         unsigned int blocksize, bbits;
2180         int nr, i;
2181         int fully_mapped = 1;
2182
2183         head = create_page_buffers(page, inode, 0);
2184         blocksize = head->b_size;
2185         bbits = block_size_bits(blocksize);
2186
2187         iblock = (sector_t)page->index << (PAGE_CACHE_SHIFT - bbits);
2188         lblock = (i_size_read(inode)+blocksize-1) >> bbits;
2189         bh = head;
2190         nr = 0;
2191         i = 0;
2192
2193         do {
2194                 if (buffer_uptodate(bh))
2195                         continue;
2196
2197                 if (!buffer_mapped(bh)) {
2198                         int err = 0;
2199
2200                         fully_mapped = 0;
2201                         if (iblock < lblock) {
2202                                 WARN_ON(bh->b_size != blocksize);
2203                                 err = get_block(inode, iblock, bh, 0);
2204                                 if (err)
2205                                         SetPageError(page);
2206                         }
2207                         if (!buffer_mapped(bh)) {
2208                                 zero_user(page, i * blocksize, blocksize);
2209                                 if (!err)
2210                                         set_buffer_uptodate(bh);
2211                                 continue;
2212                         }
2213                         /*
2214                          * get_block() might have updated the buffer
2215                          * synchronously
2216                          */
2217                         if (buffer_uptodate(bh))
2218                                 continue;
2219                 }
2220                 arr[nr++] = bh;
2221         } while (i++, iblock++, (bh = bh->b_this_page) != head);
2222
2223         if (fully_mapped)
2224                 SetPageMappedToDisk(page);
2225
2226         if (!nr) {
2227                 /*
2228                  * All buffers are uptodate - we can set the page uptodate
2229                  * as well. But not if get_block() returned an error.
2230                  */
2231                 if (!PageError(page))
2232                         SetPageUptodate(page);
2233                 unlock_page(page);
2234                 return 0;
2235         }
2236
2237         /* Stage two: lock the buffers */
2238         for (i = 0; i < nr; i++) {
2239                 bh = arr[i];
2240                 lock_buffer(bh);
2241                 mark_buffer_async_read(bh);
2242         }
2243
2244         /*
2245          * Stage 3: start the IO.  Check for uptodateness
2246          * inside the buffer lock in case another process reading
2247          * the underlying blockdev brought it uptodate (the sct fix).
2248          */
2249         for (i = 0; i < nr; i++) {
2250                 bh = arr[i];
2251                 if (buffer_uptodate(bh))
2252                         end_buffer_async_read(bh, 1);
2253                 else
2254                         submit_bh(READ, bh);
2255         }
2256         return 0;
2257 }
2258 EXPORT_SYMBOL(block_read_full_page);
2259
2260 /* utility function for filesystems that need to do work on expanding
2261  * truncates.  Uses filesystem pagecache writes to allow the filesystem to
2262  * deal with the hole.  
2263  */
2264 int generic_cont_expand_simple(struct inode *inode, loff_t size)
2265 {
2266         struct address_space *mapping = inode->i_mapping;
2267         struct page *page;
2268         void *fsdata;
2269         int err;
2270
2271         err = inode_newsize_ok(inode, size);
2272         if (err)
2273                 goto out;
2274
2275         err = pagecache_write_begin(NULL, mapping, size, 0,
2276                                 AOP_FLAG_UNINTERRUPTIBLE|AOP_FLAG_CONT_EXPAND,
2277                                 &page, &fsdata);
2278         if (err)
2279                 goto out;
2280
2281         err = pagecache_write_end(NULL, mapping, size, 0, 0, page, fsdata);
2282         BUG_ON(err > 0);
2283
2284 out:
2285         return err;
2286 }
2287 EXPORT_SYMBOL(generic_cont_expand_simple);
2288
2289 static int cont_expand_zero(struct file *file, struct address_space *mapping,
2290                             loff_t pos, loff_t *bytes)
2291 {
2292         struct inode *inode = mapping->host;
2293         unsigned blocksize = 1 << inode->i_blkbits;
2294         struct page *page;
2295         void *fsdata;
2296         pgoff_t index, curidx;
2297         loff_t curpos;
2298         unsigned zerofrom, offset, len;
2299         int err = 0;
2300
2301         index = pos >> PAGE_CACHE_SHIFT;
2302         offset = pos & ~PAGE_CACHE_MASK;
2303
2304         while (index > (curidx = (curpos = *bytes)>>PAGE_CACHE_SHIFT)) {
2305                 zerofrom = curpos & ~PAGE_CACHE_MASK;
2306                 if (zerofrom & (blocksize-1)) {
2307                         *bytes |= (blocksize-1);
2308                         (*bytes)++;
2309                 }
2310                 len = PAGE_CACHE_SIZE - zerofrom;
2311
2312                 err = pagecache_write_begin(file, mapping, curpos, len,
2313                                                 AOP_FLAG_UNINTERRUPTIBLE,
2314                                                 &page, &fsdata);
2315                 if (err)
2316                         goto out;
2317                 zero_user(page, zerofrom, len);
2318                 err = pagecache_write_end(file, mapping, curpos, len, len,
2319                                                 page, fsdata);
2320                 if (err < 0)
2321                         goto out;
2322                 BUG_ON(err != len);
2323                 err = 0;
2324
2325                 balance_dirty_pages_ratelimited(mapping);
2326
2327                 if (unlikely(fatal_signal_pending(current))) {
2328                         err = -EINTR;
2329                         goto out;
2330                 }
2331         }
2332
2333         /* page covers the boundary, find the boundary offset */
2334         if (index == curidx) {
2335                 zerofrom = curpos & ~PAGE_CACHE_MASK;
2336                 /* if we will expand the thing last block will be filled */
2337                 if (offset <= zerofrom) {
2338                         goto out;
2339                 }
2340                 if (zerofrom & (blocksize-1)) {
2341                         *bytes |= (blocksize-1);
2342                         (*bytes)++;
2343                 }
2344                 len = offset - zerofrom;
2345
2346                 err = pagecache_write_begin(file, mapping, curpos, len,
2347                                                 AOP_FLAG_UNINTERRUPTIBLE,
2348                                                 &page, &fsdata);
2349                 if (err)
2350                         goto out;
2351                 zero_user(page, zerofrom, len);
2352                 err = pagecache_write_end(file, mapping, curpos, len, len,
2353                                                 page, fsdata);
2354                 if (err < 0)
2355                         goto out;
2356                 BUG_ON(err != len);
2357                 err = 0;
2358         }
2359 out:
2360         return err;
2361 }
2362
2363 /*
2364  * For moronic filesystems that do not allow holes in file.
2365  * We may have to extend the file.
2366  */
2367 int cont_write_begin(struct file *file, struct address_space *mapping,
2368                         loff_t pos, unsigned len, unsigned flags,
2369                         struct page **pagep, void **fsdata,
2370                         get_block_t *get_block, loff_t *bytes)
2371 {
2372         struct inode *inode = mapping->host;
2373         unsigned blocksize = 1 << inode->i_blkbits;
2374         unsigned zerofrom;
2375         int err;
2376
2377         err = cont_expand_zero(file, mapping, pos, bytes);
2378         if (err)
2379                 return err;
2380
2381         zerofrom = *bytes & ~PAGE_CACHE_MASK;
2382         if (pos+len > *bytes && zerofrom & (blocksize-1)) {
2383                 *bytes |= (blocksize-1);
2384                 (*bytes)++;
2385         }
2386
2387         return block_write_begin(mapping, pos, len, flags, pagep, get_block);
2388 }
2389 EXPORT_SYMBOL(cont_write_begin);
2390
2391 int block_commit_write(struct page *page, unsigned from, unsigned to)
2392 {
2393         struct inode *inode = page->mapping->host;
2394         __block_commit_write(inode,page,from,to);
2395         return 0;
2396 }
2397 EXPORT_SYMBOL(block_commit_write);
2398
2399 /*
2400  * block_page_mkwrite() is not allowed to change the file size as it gets
2401  * called from a page fault handler when a page is first dirtied. Hence we must
2402  * be careful to check for EOF conditions here. We set the page up correctly
2403  * for a written page which means we get ENOSPC checking when writing into
2404  * holes and correct delalloc and unwritten extent mapping on filesystems that
2405  * support these features.
2406  *
2407  * We are not allowed to take the i_mutex here so we have to play games to
2408  * protect against truncate races as the page could now be beyond EOF.  Because
2409  * truncate writes the inode size before removing pages, once we have the
2410  * page lock we can determine safely if the page is beyond EOF. If it is not
2411  * beyond EOF, then the page is guaranteed safe against truncation until we
2412  * unlock the page.
2413  *
2414  * Direct callers of this function should protect against filesystem freezing
2415  * using sb_start_pagefault() - sb_end_pagefault() functions.
2416  */
2417 int block_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf,
2418                          get_block_t get_block)
2419 {
2420         struct page *page = vmf->page;
2421         struct inode *inode = file_inode(vma->vm_file);
2422         unsigned long end;
2423         loff_t size;
2424         int ret;
2425
2426         lock_page(page);
2427         size = i_size_read(inode);
2428         if ((page->mapping != inode->i_mapping) ||
2429             (page_offset(page) > size)) {
2430                 /* We overload EFAULT to mean page got truncated */
2431                 ret = -EFAULT;
2432                 goto out_unlock;
2433         }
2434
2435         /* page is wholly or partially inside EOF */
2436         if (((page->index + 1) << PAGE_CACHE_SHIFT) > size)
2437                 end = size & ~PAGE_CACHE_MASK;
2438         else
2439                 end = PAGE_CACHE_SIZE;
2440
2441         ret = __block_write_begin(page, 0, end, get_block);
2442         if (!ret)
2443                 ret = block_commit_write(page, 0, end);
2444
2445         if (unlikely(ret < 0))
2446                 goto out_unlock;
2447         set_page_dirty(page);
2448         wait_for_stable_page(page);
2449         return 0;
2450 out_unlock:
2451         unlock_page(page);
2452         return ret;
2453 }
2454 EXPORT_SYMBOL(block_page_mkwrite);
2455
2456 /*
2457  * nobh_write_begin()'s prereads are special: the buffer_heads are freed
2458  * immediately, while under the page lock.  So it needs a special end_io
2459  * handler which does not touch the bh after unlocking it.
2460  */
2461 static void end_buffer_read_nobh(struct buffer_head *bh, int uptodate)
2462 {
2463         __end_buffer_read_notouch(bh, uptodate);
2464 }
2465
2466 /*
2467  * Attach the singly-linked list of buffers created by nobh_write_begin, to
2468  * the page (converting it to circular linked list and taking care of page
2469  * dirty races).
2470  */
2471 static void attach_nobh_buffers(struct page *page, struct buffer_head *head)
2472 {
2473         struct buffer_head *bh;
2474
2475         BUG_ON(!PageLocked(page));
2476
2477         spin_lock(&page->mapping->private_lock);
2478         bh = head;
2479         do {
2480                 if (PageDirty(page))
2481                         set_buffer_dirty(bh);
2482                 if (!bh->b_this_page)
2483                         bh->b_this_page = head;
2484                 bh = bh->b_this_page;
2485         } while (bh != head);
2486         attach_page_buffers(page, head);
2487         spin_unlock(&page->mapping->private_lock);
2488 }
2489
2490 /*
2491  * On entry, the page is fully not uptodate.
2492  * On exit the page is fully uptodate in the areas outside (from,to)
2493  * The filesystem needs to handle block truncation upon failure.
2494  */
2495 int nobh_write_begin(struct address_space *mapping,
2496                         loff_t pos, unsigned len, unsigned flags,
2497                         struct page **pagep, void **fsdata,
2498                         get_block_t *get_block)
2499 {
2500         struct inode *inode = mapping->host;
2501         const unsigned blkbits = inode->i_blkbits;
2502         const unsigned blocksize = 1 << blkbits;
2503         struct buffer_head *head, *bh;
2504         struct page *page;
2505         pgoff_t index;
2506         unsigned from, to;
2507         unsigned block_in_page;
2508         unsigned block_start, block_end;
2509         sector_t block_in_file;
2510         int nr_reads = 0;
2511         int ret = 0;
2512         int is_mapped_to_disk = 1;
2513
2514         index = pos >> PAGE_CACHE_SHIFT;
2515         from = pos & (PAGE_CACHE_SIZE - 1);
2516         to = from + len;
2517
2518         page = grab_cache_page_write_begin(mapping, index, flags);
2519         if (!page)
2520                 return -ENOMEM;
2521         *pagep = page;
2522         *fsdata = NULL;
2523
2524         if (page_has_buffers(page)) {
2525                 ret = __block_write_begin(page, pos, len, get_block);
2526                 if (unlikely(ret))
2527                         goto out_release;
2528                 return ret;
2529         }
2530
2531         if (PageMappedToDisk(page))
2532                 return 0;
2533
2534         /*
2535          * Allocate buffers so that we can keep track of state, and potentially
2536          * attach them to the page if an error occurs. In the common case of
2537          * no error, they will just be freed again without ever being attached
2538          * to the page (which is all OK, because we're under the page lock).
2539          *
2540          * Be careful: the buffer linked list is a NULL terminated one, rather
2541          * than the circular one we're used to.
2542          */
2543         head = alloc_page_buffers(page, blocksize, 0);
2544         if (!head) {
2545                 ret = -ENOMEM;
2546                 goto out_release;
2547         }
2548
2549         block_in_file = (sector_t)page->index << (PAGE_CACHE_SHIFT - blkbits);
2550
2551         /*
2552          * We loop across all blocks in the page, whether or not they are
2553          * part of the affected region.  This is so we can discover if the
2554          * page is fully mapped-to-disk.
2555          */
2556         for (block_start = 0, block_in_page = 0, bh = head;
2557                   block_start < PAGE_CACHE_SIZE;
2558                   block_in_page++, block_start += blocksize, bh = bh->b_this_page) {
2559                 int create;
2560
2561                 block_end = block_start + blocksize;
2562                 bh->b_state = 0;
2563                 create = 1;
2564                 if (block_start >= to)
2565                         create = 0;
2566                 ret = get_block(inode, block_in_file + block_in_page,
2567                                         bh, create);
2568                 if (ret)
2569                         goto failed;
2570                 if (!buffer_mapped(bh))
2571                         is_mapped_to_disk = 0;
2572                 if (buffer_new(bh))
2573                         unmap_underlying_metadata(bh->b_bdev, bh->b_blocknr);
2574                 if (PageUptodate(page)) {
2575                         set_buffer_uptodate(bh);
2576                         continue;
2577                 }
2578                 if (buffer_new(bh) || !buffer_mapped(bh)) {
2579                         zero_user_segments(page, block_start, from,
2580                                                         to, block_end);
2581                         continue;
2582                 }
2583                 if (buffer_uptodate(bh))
2584                         continue;       /* reiserfs does this */
2585                 if (block_start < from || block_end > to) {
2586                         lock_buffer(bh);
2587                         bh->b_end_io = end_buffer_read_nobh;
2588                         submit_bh(READ, bh);
2589                         nr_reads++;
2590                 }
2591         }
2592
2593         if (nr_reads) {
2594                 /*
2595                  * The page is locked, so these buffers are protected from
2596                  * any VM or truncate activity.  Hence we don't need to care
2597                  * for the buffer_head refcounts.
2598                  */
2599                 for (bh = head; bh; bh = bh->b_this_page) {
2600                         wait_on_buffer(bh);
2601                         if (!buffer_uptodate(bh))
2602                                 ret = -EIO;
2603                 }
2604                 if (ret)
2605                         goto failed;
2606         }
2607
2608         if (is_mapped_to_disk)
2609                 SetPageMappedToDisk(page);
2610
2611         *fsdata = head; /* to be released by nobh_write_end */
2612
2613         return 0;
2614
2615 failed:
2616         BUG_ON(!ret);
2617         /*
2618          * Error recovery is a bit difficult. We need to zero out blocks that
2619          * were newly allocated, and dirty them to ensure they get written out.
2620          * Buffers need to be attached to the page at this point, otherwise
2621          * the handling of potential IO errors during writeout would be hard
2622          * (could try doing synchronous writeout, but what if that fails too?)
2623          */
2624         attach_nobh_buffers(page, head);
2625         page_zero_new_buffers(page, from, to);
2626
2627 out_release:
2628         unlock_page(page);
2629         page_cache_release(page);
2630         *pagep = NULL;
2631
2632         return ret;
2633 }
2634 EXPORT_SYMBOL(nobh_write_begin);
2635
2636 int nobh_write_end(struct file *file, struct address_space *mapping,
2637                         loff_t pos, unsigned len, unsigned copied,
2638                         struct page *page, void *fsdata)
2639 {
2640         struct inode *inode = page->mapping->host;
2641         struct buffer_head *head = fsdata;
2642         struct buffer_head *bh;
2643         BUG_ON(fsdata != NULL && page_has_buffers(page));
2644
2645         if (unlikely(copied < len) && head)
2646                 attach_nobh_buffers(page, head);
2647         if (page_has_buffers(page))
2648                 return generic_write_end(file, mapping, pos, len,
2649                                         copied, page, fsdata);
2650
2651         SetPageUptodate(page);
2652         set_page_dirty(page);
2653         if (pos+copied > inode->i_size) {
2654                 i_size_write(inode, pos+copied);
2655                 mark_inode_dirty(inode);
2656         }
2657
2658         unlock_page(page);
2659         page_cache_release(page);
2660
2661         while (head) {
2662                 bh = head;
2663                 head = head->b_this_page;
2664                 free_buffer_head(bh);
2665         }
2666
2667         return copied;
2668 }
2669 EXPORT_SYMBOL(nobh_write_end);
2670
2671 /*
2672  * nobh_writepage() - based on block_full_write_page() except
2673  * that it tries to operate without attaching bufferheads to
2674  * the page.
2675  */
2676 int nobh_writepage(struct page *page, get_block_t *get_block,
2677                         struct writeback_control *wbc)
2678 {
2679         struct inode * const inode = page->mapping->host;
2680         loff_t i_size = i_size_read(inode);
2681         const pgoff_t end_index = i_size >> PAGE_CACHE_SHIFT;
2682         unsigned offset;
2683         int ret;
2684
2685         /* Is the page fully inside i_size? */
2686         if (page->index < end_index)
2687                 goto out;
2688
2689         /* Is the page fully outside i_size? (truncate in progress) */
2690         offset = i_size & (PAGE_CACHE_SIZE-1);
2691         if (page->index >= end_index+1 || !offset) {
2692                 /*
2693                  * The page may have dirty, unmapped buffers.  For example,
2694                  * they may have been added in ext3_writepage().  Make them
2695                  * freeable here, so the page does not leak.
2696                  */
2697 #if 0
2698                 /* Not really sure about this  - do we need this ? */
2699                 if (page->mapping->a_ops->invalidatepage)
2700                         page->mapping->a_ops->invalidatepage(page, offset);
2701 #endif
2702                 unlock_page(page);
2703                 return 0; /* don't care */
2704         }
2705
2706         /*
2707          * The page straddles i_size.  It must be zeroed out on each and every
2708          * writepage invocation because it may be mmapped.  "A file is mapped
2709          * in multiples of the page size.  For a file that is not a multiple of
2710          * the  page size, the remaining memory is zeroed when mapped, and
2711          * writes to that region are not written out to the file."
2712          */
2713         zero_user_segment(page, offset, PAGE_CACHE_SIZE);
2714 out:
2715         ret = mpage_writepage(page, get_block, wbc);
2716         if (ret == -EAGAIN)
2717                 ret = __block_write_full_page(inode, page, get_block, wbc,
2718                                               end_buffer_async_write);
2719         return ret;
2720 }
2721 EXPORT_SYMBOL(nobh_writepage);
2722
2723 int nobh_truncate_page(struct address_space *mapping,
2724                         loff_t from, get_block_t *get_block)
2725 {
2726         pgoff_t index = from >> PAGE_CACHE_SHIFT;
2727         unsigned offset = from & (PAGE_CACHE_SIZE-1);
2728         unsigned blocksize;
2729         sector_t iblock;
2730         unsigned length, pos;
2731         struct inode *inode = mapping->host;
2732         struct page *page;
2733         struct buffer_head map_bh;
2734         int err;
2735
2736         blocksize = 1 << inode->i_blkbits;
2737         length = offset & (blocksize - 1);
2738
2739         /* Block boundary? Nothing to do */
2740         if (!length)
2741                 return 0;
2742
2743         length = blocksize - length;
2744         iblock = (sector_t)index << (PAGE_CACHE_SHIFT - inode->i_blkbits);
2745
2746         page = grab_cache_page(mapping, index);
2747         err = -ENOMEM;
2748         if (!page)
2749                 goto out;
2750
2751         if (page_has_buffers(page)) {
2752 has_buffers:
2753                 unlock_page(page);
2754                 page_cache_release(page);
2755                 return block_truncate_page(mapping, from, get_block);
2756         }
2757
2758         /* Find the buffer that contains "offset" */
2759         pos = blocksize;
2760         while (offset >= pos) {
2761                 iblock++;
2762                 pos += blocksize;
2763         }
2764
2765         map_bh.b_size = blocksize;
2766         map_bh.b_state = 0;
2767         err = get_block(inode, iblock, &map_bh, 0);
2768         if (err)
2769                 goto unlock;
2770         /* unmapped? It's a hole - nothing to do */
2771         if (!buffer_mapped(&map_bh))
2772                 goto unlock;
2773
2774         /* Ok, it's mapped. Make sure it's up-to-date */
2775         if (!PageUptodate(page)) {
2776                 err = mapping->a_ops->readpage(NULL, page);
2777                 if (err) {
2778                         page_cache_release(page);
2779                         goto out;
2780                 }
2781                 lock_page(page);
2782                 if (!PageUptodate(page)) {
2783                         err = -EIO;
2784                         goto unlock;
2785                 }
2786                 if (page_has_buffers(page))
2787                         goto has_buffers;
2788         }
2789         zero_user(page, offset, length);
2790         set_page_dirty(page);
2791         err = 0;
2792
2793 unlock:
2794         unlock_page(page);
2795         page_cache_release(page);
2796 out:
2797         return err;
2798 }
2799 EXPORT_SYMBOL(nobh_truncate_page);
2800
2801 int block_truncate_page(struct address_space *mapping,
2802                         loff_t from, get_block_t *get_block)
2803 {
2804         pgoff_t index = from >> PAGE_CACHE_SHIFT;
2805         unsigned offset = from & (PAGE_CACHE_SIZE-1);
2806         unsigned blocksize;
2807         sector_t iblock;
2808         unsigned length, pos;
2809         struct inode *inode = mapping->host;
2810         struct page *page;
2811         struct buffer_head *bh;
2812         int err;
2813
2814         blocksize = 1 << inode->i_blkbits;
2815         length = offset & (blocksize - 1);
2816
2817         /* Block boundary? Nothing to do */
2818         if (!length)
2819                 return 0;
2820
2821         length = blocksize - length;
2822         iblock = (sector_t)index << (PAGE_CACHE_SHIFT - inode->i_blkbits);
2823         
2824         page = grab_cache_page(mapping, index);
2825         err = -ENOMEM;
2826         if (!page)
2827                 goto out;
2828
2829         if (!page_has_buffers(page))
2830                 create_empty_buffers(page, blocksize, 0);
2831
2832         /* Find the buffer that contains "offset" */
2833         bh = page_buffers(page);
2834         pos = blocksize;
2835         while (offset >= pos) {
2836                 bh = bh->b_this_page;
2837                 iblock++;
2838                 pos += blocksize;
2839         }
2840
2841         err = 0;
2842         if (!buffer_mapped(bh)) {
2843                 WARN_ON(bh->b_size != blocksize);
2844                 err = get_block(inode, iblock, bh, 0);
2845                 if (err)
2846                         goto unlock;
2847                 /* unmapped? It's a hole - nothing to do */
2848                 if (!buffer_mapped(bh))
2849                         goto unlock;
2850         }
2851
2852         /* Ok, it's mapped. Make sure it's up-to-date */
2853         if (PageUptodate(page))
2854                 set_buffer_uptodate(bh);
2855
2856         if (!buffer_uptodate(bh) && !buffer_delay(bh) && !buffer_unwritten(bh)) {
2857                 err = -EIO;
2858                 ll_rw_block(READ, 1, &bh);
2859                 wait_on_buffer(bh);
2860                 /* Uhhuh. Read error. Complain and punt. */
2861                 if (!buffer_uptodate(bh))
2862                         goto unlock;
2863         }
2864
2865         zero_user(page, offset, length);
2866         mark_buffer_dirty(bh);
2867         err = 0;
2868
2869 unlock:
2870         unlock_page(page);
2871         page_cache_release(page);
2872 out:
2873         return err;
2874 }
2875 EXPORT_SYMBOL(block_truncate_page);
2876
2877 /*
2878  * The generic ->writepage function for buffer-backed address_spaces
2879  */
2880 int block_write_full_page(struct page *page, get_block_t *get_block,
2881                         struct writeback_control *wbc)
2882 {
2883         struct inode * const inode = page->mapping->host;
2884         loff_t i_size = i_size_read(inode);
2885         const pgoff_t end_index = i_size >> PAGE_CACHE_SHIFT;
2886         unsigned offset;
2887
2888         /* Is the page fully inside i_size? */
2889         if (page->index < end_index)
2890                 return __block_write_full_page(inode, page, get_block, wbc,
2891                                                end_buffer_async_write);
2892
2893         /* Is the page fully outside i_size? (truncate in progress) */
2894         offset = i_size & (PAGE_CACHE_SIZE-1);
2895         if (page->index >= end_index+1 || !offset) {
2896                 /*
2897                  * The page may have dirty, unmapped buffers.  For example,
2898                  * they may have been added in ext3_writepage().  Make them
2899                  * freeable here, so the page does not leak.
2900                  */
2901                 do_invalidatepage(page, 0, PAGE_CACHE_SIZE);
2902                 unlock_page(page);
2903                 return 0; /* don't care */
2904         }
2905
2906         /*
2907          * The page straddles i_size.  It must be zeroed out on each and every
2908          * writepage invocation because it may be mmapped.  "A file is mapped
2909          * in multiples of the page size.  For a file that is not a multiple of
2910          * the  page size, the remaining memory is zeroed when mapped, and
2911          * writes to that region are not written out to the file."
2912          */
2913         zero_user_segment(page, offset, PAGE_CACHE_SIZE);
2914         return __block_write_full_page(inode, page, get_block, wbc,
2915                                                         end_buffer_async_write);
2916 }
2917 EXPORT_SYMBOL(block_write_full_page);
2918
2919 sector_t generic_block_bmap(struct address_space *mapping, sector_t block,
2920                             get_block_t *get_block)
2921 {
2922         struct buffer_head tmp;
2923         struct inode *inode = mapping->host;
2924         tmp.b_state = 0;
2925         tmp.b_blocknr = 0;
2926         tmp.b_size = 1 << inode->i_blkbits;
2927         get_block(inode, block, &tmp, 0);
2928         return tmp.b_blocknr;
2929 }
2930 EXPORT_SYMBOL(generic_block_bmap);
2931
2932 static void end_bio_bh_io_sync(struct bio *bio)
2933 {
2934         struct buffer_head *bh = bio->bi_private;
2935
2936         if (unlikely(bio_flagged(bio, BIO_QUIET)))
2937                 set_bit(BH_Quiet, &bh->b_state);
2938
2939         bh->b_end_io(bh, !bio->bi_error);
2940         bio_put(bio);
2941 }
2942
2943 /*
2944  * This allows us to do IO even on the odd last sectors
2945  * of a device, even if the block size is some multiple
2946  * of the physical sector size.
2947  *
2948  * We'll just truncate the bio to the size of the device,
2949  * and clear the end of the buffer head manually.
2950  *
2951  * Truly out-of-range accesses will turn into actual IO
2952  * errors, this only handles the "we need to be able to
2953  * do IO at the final sector" case.
2954  */
2955 void guard_bio_eod(int rw, struct bio *bio)
2956 {
2957         sector_t maxsector;
2958         struct bio_vec *bvec = &bio->bi_io_vec[bio->bi_vcnt - 1];
2959         unsigned truncated_bytes;
2960
2961         maxsector = i_size_read(bio->bi_bdev->bd_inode) >> 9;
2962         if (!maxsector)
2963                 return;
2964
2965         /*
2966          * If the *whole* IO is past the end of the device,
2967          * let it through, and the IO layer will turn it into
2968          * an EIO.
2969          */
2970         if (unlikely(bio->bi_iter.bi_sector >= maxsector))
2971                 return;
2972
2973         maxsector -= bio->bi_iter.bi_sector;
2974         if (likely((bio->bi_iter.bi_size >> 9) <= maxsector))
2975                 return;
2976
2977         /* Uhhuh. We've got a bio that straddles the device size! */
2978         truncated_bytes = bio->bi_iter.bi_size - (maxsector << 9);
2979
2980         /* Truncate the bio.. */
2981         bio->bi_iter.bi_size -= truncated_bytes;
2982         bvec->bv_len -= truncated_bytes;
2983
2984         /* ..and clear the end of the buffer for reads */
2985         if ((rw & RW_MASK) == READ) {
2986                 zero_user(bvec->bv_page, bvec->bv_offset + bvec->bv_len,
2987                                 truncated_bytes);
2988         }
2989 }
2990
2991 static int submit_bh_wbc(int rw, struct buffer_head *bh,
2992                          unsigned long bio_flags, struct writeback_control *wbc)
2993 {
2994         struct bio *bio;
2995
2996         BUG_ON(!buffer_locked(bh));
2997         BUG_ON(!buffer_mapped(bh));
2998         BUG_ON(!bh->b_end_io);
2999         BUG_ON(buffer_delay(bh));
3000         BUG_ON(buffer_unwritten(bh));
3001
3002         /*
3003          * Only clear out a write error when rewriting
3004          */
3005         if (test_set_buffer_req(bh) && (rw & WRITE))
3006                 clear_buffer_write_io_error(bh);
3007
3008         /*
3009          * from here on down, it's all bio -- do the initial mapping,
3010          * submit_bio -> generic_make_request may further map this bio around
3011          */
3012         bio = bio_alloc(GFP_NOIO, 1);
3013
3014         if (wbc) {
3015                 wbc_init_bio(wbc, bio);
3016                 wbc_account_io(wbc, bh->b_page, bh->b_size);
3017         }
3018
3019         bio->bi_iter.bi_sector = bh->b_blocknr * (bh->b_size >> 9);
3020         bio->bi_bdev = bh->b_bdev;
3021
3022         bio_add_page(bio, bh->b_page, bh->b_size, bh_offset(bh));
3023         BUG_ON(bio->bi_iter.bi_size != bh->b_size);
3024
3025         bio->bi_end_io = end_bio_bh_io_sync;
3026         bio->bi_private = bh;
3027         bio->bi_flags |= bio_flags;
3028
3029         /* Take care of bh's that straddle the end of the device */
3030         guard_bio_eod(rw, bio);
3031
3032         if (buffer_meta(bh))
3033                 rw |= REQ_META;
3034         if (buffer_prio(bh))
3035                 rw |= REQ_PRIO;
3036
3037         submit_bio(rw, bio);
3038         return 0;
3039 }
3040
3041 int _submit_bh(int rw, struct buffer_head *bh, unsigned long bio_flags)
3042 {
3043         return submit_bh_wbc(rw, bh, bio_flags, NULL);
3044 }
3045 EXPORT_SYMBOL_GPL(_submit_bh);
3046
3047 int submit_bh(int rw, struct buffer_head *bh)
3048 {
3049         return submit_bh_wbc(rw, bh, 0, NULL);
3050 }
3051 EXPORT_SYMBOL(submit_bh);
3052
3053 /**
3054  * ll_rw_block: low-level access to block devices (DEPRECATED)
3055  * @rw: whether to %READ or %WRITE or maybe %READA (readahead)
3056  * @nr: number of &struct buffer_heads in the array
3057  * @bhs: array of pointers to &struct buffer_head
3058  *
3059  * ll_rw_block() takes an array of pointers to &struct buffer_heads, and
3060  * requests an I/O operation on them, either a %READ or a %WRITE.  The third
3061  * %READA option is described in the documentation for generic_make_request()
3062  * which ll_rw_block() calls.
3063  *
3064  * This function drops any buffer that it cannot get a lock on (with the
3065  * BH_Lock state bit), any buffer that appears to be clean when doing a write
3066  * request, and any buffer that appears to be up-to-date when doing read
3067  * request.  Further it marks as clean buffers that are processed for
3068  * writing (the buffer cache won't assume that they are actually clean
3069  * until the buffer gets unlocked).
3070  *
3071  * ll_rw_block sets b_end_io to simple completion handler that marks
3072  * the buffer up-to-date (if appropriate), unlocks the buffer and wakes
3073  * any waiters. 
3074  *
3075  * All of the buffers must be for the same device, and must also be a
3076  * multiple of the current approved size for the device.
3077  */
3078 void ll_rw_block(int rw, int nr, struct buffer_head *bhs[])
3079 {
3080         int i;
3081
3082         for (i = 0; i < nr; i++) {
3083                 struct buffer_head *bh = bhs[i];
3084
3085                 if (!trylock_buffer(bh))
3086                         continue;
3087                 if (rw == WRITE) {
3088                         if (test_clear_buffer_dirty(bh)) {
3089                                 bh->b_end_io = end_buffer_write_sync;
3090                                 get_bh(bh);
3091                                 submit_bh(WRITE, bh);
3092                                 continue;
3093                         }
3094                 } else {
3095                         if (!buffer_uptodate(bh)) {
3096                                 bh->b_end_io = end_buffer_read_sync;
3097                                 get_bh(bh);
3098                                 submit_bh(rw, bh);
3099                                 continue;
3100                         }
3101                 }
3102                 unlock_buffer(bh);
3103         }
3104 }
3105 EXPORT_SYMBOL(ll_rw_block);
3106
3107 void write_dirty_buffer(struct buffer_head *bh, int rw)
3108 {
3109         lock_buffer(bh);
3110         if (!test_clear_buffer_dirty(bh)) {
3111                 unlock_buffer(bh);
3112                 return;
3113         }
3114         bh->b_end_io = end_buffer_write_sync;
3115         get_bh(bh);
3116         submit_bh(rw, bh);
3117 }
3118 EXPORT_SYMBOL(write_dirty_buffer);
3119
3120 /*
3121  * For a data-integrity writeout, we need to wait upon any in-progress I/O
3122  * and then start new I/O and then wait upon it.  The caller must have a ref on
3123  * the buffer_head.
3124  */
3125 int __sync_dirty_buffer(struct buffer_head *bh, int rw)
3126 {
3127         int ret = 0;
3128
3129         WARN_ON(atomic_read(&bh->b_count) < 1);
3130         lock_buffer(bh);
3131         if (test_clear_buffer_dirty(bh)) {
3132                 get_bh(bh);
3133                 bh->b_end_io = end_buffer_write_sync;
3134                 ret = submit_bh(rw, bh);
3135                 wait_on_buffer(bh);
3136                 if (!ret && !buffer_uptodate(bh))
3137                         ret = -EIO;
3138         } else {
3139                 unlock_buffer(bh);
3140         }
3141         return ret;
3142 }
3143 EXPORT_SYMBOL(__sync_dirty_buffer);
3144
3145 int sync_dirty_buffer(struct buffer_head *bh)
3146 {
3147         return __sync_dirty_buffer(bh, WRITE_SYNC);
3148 }
3149 EXPORT_SYMBOL(sync_dirty_buffer);
3150
3151 /*
3152  * try_to_free_buffers() checks if all the buffers on this particular page
3153  * are unused, and releases them if so.
3154  *
3155  * Exclusion against try_to_free_buffers may be obtained by either
3156  * locking the page or by holding its mapping's private_lock.
3157  *
3158  * If the page is dirty but all the buffers are clean then we need to
3159  * be sure to mark the page clean as well.  This is because the page
3160  * may be against a block device, and a later reattachment of buffers
3161  * to a dirty page will set *all* buffers dirty.  Which would corrupt
3162  * filesystem data on the same device.
3163  *
3164  * The same applies to regular filesystem pages: if all the buffers are
3165  * clean then we set the page clean and proceed.  To do that, we require
3166  * total exclusion from __set_page_dirty_buffers().  That is obtained with
3167  * private_lock.
3168  *
3169  * try_to_free_buffers() is non-blocking.
3170  */
3171 static inline int buffer_busy(struct buffer_head *bh)
3172 {
3173         return atomic_read(&bh->b_count) |
3174                 (bh->b_state & ((1 << BH_Dirty) | (1 << BH_Lock)));
3175 }
3176
3177 static int
3178 drop_buffers(struct page *page, struct buffer_head **buffers_to_free)
3179 {
3180         struct buffer_head *head = page_buffers(page);
3181         struct buffer_head *bh;
3182
3183         bh = head;
3184         do {
3185                 if (buffer_write_io_error(bh) && page->mapping)
3186                         set_bit(AS_EIO, &page->mapping->flags);
3187                 if (buffer_busy(bh))
3188                         goto failed;
3189                 bh = bh->b_this_page;
3190         } while (bh != head);
3191
3192         do {
3193                 struct buffer_head *next = bh->b_this_page;
3194
3195                 if (bh->b_assoc_map)
3196                         __remove_assoc_queue(bh);
3197                 bh = next;
3198         } while (bh != head);
3199         *buffers_to_free = head;
3200         __clear_page_buffers(page);
3201         return 1;
3202 failed:
3203         return 0;
3204 }
3205
3206 int try_to_free_buffers(struct page *page)
3207 {
3208         struct address_space * const mapping = page->mapping;
3209         struct buffer_head *buffers_to_free = NULL;
3210         int ret = 0;
3211
3212         BUG_ON(!PageLocked(page));
3213         if (PageWriteback(page))
3214                 return 0;
3215
3216         if (mapping == NULL) {          /* can this still happen? */
3217                 ret = drop_buffers(page, &buffers_to_free);
3218                 goto out;
3219         }
3220
3221         spin_lock(&mapping->private_lock);
3222         ret = drop_buffers(page, &buffers_to_free);
3223
3224         /*
3225          * If the filesystem writes its buffers by hand (eg ext3)
3226          * then we can have clean buffers against a dirty page.  We
3227          * clean the page here; otherwise the VM will never notice
3228          * that the filesystem did any IO at all.
3229          *
3230          * Also, during truncate, discard_buffer will have marked all
3231          * the page's buffers clean.  We discover that here and clean
3232          * the page also.
3233          *
3234          * private_lock must be held over this entire operation in order
3235          * to synchronise against __set_page_dirty_buffers and prevent the
3236          * dirty bit from being lost.
3237          */
3238         if (ret)
3239                 cancel_dirty_page(page);
3240         spin_unlock(&mapping->private_lock);
3241 out:
3242         if (buffers_to_free) {
3243                 struct buffer_head *bh = buffers_to_free;
3244
3245                 do {
3246                         struct buffer_head *next = bh->b_this_page;
3247                         free_buffer_head(bh);
3248                         bh = next;
3249                 } while (bh != buffers_to_free);
3250         }
3251         return ret;
3252 }
3253 EXPORT_SYMBOL(try_to_free_buffers);
3254
3255 /*
3256  * There are no bdflush tunables left.  But distributions are
3257  * still running obsolete flush daemons, so we terminate them here.
3258  *
3259  * Use of bdflush() is deprecated and will be removed in a future kernel.
3260  * The `flush-X' kernel threads fully replace bdflush daemons and this call.
3261  */
3262 SYSCALL_DEFINE2(bdflush, int, func, long, data)
3263 {
3264         static int msg_count;
3265
3266         if (!capable(CAP_SYS_ADMIN))
3267                 return -EPERM;
3268
3269         if (msg_count < 5) {
3270                 msg_count++;
3271                 printk(KERN_INFO
3272                         "warning: process `%s' used the obsolete bdflush"
3273                         " system call\n", current->comm);
3274                 printk(KERN_INFO "Fix your initscripts?\n");
3275         }
3276
3277         if (func == 1)
3278                 do_exit(0);
3279         return 0;
3280 }
3281
3282 /*
3283  * Buffer-head allocation
3284  */
3285 static struct kmem_cache *bh_cachep __read_mostly;
3286
3287 /*
3288  * Once the number of bh's in the machine exceeds this level, we start
3289  * stripping them in writeback.
3290  */
3291 static unsigned long max_buffer_heads;
3292
3293 int buffer_heads_over_limit;
3294
3295 struct bh_accounting {
3296         int nr;                 /* Number of live bh's */
3297         int ratelimit;          /* Limit cacheline bouncing */
3298 };
3299
3300 static DEFINE_PER_CPU(struct bh_accounting, bh_accounting) = {0, 0};
3301
3302 static void recalc_bh_state(void)
3303 {
3304         int i;
3305         int tot = 0;
3306
3307         if (__this_cpu_inc_return(bh_accounting.ratelimit) - 1 < 4096)
3308                 return;
3309         __this_cpu_write(bh_accounting.ratelimit, 0);
3310         for_each_online_cpu(i)
3311                 tot += per_cpu(bh_accounting, i).nr;
3312         buffer_heads_over_limit = (tot > max_buffer_heads);
3313 }
3314
3315 struct buffer_head *alloc_buffer_head(gfp_t gfp_flags)
3316 {
3317         struct buffer_head *ret = kmem_cache_zalloc(bh_cachep, gfp_flags);
3318         if (ret) {
3319                 INIT_LIST_HEAD(&ret->b_assoc_buffers);
3320                 buffer_head_init_locks(ret);
3321                 preempt_disable();
3322                 __this_cpu_inc(bh_accounting.nr);
3323                 recalc_bh_state();
3324                 preempt_enable();
3325         }
3326         return ret;
3327 }
3328 EXPORT_SYMBOL(alloc_buffer_head);
3329
3330 void free_buffer_head(struct buffer_head *bh)
3331 {
3332         BUG_ON(!list_empty(&bh->b_assoc_buffers));
3333         kmem_cache_free(bh_cachep, bh);
3334         preempt_disable();
3335         __this_cpu_dec(bh_accounting.nr);
3336         recalc_bh_state();
3337         preempt_enable();
3338 }
3339 EXPORT_SYMBOL(free_buffer_head);
3340
3341 static void buffer_exit_cpu(int cpu)
3342 {
3343         int i;
3344         struct bh_lru *b = &per_cpu(bh_lrus, cpu);
3345
3346         for (i = 0; i < BH_LRU_SIZE; i++) {
3347                 brelse(b->bhs[i]);
3348                 b->bhs[i] = NULL;
3349         }
3350         this_cpu_add(bh_accounting.nr, per_cpu(bh_accounting, cpu).nr);
3351         per_cpu(bh_accounting, cpu).nr = 0;
3352 }
3353
3354 static int buffer_cpu_notify(struct notifier_block *self,
3355                               unsigned long action, void *hcpu)
3356 {
3357         if (action == CPU_DEAD || action == CPU_DEAD_FROZEN)
3358                 buffer_exit_cpu((unsigned long)hcpu);
3359         return NOTIFY_OK;
3360 }
3361
3362 /**
3363  * bh_uptodate_or_lock - Test whether the buffer is uptodate
3364  * @bh: struct buffer_head
3365  *
3366  * Return true if the buffer is up-to-date and false,
3367  * with the buffer locked, if not.
3368  */
3369 int bh_uptodate_or_lock(struct buffer_head *bh)
3370 {
3371         if (!buffer_uptodate(bh)) {
3372                 lock_buffer(bh);
3373                 if (!buffer_uptodate(bh))
3374                         return 0;
3375                 unlock_buffer(bh);
3376         }
3377         return 1;
3378 }
3379 EXPORT_SYMBOL(bh_uptodate_or_lock);
3380
3381 /**
3382  * bh_submit_read - Submit a locked buffer for reading
3383  * @bh: struct buffer_head
3384  *
3385  * Returns zero on success and -EIO on error.
3386  */
3387 int bh_submit_read(struct buffer_head *bh)
3388 {
3389         BUG_ON(!buffer_locked(bh));
3390
3391         if (buffer_uptodate(bh)) {
3392                 unlock_buffer(bh);
3393                 return 0;
3394         }
3395
3396         get_bh(bh);
3397         bh->b_end_io = end_buffer_read_sync;
3398         submit_bh(READ, bh);
3399         wait_on_buffer(bh);
3400         if (buffer_uptodate(bh))
3401                 return 0;
3402         return -EIO;
3403 }
3404 EXPORT_SYMBOL(bh_submit_read);
3405
3406 void __init buffer_init(void)
3407 {
3408         unsigned long nrpages;
3409
3410         bh_cachep = kmem_cache_create("buffer_head",
3411                         sizeof(struct buffer_head), 0,
3412                                 (SLAB_RECLAIM_ACCOUNT|SLAB_PANIC|
3413                                 SLAB_MEM_SPREAD),
3414                                 NULL);
3415
3416         /*
3417          * Limit the bh occupancy to 10% of ZONE_NORMAL
3418          */
3419         nrpages = (nr_free_buffer_pages() * 10) / 100;
3420         max_buffer_heads = nrpages * (PAGE_SIZE / sizeof(struct buffer_head));
3421         hotcpu_notifier(buffer_cpu_notify, 0);
3422 }