Add the rt linux 4.1.3-rt3 as base
[kvmfornfv.git] / kernel / fs / buffer.c
1 /*
2  *  linux/fs/buffer.c
3  *
4  *  Copyright (C) 1991, 1992, 2002  Linus Torvalds
5  */
6
7 /*
8  * Start bdflush() with kernel_thread not syscall - Paul Gortmaker, 12/95
9  *
10  * Removed a lot of unnecessary code and simplified things now that
11  * the buffer cache isn't our primary cache - Andrew Tridgell 12/96
12  *
13  * Speed up hash, lru, and free list operations.  Use gfp() for allocating
14  * hash table, use SLAB cache for buffer heads. SMP threading.  -DaveM
15  *
16  * Added 32k buffer block sizes - these are required older ARM systems. - RMK
17  *
18  * async buffer flushing, 1999 Andrea Arcangeli <andrea@suse.de>
19  */
20
21 #include <linux/kernel.h>
22 #include <linux/syscalls.h>
23 #include <linux/fs.h>
24 #include <linux/mm.h>
25 #include <linux/percpu.h>
26 #include <linux/slab.h>
27 #include <linux/capability.h>
28 #include <linux/blkdev.h>
29 #include <linux/file.h>
30 #include <linux/quotaops.h>
31 #include <linux/highmem.h>
32 #include <linux/export.h>
33 #include <linux/writeback.h>
34 #include <linux/hash.h>
35 #include <linux/suspend.h>
36 #include <linux/buffer_head.h>
37 #include <linux/task_io_accounting_ops.h>
38 #include <linux/bio.h>
39 #include <linux/notifier.h>
40 #include <linux/cpu.h>
41 #include <linux/bitops.h>
42 #include <linux/mpage.h>
43 #include <linux/bit_spinlock.h>
44 #include <trace/events/block.h>
45
46 static int fsync_buffers_list(spinlock_t *lock, struct list_head *list);
47
48 #define BH_ENTRY(list) list_entry((list), struct buffer_head, b_assoc_buffers)
49
50 void init_buffer(struct buffer_head *bh, bh_end_io_t *handler, void *private)
51 {
52         bh->b_end_io = handler;
53         bh->b_private = private;
54 }
55 EXPORT_SYMBOL(init_buffer);
56
57 inline void touch_buffer(struct buffer_head *bh)
58 {
59         trace_block_touch_buffer(bh);
60         mark_page_accessed(bh->b_page);
61 }
62 EXPORT_SYMBOL(touch_buffer);
63
64 void __lock_buffer(struct buffer_head *bh)
65 {
66         wait_on_bit_lock_io(&bh->b_state, BH_Lock, TASK_UNINTERRUPTIBLE);
67 }
68 EXPORT_SYMBOL(__lock_buffer);
69
70 void unlock_buffer(struct buffer_head *bh)
71 {
72         clear_bit_unlock(BH_Lock, &bh->b_state);
73         smp_mb__after_atomic();
74         wake_up_bit(&bh->b_state, BH_Lock);
75 }
76 EXPORT_SYMBOL(unlock_buffer);
77
78 /*
79  * Returns if the page has dirty or writeback buffers. If all the buffers
80  * are unlocked and clean then the PageDirty information is stale. If
81  * any of the pages are locked, it is assumed they are locked for IO.
82  */
83 void buffer_check_dirty_writeback(struct page *page,
84                                      bool *dirty, bool *writeback)
85 {
86         struct buffer_head *head, *bh;
87         *dirty = false;
88         *writeback = false;
89
90         BUG_ON(!PageLocked(page));
91
92         if (!page_has_buffers(page))
93                 return;
94
95         if (PageWriteback(page))
96                 *writeback = true;
97
98         head = page_buffers(page);
99         bh = head;
100         do {
101                 if (buffer_locked(bh))
102                         *writeback = true;
103
104                 if (buffer_dirty(bh))
105                         *dirty = true;
106
107                 bh = bh->b_this_page;
108         } while (bh != head);
109 }
110 EXPORT_SYMBOL(buffer_check_dirty_writeback);
111
112 /*
113  * Block until a buffer comes unlocked.  This doesn't stop it
114  * from becoming locked again - you have to lock it yourself
115  * if you want to preserve its state.
116  */
117 void __wait_on_buffer(struct buffer_head * bh)
118 {
119         wait_on_bit_io(&bh->b_state, BH_Lock, TASK_UNINTERRUPTIBLE);
120 }
121 EXPORT_SYMBOL(__wait_on_buffer);
122
123 static void
124 __clear_page_buffers(struct page *page)
125 {
126         ClearPagePrivate(page);
127         set_page_private(page, 0);
128         page_cache_release(page);
129 }
130
131 static void buffer_io_error(struct buffer_head *bh, char *msg)
132 {
133         char b[BDEVNAME_SIZE];
134
135         if (!test_bit(BH_Quiet, &bh->b_state))
136                 printk_ratelimited(KERN_ERR
137                         "Buffer I/O error on dev %s, logical block %llu%s\n",
138                         bdevname(bh->b_bdev, b),
139                         (unsigned long long)bh->b_blocknr, msg);
140 }
141
142 /*
143  * End-of-IO handler helper function which does not touch the bh after
144  * unlocking it.
145  * Note: unlock_buffer() sort-of does touch the bh after unlocking it, but
146  * a race there is benign: unlock_buffer() only use the bh's address for
147  * hashing after unlocking the buffer, so it doesn't actually touch the bh
148  * itself.
149  */
150 static void __end_buffer_read_notouch(struct buffer_head *bh, int uptodate)
151 {
152         if (uptodate) {
153                 set_buffer_uptodate(bh);
154         } else {
155                 /* This happens, due to failed READA attempts. */
156                 clear_buffer_uptodate(bh);
157         }
158         unlock_buffer(bh);
159 }
160
161 /*
162  * Default synchronous end-of-IO handler..  Just mark it up-to-date and
163  * unlock the buffer. This is what ll_rw_block uses too.
164  */
165 void end_buffer_read_sync(struct buffer_head *bh, int uptodate)
166 {
167         __end_buffer_read_notouch(bh, uptodate);
168         put_bh(bh);
169 }
170 EXPORT_SYMBOL(end_buffer_read_sync);
171
172 void end_buffer_write_sync(struct buffer_head *bh, int uptodate)
173 {
174         if (uptodate) {
175                 set_buffer_uptodate(bh);
176         } else {
177                 buffer_io_error(bh, ", lost sync page write");
178                 set_buffer_write_io_error(bh);
179                 clear_buffer_uptodate(bh);
180         }
181         unlock_buffer(bh);
182         put_bh(bh);
183 }
184 EXPORT_SYMBOL(end_buffer_write_sync);
185
186 /*
187  * Various filesystems appear to want __find_get_block to be non-blocking.
188  * But it's the page lock which protects the buffers.  To get around this,
189  * we get exclusion from try_to_free_buffers with the blockdev mapping's
190  * private_lock.
191  *
192  * Hack idea: for the blockdev mapping, i_bufferlist_lock contention
193  * may be quite high.  This code could TryLock the page, and if that
194  * succeeds, there is no need to take private_lock. (But if
195  * private_lock is contended then so is mapping->tree_lock).
196  */
197 static struct buffer_head *
198 __find_get_block_slow(struct block_device *bdev, sector_t block)
199 {
200         struct inode *bd_inode = bdev->bd_inode;
201         struct address_space *bd_mapping = bd_inode->i_mapping;
202         struct buffer_head *ret = NULL;
203         pgoff_t index;
204         struct buffer_head *bh;
205         struct buffer_head *head;
206         struct page *page;
207         int all_mapped = 1;
208
209         index = block >> (PAGE_CACHE_SHIFT - bd_inode->i_blkbits);
210         page = find_get_page_flags(bd_mapping, index, FGP_ACCESSED);
211         if (!page)
212                 goto out;
213
214         spin_lock(&bd_mapping->private_lock);
215         if (!page_has_buffers(page))
216                 goto out_unlock;
217         head = page_buffers(page);
218         bh = head;
219         do {
220                 if (!buffer_mapped(bh))
221                         all_mapped = 0;
222                 else if (bh->b_blocknr == block) {
223                         ret = bh;
224                         get_bh(bh);
225                         goto out_unlock;
226                 }
227                 bh = bh->b_this_page;
228         } while (bh != head);
229
230         /* we might be here because some of the buffers on this page are
231          * not mapped.  This is due to various races between
232          * file io on the block device and getblk.  It gets dealt with
233          * elsewhere, don't buffer_error if we had some unmapped buffers
234          */
235         if (all_mapped) {
236                 char b[BDEVNAME_SIZE];
237
238                 printk("__find_get_block_slow() failed. "
239                         "block=%llu, b_blocknr=%llu\n",
240                         (unsigned long long)block,
241                         (unsigned long long)bh->b_blocknr);
242                 printk("b_state=0x%08lx, b_size=%zu\n",
243                         bh->b_state, bh->b_size);
244                 printk("device %s blocksize: %d\n", bdevname(bdev, b),
245                         1 << bd_inode->i_blkbits);
246         }
247 out_unlock:
248         spin_unlock(&bd_mapping->private_lock);
249         page_cache_release(page);
250 out:
251         return ret;
252 }
253
254 /*
255  * Kick the writeback threads then try to free up some ZONE_NORMAL memory.
256  */
257 static void free_more_memory(void)
258 {
259         struct zone *zone;
260         int nid;
261
262         wakeup_flusher_threads(1024, WB_REASON_FREE_MORE_MEM);
263         yield();
264
265         for_each_online_node(nid) {
266                 (void)first_zones_zonelist(node_zonelist(nid, GFP_NOFS),
267                                                 gfp_zone(GFP_NOFS), NULL,
268                                                 &zone);
269                 if (zone)
270                         try_to_free_pages(node_zonelist(nid, GFP_NOFS), 0,
271                                                 GFP_NOFS, NULL);
272         }
273 }
274
275 /*
276  * I/O completion handler for block_read_full_page() - pages
277  * which come unlocked at the end of I/O.
278  */
279 static void end_buffer_async_read(struct buffer_head *bh, int uptodate)
280 {
281         unsigned long flags;
282         struct buffer_head *first;
283         struct buffer_head *tmp;
284         struct page *page;
285         int page_uptodate = 1;
286
287         BUG_ON(!buffer_async_read(bh));
288
289         page = bh->b_page;
290         if (uptodate) {
291                 set_buffer_uptodate(bh);
292         } else {
293                 clear_buffer_uptodate(bh);
294                 buffer_io_error(bh, ", async page read");
295                 SetPageError(page);
296         }
297
298         /*
299          * Be _very_ careful from here on. Bad things can happen if
300          * two buffer heads end IO at almost the same time and both
301          * decide that the page is now completely done.
302          */
303         first = page_buffers(page);
304         flags = bh_uptodate_lock_irqsave(first);
305         clear_buffer_async_read(bh);
306         unlock_buffer(bh);
307         tmp = bh;
308         do {
309                 if (!buffer_uptodate(tmp))
310                         page_uptodate = 0;
311                 if (buffer_async_read(tmp)) {
312                         BUG_ON(!buffer_locked(tmp));
313                         goto still_busy;
314                 }
315                 tmp = tmp->b_this_page;
316         } while (tmp != bh);
317         bh_uptodate_unlock_irqrestore(first, flags);
318
319         /*
320          * If none of the buffers had errors and they are all
321          * uptodate then we can set the page uptodate.
322          */
323         if (page_uptodate && !PageError(page))
324                 SetPageUptodate(page);
325         unlock_page(page);
326         return;
327
328 still_busy:
329         bh_uptodate_unlock_irqrestore(first, flags);
330 }
331
332 /*
333  * Completion handler for block_write_full_page() - pages which are unlocked
334  * during I/O, and which have PageWriteback cleared upon I/O completion.
335  */
336 void end_buffer_async_write(struct buffer_head *bh, int uptodate)
337 {
338         unsigned long flags;
339         struct buffer_head *first;
340         struct buffer_head *tmp;
341         struct page *page;
342
343         BUG_ON(!buffer_async_write(bh));
344
345         page = bh->b_page;
346         if (uptodate) {
347                 set_buffer_uptodate(bh);
348         } else {
349                 buffer_io_error(bh, ", lost async page write");
350                 set_bit(AS_EIO, &page->mapping->flags);
351                 set_buffer_write_io_error(bh);
352                 clear_buffer_uptodate(bh);
353                 SetPageError(page);
354         }
355
356         first = page_buffers(page);
357         flags = bh_uptodate_lock_irqsave(first);
358
359         clear_buffer_async_write(bh);
360         unlock_buffer(bh);
361         tmp = bh->b_this_page;
362         while (tmp != bh) {
363                 if (buffer_async_write(tmp)) {
364                         BUG_ON(!buffer_locked(tmp));
365                         goto still_busy;
366                 }
367                 tmp = tmp->b_this_page;
368         }
369         bh_uptodate_unlock_irqrestore(first, flags);
370         end_page_writeback(page);
371         return;
372
373 still_busy:
374         bh_uptodate_unlock_irqrestore(first, flags);
375 }
376 EXPORT_SYMBOL(end_buffer_async_write);
377
378 /*
379  * If a page's buffers are under async readin (end_buffer_async_read
380  * completion) then there is a possibility that another thread of
381  * control could lock one of the buffers after it has completed
382  * but while some of the other buffers have not completed.  This
383  * locked buffer would confuse end_buffer_async_read() into not unlocking
384  * the page.  So the absence of BH_Async_Read tells end_buffer_async_read()
385  * that this buffer is not under async I/O.
386  *
387  * The page comes unlocked when it has no locked buffer_async buffers
388  * left.
389  *
390  * PageLocked prevents anyone starting new async I/O reads any of
391  * the buffers.
392  *
393  * PageWriteback is used to prevent simultaneous writeout of the same
394  * page.
395  *
396  * PageLocked prevents anyone from starting writeback of a page which is
397  * under read I/O (PageWriteback is only ever set against a locked page).
398  */
399 static void mark_buffer_async_read(struct buffer_head *bh)
400 {
401         bh->b_end_io = end_buffer_async_read;
402         set_buffer_async_read(bh);
403 }
404
405 static void mark_buffer_async_write_endio(struct buffer_head *bh,
406                                           bh_end_io_t *handler)
407 {
408         bh->b_end_io = handler;
409         set_buffer_async_write(bh);
410 }
411
412 void mark_buffer_async_write(struct buffer_head *bh)
413 {
414         mark_buffer_async_write_endio(bh, end_buffer_async_write);
415 }
416 EXPORT_SYMBOL(mark_buffer_async_write);
417
418
419 /*
420  * fs/buffer.c contains helper functions for buffer-backed address space's
421  * fsync functions.  A common requirement for buffer-based filesystems is
422  * that certain data from the backing blockdev needs to be written out for
423  * a successful fsync().  For example, ext2 indirect blocks need to be
424  * written back and waited upon before fsync() returns.
425  *
426  * The functions mark_buffer_inode_dirty(), fsync_inode_buffers(),
427  * inode_has_buffers() and invalidate_inode_buffers() are provided for the
428  * management of a list of dependent buffers at ->i_mapping->private_list.
429  *
430  * Locking is a little subtle: try_to_free_buffers() will remove buffers
431  * from their controlling inode's queue when they are being freed.  But
432  * try_to_free_buffers() will be operating against the *blockdev* mapping
433  * at the time, not against the S_ISREG file which depends on those buffers.
434  * So the locking for private_list is via the private_lock in the address_space
435  * which backs the buffers.  Which is different from the address_space 
436  * against which the buffers are listed.  So for a particular address_space,
437  * mapping->private_lock does *not* protect mapping->private_list!  In fact,
438  * mapping->private_list will always be protected by the backing blockdev's
439  * ->private_lock.
440  *
441  * Which introduces a requirement: all buffers on an address_space's
442  * ->private_list must be from the same address_space: the blockdev's.
443  *
444  * address_spaces which do not place buffers at ->private_list via these
445  * utility functions are free to use private_lock and private_list for
446  * whatever they want.  The only requirement is that list_empty(private_list)
447  * be true at clear_inode() time.
448  *
449  * FIXME: clear_inode should not call invalidate_inode_buffers().  The
450  * filesystems should do that.  invalidate_inode_buffers() should just go
451  * BUG_ON(!list_empty).
452  *
453  * FIXME: mark_buffer_dirty_inode() is a data-plane operation.  It should
454  * take an address_space, not an inode.  And it should be called
455  * mark_buffer_dirty_fsync() to clearly define why those buffers are being
456  * queued up.
457  *
458  * FIXME: mark_buffer_dirty_inode() doesn't need to add the buffer to the
459  * list if it is already on a list.  Because if the buffer is on a list,
460  * it *must* already be on the right one.  If not, the filesystem is being
461  * silly.  This will save a ton of locking.  But first we have to ensure
462  * that buffers are taken *off* the old inode's list when they are freed
463  * (presumably in truncate).  That requires careful auditing of all
464  * filesystems (do it inside bforget()).  It could also be done by bringing
465  * b_inode back.
466  */
467
468 /*
469  * The buffer's backing address_space's private_lock must be held
470  */
471 static void __remove_assoc_queue(struct buffer_head *bh)
472 {
473         list_del_init(&bh->b_assoc_buffers);
474         WARN_ON(!bh->b_assoc_map);
475         if (buffer_write_io_error(bh))
476                 set_bit(AS_EIO, &bh->b_assoc_map->flags);
477         bh->b_assoc_map = NULL;
478 }
479
480 int inode_has_buffers(struct inode *inode)
481 {
482         return !list_empty(&inode->i_data.private_list);
483 }
484
485 /*
486  * osync is designed to support O_SYNC io.  It waits synchronously for
487  * all already-submitted IO to complete, but does not queue any new
488  * writes to the disk.
489  *
490  * To do O_SYNC writes, just queue the buffer writes with ll_rw_block as
491  * you dirty the buffers, and then use osync_inode_buffers to wait for
492  * completion.  Any other dirty buffers which are not yet queued for
493  * write will not be flushed to disk by the osync.
494  */
495 static int osync_buffers_list(spinlock_t *lock, struct list_head *list)
496 {
497         struct buffer_head *bh;
498         struct list_head *p;
499         int err = 0;
500
501         spin_lock(lock);
502 repeat:
503         list_for_each_prev(p, list) {
504                 bh = BH_ENTRY(p);
505                 if (buffer_locked(bh)) {
506                         get_bh(bh);
507                         spin_unlock(lock);
508                         wait_on_buffer(bh);
509                         if (!buffer_uptodate(bh))
510                                 err = -EIO;
511                         brelse(bh);
512                         spin_lock(lock);
513                         goto repeat;
514                 }
515         }
516         spin_unlock(lock);
517         return err;
518 }
519
520 static void do_thaw_one(struct super_block *sb, void *unused)
521 {
522         char b[BDEVNAME_SIZE];
523         while (sb->s_bdev && !thaw_bdev(sb->s_bdev, sb))
524                 printk(KERN_WARNING "Emergency Thaw on %s\n",
525                        bdevname(sb->s_bdev, b));
526 }
527
528 static void do_thaw_all(struct work_struct *work)
529 {
530         iterate_supers(do_thaw_one, NULL);
531         kfree(work);
532         printk(KERN_WARNING "Emergency Thaw complete\n");
533 }
534
535 /**
536  * emergency_thaw_all -- forcibly thaw every frozen filesystem
537  *
538  * Used for emergency unfreeze of all filesystems via SysRq
539  */
540 void emergency_thaw_all(void)
541 {
542         struct work_struct *work;
543
544         work = kmalloc(sizeof(*work), GFP_ATOMIC);
545         if (work) {
546                 INIT_WORK(work, do_thaw_all);
547                 schedule_work(work);
548         }
549 }
550
551 /**
552  * sync_mapping_buffers - write out & wait upon a mapping's "associated" buffers
553  * @mapping: the mapping which wants those buffers written
554  *
555  * Starts I/O against the buffers at mapping->private_list, and waits upon
556  * that I/O.
557  *
558  * Basically, this is a convenience function for fsync().
559  * @mapping is a file or directory which needs those buffers to be written for
560  * a successful fsync().
561  */
562 int sync_mapping_buffers(struct address_space *mapping)
563 {
564         struct address_space *buffer_mapping = mapping->private_data;
565
566         if (buffer_mapping == NULL || list_empty(&mapping->private_list))
567                 return 0;
568
569         return fsync_buffers_list(&buffer_mapping->private_lock,
570                                         &mapping->private_list);
571 }
572 EXPORT_SYMBOL(sync_mapping_buffers);
573
574 /*
575  * Called when we've recently written block `bblock', and it is known that
576  * `bblock' was for a buffer_boundary() buffer.  This means that the block at
577  * `bblock + 1' is probably a dirty indirect block.  Hunt it down and, if it's
578  * dirty, schedule it for IO.  So that indirects merge nicely with their data.
579  */
580 void write_boundary_block(struct block_device *bdev,
581                         sector_t bblock, unsigned blocksize)
582 {
583         struct buffer_head *bh = __find_get_block(bdev, bblock + 1, blocksize);
584         if (bh) {
585                 if (buffer_dirty(bh))
586                         ll_rw_block(WRITE, 1, &bh);
587                 put_bh(bh);
588         }
589 }
590
591 void mark_buffer_dirty_inode(struct buffer_head *bh, struct inode *inode)
592 {
593         struct address_space *mapping = inode->i_mapping;
594         struct address_space *buffer_mapping = bh->b_page->mapping;
595
596         mark_buffer_dirty(bh);
597         if (!mapping->private_data) {
598                 mapping->private_data = buffer_mapping;
599         } else {
600                 BUG_ON(mapping->private_data != buffer_mapping);
601         }
602         if (!bh->b_assoc_map) {
603                 spin_lock(&buffer_mapping->private_lock);
604                 list_move_tail(&bh->b_assoc_buffers,
605                                 &mapping->private_list);
606                 bh->b_assoc_map = mapping;
607                 spin_unlock(&buffer_mapping->private_lock);
608         }
609 }
610 EXPORT_SYMBOL(mark_buffer_dirty_inode);
611
612 /*
613  * Mark the page dirty, and set it dirty in the radix tree, and mark the inode
614  * dirty.
615  *
616  * If warn is true, then emit a warning if the page is not uptodate and has
617  * not been truncated.
618  */
619 static void __set_page_dirty(struct page *page,
620                 struct address_space *mapping, int warn)
621 {
622         unsigned long flags;
623
624         spin_lock_irqsave(&mapping->tree_lock, flags);
625         if (page->mapping) {    /* Race with truncate? */
626                 WARN_ON_ONCE(warn && !PageUptodate(page));
627                 account_page_dirtied(page, mapping);
628                 radix_tree_tag_set(&mapping->page_tree,
629                                 page_index(page), PAGECACHE_TAG_DIRTY);
630         }
631         spin_unlock_irqrestore(&mapping->tree_lock, flags);
632         __mark_inode_dirty(mapping->host, I_DIRTY_PAGES);
633 }
634
635 /*
636  * Add a page to the dirty page list.
637  *
638  * It is a sad fact of life that this function is called from several places
639  * deeply under spinlocking.  It may not sleep.
640  *
641  * If the page has buffers, the uptodate buffers are set dirty, to preserve
642  * dirty-state coherency between the page and the buffers.  It the page does
643  * not have buffers then when they are later attached they will all be set
644  * dirty.
645  *
646  * The buffers are dirtied before the page is dirtied.  There's a small race
647  * window in which a writepage caller may see the page cleanness but not the
648  * buffer dirtiness.  That's fine.  If this code were to set the page dirty
649  * before the buffers, a concurrent writepage caller could clear the page dirty
650  * bit, see a bunch of clean buffers and we'd end up with dirty buffers/clean
651  * page on the dirty page list.
652  *
653  * We use private_lock to lock against try_to_free_buffers while using the
654  * page's buffer list.  Also use this to protect against clean buffers being
655  * added to the page after it was set dirty.
656  *
657  * FIXME: may need to call ->reservepage here as well.  That's rather up to the
658  * address_space though.
659  */
660 int __set_page_dirty_buffers(struct page *page)
661 {
662         int newly_dirty;
663         struct address_space *mapping = page_mapping(page);
664
665         if (unlikely(!mapping))
666                 return !TestSetPageDirty(page);
667
668         spin_lock(&mapping->private_lock);
669         if (page_has_buffers(page)) {
670                 struct buffer_head *head = page_buffers(page);
671                 struct buffer_head *bh = head;
672
673                 do {
674                         set_buffer_dirty(bh);
675                         bh = bh->b_this_page;
676                 } while (bh != head);
677         }
678         newly_dirty = !TestSetPageDirty(page);
679         spin_unlock(&mapping->private_lock);
680
681         if (newly_dirty)
682                 __set_page_dirty(page, mapping, 1);
683         return newly_dirty;
684 }
685 EXPORT_SYMBOL(__set_page_dirty_buffers);
686
687 /*
688  * Write out and wait upon a list of buffers.
689  *
690  * We have conflicting pressures: we want to make sure that all
691  * initially dirty buffers get waited on, but that any subsequently
692  * dirtied buffers don't.  After all, we don't want fsync to last
693  * forever if somebody is actively writing to the file.
694  *
695  * Do this in two main stages: first we copy dirty buffers to a
696  * temporary inode list, queueing the writes as we go.  Then we clean
697  * up, waiting for those writes to complete.
698  * 
699  * During this second stage, any subsequent updates to the file may end
700  * up refiling the buffer on the original inode's dirty list again, so
701  * there is a chance we will end up with a buffer queued for write but
702  * not yet completed on that list.  So, as a final cleanup we go through
703  * the osync code to catch these locked, dirty buffers without requeuing
704  * any newly dirty buffers for write.
705  */
706 static int fsync_buffers_list(spinlock_t *lock, struct list_head *list)
707 {
708         struct buffer_head *bh;
709         struct list_head tmp;
710         struct address_space *mapping;
711         int err = 0, err2;
712         struct blk_plug plug;
713
714         INIT_LIST_HEAD(&tmp);
715         blk_start_plug(&plug);
716
717         spin_lock(lock);
718         while (!list_empty(list)) {
719                 bh = BH_ENTRY(list->next);
720                 mapping = bh->b_assoc_map;
721                 __remove_assoc_queue(bh);
722                 /* Avoid race with mark_buffer_dirty_inode() which does
723                  * a lockless check and we rely on seeing the dirty bit */
724                 smp_mb();
725                 if (buffer_dirty(bh) || buffer_locked(bh)) {
726                         list_add(&bh->b_assoc_buffers, &tmp);
727                         bh->b_assoc_map = mapping;
728                         if (buffer_dirty(bh)) {
729                                 get_bh(bh);
730                                 spin_unlock(lock);
731                                 /*
732                                  * Ensure any pending I/O completes so that
733                                  * write_dirty_buffer() actually writes the
734                                  * current contents - it is a noop if I/O is
735                                  * still in flight on potentially older
736                                  * contents.
737                                  */
738                                 write_dirty_buffer(bh, WRITE_SYNC);
739
740                                 /*
741                                  * Kick off IO for the previous mapping. Note
742                                  * that we will not run the very last mapping,
743                                  * wait_on_buffer() will do that for us
744                                  * through sync_buffer().
745                                  */
746                                 brelse(bh);
747                                 spin_lock(lock);
748                         }
749                 }
750         }
751
752         spin_unlock(lock);
753         blk_finish_plug(&plug);
754         spin_lock(lock);
755
756         while (!list_empty(&tmp)) {
757                 bh = BH_ENTRY(tmp.prev);
758                 get_bh(bh);
759                 mapping = bh->b_assoc_map;
760                 __remove_assoc_queue(bh);
761                 /* Avoid race with mark_buffer_dirty_inode() which does
762                  * a lockless check and we rely on seeing the dirty bit */
763                 smp_mb();
764                 if (buffer_dirty(bh)) {
765                         list_add(&bh->b_assoc_buffers,
766                                  &mapping->private_list);
767                         bh->b_assoc_map = mapping;
768                 }
769                 spin_unlock(lock);
770                 wait_on_buffer(bh);
771                 if (!buffer_uptodate(bh))
772                         err = -EIO;
773                 brelse(bh);
774                 spin_lock(lock);
775         }
776         
777         spin_unlock(lock);
778         err2 = osync_buffers_list(lock, list);
779         if (err)
780                 return err;
781         else
782                 return err2;
783 }
784
785 /*
786  * Invalidate any and all dirty buffers on a given inode.  We are
787  * probably unmounting the fs, but that doesn't mean we have already
788  * done a sync().  Just drop the buffers from the inode list.
789  *
790  * NOTE: we take the inode's blockdev's mapping's private_lock.  Which
791  * assumes that all the buffers are against the blockdev.  Not true
792  * for reiserfs.
793  */
794 void invalidate_inode_buffers(struct inode *inode)
795 {
796         if (inode_has_buffers(inode)) {
797                 struct address_space *mapping = &inode->i_data;
798                 struct list_head *list = &mapping->private_list;
799                 struct address_space *buffer_mapping = mapping->private_data;
800
801                 spin_lock(&buffer_mapping->private_lock);
802                 while (!list_empty(list))
803                         __remove_assoc_queue(BH_ENTRY(list->next));
804                 spin_unlock(&buffer_mapping->private_lock);
805         }
806 }
807 EXPORT_SYMBOL(invalidate_inode_buffers);
808
809 /*
810  * Remove any clean buffers from the inode's buffer list.  This is called
811  * when we're trying to free the inode itself.  Those buffers can pin it.
812  *
813  * Returns true if all buffers were removed.
814  */
815 int remove_inode_buffers(struct inode *inode)
816 {
817         int ret = 1;
818
819         if (inode_has_buffers(inode)) {
820                 struct address_space *mapping = &inode->i_data;
821                 struct list_head *list = &mapping->private_list;
822                 struct address_space *buffer_mapping = mapping->private_data;
823
824                 spin_lock(&buffer_mapping->private_lock);
825                 while (!list_empty(list)) {
826                         struct buffer_head *bh = BH_ENTRY(list->next);
827                         if (buffer_dirty(bh)) {
828                                 ret = 0;
829                                 break;
830                         }
831                         __remove_assoc_queue(bh);
832                 }
833                 spin_unlock(&buffer_mapping->private_lock);
834         }
835         return ret;
836 }
837
838 /*
839  * Create the appropriate buffers when given a page for data area and
840  * the size of each buffer.. Use the bh->b_this_page linked list to
841  * follow the buffers created.  Return NULL if unable to create more
842  * buffers.
843  *
844  * The retry flag is used to differentiate async IO (paging, swapping)
845  * which may not fail from ordinary buffer allocations.
846  */
847 struct buffer_head *alloc_page_buffers(struct page *page, unsigned long size,
848                 int retry)
849 {
850         struct buffer_head *bh, *head;
851         long offset;
852
853 try_again:
854         head = NULL;
855         offset = PAGE_SIZE;
856         while ((offset -= size) >= 0) {
857                 bh = alloc_buffer_head(GFP_NOFS);
858                 if (!bh)
859                         goto no_grow;
860
861                 bh->b_this_page = head;
862                 bh->b_blocknr = -1;
863                 head = bh;
864
865                 bh->b_size = size;
866
867                 /* Link the buffer to its page */
868                 set_bh_page(bh, page, offset);
869         }
870         return head;
871 /*
872  * In case anything failed, we just free everything we got.
873  */
874 no_grow:
875         if (head) {
876                 do {
877                         bh = head;
878                         head = head->b_this_page;
879                         free_buffer_head(bh);
880                 } while (head);
881         }
882
883         /*
884          * Return failure for non-async IO requests.  Async IO requests
885          * are not allowed to fail, so we have to wait until buffer heads
886          * become available.  But we don't want tasks sleeping with 
887          * partially complete buffers, so all were released above.
888          */
889         if (!retry)
890                 return NULL;
891
892         /* We're _really_ low on memory. Now we just
893          * wait for old buffer heads to become free due to
894          * finishing IO.  Since this is an async request and
895          * the reserve list is empty, we're sure there are 
896          * async buffer heads in use.
897          */
898         free_more_memory();
899         goto try_again;
900 }
901 EXPORT_SYMBOL_GPL(alloc_page_buffers);
902
903 static inline void
904 link_dev_buffers(struct page *page, struct buffer_head *head)
905 {
906         struct buffer_head *bh, *tail;
907
908         bh = head;
909         do {
910                 tail = bh;
911                 bh = bh->b_this_page;
912         } while (bh);
913         tail->b_this_page = head;
914         attach_page_buffers(page, head);
915 }
916
917 static sector_t blkdev_max_block(struct block_device *bdev, unsigned int size)
918 {
919         sector_t retval = ~((sector_t)0);
920         loff_t sz = i_size_read(bdev->bd_inode);
921
922         if (sz) {
923                 unsigned int sizebits = blksize_bits(size);
924                 retval = (sz >> sizebits);
925         }
926         return retval;
927 }
928
929 /*
930  * Initialise the state of a blockdev page's buffers.
931  */ 
932 static sector_t
933 init_page_buffers(struct page *page, struct block_device *bdev,
934                         sector_t block, int size)
935 {
936         struct buffer_head *head = page_buffers(page);
937         struct buffer_head *bh = head;
938         int uptodate = PageUptodate(page);
939         sector_t end_block = blkdev_max_block(I_BDEV(bdev->bd_inode), size);
940
941         do {
942                 if (!buffer_mapped(bh)) {
943                         init_buffer(bh, NULL, NULL);
944                         bh->b_bdev = bdev;
945                         bh->b_blocknr = block;
946                         if (uptodate)
947                                 set_buffer_uptodate(bh);
948                         if (block < end_block)
949                                 set_buffer_mapped(bh);
950                 }
951                 block++;
952                 bh = bh->b_this_page;
953         } while (bh != head);
954
955         /*
956          * Caller needs to validate requested block against end of device.
957          */
958         return end_block;
959 }
960
961 /*
962  * Create the page-cache page that contains the requested block.
963  *
964  * This is used purely for blockdev mappings.
965  */
966 static int
967 grow_dev_page(struct block_device *bdev, sector_t block,
968               pgoff_t index, int size, int sizebits, gfp_t gfp)
969 {
970         struct inode *inode = bdev->bd_inode;
971         struct page *page;
972         struct buffer_head *bh;
973         sector_t end_block;
974         int ret = 0;            /* Will call free_more_memory() */
975         gfp_t gfp_mask;
976
977         gfp_mask = (mapping_gfp_mask(inode->i_mapping) & ~__GFP_FS) | gfp;
978
979         /*
980          * XXX: __getblk_slow() can not really deal with failure and
981          * will endlessly loop on improvised global reclaim.  Prefer
982          * looping in the allocator rather than here, at least that
983          * code knows what it's doing.
984          */
985         gfp_mask |= __GFP_NOFAIL;
986
987         page = find_or_create_page(inode->i_mapping, index, gfp_mask);
988         if (!page)
989                 return ret;
990
991         BUG_ON(!PageLocked(page));
992
993         if (page_has_buffers(page)) {
994                 bh = page_buffers(page);
995                 if (bh->b_size == size) {
996                         end_block = init_page_buffers(page, bdev,
997                                                 (sector_t)index << sizebits,
998                                                 size);
999                         goto done;
1000                 }
1001                 if (!try_to_free_buffers(page))
1002                         goto failed;
1003         }
1004
1005         /*
1006          * Allocate some buffers for this page
1007          */
1008         bh = alloc_page_buffers(page, size, 0);
1009         if (!bh)
1010                 goto failed;
1011
1012         /*
1013          * Link the page to the buffers and initialise them.  Take the
1014          * lock to be atomic wrt __find_get_block(), which does not
1015          * run under the page lock.
1016          */
1017         spin_lock(&inode->i_mapping->private_lock);
1018         link_dev_buffers(page, bh);
1019         end_block = init_page_buffers(page, bdev, (sector_t)index << sizebits,
1020                         size);
1021         spin_unlock(&inode->i_mapping->private_lock);
1022 done:
1023         ret = (block < end_block) ? 1 : -ENXIO;
1024 failed:
1025         unlock_page(page);
1026         page_cache_release(page);
1027         return ret;
1028 }
1029
1030 /*
1031  * Create buffers for the specified block device block's page.  If
1032  * that page was dirty, the buffers are set dirty also.
1033  */
1034 static int
1035 grow_buffers(struct block_device *bdev, sector_t block, int size, gfp_t gfp)
1036 {
1037         pgoff_t index;
1038         int sizebits;
1039
1040         sizebits = -1;
1041         do {
1042                 sizebits++;
1043         } while ((size << sizebits) < PAGE_SIZE);
1044
1045         index = block >> sizebits;
1046
1047         /*
1048          * Check for a block which wants to lie outside our maximum possible
1049          * pagecache index.  (this comparison is done using sector_t types).
1050          */
1051         if (unlikely(index != block >> sizebits)) {
1052                 char b[BDEVNAME_SIZE];
1053
1054                 printk(KERN_ERR "%s: requested out-of-range block %llu for "
1055                         "device %s\n",
1056                         __func__, (unsigned long long)block,
1057                         bdevname(bdev, b));
1058                 return -EIO;
1059         }
1060
1061         /* Create a page with the proper size buffers.. */
1062         return grow_dev_page(bdev, block, index, size, sizebits, gfp);
1063 }
1064
1065 struct buffer_head *
1066 __getblk_slow(struct block_device *bdev, sector_t block,
1067              unsigned size, gfp_t gfp)
1068 {
1069         /* Size must be multiple of hard sectorsize */
1070         if (unlikely(size & (bdev_logical_block_size(bdev)-1) ||
1071                         (size < 512 || size > PAGE_SIZE))) {
1072                 printk(KERN_ERR "getblk(): invalid block size %d requested\n",
1073                                         size);
1074                 printk(KERN_ERR "logical block size: %d\n",
1075                                         bdev_logical_block_size(bdev));
1076
1077                 dump_stack();
1078                 return NULL;
1079         }
1080
1081         for (;;) {
1082                 struct buffer_head *bh;
1083                 int ret;
1084
1085                 bh = __find_get_block(bdev, block, size);
1086                 if (bh)
1087                         return bh;
1088
1089                 ret = grow_buffers(bdev, block, size, gfp);
1090                 if (ret < 0)
1091                         return NULL;
1092                 if (ret == 0)
1093                         free_more_memory();
1094         }
1095 }
1096 EXPORT_SYMBOL(__getblk_slow);
1097
1098 /*
1099  * The relationship between dirty buffers and dirty pages:
1100  *
1101  * Whenever a page has any dirty buffers, the page's dirty bit is set, and
1102  * the page is tagged dirty in its radix tree.
1103  *
1104  * At all times, the dirtiness of the buffers represents the dirtiness of
1105  * subsections of the page.  If the page has buffers, the page dirty bit is
1106  * merely a hint about the true dirty state.
1107  *
1108  * When a page is set dirty in its entirety, all its buffers are marked dirty
1109  * (if the page has buffers).
1110  *
1111  * When a buffer is marked dirty, its page is dirtied, but the page's other
1112  * buffers are not.
1113  *
1114  * Also.  When blockdev buffers are explicitly read with bread(), they
1115  * individually become uptodate.  But their backing page remains not
1116  * uptodate - even if all of its buffers are uptodate.  A subsequent
1117  * block_read_full_page() against that page will discover all the uptodate
1118  * buffers, will set the page uptodate and will perform no I/O.
1119  */
1120
1121 /**
1122  * mark_buffer_dirty - mark a buffer_head as needing writeout
1123  * @bh: the buffer_head to mark dirty
1124  *
1125  * mark_buffer_dirty() will set the dirty bit against the buffer, then set its
1126  * backing page dirty, then tag the page as dirty in its address_space's radix
1127  * tree and then attach the address_space's inode to its superblock's dirty
1128  * inode list.
1129  *
1130  * mark_buffer_dirty() is atomic.  It takes bh->b_page->mapping->private_lock,
1131  * mapping->tree_lock and mapping->host->i_lock.
1132  */
1133 void mark_buffer_dirty(struct buffer_head *bh)
1134 {
1135         WARN_ON_ONCE(!buffer_uptodate(bh));
1136
1137         trace_block_dirty_buffer(bh);
1138
1139         /*
1140          * Very *carefully* optimize the it-is-already-dirty case.
1141          *
1142          * Don't let the final "is it dirty" escape to before we
1143          * perhaps modified the buffer.
1144          */
1145         if (buffer_dirty(bh)) {
1146                 smp_mb();
1147                 if (buffer_dirty(bh))
1148                         return;
1149         }
1150
1151         if (!test_set_buffer_dirty(bh)) {
1152                 struct page *page = bh->b_page;
1153                 if (!TestSetPageDirty(page)) {
1154                         struct address_space *mapping = page_mapping(page);
1155                         if (mapping)
1156                                 __set_page_dirty(page, mapping, 0);
1157                 }
1158         }
1159 }
1160 EXPORT_SYMBOL(mark_buffer_dirty);
1161
1162 /*
1163  * Decrement a buffer_head's reference count.  If all buffers against a page
1164  * have zero reference count, are clean and unlocked, and if the page is clean
1165  * and unlocked then try_to_free_buffers() may strip the buffers from the page
1166  * in preparation for freeing it (sometimes, rarely, buffers are removed from
1167  * a page but it ends up not being freed, and buffers may later be reattached).
1168  */
1169 void __brelse(struct buffer_head * buf)
1170 {
1171         if (atomic_read(&buf->b_count)) {
1172                 put_bh(buf);
1173                 return;
1174         }
1175         WARN(1, KERN_ERR "VFS: brelse: Trying to free free buffer\n");
1176 }
1177 EXPORT_SYMBOL(__brelse);
1178
1179 /*
1180  * bforget() is like brelse(), except it discards any
1181  * potentially dirty data.
1182  */
1183 void __bforget(struct buffer_head *bh)
1184 {
1185         clear_buffer_dirty(bh);
1186         if (bh->b_assoc_map) {
1187                 struct address_space *buffer_mapping = bh->b_page->mapping;
1188
1189                 spin_lock(&buffer_mapping->private_lock);
1190                 list_del_init(&bh->b_assoc_buffers);
1191                 bh->b_assoc_map = NULL;
1192                 spin_unlock(&buffer_mapping->private_lock);
1193         }
1194         __brelse(bh);
1195 }
1196 EXPORT_SYMBOL(__bforget);
1197
1198 static struct buffer_head *__bread_slow(struct buffer_head *bh)
1199 {
1200         lock_buffer(bh);
1201         if (buffer_uptodate(bh)) {
1202                 unlock_buffer(bh);
1203                 return bh;
1204         } else {
1205                 get_bh(bh);
1206                 bh->b_end_io = end_buffer_read_sync;
1207                 submit_bh(READ, bh);
1208                 wait_on_buffer(bh);
1209                 if (buffer_uptodate(bh))
1210                         return bh;
1211         }
1212         brelse(bh);
1213         return NULL;
1214 }
1215
1216 /*
1217  * Per-cpu buffer LRU implementation.  To reduce the cost of __find_get_block().
1218  * The bhs[] array is sorted - newest buffer is at bhs[0].  Buffers have their
1219  * refcount elevated by one when they're in an LRU.  A buffer can only appear
1220  * once in a particular CPU's LRU.  A single buffer can be present in multiple
1221  * CPU's LRUs at the same time.
1222  *
1223  * This is a transparent caching front-end to sb_bread(), sb_getblk() and
1224  * sb_find_get_block().
1225  *
1226  * The LRUs themselves only need locking against invalidate_bh_lrus.  We use
1227  * a local interrupt disable for that.
1228  */
1229
1230 #define BH_LRU_SIZE     16
1231
1232 struct bh_lru {
1233         struct buffer_head *bhs[BH_LRU_SIZE];
1234 };
1235
1236 static DEFINE_PER_CPU(struct bh_lru, bh_lrus) = {{ NULL }};
1237
1238 #ifdef CONFIG_SMP
1239 #define bh_lru_lock()   local_irq_disable()
1240 #define bh_lru_unlock() local_irq_enable()
1241 #else
1242 #define bh_lru_lock()   preempt_disable()
1243 #define bh_lru_unlock() preempt_enable()
1244 #endif
1245
1246 static inline void check_irqs_on(void)
1247 {
1248 #ifdef irqs_disabled
1249         BUG_ON(irqs_disabled());
1250 #endif
1251 }
1252
1253 /*
1254  * The LRU management algorithm is dopey-but-simple.  Sorry.
1255  */
1256 static void bh_lru_install(struct buffer_head *bh)
1257 {
1258         struct buffer_head *evictee = NULL;
1259
1260         check_irqs_on();
1261         bh_lru_lock();
1262         if (__this_cpu_read(bh_lrus.bhs[0]) != bh) {
1263                 struct buffer_head *bhs[BH_LRU_SIZE];
1264                 int in;
1265                 int out = 0;
1266
1267                 get_bh(bh);
1268                 bhs[out++] = bh;
1269                 for (in = 0; in < BH_LRU_SIZE; in++) {
1270                         struct buffer_head *bh2 =
1271                                 __this_cpu_read(bh_lrus.bhs[in]);
1272
1273                         if (bh2 == bh) {
1274                                 __brelse(bh2);
1275                         } else {
1276                                 if (out >= BH_LRU_SIZE) {
1277                                         BUG_ON(evictee != NULL);
1278                                         evictee = bh2;
1279                                 } else {
1280                                         bhs[out++] = bh2;
1281                                 }
1282                         }
1283                 }
1284                 while (out < BH_LRU_SIZE)
1285                         bhs[out++] = NULL;
1286                 memcpy(this_cpu_ptr(&bh_lrus.bhs), bhs, sizeof(bhs));
1287         }
1288         bh_lru_unlock();
1289
1290         if (evictee)
1291                 __brelse(evictee);
1292 }
1293
1294 /*
1295  * Look up the bh in this cpu's LRU.  If it's there, move it to the head.
1296  */
1297 static struct buffer_head *
1298 lookup_bh_lru(struct block_device *bdev, sector_t block, unsigned size)
1299 {
1300         struct buffer_head *ret = NULL;
1301         unsigned int i;
1302
1303         check_irqs_on();
1304         bh_lru_lock();
1305         for (i = 0; i < BH_LRU_SIZE; i++) {
1306                 struct buffer_head *bh = __this_cpu_read(bh_lrus.bhs[i]);
1307
1308                 if (bh && bh->b_blocknr == block && bh->b_bdev == bdev &&
1309                     bh->b_size == size) {
1310                         if (i) {
1311                                 while (i) {
1312                                         __this_cpu_write(bh_lrus.bhs[i],
1313                                                 __this_cpu_read(bh_lrus.bhs[i - 1]));
1314                                         i--;
1315                                 }
1316                                 __this_cpu_write(bh_lrus.bhs[0], bh);
1317                         }
1318                         get_bh(bh);
1319                         ret = bh;
1320                         break;
1321                 }
1322         }
1323         bh_lru_unlock();
1324         return ret;
1325 }
1326
1327 /*
1328  * Perform a pagecache lookup for the matching buffer.  If it's there, refresh
1329  * it in the LRU and mark it as accessed.  If it is not present then return
1330  * NULL
1331  */
1332 struct buffer_head *
1333 __find_get_block(struct block_device *bdev, sector_t block, unsigned size)
1334 {
1335         struct buffer_head *bh = lookup_bh_lru(bdev, block, size);
1336
1337         if (bh == NULL) {
1338                 /* __find_get_block_slow will mark the page accessed */
1339                 bh = __find_get_block_slow(bdev, block);
1340                 if (bh)
1341                         bh_lru_install(bh);
1342         } else
1343                 touch_buffer(bh);
1344
1345         return bh;
1346 }
1347 EXPORT_SYMBOL(__find_get_block);
1348
1349 /*
1350  * __getblk_gfp() will locate (and, if necessary, create) the buffer_head
1351  * which corresponds to the passed block_device, block and size. The
1352  * returned buffer has its reference count incremented.
1353  *
1354  * __getblk_gfp() will lock up the machine if grow_dev_page's
1355  * try_to_free_buffers() attempt is failing.  FIXME, perhaps?
1356  */
1357 struct buffer_head *
1358 __getblk_gfp(struct block_device *bdev, sector_t block,
1359              unsigned size, gfp_t gfp)
1360 {
1361         struct buffer_head *bh = __find_get_block(bdev, block, size);
1362
1363         might_sleep();
1364         if (bh == NULL)
1365                 bh = __getblk_slow(bdev, block, size, gfp);
1366         return bh;
1367 }
1368 EXPORT_SYMBOL(__getblk_gfp);
1369
1370 /*
1371  * Do async read-ahead on a buffer..
1372  */
1373 void __breadahead(struct block_device *bdev, sector_t block, unsigned size)
1374 {
1375         struct buffer_head *bh = __getblk(bdev, block, size);
1376         if (likely(bh)) {
1377                 ll_rw_block(READA, 1, &bh);
1378                 brelse(bh);
1379         }
1380 }
1381 EXPORT_SYMBOL(__breadahead);
1382
1383 /**
1384  *  __bread_gfp() - reads a specified block and returns the bh
1385  *  @bdev: the block_device to read from
1386  *  @block: number of block
1387  *  @size: size (in bytes) to read
1388  *  @gfp: page allocation flag
1389  *
1390  *  Reads a specified block, and returns buffer head that contains it.
1391  *  The page cache can be allocated from non-movable area
1392  *  not to prevent page migration if you set gfp to zero.
1393  *  It returns NULL if the block was unreadable.
1394  */
1395 struct buffer_head *
1396 __bread_gfp(struct block_device *bdev, sector_t block,
1397                    unsigned size, gfp_t gfp)
1398 {
1399         struct buffer_head *bh = __getblk_gfp(bdev, block, size, gfp);
1400
1401         if (likely(bh) && !buffer_uptodate(bh))
1402                 bh = __bread_slow(bh);
1403         return bh;
1404 }
1405 EXPORT_SYMBOL(__bread_gfp);
1406
1407 /*
1408  * invalidate_bh_lrus() is called rarely - but not only at unmount.
1409  * This doesn't race because it runs in each cpu either in irq
1410  * or with preempt disabled.
1411  */
1412 static void invalidate_bh_lru(void *arg)
1413 {
1414         struct bh_lru *b = &get_cpu_var(bh_lrus);
1415         int i;
1416
1417         for (i = 0; i < BH_LRU_SIZE; i++) {
1418                 brelse(b->bhs[i]);
1419                 b->bhs[i] = NULL;
1420         }
1421         put_cpu_var(bh_lrus);
1422 }
1423
1424 static bool has_bh_in_lru(int cpu, void *dummy)
1425 {
1426         struct bh_lru *b = per_cpu_ptr(&bh_lrus, cpu);
1427         int i;
1428         
1429         for (i = 0; i < BH_LRU_SIZE; i++) {
1430                 if (b->bhs[i])
1431                         return 1;
1432         }
1433
1434         return 0;
1435 }
1436
1437 void invalidate_bh_lrus(void)
1438 {
1439         on_each_cpu_cond(has_bh_in_lru, invalidate_bh_lru, NULL, 1, GFP_KERNEL);
1440 }
1441 EXPORT_SYMBOL_GPL(invalidate_bh_lrus);
1442
1443 void set_bh_page(struct buffer_head *bh,
1444                 struct page *page, unsigned long offset)
1445 {
1446         bh->b_page = page;
1447         BUG_ON(offset >= PAGE_SIZE);
1448         if (PageHighMem(page))
1449                 /*
1450                  * This catches illegal uses and preserves the offset:
1451                  */
1452                 bh->b_data = (char *)(0 + offset);
1453         else
1454                 bh->b_data = page_address(page) + offset;
1455 }
1456 EXPORT_SYMBOL(set_bh_page);
1457
1458 /*
1459  * Called when truncating a buffer on a page completely.
1460  */
1461
1462 /* Bits that are cleared during an invalidate */
1463 #define BUFFER_FLAGS_DISCARD \
1464         (1 << BH_Mapped | 1 << BH_New | 1 << BH_Req | \
1465          1 << BH_Delay | 1 << BH_Unwritten)
1466
1467 static void discard_buffer(struct buffer_head * bh)
1468 {
1469         unsigned long b_state, b_state_old;
1470
1471         lock_buffer(bh);
1472         clear_buffer_dirty(bh);
1473         bh->b_bdev = NULL;
1474         b_state = bh->b_state;
1475         for (;;) {
1476                 b_state_old = cmpxchg(&bh->b_state, b_state,
1477                                       (b_state & ~BUFFER_FLAGS_DISCARD));
1478                 if (b_state_old == b_state)
1479                         break;
1480                 b_state = b_state_old;
1481         }
1482         unlock_buffer(bh);
1483 }
1484
1485 /**
1486  * block_invalidatepage - invalidate part or all of a buffer-backed page
1487  *
1488  * @page: the page which is affected
1489  * @offset: start of the range to invalidate
1490  * @length: length of the range to invalidate
1491  *
1492  * block_invalidatepage() is called when all or part of the page has become
1493  * invalidated by a truncate operation.
1494  *
1495  * block_invalidatepage() does not have to release all buffers, but it must
1496  * ensure that no dirty buffer is left outside @offset and that no I/O
1497  * is underway against any of the blocks which are outside the truncation
1498  * point.  Because the caller is about to free (and possibly reuse) those
1499  * blocks on-disk.
1500  */
1501 void block_invalidatepage(struct page *page, unsigned int offset,
1502                           unsigned int length)
1503 {
1504         struct buffer_head *head, *bh, *next;
1505         unsigned int curr_off = 0;
1506         unsigned int stop = length + offset;
1507
1508         BUG_ON(!PageLocked(page));
1509         if (!page_has_buffers(page))
1510                 goto out;
1511
1512         /*
1513          * Check for overflow
1514          */
1515         BUG_ON(stop > PAGE_CACHE_SIZE || stop < length);
1516
1517         head = page_buffers(page);
1518         bh = head;
1519         do {
1520                 unsigned int next_off = curr_off + bh->b_size;
1521                 next = bh->b_this_page;
1522
1523                 /*
1524                  * Are we still fully in range ?
1525                  */
1526                 if (next_off > stop)
1527                         goto out;
1528
1529                 /*
1530                  * is this block fully invalidated?
1531                  */
1532                 if (offset <= curr_off)
1533                         discard_buffer(bh);
1534                 curr_off = next_off;
1535                 bh = next;
1536         } while (bh != head);
1537
1538         /*
1539          * We release buffers only if the entire page is being invalidated.
1540          * The get_block cached value has been unconditionally invalidated,
1541          * so real IO is not possible anymore.
1542          */
1543         if (offset == 0)
1544                 try_to_release_page(page, 0);
1545 out:
1546         return;
1547 }
1548 EXPORT_SYMBOL(block_invalidatepage);
1549
1550
1551 /*
1552  * We attach and possibly dirty the buffers atomically wrt
1553  * __set_page_dirty_buffers() via private_lock.  try_to_free_buffers
1554  * is already excluded via the page lock.
1555  */
1556 void create_empty_buffers(struct page *page,
1557                         unsigned long blocksize, unsigned long b_state)
1558 {
1559         struct buffer_head *bh, *head, *tail;
1560
1561         head = alloc_page_buffers(page, blocksize, 1);
1562         bh = head;
1563         do {
1564                 bh->b_state |= b_state;
1565                 tail = bh;
1566                 bh = bh->b_this_page;
1567         } while (bh);
1568         tail->b_this_page = head;
1569
1570         spin_lock(&page->mapping->private_lock);
1571         if (PageUptodate(page) || PageDirty(page)) {
1572                 bh = head;
1573                 do {
1574                         if (PageDirty(page))
1575                                 set_buffer_dirty(bh);
1576                         if (PageUptodate(page))
1577                                 set_buffer_uptodate(bh);
1578                         bh = bh->b_this_page;
1579                 } while (bh != head);
1580         }
1581         attach_page_buffers(page, head);
1582         spin_unlock(&page->mapping->private_lock);
1583 }
1584 EXPORT_SYMBOL(create_empty_buffers);
1585
1586 /*
1587  * We are taking a block for data and we don't want any output from any
1588  * buffer-cache aliases starting from return from that function and
1589  * until the moment when something will explicitly mark the buffer
1590  * dirty (hopefully that will not happen until we will free that block ;-)
1591  * We don't even need to mark it not-uptodate - nobody can expect
1592  * anything from a newly allocated buffer anyway. We used to used
1593  * unmap_buffer() for such invalidation, but that was wrong. We definitely
1594  * don't want to mark the alias unmapped, for example - it would confuse
1595  * anyone who might pick it with bread() afterwards...
1596  *
1597  * Also..  Note that bforget() doesn't lock the buffer.  So there can
1598  * be writeout I/O going on against recently-freed buffers.  We don't
1599  * wait on that I/O in bforget() - it's more efficient to wait on the I/O
1600  * only if we really need to.  That happens here.
1601  */
1602 void unmap_underlying_metadata(struct block_device *bdev, sector_t block)
1603 {
1604         struct buffer_head *old_bh;
1605
1606         might_sleep();
1607
1608         old_bh = __find_get_block_slow(bdev, block);
1609         if (old_bh) {
1610                 clear_buffer_dirty(old_bh);
1611                 wait_on_buffer(old_bh);
1612                 clear_buffer_req(old_bh);
1613                 __brelse(old_bh);
1614         }
1615 }
1616 EXPORT_SYMBOL(unmap_underlying_metadata);
1617
1618 /*
1619  * Size is a power-of-two in the range 512..PAGE_SIZE,
1620  * and the case we care about most is PAGE_SIZE.
1621  *
1622  * So this *could* possibly be written with those
1623  * constraints in mind (relevant mostly if some
1624  * architecture has a slow bit-scan instruction)
1625  */
1626 static inline int block_size_bits(unsigned int blocksize)
1627 {
1628         return ilog2(blocksize);
1629 }
1630
1631 static struct buffer_head *create_page_buffers(struct page *page, struct inode *inode, unsigned int b_state)
1632 {
1633         BUG_ON(!PageLocked(page));
1634
1635         if (!page_has_buffers(page))
1636                 create_empty_buffers(page, 1 << ACCESS_ONCE(inode->i_blkbits), b_state);
1637         return page_buffers(page);
1638 }
1639
1640 /*
1641  * NOTE! All mapped/uptodate combinations are valid:
1642  *
1643  *      Mapped  Uptodate        Meaning
1644  *
1645  *      No      No              "unknown" - must do get_block()
1646  *      No      Yes             "hole" - zero-filled
1647  *      Yes     No              "allocated" - allocated on disk, not read in
1648  *      Yes     Yes             "valid" - allocated and up-to-date in memory.
1649  *
1650  * "Dirty" is valid only with the last case (mapped+uptodate).
1651  */
1652
1653 /*
1654  * While block_write_full_page is writing back the dirty buffers under
1655  * the page lock, whoever dirtied the buffers may decide to clean them
1656  * again at any time.  We handle that by only looking at the buffer
1657  * state inside lock_buffer().
1658  *
1659  * If block_write_full_page() is called for regular writeback
1660  * (wbc->sync_mode == WB_SYNC_NONE) then it will redirty a page which has a
1661  * locked buffer.   This only can happen if someone has written the buffer
1662  * directly, with submit_bh().  At the address_space level PageWriteback
1663  * prevents this contention from occurring.
1664  *
1665  * If block_write_full_page() is called with wbc->sync_mode ==
1666  * WB_SYNC_ALL, the writes are posted using WRITE_SYNC; this
1667  * causes the writes to be flagged as synchronous writes.
1668  */
1669 static int __block_write_full_page(struct inode *inode, struct page *page,
1670                         get_block_t *get_block, struct writeback_control *wbc,
1671                         bh_end_io_t *handler)
1672 {
1673         int err;
1674         sector_t block;
1675         sector_t last_block;
1676         struct buffer_head *bh, *head;
1677         unsigned int blocksize, bbits;
1678         int nr_underway = 0;
1679         int write_op = (wbc->sync_mode == WB_SYNC_ALL ?
1680                         WRITE_SYNC : WRITE);
1681
1682         head = create_page_buffers(page, inode,
1683                                         (1 << BH_Dirty)|(1 << BH_Uptodate));
1684
1685         /*
1686          * Be very careful.  We have no exclusion from __set_page_dirty_buffers
1687          * here, and the (potentially unmapped) buffers may become dirty at
1688          * any time.  If a buffer becomes dirty here after we've inspected it
1689          * then we just miss that fact, and the page stays dirty.
1690          *
1691          * Buffers outside i_size may be dirtied by __set_page_dirty_buffers;
1692          * handle that here by just cleaning them.
1693          */
1694
1695         bh = head;
1696         blocksize = bh->b_size;
1697         bbits = block_size_bits(blocksize);
1698
1699         block = (sector_t)page->index << (PAGE_CACHE_SHIFT - bbits);
1700         last_block = (i_size_read(inode) - 1) >> bbits;
1701
1702         /*
1703          * Get all the dirty buffers mapped to disk addresses and
1704          * handle any aliases from the underlying blockdev's mapping.
1705          */
1706         do {
1707                 if (block > last_block) {
1708                         /*
1709                          * mapped buffers outside i_size will occur, because
1710                          * this page can be outside i_size when there is a
1711                          * truncate in progress.
1712                          */
1713                         /*
1714                          * The buffer was zeroed by block_write_full_page()
1715                          */
1716                         clear_buffer_dirty(bh);
1717                         set_buffer_uptodate(bh);
1718                 } else if ((!buffer_mapped(bh) || buffer_delay(bh)) &&
1719                            buffer_dirty(bh)) {
1720                         WARN_ON(bh->b_size != blocksize);
1721                         err = get_block(inode, block, bh, 1);
1722                         if (err)
1723                                 goto recover;
1724                         clear_buffer_delay(bh);
1725                         if (buffer_new(bh)) {
1726                                 /* blockdev mappings never come here */
1727                                 clear_buffer_new(bh);
1728                                 unmap_underlying_metadata(bh->b_bdev,
1729                                                         bh->b_blocknr);
1730                         }
1731                 }
1732                 bh = bh->b_this_page;
1733                 block++;
1734         } while (bh != head);
1735
1736         do {
1737                 if (!buffer_mapped(bh))
1738                         continue;
1739                 /*
1740                  * If it's a fully non-blocking write attempt and we cannot
1741                  * lock the buffer then redirty the page.  Note that this can
1742                  * potentially cause a busy-wait loop from writeback threads
1743                  * and kswapd activity, but those code paths have their own
1744                  * higher-level throttling.
1745                  */
1746                 if (wbc->sync_mode != WB_SYNC_NONE) {
1747                         lock_buffer(bh);
1748                 } else if (!trylock_buffer(bh)) {
1749                         redirty_page_for_writepage(wbc, page);
1750                         continue;
1751                 }
1752                 if (test_clear_buffer_dirty(bh)) {
1753                         mark_buffer_async_write_endio(bh, handler);
1754                 } else {
1755                         unlock_buffer(bh);
1756                 }
1757         } while ((bh = bh->b_this_page) != head);
1758
1759         /*
1760          * The page and its buffers are protected by PageWriteback(), so we can
1761          * drop the bh refcounts early.
1762          */
1763         BUG_ON(PageWriteback(page));
1764         set_page_writeback(page);
1765
1766         do {
1767                 struct buffer_head *next = bh->b_this_page;
1768                 if (buffer_async_write(bh)) {
1769                         submit_bh(write_op, bh);
1770                         nr_underway++;
1771                 }
1772                 bh = next;
1773         } while (bh != head);
1774         unlock_page(page);
1775
1776         err = 0;
1777 done:
1778         if (nr_underway == 0) {
1779                 /*
1780                  * The page was marked dirty, but the buffers were
1781                  * clean.  Someone wrote them back by hand with
1782                  * ll_rw_block/submit_bh.  A rare case.
1783                  */
1784                 end_page_writeback(page);
1785
1786                 /*
1787                  * The page and buffer_heads can be released at any time from
1788                  * here on.
1789                  */
1790         }
1791         return err;
1792
1793 recover:
1794         /*
1795          * ENOSPC, or some other error.  We may already have added some
1796          * blocks to the file, so we need to write these out to avoid
1797          * exposing stale data.
1798          * The page is currently locked and not marked for writeback
1799          */
1800         bh = head;
1801         /* Recovery: lock and submit the mapped buffers */
1802         do {
1803                 if (buffer_mapped(bh) && buffer_dirty(bh) &&
1804                     !buffer_delay(bh)) {
1805                         lock_buffer(bh);
1806                         mark_buffer_async_write_endio(bh, handler);
1807                 } else {
1808                         /*
1809                          * The buffer may have been set dirty during
1810                          * attachment to a dirty page.
1811                          */
1812                         clear_buffer_dirty(bh);
1813                 }
1814         } while ((bh = bh->b_this_page) != head);
1815         SetPageError(page);
1816         BUG_ON(PageWriteback(page));
1817         mapping_set_error(page->mapping, err);
1818         set_page_writeback(page);
1819         do {
1820                 struct buffer_head *next = bh->b_this_page;
1821                 if (buffer_async_write(bh)) {
1822                         clear_buffer_dirty(bh);
1823                         submit_bh(write_op, bh);
1824                         nr_underway++;
1825                 }
1826                 bh = next;
1827         } while (bh != head);
1828         unlock_page(page);
1829         goto done;
1830 }
1831
1832 /*
1833  * If a page has any new buffers, zero them out here, and mark them uptodate
1834  * and dirty so they'll be written out (in order to prevent uninitialised
1835  * block data from leaking). And clear the new bit.
1836  */
1837 void page_zero_new_buffers(struct page *page, unsigned from, unsigned to)
1838 {
1839         unsigned int block_start, block_end;
1840         struct buffer_head *head, *bh;
1841
1842         BUG_ON(!PageLocked(page));
1843         if (!page_has_buffers(page))
1844                 return;
1845
1846         bh = head = page_buffers(page);
1847         block_start = 0;
1848         do {
1849                 block_end = block_start + bh->b_size;
1850
1851                 if (buffer_new(bh)) {
1852                         if (block_end > from && block_start < to) {
1853                                 if (!PageUptodate(page)) {
1854                                         unsigned start, size;
1855
1856                                         start = max(from, block_start);
1857                                         size = min(to, block_end) - start;
1858
1859                                         zero_user(page, start, size);
1860                                         set_buffer_uptodate(bh);
1861                                 }
1862
1863                                 clear_buffer_new(bh);
1864                                 mark_buffer_dirty(bh);
1865                         }
1866                 }
1867
1868                 block_start = block_end;
1869                 bh = bh->b_this_page;
1870         } while (bh != head);
1871 }
1872 EXPORT_SYMBOL(page_zero_new_buffers);
1873
1874 int __block_write_begin(struct page *page, loff_t pos, unsigned len,
1875                 get_block_t *get_block)
1876 {
1877         unsigned from = pos & (PAGE_CACHE_SIZE - 1);
1878         unsigned to = from + len;
1879         struct inode *inode = page->mapping->host;
1880         unsigned block_start, block_end;
1881         sector_t block;
1882         int err = 0;
1883         unsigned blocksize, bbits;
1884         struct buffer_head *bh, *head, *wait[2], **wait_bh=wait;
1885
1886         BUG_ON(!PageLocked(page));
1887         BUG_ON(from > PAGE_CACHE_SIZE);
1888         BUG_ON(to > PAGE_CACHE_SIZE);
1889         BUG_ON(from > to);
1890
1891         head = create_page_buffers(page, inode, 0);
1892         blocksize = head->b_size;
1893         bbits = block_size_bits(blocksize);
1894
1895         block = (sector_t)page->index << (PAGE_CACHE_SHIFT - bbits);
1896
1897         for(bh = head, block_start = 0; bh != head || !block_start;
1898             block++, block_start=block_end, bh = bh->b_this_page) {
1899                 block_end = block_start + blocksize;
1900                 if (block_end <= from || block_start >= to) {
1901                         if (PageUptodate(page)) {
1902                                 if (!buffer_uptodate(bh))
1903                                         set_buffer_uptodate(bh);
1904                         }
1905                         continue;
1906                 }
1907                 if (buffer_new(bh))
1908                         clear_buffer_new(bh);
1909                 if (!buffer_mapped(bh)) {
1910                         WARN_ON(bh->b_size != blocksize);
1911                         err = get_block(inode, block, bh, 1);
1912                         if (err)
1913                                 break;
1914                         if (buffer_new(bh)) {
1915                                 unmap_underlying_metadata(bh->b_bdev,
1916                                                         bh->b_blocknr);
1917                                 if (PageUptodate(page)) {
1918                                         clear_buffer_new(bh);
1919                                         set_buffer_uptodate(bh);
1920                                         mark_buffer_dirty(bh);
1921                                         continue;
1922                                 }
1923                                 if (block_end > to || block_start < from)
1924                                         zero_user_segments(page,
1925                                                 to, block_end,
1926                                                 block_start, from);
1927                                 continue;
1928                         }
1929                 }
1930                 if (PageUptodate(page)) {
1931                         if (!buffer_uptodate(bh))
1932                                 set_buffer_uptodate(bh);
1933                         continue; 
1934                 }
1935                 if (!buffer_uptodate(bh) && !buffer_delay(bh) &&
1936                     !buffer_unwritten(bh) &&
1937                      (block_start < from || block_end > to)) {
1938                         ll_rw_block(READ, 1, &bh);
1939                         *wait_bh++=bh;
1940                 }
1941         }
1942         /*
1943          * If we issued read requests - let them complete.
1944          */
1945         while(wait_bh > wait) {
1946                 wait_on_buffer(*--wait_bh);
1947                 if (!buffer_uptodate(*wait_bh))
1948                         err = -EIO;
1949         }
1950         if (unlikely(err))
1951                 page_zero_new_buffers(page, from, to);
1952         return err;
1953 }
1954 EXPORT_SYMBOL(__block_write_begin);
1955
1956 static int __block_commit_write(struct inode *inode, struct page *page,
1957                 unsigned from, unsigned to)
1958 {
1959         unsigned block_start, block_end;
1960         int partial = 0;
1961         unsigned blocksize;
1962         struct buffer_head *bh, *head;
1963
1964         bh = head = page_buffers(page);
1965         blocksize = bh->b_size;
1966
1967         block_start = 0;
1968         do {
1969                 block_end = block_start + blocksize;
1970                 if (block_end <= from || block_start >= to) {
1971                         if (!buffer_uptodate(bh))
1972                                 partial = 1;
1973                 } else {
1974                         set_buffer_uptodate(bh);
1975                         mark_buffer_dirty(bh);
1976                 }
1977                 clear_buffer_new(bh);
1978
1979                 block_start = block_end;
1980                 bh = bh->b_this_page;
1981         } while (bh != head);
1982
1983         /*
1984          * If this is a partial write which happened to make all buffers
1985          * uptodate then we can optimize away a bogus readpage() for
1986          * the next read(). Here we 'discover' whether the page went
1987          * uptodate as a result of this (potentially partial) write.
1988          */
1989         if (!partial)
1990                 SetPageUptodate(page);
1991         return 0;
1992 }
1993
1994 /*
1995  * block_write_begin takes care of the basic task of block allocation and
1996  * bringing partial write blocks uptodate first.
1997  *
1998  * The filesystem needs to handle block truncation upon failure.
1999  */
2000 int block_write_begin(struct address_space *mapping, loff_t pos, unsigned len,
2001                 unsigned flags, struct page **pagep, get_block_t *get_block)
2002 {
2003         pgoff_t index = pos >> PAGE_CACHE_SHIFT;
2004         struct page *page;
2005         int status;
2006
2007         page = grab_cache_page_write_begin(mapping, index, flags);
2008         if (!page)
2009                 return -ENOMEM;
2010
2011         status = __block_write_begin(page, pos, len, get_block);
2012         if (unlikely(status)) {
2013                 unlock_page(page);
2014                 page_cache_release(page);
2015                 page = NULL;
2016         }
2017
2018         *pagep = page;
2019         return status;
2020 }
2021 EXPORT_SYMBOL(block_write_begin);
2022
2023 int block_write_end(struct file *file, struct address_space *mapping,
2024                         loff_t pos, unsigned len, unsigned copied,
2025                         struct page *page, void *fsdata)
2026 {
2027         struct inode *inode = mapping->host;
2028         unsigned start;
2029
2030         start = pos & (PAGE_CACHE_SIZE - 1);
2031
2032         if (unlikely(copied < len)) {
2033                 /*
2034                  * The buffers that were written will now be uptodate, so we
2035                  * don't have to worry about a readpage reading them and
2036                  * overwriting a partial write. However if we have encountered
2037                  * a short write and only partially written into a buffer, it
2038                  * will not be marked uptodate, so a readpage might come in and
2039                  * destroy our partial write.
2040                  *
2041                  * Do the simplest thing, and just treat any short write to a
2042                  * non uptodate page as a zero-length write, and force the
2043                  * caller to redo the whole thing.
2044                  */
2045                 if (!PageUptodate(page))
2046                         copied = 0;
2047
2048                 page_zero_new_buffers(page, start+copied, start+len);
2049         }
2050         flush_dcache_page(page);
2051
2052         /* This could be a short (even 0-length) commit */
2053         __block_commit_write(inode, page, start, start+copied);
2054
2055         return copied;
2056 }
2057 EXPORT_SYMBOL(block_write_end);
2058
2059 int generic_write_end(struct file *file, struct address_space *mapping,
2060                         loff_t pos, unsigned len, unsigned copied,
2061                         struct page *page, void *fsdata)
2062 {
2063         struct inode *inode = mapping->host;
2064         loff_t old_size = inode->i_size;
2065         int i_size_changed = 0;
2066
2067         copied = block_write_end(file, mapping, pos, len, copied, page, fsdata);
2068
2069         /*
2070          * No need to use i_size_read() here, the i_size
2071          * cannot change under us because we hold i_mutex.
2072          *
2073          * But it's important to update i_size while still holding page lock:
2074          * page writeout could otherwise come in and zero beyond i_size.
2075          */
2076         if (pos+copied > inode->i_size) {
2077                 i_size_write(inode, pos+copied);
2078                 i_size_changed = 1;
2079         }
2080
2081         unlock_page(page);
2082         page_cache_release(page);
2083
2084         if (old_size < pos)
2085                 pagecache_isize_extended(inode, old_size, pos);
2086         /*
2087          * Don't mark the inode dirty under page lock. First, it unnecessarily
2088          * makes the holding time of page lock longer. Second, it forces lock
2089          * ordering of page lock and transaction start for journaling
2090          * filesystems.
2091          */
2092         if (i_size_changed)
2093                 mark_inode_dirty(inode);
2094
2095         return copied;
2096 }
2097 EXPORT_SYMBOL(generic_write_end);
2098
2099 /*
2100  * block_is_partially_uptodate checks whether buffers within a page are
2101  * uptodate or not.
2102  *
2103  * Returns true if all buffers which correspond to a file portion
2104  * we want to read are uptodate.
2105  */
2106 int block_is_partially_uptodate(struct page *page, unsigned long from,
2107                                         unsigned long count)
2108 {
2109         unsigned block_start, block_end, blocksize;
2110         unsigned to;
2111         struct buffer_head *bh, *head;
2112         int ret = 1;
2113
2114         if (!page_has_buffers(page))
2115                 return 0;
2116
2117         head = page_buffers(page);
2118         blocksize = head->b_size;
2119         to = min_t(unsigned, PAGE_CACHE_SIZE - from, count);
2120         to = from + to;
2121         if (from < blocksize && to > PAGE_CACHE_SIZE - blocksize)
2122                 return 0;
2123
2124         bh = head;
2125         block_start = 0;
2126         do {
2127                 block_end = block_start + blocksize;
2128                 if (block_end > from && block_start < to) {
2129                         if (!buffer_uptodate(bh)) {
2130                                 ret = 0;
2131                                 break;
2132                         }
2133                         if (block_end >= to)
2134                                 break;
2135                 }
2136                 block_start = block_end;
2137                 bh = bh->b_this_page;
2138         } while (bh != head);
2139
2140         return ret;
2141 }
2142 EXPORT_SYMBOL(block_is_partially_uptodate);
2143
2144 /*
2145  * Generic "read page" function for block devices that have the normal
2146  * get_block functionality. This is most of the block device filesystems.
2147  * Reads the page asynchronously --- the unlock_buffer() and
2148  * set/clear_buffer_uptodate() functions propagate buffer state into the
2149  * page struct once IO has completed.
2150  */
2151 int block_read_full_page(struct page *page, get_block_t *get_block)
2152 {
2153         struct inode *inode = page->mapping->host;
2154         sector_t iblock, lblock;
2155         struct buffer_head *bh, *head, *arr[MAX_BUF_PER_PAGE];
2156         unsigned int blocksize, bbits;
2157         int nr, i;
2158         int fully_mapped = 1;
2159
2160         head = create_page_buffers(page, inode, 0);
2161         blocksize = head->b_size;
2162         bbits = block_size_bits(blocksize);
2163
2164         iblock = (sector_t)page->index << (PAGE_CACHE_SHIFT - bbits);
2165         lblock = (i_size_read(inode)+blocksize-1) >> bbits;
2166         bh = head;
2167         nr = 0;
2168         i = 0;
2169
2170         do {
2171                 if (buffer_uptodate(bh))
2172                         continue;
2173
2174                 if (!buffer_mapped(bh)) {
2175                         int err = 0;
2176
2177                         fully_mapped = 0;
2178                         if (iblock < lblock) {
2179                                 WARN_ON(bh->b_size != blocksize);
2180                                 err = get_block(inode, iblock, bh, 0);
2181                                 if (err)
2182                                         SetPageError(page);
2183                         }
2184                         if (!buffer_mapped(bh)) {
2185                                 zero_user(page, i * blocksize, blocksize);
2186                                 if (!err)
2187                                         set_buffer_uptodate(bh);
2188                                 continue;
2189                         }
2190                         /*
2191                          * get_block() might have updated the buffer
2192                          * synchronously
2193                          */
2194                         if (buffer_uptodate(bh))
2195                                 continue;
2196                 }
2197                 arr[nr++] = bh;
2198         } while (i++, iblock++, (bh = bh->b_this_page) != head);
2199
2200         if (fully_mapped)
2201                 SetPageMappedToDisk(page);
2202
2203         if (!nr) {
2204                 /*
2205                  * All buffers are uptodate - we can set the page uptodate
2206                  * as well. But not if get_block() returned an error.
2207                  */
2208                 if (!PageError(page))
2209                         SetPageUptodate(page);
2210                 unlock_page(page);
2211                 return 0;
2212         }
2213
2214         /* Stage two: lock the buffers */
2215         for (i = 0; i < nr; i++) {
2216                 bh = arr[i];
2217                 lock_buffer(bh);
2218                 mark_buffer_async_read(bh);
2219         }
2220
2221         /*
2222          * Stage 3: start the IO.  Check for uptodateness
2223          * inside the buffer lock in case another process reading
2224          * the underlying blockdev brought it uptodate (the sct fix).
2225          */
2226         for (i = 0; i < nr; i++) {
2227                 bh = arr[i];
2228                 if (buffer_uptodate(bh))
2229                         end_buffer_async_read(bh, 1);
2230                 else
2231                         submit_bh(READ, bh);
2232         }
2233         return 0;
2234 }
2235 EXPORT_SYMBOL(block_read_full_page);
2236
2237 /* utility function for filesystems that need to do work on expanding
2238  * truncates.  Uses filesystem pagecache writes to allow the filesystem to
2239  * deal with the hole.  
2240  */
2241 int generic_cont_expand_simple(struct inode *inode, loff_t size)
2242 {
2243         struct address_space *mapping = inode->i_mapping;
2244         struct page *page;
2245         void *fsdata;
2246         int err;
2247
2248         err = inode_newsize_ok(inode, size);
2249         if (err)
2250                 goto out;
2251
2252         err = pagecache_write_begin(NULL, mapping, size, 0,
2253                                 AOP_FLAG_UNINTERRUPTIBLE|AOP_FLAG_CONT_EXPAND,
2254                                 &page, &fsdata);
2255         if (err)
2256                 goto out;
2257
2258         err = pagecache_write_end(NULL, mapping, size, 0, 0, page, fsdata);
2259         BUG_ON(err > 0);
2260
2261 out:
2262         return err;
2263 }
2264 EXPORT_SYMBOL(generic_cont_expand_simple);
2265
2266 static int cont_expand_zero(struct file *file, struct address_space *mapping,
2267                             loff_t pos, loff_t *bytes)
2268 {
2269         struct inode *inode = mapping->host;
2270         unsigned blocksize = 1 << inode->i_blkbits;
2271         struct page *page;
2272         void *fsdata;
2273         pgoff_t index, curidx;
2274         loff_t curpos;
2275         unsigned zerofrom, offset, len;
2276         int err = 0;
2277
2278         index = pos >> PAGE_CACHE_SHIFT;
2279         offset = pos & ~PAGE_CACHE_MASK;
2280
2281         while (index > (curidx = (curpos = *bytes)>>PAGE_CACHE_SHIFT)) {
2282                 zerofrom = curpos & ~PAGE_CACHE_MASK;
2283                 if (zerofrom & (blocksize-1)) {
2284                         *bytes |= (blocksize-1);
2285                         (*bytes)++;
2286                 }
2287                 len = PAGE_CACHE_SIZE - zerofrom;
2288
2289                 err = pagecache_write_begin(file, mapping, curpos, len,
2290                                                 AOP_FLAG_UNINTERRUPTIBLE,
2291                                                 &page, &fsdata);
2292                 if (err)
2293                         goto out;
2294                 zero_user(page, zerofrom, len);
2295                 err = pagecache_write_end(file, mapping, curpos, len, len,
2296                                                 page, fsdata);
2297                 if (err < 0)
2298                         goto out;
2299                 BUG_ON(err != len);
2300                 err = 0;
2301
2302                 balance_dirty_pages_ratelimited(mapping);
2303
2304                 if (unlikely(fatal_signal_pending(current))) {
2305                         err = -EINTR;
2306                         goto out;
2307                 }
2308         }
2309
2310         /* page covers the boundary, find the boundary offset */
2311         if (index == curidx) {
2312                 zerofrom = curpos & ~PAGE_CACHE_MASK;
2313                 /* if we will expand the thing last block will be filled */
2314                 if (offset <= zerofrom) {
2315                         goto out;
2316                 }
2317                 if (zerofrom & (blocksize-1)) {
2318                         *bytes |= (blocksize-1);
2319                         (*bytes)++;
2320                 }
2321                 len = offset - zerofrom;
2322
2323                 err = pagecache_write_begin(file, mapping, curpos, len,
2324                                                 AOP_FLAG_UNINTERRUPTIBLE,
2325                                                 &page, &fsdata);
2326                 if (err)
2327                         goto out;
2328                 zero_user(page, zerofrom, len);
2329                 err = pagecache_write_end(file, mapping, curpos, len, len,
2330                                                 page, fsdata);
2331                 if (err < 0)
2332                         goto out;
2333                 BUG_ON(err != len);
2334                 err = 0;
2335         }
2336 out:
2337         return err;
2338 }
2339
2340 /*
2341  * For moronic filesystems that do not allow holes in file.
2342  * We may have to extend the file.
2343  */
2344 int cont_write_begin(struct file *file, struct address_space *mapping,
2345                         loff_t pos, unsigned len, unsigned flags,
2346                         struct page **pagep, void **fsdata,
2347                         get_block_t *get_block, loff_t *bytes)
2348 {
2349         struct inode *inode = mapping->host;
2350         unsigned blocksize = 1 << inode->i_blkbits;
2351         unsigned zerofrom;
2352         int err;
2353
2354         err = cont_expand_zero(file, mapping, pos, bytes);
2355         if (err)
2356                 return err;
2357
2358         zerofrom = *bytes & ~PAGE_CACHE_MASK;
2359         if (pos+len > *bytes && zerofrom & (blocksize-1)) {
2360                 *bytes |= (blocksize-1);
2361                 (*bytes)++;
2362         }
2363
2364         return block_write_begin(mapping, pos, len, flags, pagep, get_block);
2365 }
2366 EXPORT_SYMBOL(cont_write_begin);
2367
2368 int block_commit_write(struct page *page, unsigned from, unsigned to)
2369 {
2370         struct inode *inode = page->mapping->host;
2371         __block_commit_write(inode,page,from,to);
2372         return 0;
2373 }
2374 EXPORT_SYMBOL(block_commit_write);
2375
2376 /*
2377  * block_page_mkwrite() is not allowed to change the file size as it gets
2378  * called from a page fault handler when a page is first dirtied. Hence we must
2379  * be careful to check for EOF conditions here. We set the page up correctly
2380  * for a written page which means we get ENOSPC checking when writing into
2381  * holes and correct delalloc and unwritten extent mapping on filesystems that
2382  * support these features.
2383  *
2384  * We are not allowed to take the i_mutex here so we have to play games to
2385  * protect against truncate races as the page could now be beyond EOF.  Because
2386  * truncate writes the inode size before removing pages, once we have the
2387  * page lock we can determine safely if the page is beyond EOF. If it is not
2388  * beyond EOF, then the page is guaranteed safe against truncation until we
2389  * unlock the page.
2390  *
2391  * Direct callers of this function should protect against filesystem freezing
2392  * using sb_start_write() - sb_end_write() functions.
2393  */
2394 int __block_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf,
2395                          get_block_t get_block)
2396 {
2397         struct page *page = vmf->page;
2398         struct inode *inode = file_inode(vma->vm_file);
2399         unsigned long end;
2400         loff_t size;
2401         int ret;
2402
2403         lock_page(page);
2404         size = i_size_read(inode);
2405         if ((page->mapping != inode->i_mapping) ||
2406             (page_offset(page) > size)) {
2407                 /* We overload EFAULT to mean page got truncated */
2408                 ret = -EFAULT;
2409                 goto out_unlock;
2410         }
2411
2412         /* page is wholly or partially inside EOF */
2413         if (((page->index + 1) << PAGE_CACHE_SHIFT) > size)
2414                 end = size & ~PAGE_CACHE_MASK;
2415         else
2416                 end = PAGE_CACHE_SIZE;
2417
2418         ret = __block_write_begin(page, 0, end, get_block);
2419         if (!ret)
2420                 ret = block_commit_write(page, 0, end);
2421
2422         if (unlikely(ret < 0))
2423                 goto out_unlock;
2424         set_page_dirty(page);
2425         wait_for_stable_page(page);
2426         return 0;
2427 out_unlock:
2428         unlock_page(page);
2429         return ret;
2430 }
2431 EXPORT_SYMBOL(__block_page_mkwrite);
2432
2433 int block_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf,
2434                    get_block_t get_block)
2435 {
2436         int ret;
2437         struct super_block *sb = file_inode(vma->vm_file)->i_sb;
2438
2439         sb_start_pagefault(sb);
2440
2441         /*
2442          * Update file times before taking page lock. We may end up failing the
2443          * fault so this update may be superfluous but who really cares...
2444          */
2445         file_update_time(vma->vm_file);
2446
2447         ret = __block_page_mkwrite(vma, vmf, get_block);
2448         sb_end_pagefault(sb);
2449         return block_page_mkwrite_return(ret);
2450 }
2451 EXPORT_SYMBOL(block_page_mkwrite);
2452
2453 /*
2454  * nobh_write_begin()'s prereads are special: the buffer_heads are freed
2455  * immediately, while under the page lock.  So it needs a special end_io
2456  * handler which does not touch the bh after unlocking it.
2457  */
2458 static void end_buffer_read_nobh(struct buffer_head *bh, int uptodate)
2459 {
2460         __end_buffer_read_notouch(bh, uptodate);
2461 }
2462
2463 /*
2464  * Attach the singly-linked list of buffers created by nobh_write_begin, to
2465  * the page (converting it to circular linked list and taking care of page
2466  * dirty races).
2467  */
2468 static void attach_nobh_buffers(struct page *page, struct buffer_head *head)
2469 {
2470         struct buffer_head *bh;
2471
2472         BUG_ON(!PageLocked(page));
2473
2474         spin_lock(&page->mapping->private_lock);
2475         bh = head;
2476         do {
2477                 if (PageDirty(page))
2478                         set_buffer_dirty(bh);
2479                 if (!bh->b_this_page)
2480                         bh->b_this_page = head;
2481                 bh = bh->b_this_page;
2482         } while (bh != head);
2483         attach_page_buffers(page, head);
2484         spin_unlock(&page->mapping->private_lock);
2485 }
2486
2487 /*
2488  * On entry, the page is fully not uptodate.
2489  * On exit the page is fully uptodate in the areas outside (from,to)
2490  * The filesystem needs to handle block truncation upon failure.
2491  */
2492 int nobh_write_begin(struct address_space *mapping,
2493                         loff_t pos, unsigned len, unsigned flags,
2494                         struct page **pagep, void **fsdata,
2495                         get_block_t *get_block)
2496 {
2497         struct inode *inode = mapping->host;
2498         const unsigned blkbits = inode->i_blkbits;
2499         const unsigned blocksize = 1 << blkbits;
2500         struct buffer_head *head, *bh;
2501         struct page *page;
2502         pgoff_t index;
2503         unsigned from, to;
2504         unsigned block_in_page;
2505         unsigned block_start, block_end;
2506         sector_t block_in_file;
2507         int nr_reads = 0;
2508         int ret = 0;
2509         int is_mapped_to_disk = 1;
2510
2511         index = pos >> PAGE_CACHE_SHIFT;
2512         from = pos & (PAGE_CACHE_SIZE - 1);
2513         to = from + len;
2514
2515         page = grab_cache_page_write_begin(mapping, index, flags);
2516         if (!page)
2517                 return -ENOMEM;
2518         *pagep = page;
2519         *fsdata = NULL;
2520
2521         if (page_has_buffers(page)) {
2522                 ret = __block_write_begin(page, pos, len, get_block);
2523                 if (unlikely(ret))
2524                         goto out_release;
2525                 return ret;
2526         }
2527
2528         if (PageMappedToDisk(page))
2529                 return 0;
2530
2531         /*
2532          * Allocate buffers so that we can keep track of state, and potentially
2533          * attach them to the page if an error occurs. In the common case of
2534          * no error, they will just be freed again without ever being attached
2535          * to the page (which is all OK, because we're under the page lock).
2536          *
2537          * Be careful: the buffer linked list is a NULL terminated one, rather
2538          * than the circular one we're used to.
2539          */
2540         head = alloc_page_buffers(page, blocksize, 0);
2541         if (!head) {
2542                 ret = -ENOMEM;
2543                 goto out_release;
2544         }
2545
2546         block_in_file = (sector_t)page->index << (PAGE_CACHE_SHIFT - blkbits);
2547
2548         /*
2549          * We loop across all blocks in the page, whether or not they are
2550          * part of the affected region.  This is so we can discover if the
2551          * page is fully mapped-to-disk.
2552          */
2553         for (block_start = 0, block_in_page = 0, bh = head;
2554                   block_start < PAGE_CACHE_SIZE;
2555                   block_in_page++, block_start += blocksize, bh = bh->b_this_page) {
2556                 int create;
2557
2558                 block_end = block_start + blocksize;
2559                 bh->b_state = 0;
2560                 create = 1;
2561                 if (block_start >= to)
2562                         create = 0;
2563                 ret = get_block(inode, block_in_file + block_in_page,
2564                                         bh, create);
2565                 if (ret)
2566                         goto failed;
2567                 if (!buffer_mapped(bh))
2568                         is_mapped_to_disk = 0;
2569                 if (buffer_new(bh))
2570                         unmap_underlying_metadata(bh->b_bdev, bh->b_blocknr);
2571                 if (PageUptodate(page)) {
2572                         set_buffer_uptodate(bh);
2573                         continue;
2574                 }
2575                 if (buffer_new(bh) || !buffer_mapped(bh)) {
2576                         zero_user_segments(page, block_start, from,
2577                                                         to, block_end);
2578                         continue;
2579                 }
2580                 if (buffer_uptodate(bh))
2581                         continue;       /* reiserfs does this */
2582                 if (block_start < from || block_end > to) {
2583                         lock_buffer(bh);
2584                         bh->b_end_io = end_buffer_read_nobh;
2585                         submit_bh(READ, bh);
2586                         nr_reads++;
2587                 }
2588         }
2589
2590         if (nr_reads) {
2591                 /*
2592                  * The page is locked, so these buffers are protected from
2593                  * any VM or truncate activity.  Hence we don't need to care
2594                  * for the buffer_head refcounts.
2595                  */
2596                 for (bh = head; bh; bh = bh->b_this_page) {
2597                         wait_on_buffer(bh);
2598                         if (!buffer_uptodate(bh))
2599                                 ret = -EIO;
2600                 }
2601                 if (ret)
2602                         goto failed;
2603         }
2604
2605         if (is_mapped_to_disk)
2606                 SetPageMappedToDisk(page);
2607
2608         *fsdata = head; /* to be released by nobh_write_end */
2609
2610         return 0;
2611
2612 failed:
2613         BUG_ON(!ret);
2614         /*
2615          * Error recovery is a bit difficult. We need to zero out blocks that
2616          * were newly allocated, and dirty them to ensure they get written out.
2617          * Buffers need to be attached to the page at this point, otherwise
2618          * the handling of potential IO errors during writeout would be hard
2619          * (could try doing synchronous writeout, but what if that fails too?)
2620          */
2621         attach_nobh_buffers(page, head);
2622         page_zero_new_buffers(page, from, to);
2623
2624 out_release:
2625         unlock_page(page);
2626         page_cache_release(page);
2627         *pagep = NULL;
2628
2629         return ret;
2630 }
2631 EXPORT_SYMBOL(nobh_write_begin);
2632
2633 int nobh_write_end(struct file *file, struct address_space *mapping,
2634                         loff_t pos, unsigned len, unsigned copied,
2635                         struct page *page, void *fsdata)
2636 {
2637         struct inode *inode = page->mapping->host;
2638         struct buffer_head *head = fsdata;
2639         struct buffer_head *bh;
2640         BUG_ON(fsdata != NULL && page_has_buffers(page));
2641
2642         if (unlikely(copied < len) && head)
2643                 attach_nobh_buffers(page, head);
2644         if (page_has_buffers(page))
2645                 return generic_write_end(file, mapping, pos, len,
2646                                         copied, page, fsdata);
2647
2648         SetPageUptodate(page);
2649         set_page_dirty(page);
2650         if (pos+copied > inode->i_size) {
2651                 i_size_write(inode, pos+copied);
2652                 mark_inode_dirty(inode);
2653         }
2654
2655         unlock_page(page);
2656         page_cache_release(page);
2657
2658         while (head) {
2659                 bh = head;
2660                 head = head->b_this_page;
2661                 free_buffer_head(bh);
2662         }
2663
2664         return copied;
2665 }
2666 EXPORT_SYMBOL(nobh_write_end);
2667
2668 /*
2669  * nobh_writepage() - based on block_full_write_page() except
2670  * that it tries to operate without attaching bufferheads to
2671  * the page.
2672  */
2673 int nobh_writepage(struct page *page, get_block_t *get_block,
2674                         struct writeback_control *wbc)
2675 {
2676         struct inode * const inode = page->mapping->host;
2677         loff_t i_size = i_size_read(inode);
2678         const pgoff_t end_index = i_size >> PAGE_CACHE_SHIFT;
2679         unsigned offset;
2680         int ret;
2681
2682         /* Is the page fully inside i_size? */
2683         if (page->index < end_index)
2684                 goto out;
2685
2686         /* Is the page fully outside i_size? (truncate in progress) */
2687         offset = i_size & (PAGE_CACHE_SIZE-1);
2688         if (page->index >= end_index+1 || !offset) {
2689                 /*
2690                  * The page may have dirty, unmapped buffers.  For example,
2691                  * they may have been added in ext3_writepage().  Make them
2692                  * freeable here, so the page does not leak.
2693                  */
2694 #if 0
2695                 /* Not really sure about this  - do we need this ? */
2696                 if (page->mapping->a_ops->invalidatepage)
2697                         page->mapping->a_ops->invalidatepage(page, offset);
2698 #endif
2699                 unlock_page(page);
2700                 return 0; /* don't care */
2701         }
2702
2703         /*
2704          * The page straddles i_size.  It must be zeroed out on each and every
2705          * writepage invocation because it may be mmapped.  "A file is mapped
2706          * in multiples of the page size.  For a file that is not a multiple of
2707          * the  page size, the remaining memory is zeroed when mapped, and
2708          * writes to that region are not written out to the file."
2709          */
2710         zero_user_segment(page, offset, PAGE_CACHE_SIZE);
2711 out:
2712         ret = mpage_writepage(page, get_block, wbc);
2713         if (ret == -EAGAIN)
2714                 ret = __block_write_full_page(inode, page, get_block, wbc,
2715                                               end_buffer_async_write);
2716         return ret;
2717 }
2718 EXPORT_SYMBOL(nobh_writepage);
2719
2720 int nobh_truncate_page(struct address_space *mapping,
2721                         loff_t from, get_block_t *get_block)
2722 {
2723         pgoff_t index = from >> PAGE_CACHE_SHIFT;
2724         unsigned offset = from & (PAGE_CACHE_SIZE-1);
2725         unsigned blocksize;
2726         sector_t iblock;
2727         unsigned length, pos;
2728         struct inode *inode = mapping->host;
2729         struct page *page;
2730         struct buffer_head map_bh;
2731         int err;
2732
2733         blocksize = 1 << inode->i_blkbits;
2734         length = offset & (blocksize - 1);
2735
2736         /* Block boundary? Nothing to do */
2737         if (!length)
2738                 return 0;
2739
2740         length = blocksize - length;
2741         iblock = (sector_t)index << (PAGE_CACHE_SHIFT - inode->i_blkbits);
2742
2743         page = grab_cache_page(mapping, index);
2744         err = -ENOMEM;
2745         if (!page)
2746                 goto out;
2747
2748         if (page_has_buffers(page)) {
2749 has_buffers:
2750                 unlock_page(page);
2751                 page_cache_release(page);
2752                 return block_truncate_page(mapping, from, get_block);
2753         }
2754
2755         /* Find the buffer that contains "offset" */
2756         pos = blocksize;
2757         while (offset >= pos) {
2758                 iblock++;
2759                 pos += blocksize;
2760         }
2761
2762         map_bh.b_size = blocksize;
2763         map_bh.b_state = 0;
2764         err = get_block(inode, iblock, &map_bh, 0);
2765         if (err)
2766                 goto unlock;
2767         /* unmapped? It's a hole - nothing to do */
2768         if (!buffer_mapped(&map_bh))
2769                 goto unlock;
2770
2771         /* Ok, it's mapped. Make sure it's up-to-date */
2772         if (!PageUptodate(page)) {
2773                 err = mapping->a_ops->readpage(NULL, page);
2774                 if (err) {
2775                         page_cache_release(page);
2776                         goto out;
2777                 }
2778                 lock_page(page);
2779                 if (!PageUptodate(page)) {
2780                         err = -EIO;
2781                         goto unlock;
2782                 }
2783                 if (page_has_buffers(page))
2784                         goto has_buffers;
2785         }
2786         zero_user(page, offset, length);
2787         set_page_dirty(page);
2788         err = 0;
2789
2790 unlock:
2791         unlock_page(page);
2792         page_cache_release(page);
2793 out:
2794         return err;
2795 }
2796 EXPORT_SYMBOL(nobh_truncate_page);
2797
2798 int block_truncate_page(struct address_space *mapping,
2799                         loff_t from, get_block_t *get_block)
2800 {
2801         pgoff_t index = from >> PAGE_CACHE_SHIFT;
2802         unsigned offset = from & (PAGE_CACHE_SIZE-1);
2803         unsigned blocksize;
2804         sector_t iblock;
2805         unsigned length, pos;
2806         struct inode *inode = mapping->host;
2807         struct page *page;
2808         struct buffer_head *bh;
2809         int err;
2810
2811         blocksize = 1 << inode->i_blkbits;
2812         length = offset & (blocksize - 1);
2813
2814         /* Block boundary? Nothing to do */
2815         if (!length)
2816                 return 0;
2817
2818         length = blocksize - length;
2819         iblock = (sector_t)index << (PAGE_CACHE_SHIFT - inode->i_blkbits);
2820         
2821         page = grab_cache_page(mapping, index);
2822         err = -ENOMEM;
2823         if (!page)
2824                 goto out;
2825
2826         if (!page_has_buffers(page))
2827                 create_empty_buffers(page, blocksize, 0);
2828
2829         /* Find the buffer that contains "offset" */
2830         bh = page_buffers(page);
2831         pos = blocksize;
2832         while (offset >= pos) {
2833                 bh = bh->b_this_page;
2834                 iblock++;
2835                 pos += blocksize;
2836         }
2837
2838         err = 0;
2839         if (!buffer_mapped(bh)) {
2840                 WARN_ON(bh->b_size != blocksize);
2841                 err = get_block(inode, iblock, bh, 0);
2842                 if (err)
2843                         goto unlock;
2844                 /* unmapped? It's a hole - nothing to do */
2845                 if (!buffer_mapped(bh))
2846                         goto unlock;
2847         }
2848
2849         /* Ok, it's mapped. Make sure it's up-to-date */
2850         if (PageUptodate(page))
2851                 set_buffer_uptodate(bh);
2852
2853         if (!buffer_uptodate(bh) && !buffer_delay(bh) && !buffer_unwritten(bh)) {
2854                 err = -EIO;
2855                 ll_rw_block(READ, 1, &bh);
2856                 wait_on_buffer(bh);
2857                 /* Uhhuh. Read error. Complain and punt. */
2858                 if (!buffer_uptodate(bh))
2859                         goto unlock;
2860         }
2861
2862         zero_user(page, offset, length);
2863         mark_buffer_dirty(bh);
2864         err = 0;
2865
2866 unlock:
2867         unlock_page(page);
2868         page_cache_release(page);
2869 out:
2870         return err;
2871 }
2872 EXPORT_SYMBOL(block_truncate_page);
2873
2874 /*
2875  * The generic ->writepage function for buffer-backed address_spaces
2876  */
2877 int block_write_full_page(struct page *page, get_block_t *get_block,
2878                         struct writeback_control *wbc)
2879 {
2880         struct inode * const inode = page->mapping->host;
2881         loff_t i_size = i_size_read(inode);
2882         const pgoff_t end_index = i_size >> PAGE_CACHE_SHIFT;
2883         unsigned offset;
2884
2885         /* Is the page fully inside i_size? */
2886         if (page->index < end_index)
2887                 return __block_write_full_page(inode, page, get_block, wbc,
2888                                                end_buffer_async_write);
2889
2890         /* Is the page fully outside i_size? (truncate in progress) */
2891         offset = i_size & (PAGE_CACHE_SIZE-1);
2892         if (page->index >= end_index+1 || !offset) {
2893                 /*
2894                  * The page may have dirty, unmapped buffers.  For example,
2895                  * they may have been added in ext3_writepage().  Make them
2896                  * freeable here, so the page does not leak.
2897                  */
2898                 do_invalidatepage(page, 0, PAGE_CACHE_SIZE);
2899                 unlock_page(page);
2900                 return 0; /* don't care */
2901         }
2902
2903         /*
2904          * The page straddles i_size.  It must be zeroed out on each and every
2905          * writepage invocation because it may be mmapped.  "A file is mapped
2906          * in multiples of the page size.  For a file that is not a multiple of
2907          * the  page size, the remaining memory is zeroed when mapped, and
2908          * writes to that region are not written out to the file."
2909          */
2910         zero_user_segment(page, offset, PAGE_CACHE_SIZE);
2911         return __block_write_full_page(inode, page, get_block, wbc,
2912                                                         end_buffer_async_write);
2913 }
2914 EXPORT_SYMBOL(block_write_full_page);
2915
2916 sector_t generic_block_bmap(struct address_space *mapping, sector_t block,
2917                             get_block_t *get_block)
2918 {
2919         struct buffer_head tmp;
2920         struct inode *inode = mapping->host;
2921         tmp.b_state = 0;
2922         tmp.b_blocknr = 0;
2923         tmp.b_size = 1 << inode->i_blkbits;
2924         get_block(inode, block, &tmp, 0);
2925         return tmp.b_blocknr;
2926 }
2927 EXPORT_SYMBOL(generic_block_bmap);
2928
2929 static void end_bio_bh_io_sync(struct bio *bio, int err)
2930 {
2931         struct buffer_head *bh = bio->bi_private;
2932
2933         if (err == -EOPNOTSUPP) {
2934                 set_bit(BIO_EOPNOTSUPP, &bio->bi_flags);
2935         }
2936
2937         if (unlikely (test_bit(BIO_QUIET,&bio->bi_flags)))
2938                 set_bit(BH_Quiet, &bh->b_state);
2939
2940         bh->b_end_io(bh, test_bit(BIO_UPTODATE, &bio->bi_flags));
2941         bio_put(bio);
2942 }
2943
2944 /*
2945  * This allows us to do IO even on the odd last sectors
2946  * of a device, even if the block size is some multiple
2947  * of the physical sector size.
2948  *
2949  * We'll just truncate the bio to the size of the device,
2950  * and clear the end of the buffer head manually.
2951  *
2952  * Truly out-of-range accesses will turn into actual IO
2953  * errors, this only handles the "we need to be able to
2954  * do IO at the final sector" case.
2955  */
2956 void guard_bio_eod(int rw, struct bio *bio)
2957 {
2958         sector_t maxsector;
2959         struct bio_vec *bvec = &bio->bi_io_vec[bio->bi_vcnt - 1];
2960         unsigned truncated_bytes;
2961
2962         maxsector = i_size_read(bio->bi_bdev->bd_inode) >> 9;
2963         if (!maxsector)
2964                 return;
2965
2966         /*
2967          * If the *whole* IO is past the end of the device,
2968          * let it through, and the IO layer will turn it into
2969          * an EIO.
2970          */
2971         if (unlikely(bio->bi_iter.bi_sector >= maxsector))
2972                 return;
2973
2974         maxsector -= bio->bi_iter.bi_sector;
2975         if (likely((bio->bi_iter.bi_size >> 9) <= maxsector))
2976                 return;
2977
2978         /* Uhhuh. We've got a bio that straddles the device size! */
2979         truncated_bytes = bio->bi_iter.bi_size - (maxsector << 9);
2980
2981         /* Truncate the bio.. */
2982         bio->bi_iter.bi_size -= truncated_bytes;
2983         bvec->bv_len -= truncated_bytes;
2984
2985         /* ..and clear the end of the buffer for reads */
2986         if ((rw & RW_MASK) == READ) {
2987                 zero_user(bvec->bv_page, bvec->bv_offset + bvec->bv_len,
2988                                 truncated_bytes);
2989         }
2990 }
2991
2992 int _submit_bh(int rw, struct buffer_head *bh, unsigned long bio_flags)
2993 {
2994         struct bio *bio;
2995         int ret = 0;
2996
2997         BUG_ON(!buffer_locked(bh));
2998         BUG_ON(!buffer_mapped(bh));
2999         BUG_ON(!bh->b_end_io);
3000         BUG_ON(buffer_delay(bh));
3001         BUG_ON(buffer_unwritten(bh));
3002
3003         /*
3004          * Only clear out a write error when rewriting
3005          */
3006         if (test_set_buffer_req(bh) && (rw & WRITE))
3007                 clear_buffer_write_io_error(bh);
3008
3009         /*
3010          * from here on down, it's all bio -- do the initial mapping,
3011          * submit_bio -> generic_make_request may further map this bio around
3012          */
3013         bio = bio_alloc(GFP_NOIO, 1);
3014
3015         bio->bi_iter.bi_sector = bh->b_blocknr * (bh->b_size >> 9);
3016         bio->bi_bdev = bh->b_bdev;
3017         bio->bi_io_vec[0].bv_page = bh->b_page;
3018         bio->bi_io_vec[0].bv_len = bh->b_size;
3019         bio->bi_io_vec[0].bv_offset = bh_offset(bh);
3020
3021         bio->bi_vcnt = 1;
3022         bio->bi_iter.bi_size = bh->b_size;
3023
3024         bio->bi_end_io = end_bio_bh_io_sync;
3025         bio->bi_private = bh;
3026         bio->bi_flags |= bio_flags;
3027
3028         /* Take care of bh's that straddle the end of the device */
3029         guard_bio_eod(rw, bio);
3030
3031         if (buffer_meta(bh))
3032                 rw |= REQ_META;
3033         if (buffer_prio(bh))
3034                 rw |= REQ_PRIO;
3035
3036         bio_get(bio);
3037         submit_bio(rw, bio);
3038
3039         if (bio_flagged(bio, BIO_EOPNOTSUPP))
3040                 ret = -EOPNOTSUPP;
3041
3042         bio_put(bio);
3043         return ret;
3044 }
3045 EXPORT_SYMBOL_GPL(_submit_bh);
3046
3047 int submit_bh(int rw, struct buffer_head *bh)
3048 {
3049         return _submit_bh(rw, bh, 0);
3050 }
3051 EXPORT_SYMBOL(submit_bh);
3052
3053 /**
3054  * ll_rw_block: low-level access to block devices (DEPRECATED)
3055  * @rw: whether to %READ or %WRITE or maybe %READA (readahead)
3056  * @nr: number of &struct buffer_heads in the array
3057  * @bhs: array of pointers to &struct buffer_head
3058  *
3059  * ll_rw_block() takes an array of pointers to &struct buffer_heads, and
3060  * requests an I/O operation on them, either a %READ or a %WRITE.  The third
3061  * %READA option is described in the documentation for generic_make_request()
3062  * which ll_rw_block() calls.
3063  *
3064  * This function drops any buffer that it cannot get a lock on (with the
3065  * BH_Lock state bit), any buffer that appears to be clean when doing a write
3066  * request, and any buffer that appears to be up-to-date when doing read
3067  * request.  Further it marks as clean buffers that are processed for
3068  * writing (the buffer cache won't assume that they are actually clean
3069  * until the buffer gets unlocked).
3070  *
3071  * ll_rw_block sets b_end_io to simple completion handler that marks
3072  * the buffer up-to-date (if appropriate), unlocks the buffer and wakes
3073  * any waiters. 
3074  *
3075  * All of the buffers must be for the same device, and must also be a
3076  * multiple of the current approved size for the device.
3077  */
3078 void ll_rw_block(int rw, int nr, struct buffer_head *bhs[])
3079 {
3080         int i;
3081
3082         for (i = 0; i < nr; i++) {
3083                 struct buffer_head *bh = bhs[i];
3084
3085                 if (!trylock_buffer(bh))
3086                         continue;
3087                 if (rw == WRITE) {
3088                         if (test_clear_buffer_dirty(bh)) {
3089                                 bh->b_end_io = end_buffer_write_sync;
3090                                 get_bh(bh);
3091                                 submit_bh(WRITE, bh);
3092                                 continue;
3093                         }
3094                 } else {
3095                         if (!buffer_uptodate(bh)) {
3096                                 bh->b_end_io = end_buffer_read_sync;
3097                                 get_bh(bh);
3098                                 submit_bh(rw, bh);
3099                                 continue;
3100                         }
3101                 }
3102                 unlock_buffer(bh);
3103         }
3104 }
3105 EXPORT_SYMBOL(ll_rw_block);
3106
3107 void write_dirty_buffer(struct buffer_head *bh, int rw)
3108 {
3109         lock_buffer(bh);
3110         if (!test_clear_buffer_dirty(bh)) {
3111                 unlock_buffer(bh);
3112                 return;
3113         }
3114         bh->b_end_io = end_buffer_write_sync;
3115         get_bh(bh);
3116         submit_bh(rw, bh);
3117 }
3118 EXPORT_SYMBOL(write_dirty_buffer);
3119
3120 /*
3121  * For a data-integrity writeout, we need to wait upon any in-progress I/O
3122  * and then start new I/O and then wait upon it.  The caller must have a ref on
3123  * the buffer_head.
3124  */
3125 int __sync_dirty_buffer(struct buffer_head *bh, int rw)
3126 {
3127         int ret = 0;
3128
3129         WARN_ON(atomic_read(&bh->b_count) < 1);
3130         lock_buffer(bh);
3131         if (test_clear_buffer_dirty(bh)) {
3132                 get_bh(bh);
3133                 bh->b_end_io = end_buffer_write_sync;
3134                 ret = submit_bh(rw, bh);
3135                 wait_on_buffer(bh);
3136                 if (!ret && !buffer_uptodate(bh))
3137                         ret = -EIO;
3138         } else {
3139                 unlock_buffer(bh);
3140         }
3141         return ret;
3142 }
3143 EXPORT_SYMBOL(__sync_dirty_buffer);
3144
3145 int sync_dirty_buffer(struct buffer_head *bh)
3146 {
3147         return __sync_dirty_buffer(bh, WRITE_SYNC);
3148 }
3149 EXPORT_SYMBOL(sync_dirty_buffer);
3150
3151 /*
3152  * try_to_free_buffers() checks if all the buffers on this particular page
3153  * are unused, and releases them if so.
3154  *
3155  * Exclusion against try_to_free_buffers may be obtained by either
3156  * locking the page or by holding its mapping's private_lock.
3157  *
3158  * If the page is dirty but all the buffers are clean then we need to
3159  * be sure to mark the page clean as well.  This is because the page
3160  * may be against a block device, and a later reattachment of buffers
3161  * to a dirty page will set *all* buffers dirty.  Which would corrupt
3162  * filesystem data on the same device.
3163  *
3164  * The same applies to regular filesystem pages: if all the buffers are
3165  * clean then we set the page clean and proceed.  To do that, we require
3166  * total exclusion from __set_page_dirty_buffers().  That is obtained with
3167  * private_lock.
3168  *
3169  * try_to_free_buffers() is non-blocking.
3170  */
3171 static inline int buffer_busy(struct buffer_head *bh)
3172 {
3173         return atomic_read(&bh->b_count) |
3174                 (bh->b_state & ((1 << BH_Dirty) | (1 << BH_Lock)));
3175 }
3176
3177 static int
3178 drop_buffers(struct page *page, struct buffer_head **buffers_to_free)
3179 {
3180         struct buffer_head *head = page_buffers(page);
3181         struct buffer_head *bh;
3182
3183         bh = head;
3184         do {
3185                 if (buffer_write_io_error(bh) && page->mapping)
3186                         set_bit(AS_EIO, &page->mapping->flags);
3187                 if (buffer_busy(bh))
3188                         goto failed;
3189                 bh = bh->b_this_page;
3190         } while (bh != head);
3191
3192         do {
3193                 struct buffer_head *next = bh->b_this_page;
3194
3195                 if (bh->b_assoc_map)
3196                         __remove_assoc_queue(bh);
3197                 bh = next;
3198         } while (bh != head);
3199         *buffers_to_free = head;
3200         __clear_page_buffers(page);
3201         return 1;
3202 failed:
3203         return 0;
3204 }
3205
3206 int try_to_free_buffers(struct page *page)
3207 {
3208         struct address_space * const mapping = page->mapping;
3209         struct buffer_head *buffers_to_free = NULL;
3210         int ret = 0;
3211
3212         BUG_ON(!PageLocked(page));
3213         if (PageWriteback(page))
3214                 return 0;
3215
3216         if (mapping == NULL) {          /* can this still happen? */
3217                 ret = drop_buffers(page, &buffers_to_free);
3218                 goto out;
3219         }
3220
3221         spin_lock(&mapping->private_lock);
3222         ret = drop_buffers(page, &buffers_to_free);
3223
3224         /*
3225          * If the filesystem writes its buffers by hand (eg ext3)
3226          * then we can have clean buffers against a dirty page.  We
3227          * clean the page here; otherwise the VM will never notice
3228          * that the filesystem did any IO at all.
3229          *
3230          * Also, during truncate, discard_buffer will have marked all
3231          * the page's buffers clean.  We discover that here and clean
3232          * the page also.
3233          *
3234          * private_lock must be held over this entire operation in order
3235          * to synchronise against __set_page_dirty_buffers and prevent the
3236          * dirty bit from being lost.
3237          */
3238         if (ret && TestClearPageDirty(page))
3239                 account_page_cleaned(page, mapping);
3240         spin_unlock(&mapping->private_lock);
3241 out:
3242         if (buffers_to_free) {
3243                 struct buffer_head *bh = buffers_to_free;
3244
3245                 do {
3246                         struct buffer_head *next = bh->b_this_page;
3247                         free_buffer_head(bh);
3248                         bh = next;
3249                 } while (bh != buffers_to_free);
3250         }
3251         return ret;
3252 }
3253 EXPORT_SYMBOL(try_to_free_buffers);
3254
3255 /*
3256  * There are no bdflush tunables left.  But distributions are
3257  * still running obsolete flush daemons, so we terminate them here.
3258  *
3259  * Use of bdflush() is deprecated and will be removed in a future kernel.
3260  * The `flush-X' kernel threads fully replace bdflush daemons and this call.
3261  */
3262 SYSCALL_DEFINE2(bdflush, int, func, long, data)
3263 {
3264         static int msg_count;
3265
3266         if (!capable(CAP_SYS_ADMIN))
3267                 return -EPERM;
3268
3269         if (msg_count < 5) {
3270                 msg_count++;
3271                 printk(KERN_INFO
3272                         "warning: process `%s' used the obsolete bdflush"
3273                         " system call\n", current->comm);
3274                 printk(KERN_INFO "Fix your initscripts?\n");
3275         }
3276
3277         if (func == 1)
3278                 do_exit(0);
3279         return 0;
3280 }
3281
3282 /*
3283  * Buffer-head allocation
3284  */
3285 static struct kmem_cache *bh_cachep __read_mostly;
3286
3287 /*
3288  * Once the number of bh's in the machine exceeds this level, we start
3289  * stripping them in writeback.
3290  */
3291 static unsigned long max_buffer_heads;
3292
3293 int buffer_heads_over_limit;
3294
3295 struct bh_accounting {
3296         int nr;                 /* Number of live bh's */
3297         int ratelimit;          /* Limit cacheline bouncing */
3298 };
3299
3300 static DEFINE_PER_CPU(struct bh_accounting, bh_accounting) = {0, 0};
3301
3302 static void recalc_bh_state(void)
3303 {
3304         int i;
3305         int tot = 0;
3306
3307         if (__this_cpu_inc_return(bh_accounting.ratelimit) - 1 < 4096)
3308                 return;
3309         __this_cpu_write(bh_accounting.ratelimit, 0);
3310         for_each_online_cpu(i)
3311                 tot += per_cpu(bh_accounting, i).nr;
3312         buffer_heads_over_limit = (tot > max_buffer_heads);
3313 }
3314
3315 struct buffer_head *alloc_buffer_head(gfp_t gfp_flags)
3316 {
3317         struct buffer_head *ret = kmem_cache_zalloc(bh_cachep, gfp_flags);
3318         if (ret) {
3319                 INIT_LIST_HEAD(&ret->b_assoc_buffers);
3320                 buffer_head_init_locks(ret);
3321                 preempt_disable();
3322                 __this_cpu_inc(bh_accounting.nr);
3323                 recalc_bh_state();
3324                 preempt_enable();
3325         }
3326         return ret;
3327 }
3328 EXPORT_SYMBOL(alloc_buffer_head);
3329
3330 void free_buffer_head(struct buffer_head *bh)
3331 {
3332         BUG_ON(!list_empty(&bh->b_assoc_buffers));
3333         kmem_cache_free(bh_cachep, bh);
3334         preempt_disable();
3335         __this_cpu_dec(bh_accounting.nr);
3336         recalc_bh_state();
3337         preempt_enable();
3338 }
3339 EXPORT_SYMBOL(free_buffer_head);
3340
3341 static void buffer_exit_cpu(int cpu)
3342 {
3343         int i;
3344         struct bh_lru *b = &per_cpu(bh_lrus, cpu);
3345
3346         for (i = 0; i < BH_LRU_SIZE; i++) {
3347                 brelse(b->bhs[i]);
3348                 b->bhs[i] = NULL;
3349         }
3350         this_cpu_add(bh_accounting.nr, per_cpu(bh_accounting, cpu).nr);
3351         per_cpu(bh_accounting, cpu).nr = 0;
3352 }
3353
3354 static int buffer_cpu_notify(struct notifier_block *self,
3355                               unsigned long action, void *hcpu)
3356 {
3357         if (action == CPU_DEAD || action == CPU_DEAD_FROZEN)
3358                 buffer_exit_cpu((unsigned long)hcpu);
3359         return NOTIFY_OK;
3360 }
3361
3362 /**
3363  * bh_uptodate_or_lock - Test whether the buffer is uptodate
3364  * @bh: struct buffer_head
3365  *
3366  * Return true if the buffer is up-to-date and false,
3367  * with the buffer locked, if not.
3368  */
3369 int bh_uptodate_or_lock(struct buffer_head *bh)
3370 {
3371         if (!buffer_uptodate(bh)) {
3372                 lock_buffer(bh);
3373                 if (!buffer_uptodate(bh))
3374                         return 0;
3375                 unlock_buffer(bh);
3376         }
3377         return 1;
3378 }
3379 EXPORT_SYMBOL(bh_uptodate_or_lock);
3380
3381 /**
3382  * bh_submit_read - Submit a locked buffer for reading
3383  * @bh: struct buffer_head
3384  *
3385  * Returns zero on success and -EIO on error.
3386  */
3387 int bh_submit_read(struct buffer_head *bh)
3388 {
3389         BUG_ON(!buffer_locked(bh));
3390
3391         if (buffer_uptodate(bh)) {
3392                 unlock_buffer(bh);
3393                 return 0;
3394         }
3395
3396         get_bh(bh);
3397         bh->b_end_io = end_buffer_read_sync;
3398         submit_bh(READ, bh);
3399         wait_on_buffer(bh);
3400         if (buffer_uptodate(bh))
3401                 return 0;
3402         return -EIO;
3403 }
3404 EXPORT_SYMBOL(bh_submit_read);
3405
3406 void __init buffer_init(void)
3407 {
3408         unsigned long nrpages;
3409
3410         bh_cachep = kmem_cache_create("buffer_head",
3411                         sizeof(struct buffer_head), 0,
3412                                 (SLAB_RECLAIM_ACCOUNT|SLAB_PANIC|
3413                                 SLAB_MEM_SPREAD),
3414                                 NULL);
3415
3416         /*
3417          * Limit the bh occupancy to 10% of ZONE_NORMAL
3418          */
3419         nrpages = (nr_free_buffer_pages() * 10) / 100;
3420         max_buffer_heads = nrpages * (PAGE_SIZE / sizeof(struct buffer_head));
3421         hotcpu_notifier(buffer_cpu_notify, 0);
3422 }