Add the rt linux 4.1.3-rt3 as base
[kvmfornfv.git] / kernel / fs / xfs / xfs_extent_busy.c
1 /*
2  * Copyright (c) 2000-2002,2005 Silicon Graphics, Inc.
3  * Copyright (c) 2010 David Chinner.
4  * Copyright (c) 2011 Christoph Hellwig.
5  * All Rights Reserved.
6  *
7  * This program is free software; you can redistribute it and/or
8  * modify it under the terms of the GNU General Public License as
9  * published by the Free Software Foundation.
10  *
11  * This program is distributed in the hope that it would be useful,
12  * but WITHOUT ANY WARRANTY; without even the implied warranty of
13  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
14  * GNU General Public License for more details.
15  *
16  * You should have received a copy of the GNU General Public License
17  * along with this program; if not, write the Free Software Foundation,
18  * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
19  */
20 #include "xfs.h"
21 #include "xfs_fs.h"
22 #include "xfs_format.h"
23 #include "xfs_log_format.h"
24 #include "xfs_shared.h"
25 #include "xfs_trans_resv.h"
26 #include "xfs_sb.h"
27 #include "xfs_mount.h"
28 #include "xfs_alloc.h"
29 #include "xfs_extent_busy.h"
30 #include "xfs_trace.h"
31 #include "xfs_trans.h"
32 #include "xfs_log.h"
33
34 void
35 xfs_extent_busy_insert(
36         struct xfs_trans        *tp,
37         xfs_agnumber_t          agno,
38         xfs_agblock_t           bno,
39         xfs_extlen_t            len,
40         unsigned int            flags)
41 {
42         struct xfs_extent_busy  *new;
43         struct xfs_extent_busy  *busyp;
44         struct xfs_perag        *pag;
45         struct rb_node          **rbp;
46         struct rb_node          *parent = NULL;
47
48         new = kmem_zalloc(sizeof(struct xfs_extent_busy), KM_MAYFAIL);
49         if (!new) {
50                 /*
51                  * No Memory!  Since it is now not possible to track the free
52                  * block, make this a synchronous transaction to insure that
53                  * the block is not reused before this transaction commits.
54                  */
55                 trace_xfs_extent_busy_enomem(tp->t_mountp, agno, bno, len);
56                 xfs_trans_set_sync(tp);
57                 return;
58         }
59
60         new->agno = agno;
61         new->bno = bno;
62         new->length = len;
63         INIT_LIST_HEAD(&new->list);
64         new->flags = flags;
65
66         /* trace before insert to be able to see failed inserts */
67         trace_xfs_extent_busy(tp->t_mountp, agno, bno, len);
68
69         pag = xfs_perag_get(tp->t_mountp, new->agno);
70         spin_lock(&pag->pagb_lock);
71         rbp = &pag->pagb_tree.rb_node;
72         while (*rbp) {
73                 parent = *rbp;
74                 busyp = rb_entry(parent, struct xfs_extent_busy, rb_node);
75
76                 if (new->bno < busyp->bno) {
77                         rbp = &(*rbp)->rb_left;
78                         ASSERT(new->bno + new->length <= busyp->bno);
79                 } else if (new->bno > busyp->bno) {
80                         rbp = &(*rbp)->rb_right;
81                         ASSERT(bno >= busyp->bno + busyp->length);
82                 } else {
83                         ASSERT(0);
84                 }
85         }
86
87         rb_link_node(&new->rb_node, parent, rbp);
88         rb_insert_color(&new->rb_node, &pag->pagb_tree);
89
90         list_add(&new->list, &tp->t_busy);
91         spin_unlock(&pag->pagb_lock);
92         xfs_perag_put(pag);
93 }
94
95 /*
96  * Search for a busy extent within the range of the extent we are about to
97  * allocate.  You need to be holding the busy extent tree lock when calling
98  * xfs_extent_busy_search(). This function returns 0 for no overlapping busy
99  * extent, -1 for an overlapping but not exact busy extent, and 1 for an exact
100  * match. This is done so that a non-zero return indicates an overlap that
101  * will require a synchronous transaction, but it can still be
102  * used to distinguish between a partial or exact match.
103  */
104 int
105 xfs_extent_busy_search(
106         struct xfs_mount        *mp,
107         xfs_agnumber_t          agno,
108         xfs_agblock_t           bno,
109         xfs_extlen_t            len)
110 {
111         struct xfs_perag        *pag;
112         struct rb_node          *rbp;
113         struct xfs_extent_busy  *busyp;
114         int                     match = 0;
115
116         pag = xfs_perag_get(mp, agno);
117         spin_lock(&pag->pagb_lock);
118
119         rbp = pag->pagb_tree.rb_node;
120
121         /* find closest start bno overlap */
122         while (rbp) {
123                 busyp = rb_entry(rbp, struct xfs_extent_busy, rb_node);
124                 if (bno < busyp->bno) {
125                         /* may overlap, but exact start block is lower */
126                         if (bno + len > busyp->bno)
127                                 match = -1;
128                         rbp = rbp->rb_left;
129                 } else if (bno > busyp->bno) {
130                         /* may overlap, but exact start block is higher */
131                         if (bno < busyp->bno + busyp->length)
132                                 match = -1;
133                         rbp = rbp->rb_right;
134                 } else {
135                         /* bno matches busyp, length determines exact match */
136                         match = (busyp->length == len) ? 1 : -1;
137                         break;
138                 }
139         }
140         spin_unlock(&pag->pagb_lock);
141         xfs_perag_put(pag);
142         return match;
143 }
144
145 /*
146  * The found free extent [fbno, fend] overlaps part or all of the given busy
147  * extent.  If the overlap covers the beginning, the end, or all of the busy
148  * extent, the overlapping portion can be made unbusy and used for the
149  * allocation.  We can't split a busy extent because we can't modify a
150  * transaction/CIL context busy list, but we can update an entry's block
151  * number or length.
152  *
153  * Returns true if the extent can safely be reused, or false if the search
154  * needs to be restarted.
155  */
156 STATIC bool
157 xfs_extent_busy_update_extent(
158         struct xfs_mount        *mp,
159         struct xfs_perag        *pag,
160         struct xfs_extent_busy  *busyp,
161         xfs_agblock_t           fbno,
162         xfs_extlen_t            flen,
163         bool                    userdata) __releases(&pag->pagb_lock)
164                                           __acquires(&pag->pagb_lock)
165 {
166         xfs_agblock_t           fend = fbno + flen;
167         xfs_agblock_t           bbno = busyp->bno;
168         xfs_agblock_t           bend = bbno + busyp->length;
169
170         /*
171          * This extent is currently being discarded.  Give the thread
172          * performing the discard a chance to mark the extent unbusy
173          * and retry.
174          */
175         if (busyp->flags & XFS_EXTENT_BUSY_DISCARDED) {
176                 spin_unlock(&pag->pagb_lock);
177                 delay(1);
178                 spin_lock(&pag->pagb_lock);
179                 return false;
180         }
181
182         /*
183          * If there is a busy extent overlapping a user allocation, we have
184          * no choice but to force the log and retry the search.
185          *
186          * Fortunately this does not happen during normal operation, but
187          * only if the filesystem is very low on space and has to dip into
188          * the AGFL for normal allocations.
189          */
190         if (userdata)
191                 goto out_force_log;
192
193         if (bbno < fbno && bend > fend) {
194                 /*
195                  * Case 1:
196                  *    bbno           bend
197                  *    +BBBBBBBBBBBBBBBBB+
198                  *        +---------+
199                  *        fbno   fend
200                  */
201
202                 /*
203                  * We would have to split the busy extent to be able to track
204                  * it correct, which we cannot do because we would have to
205                  * modify the list of busy extents attached to the transaction
206                  * or CIL context, which is immutable.
207                  *
208                  * Force out the log to clear the busy extent and retry the
209                  * search.
210                  */
211                 goto out_force_log;
212         } else if (bbno >= fbno && bend <= fend) {
213                 /*
214                  * Case 2:
215                  *    bbno           bend
216                  *    +BBBBBBBBBBBBBBBBB+
217                  *    +-----------------+
218                  *    fbno           fend
219                  *
220                  * Case 3:
221                  *    bbno           bend
222                  *    +BBBBBBBBBBBBBBBBB+
223                  *    +--------------------------+
224                  *    fbno                    fend
225                  *
226                  * Case 4:
227                  *             bbno           bend
228                  *             +BBBBBBBBBBBBBBBBB+
229                  *    +--------------------------+
230                  *    fbno                    fend
231                  *
232                  * Case 5:
233                  *             bbno           bend
234                  *             +BBBBBBBBBBBBBBBBB+
235                  *    +-----------------------------------+
236                  *    fbno                             fend
237                  *
238                  */
239
240                 /*
241                  * The busy extent is fully covered by the extent we are
242                  * allocating, and can simply be removed from the rbtree.
243                  * However we cannot remove it from the immutable list
244                  * tracking busy extents in the transaction or CIL context,
245                  * so set the length to zero to mark it invalid.
246                  *
247                  * We also need to restart the busy extent search from the
248                  * tree root, because erasing the node can rearrange the
249                  * tree topology.
250                  */
251                 rb_erase(&busyp->rb_node, &pag->pagb_tree);
252                 busyp->length = 0;
253                 return false;
254         } else if (fend < bend) {
255                 /*
256                  * Case 6:
257                  *              bbno           bend
258                  *             +BBBBBBBBBBBBBBBBB+
259                  *             +---------+
260                  *             fbno   fend
261                  *
262                  * Case 7:
263                  *             bbno           bend
264                  *             +BBBBBBBBBBBBBBBBB+
265                  *    +------------------+
266                  *    fbno            fend
267                  *
268                  */
269                 busyp->bno = fend;
270         } else if (bbno < fbno) {
271                 /*
272                  * Case 8:
273                  *    bbno           bend
274                  *    +BBBBBBBBBBBBBBBBB+
275                  *        +-------------+
276                  *        fbno       fend
277                  *
278                  * Case 9:
279                  *    bbno           bend
280                  *    +BBBBBBBBBBBBBBBBB+
281                  *        +----------------------+
282                  *        fbno                fend
283                  */
284                 busyp->length = fbno - busyp->bno;
285         } else {
286                 ASSERT(0);
287         }
288
289         trace_xfs_extent_busy_reuse(mp, pag->pag_agno, fbno, flen);
290         return true;
291
292 out_force_log:
293         spin_unlock(&pag->pagb_lock);
294         xfs_log_force(mp, XFS_LOG_SYNC);
295         trace_xfs_extent_busy_force(mp, pag->pag_agno, fbno, flen);
296         spin_lock(&pag->pagb_lock);
297         return false;
298 }
299
300
301 /*
302  * For a given extent [fbno, flen], make sure we can reuse it safely.
303  */
304 void
305 xfs_extent_busy_reuse(
306         struct xfs_mount        *mp,
307         xfs_agnumber_t          agno,
308         xfs_agblock_t           fbno,
309         xfs_extlen_t            flen,
310         bool                    userdata)
311 {
312         struct xfs_perag        *pag;
313         struct rb_node          *rbp;
314
315         ASSERT(flen > 0);
316
317         pag = xfs_perag_get(mp, agno);
318         spin_lock(&pag->pagb_lock);
319 restart:
320         rbp = pag->pagb_tree.rb_node;
321         while (rbp) {
322                 struct xfs_extent_busy *busyp =
323                         rb_entry(rbp, struct xfs_extent_busy, rb_node);
324                 xfs_agblock_t   bbno = busyp->bno;
325                 xfs_agblock_t   bend = bbno + busyp->length;
326
327                 if (fbno + flen <= bbno) {
328                         rbp = rbp->rb_left;
329                         continue;
330                 } else if (fbno >= bend) {
331                         rbp = rbp->rb_right;
332                         continue;
333                 }
334
335                 if (!xfs_extent_busy_update_extent(mp, pag, busyp, fbno, flen,
336                                                   userdata))
337                         goto restart;
338         }
339         spin_unlock(&pag->pagb_lock);
340         xfs_perag_put(pag);
341 }
342
343 /*
344  * For a given extent [fbno, flen], search the busy extent list to find a
345  * subset of the extent that is not busy.  If *rlen is smaller than
346  * args->minlen no suitable extent could be found, and the higher level
347  * code needs to force out the log and retry the allocation.
348  */
349 void
350 xfs_extent_busy_trim(
351         struct xfs_alloc_arg    *args,
352         xfs_agblock_t           bno,
353         xfs_extlen_t            len,
354         xfs_agblock_t           *rbno,
355         xfs_extlen_t            *rlen)
356 {
357         xfs_agblock_t           fbno;
358         xfs_extlen_t            flen;
359         struct rb_node          *rbp;
360
361         ASSERT(len > 0);
362
363         spin_lock(&args->pag->pagb_lock);
364 restart:
365         fbno = bno;
366         flen = len;
367         rbp = args->pag->pagb_tree.rb_node;
368         while (rbp && flen >= args->minlen) {
369                 struct xfs_extent_busy *busyp =
370                         rb_entry(rbp, struct xfs_extent_busy, rb_node);
371                 xfs_agblock_t   fend = fbno + flen;
372                 xfs_agblock_t   bbno = busyp->bno;
373                 xfs_agblock_t   bend = bbno + busyp->length;
374
375                 if (fend <= bbno) {
376                         rbp = rbp->rb_left;
377                         continue;
378                 } else if (fbno >= bend) {
379                         rbp = rbp->rb_right;
380                         continue;
381                 }
382
383                 /*
384                  * If this is a metadata allocation, try to reuse the busy
385                  * extent instead of trimming the allocation.
386                  */
387                 if (!args->userdata &&
388                     !(busyp->flags & XFS_EXTENT_BUSY_DISCARDED)) {
389                         if (!xfs_extent_busy_update_extent(args->mp, args->pag,
390                                                           busyp, fbno, flen,
391                                                           false))
392                                 goto restart;
393                         continue;
394                 }
395
396                 if (bbno <= fbno) {
397                         /* start overlap */
398
399                         /*
400                          * Case 1:
401                          *    bbno           bend
402                          *    +BBBBBBBBBBBBBBBBB+
403                          *        +---------+
404                          *        fbno   fend
405                          *
406                          * Case 2:
407                          *    bbno           bend
408                          *    +BBBBBBBBBBBBBBBBB+
409                          *    +-------------+
410                          *    fbno       fend
411                          *
412                          * Case 3:
413                          *    bbno           bend
414                          *    +BBBBBBBBBBBBBBBBB+
415                          *        +-------------+
416                          *        fbno       fend
417                          *
418                          * Case 4:
419                          *    bbno           bend
420                          *    +BBBBBBBBBBBBBBBBB+
421                          *    +-----------------+
422                          *    fbno           fend
423                          *
424                          * No unbusy region in extent, return failure.
425                          */
426                         if (fend <= bend)
427                                 goto fail;
428
429                         /*
430                          * Case 5:
431                          *    bbno           bend
432                          *    +BBBBBBBBBBBBBBBBB+
433                          *        +----------------------+
434                          *        fbno                fend
435                          *
436                          * Case 6:
437                          *    bbno           bend
438                          *    +BBBBBBBBBBBBBBBBB+
439                          *    +--------------------------+
440                          *    fbno                    fend
441                          *
442                          * Needs to be trimmed to:
443                          *                       +-------+
444                          *                       fbno fend
445                          */
446                         fbno = bend;
447                 } else if (bend >= fend) {
448                         /* end overlap */
449
450                         /*
451                          * Case 7:
452                          *             bbno           bend
453                          *             +BBBBBBBBBBBBBBBBB+
454                          *    +------------------+
455                          *    fbno            fend
456                          *
457                          * Case 8:
458                          *             bbno           bend
459                          *             +BBBBBBBBBBBBBBBBB+
460                          *    +--------------------------+
461                          *    fbno                    fend
462                          *
463                          * Needs to be trimmed to:
464                          *    +-------+
465                          *    fbno fend
466                          */
467                         fend = bbno;
468                 } else {
469                         /* middle overlap */
470
471                         /*
472                          * Case 9:
473                          *             bbno           bend
474                          *             +BBBBBBBBBBBBBBBBB+
475                          *    +-----------------------------------+
476                          *    fbno                             fend
477                          *
478                          * Can be trimmed to:
479                          *    +-------+        OR         +-------+
480                          *    fbno fend                   fbno fend
481                          *
482                          * Backward allocation leads to significant
483                          * fragmentation of directories, which degrades
484                          * directory performance, therefore we always want to
485                          * choose the option that produces forward allocation
486                          * patterns.
487                          * Preferring the lower bno extent will make the next
488                          * request use "fend" as the start of the next
489                          * allocation;  if the segment is no longer busy at
490                          * that point, we'll get a contiguous allocation, but
491                          * even if it is still busy, we will get a forward
492                          * allocation.
493                          * We try to avoid choosing the segment at "bend",
494                          * because that can lead to the next allocation
495                          * taking the segment at "fbno", which would be a
496                          * backward allocation.  We only use the segment at
497                          * "fbno" if it is much larger than the current
498                          * requested size, because in that case there's a
499                          * good chance subsequent allocations will be
500                          * contiguous.
501                          */
502                         if (bbno - fbno >= args->maxlen) {
503                                 /* left candidate fits perfect */
504                                 fend = bbno;
505                         } else if (fend - bend >= args->maxlen * 4) {
506                                 /* right candidate has enough free space */
507                                 fbno = bend;
508                         } else if (bbno - fbno >= args->minlen) {
509                                 /* left candidate fits minimum requirement */
510                                 fend = bbno;
511                         } else {
512                                 goto fail;
513                         }
514                 }
515
516                 flen = fend - fbno;
517         }
518         spin_unlock(&args->pag->pagb_lock);
519
520         if (fbno != bno || flen != len) {
521                 trace_xfs_extent_busy_trim(args->mp, args->agno, bno, len,
522                                           fbno, flen);
523         }
524         *rbno = fbno;
525         *rlen = flen;
526         return;
527 fail:
528         /*
529          * Return a zero extent length as failure indications.  All callers
530          * re-check if the trimmed extent satisfies the minlen requirement.
531          */
532         spin_unlock(&args->pag->pagb_lock);
533         trace_xfs_extent_busy_trim(args->mp, args->agno, bno, len, fbno, 0);
534         *rbno = fbno;
535         *rlen = 0;
536 }
537
538 STATIC void
539 xfs_extent_busy_clear_one(
540         struct xfs_mount        *mp,
541         struct xfs_perag        *pag,
542         struct xfs_extent_busy  *busyp)
543 {
544         if (busyp->length) {
545                 trace_xfs_extent_busy_clear(mp, busyp->agno, busyp->bno,
546                                                 busyp->length);
547                 rb_erase(&busyp->rb_node, &pag->pagb_tree);
548         }
549
550         list_del_init(&busyp->list);
551         kmem_free(busyp);
552 }
553
554 /*
555  * Remove all extents on the passed in list from the busy extents tree.
556  * If do_discard is set skip extents that need to be discarded, and mark
557  * these as undergoing a discard operation instead.
558  */
559 void
560 xfs_extent_busy_clear(
561         struct xfs_mount        *mp,
562         struct list_head        *list,
563         bool                    do_discard)
564 {
565         struct xfs_extent_busy  *busyp, *n;
566         struct xfs_perag        *pag = NULL;
567         xfs_agnumber_t          agno = NULLAGNUMBER;
568
569         list_for_each_entry_safe(busyp, n, list, list) {
570                 if (busyp->agno != agno) {
571                         if (pag) {
572                                 spin_unlock(&pag->pagb_lock);
573                                 xfs_perag_put(pag);
574                         }
575                         pag = xfs_perag_get(mp, busyp->agno);
576                         spin_lock(&pag->pagb_lock);
577                         agno = busyp->agno;
578                 }
579
580                 if (do_discard && busyp->length &&
581                     !(busyp->flags & XFS_EXTENT_BUSY_SKIP_DISCARD))
582                         busyp->flags = XFS_EXTENT_BUSY_DISCARDED;
583                 else
584                         xfs_extent_busy_clear_one(mp, pag, busyp);
585         }
586
587         if (pag) {
588                 spin_unlock(&pag->pagb_lock);
589                 xfs_perag_put(pag);
590         }
591 }
592
593 /*
594  * Callback for list_sort to sort busy extents by the AG they reside in.
595  */
596 int
597 xfs_extent_busy_ag_cmp(
598         void                    *priv,
599         struct list_head        *a,
600         struct list_head        *b)
601 {
602         return container_of(a, struct xfs_extent_busy, list)->agno -
603                 container_of(b, struct xfs_extent_busy, list)->agno;
604 }