These changes are the raw update to qemu-2.6.
[kvmfornfv.git] / qemu / block / qcow2-refcount.c
1 /*
2  * Block driver for the QCOW version 2 format
3  *
4  * Copyright (c) 2004-2006 Fabrice Bellard
5  *
6  * Permission is hereby granted, free of charge, to any person obtaining a copy
7  * of this software and associated documentation files (the "Software"), to deal
8  * in the Software without restriction, including without limitation the rights
9  * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
10  * copies of the Software, and to permit persons to whom the Software is
11  * furnished to do so, subject to the following conditions:
12  *
13  * The above copyright notice and this permission notice shall be included in
14  * all copies or substantial portions of the Software.
15  *
16  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
19  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
22  * THE SOFTWARE.
23  */
24
25 #include "qemu/osdep.h"
26 #include "qapi/error.h"
27 #include "qemu-common.h"
28 #include "block/block_int.h"
29 #include "block/qcow2.h"
30 #include "qemu/range.h"
31
32 static int64_t alloc_clusters_noref(BlockDriverState *bs, uint64_t size);
33 static int QEMU_WARN_UNUSED_RESULT update_refcount(BlockDriverState *bs,
34                             int64_t offset, int64_t length, uint64_t addend,
35                             bool decrease, enum qcow2_discard_type type);
36
37 static uint64_t get_refcount_ro0(const void *refcount_array, uint64_t index);
38 static uint64_t get_refcount_ro1(const void *refcount_array, uint64_t index);
39 static uint64_t get_refcount_ro2(const void *refcount_array, uint64_t index);
40 static uint64_t get_refcount_ro3(const void *refcount_array, uint64_t index);
41 static uint64_t get_refcount_ro4(const void *refcount_array, uint64_t index);
42 static uint64_t get_refcount_ro5(const void *refcount_array, uint64_t index);
43 static uint64_t get_refcount_ro6(const void *refcount_array, uint64_t index);
44
45 static void set_refcount_ro0(void *refcount_array, uint64_t index,
46                              uint64_t value);
47 static void set_refcount_ro1(void *refcount_array, uint64_t index,
48                              uint64_t value);
49 static void set_refcount_ro2(void *refcount_array, uint64_t index,
50                              uint64_t value);
51 static void set_refcount_ro3(void *refcount_array, uint64_t index,
52                              uint64_t value);
53 static void set_refcount_ro4(void *refcount_array, uint64_t index,
54                              uint64_t value);
55 static void set_refcount_ro5(void *refcount_array, uint64_t index,
56                              uint64_t value);
57 static void set_refcount_ro6(void *refcount_array, uint64_t index,
58                              uint64_t value);
59
60
61 static Qcow2GetRefcountFunc *const get_refcount_funcs[] = {
62     &get_refcount_ro0,
63     &get_refcount_ro1,
64     &get_refcount_ro2,
65     &get_refcount_ro3,
66     &get_refcount_ro4,
67     &get_refcount_ro5,
68     &get_refcount_ro6
69 };
70
71 static Qcow2SetRefcountFunc *const set_refcount_funcs[] = {
72     &set_refcount_ro0,
73     &set_refcount_ro1,
74     &set_refcount_ro2,
75     &set_refcount_ro3,
76     &set_refcount_ro4,
77     &set_refcount_ro5,
78     &set_refcount_ro6
79 };
80
81
82 /*********************************************************/
83 /* refcount handling */
84
85 int qcow2_refcount_init(BlockDriverState *bs)
86 {
87     BDRVQcow2State *s = bs->opaque;
88     unsigned int refcount_table_size2, i;
89     int ret;
90
91     assert(s->refcount_order >= 0 && s->refcount_order <= 6);
92
93     s->get_refcount = get_refcount_funcs[s->refcount_order];
94     s->set_refcount = set_refcount_funcs[s->refcount_order];
95
96     assert(s->refcount_table_size <= INT_MAX / sizeof(uint64_t));
97     refcount_table_size2 = s->refcount_table_size * sizeof(uint64_t);
98     s->refcount_table = g_try_malloc(refcount_table_size2);
99
100     if (s->refcount_table_size > 0) {
101         if (s->refcount_table == NULL) {
102             ret = -ENOMEM;
103             goto fail;
104         }
105         BLKDBG_EVENT(bs->file, BLKDBG_REFTABLE_LOAD);
106         ret = bdrv_pread(bs->file->bs, s->refcount_table_offset,
107                          s->refcount_table, refcount_table_size2);
108         if (ret < 0) {
109             goto fail;
110         }
111         for(i = 0; i < s->refcount_table_size; i++)
112             be64_to_cpus(&s->refcount_table[i]);
113     }
114     return 0;
115  fail:
116     return ret;
117 }
118
119 void qcow2_refcount_close(BlockDriverState *bs)
120 {
121     BDRVQcow2State *s = bs->opaque;
122     g_free(s->refcount_table);
123 }
124
125
126 static uint64_t get_refcount_ro0(const void *refcount_array, uint64_t index)
127 {
128     return (((const uint8_t *)refcount_array)[index / 8] >> (index % 8)) & 0x1;
129 }
130
131 static void set_refcount_ro0(void *refcount_array, uint64_t index,
132                              uint64_t value)
133 {
134     assert(!(value >> 1));
135     ((uint8_t *)refcount_array)[index / 8] &= ~(0x1 << (index % 8));
136     ((uint8_t *)refcount_array)[index / 8] |= value << (index % 8);
137 }
138
139 static uint64_t get_refcount_ro1(const void *refcount_array, uint64_t index)
140 {
141     return (((const uint8_t *)refcount_array)[index / 4] >> (2 * (index % 4)))
142            & 0x3;
143 }
144
145 static void set_refcount_ro1(void *refcount_array, uint64_t index,
146                              uint64_t value)
147 {
148     assert(!(value >> 2));
149     ((uint8_t *)refcount_array)[index / 4] &= ~(0x3 << (2 * (index % 4)));
150     ((uint8_t *)refcount_array)[index / 4] |= value << (2 * (index % 4));
151 }
152
153 static uint64_t get_refcount_ro2(const void *refcount_array, uint64_t index)
154 {
155     return (((const uint8_t *)refcount_array)[index / 2] >> (4 * (index % 2)))
156            & 0xf;
157 }
158
159 static void set_refcount_ro2(void *refcount_array, uint64_t index,
160                              uint64_t value)
161 {
162     assert(!(value >> 4));
163     ((uint8_t *)refcount_array)[index / 2] &= ~(0xf << (4 * (index % 2)));
164     ((uint8_t *)refcount_array)[index / 2] |= value << (4 * (index % 2));
165 }
166
167 static uint64_t get_refcount_ro3(const void *refcount_array, uint64_t index)
168 {
169     return ((const uint8_t *)refcount_array)[index];
170 }
171
172 static void set_refcount_ro3(void *refcount_array, uint64_t index,
173                              uint64_t value)
174 {
175     assert(!(value >> 8));
176     ((uint8_t *)refcount_array)[index] = value;
177 }
178
179 static uint64_t get_refcount_ro4(const void *refcount_array, uint64_t index)
180 {
181     return be16_to_cpu(((const uint16_t *)refcount_array)[index]);
182 }
183
184 static void set_refcount_ro4(void *refcount_array, uint64_t index,
185                              uint64_t value)
186 {
187     assert(!(value >> 16));
188     ((uint16_t *)refcount_array)[index] = cpu_to_be16(value);
189 }
190
191 static uint64_t get_refcount_ro5(const void *refcount_array, uint64_t index)
192 {
193     return be32_to_cpu(((const uint32_t *)refcount_array)[index]);
194 }
195
196 static void set_refcount_ro5(void *refcount_array, uint64_t index,
197                              uint64_t value)
198 {
199     assert(!(value >> 32));
200     ((uint32_t *)refcount_array)[index] = cpu_to_be32(value);
201 }
202
203 static uint64_t get_refcount_ro6(const void *refcount_array, uint64_t index)
204 {
205     return be64_to_cpu(((const uint64_t *)refcount_array)[index]);
206 }
207
208 static void set_refcount_ro6(void *refcount_array, uint64_t index,
209                              uint64_t value)
210 {
211     ((uint64_t *)refcount_array)[index] = cpu_to_be64(value);
212 }
213
214
215 static int load_refcount_block(BlockDriverState *bs,
216                                int64_t refcount_block_offset,
217                                void **refcount_block)
218 {
219     BDRVQcow2State *s = bs->opaque;
220     int ret;
221
222     BLKDBG_EVENT(bs->file, BLKDBG_REFBLOCK_LOAD);
223     ret = qcow2_cache_get(bs, s->refcount_block_cache, refcount_block_offset,
224         refcount_block);
225
226     return ret;
227 }
228
229 /*
230  * Retrieves the refcount of the cluster given by its index and stores it in
231  * *refcount. Returns 0 on success and -errno on failure.
232  */
233 int qcow2_get_refcount(BlockDriverState *bs, int64_t cluster_index,
234                        uint64_t *refcount)
235 {
236     BDRVQcow2State *s = bs->opaque;
237     uint64_t refcount_table_index, block_index;
238     int64_t refcount_block_offset;
239     int ret;
240     void *refcount_block;
241
242     refcount_table_index = cluster_index >> s->refcount_block_bits;
243     if (refcount_table_index >= s->refcount_table_size) {
244         *refcount = 0;
245         return 0;
246     }
247     refcount_block_offset =
248         s->refcount_table[refcount_table_index] & REFT_OFFSET_MASK;
249     if (!refcount_block_offset) {
250         *refcount = 0;
251         return 0;
252     }
253
254     if (offset_into_cluster(s, refcount_block_offset)) {
255         qcow2_signal_corruption(bs, true, -1, -1, "Refblock offset %#" PRIx64
256                                 " unaligned (reftable index: %#" PRIx64 ")",
257                                 refcount_block_offset, refcount_table_index);
258         return -EIO;
259     }
260
261     ret = qcow2_cache_get(bs, s->refcount_block_cache, refcount_block_offset,
262                           &refcount_block);
263     if (ret < 0) {
264         return ret;
265     }
266
267     block_index = cluster_index & (s->refcount_block_size - 1);
268     *refcount = s->get_refcount(refcount_block, block_index);
269
270     qcow2_cache_put(bs, s->refcount_block_cache, &refcount_block);
271
272     return 0;
273 }
274
275 /*
276  * Rounds the refcount table size up to avoid growing the table for each single
277  * refcount block that is allocated.
278  */
279 static unsigned int next_refcount_table_size(BDRVQcow2State *s,
280     unsigned int min_size)
281 {
282     unsigned int min_clusters = (min_size >> (s->cluster_bits - 3)) + 1;
283     unsigned int refcount_table_clusters =
284         MAX(1, s->refcount_table_size >> (s->cluster_bits - 3));
285
286     while (min_clusters > refcount_table_clusters) {
287         refcount_table_clusters = (refcount_table_clusters * 3 + 1) / 2;
288     }
289
290     return refcount_table_clusters << (s->cluster_bits - 3);
291 }
292
293
294 /* Checks if two offsets are described by the same refcount block */
295 static int in_same_refcount_block(BDRVQcow2State *s, uint64_t offset_a,
296     uint64_t offset_b)
297 {
298     uint64_t block_a = offset_a >> (s->cluster_bits + s->refcount_block_bits);
299     uint64_t block_b = offset_b >> (s->cluster_bits + s->refcount_block_bits);
300
301     return (block_a == block_b);
302 }
303
304 /*
305  * Loads a refcount block. If it doesn't exist yet, it is allocated first
306  * (including growing the refcount table if needed).
307  *
308  * Returns 0 on success or -errno in error case
309  */
310 static int alloc_refcount_block(BlockDriverState *bs,
311                                 int64_t cluster_index, void **refcount_block)
312 {
313     BDRVQcow2State *s = bs->opaque;
314     unsigned int refcount_table_index;
315     int ret;
316
317     BLKDBG_EVENT(bs->file, BLKDBG_REFBLOCK_ALLOC);
318
319     /* Find the refcount block for the given cluster */
320     refcount_table_index = cluster_index >> s->refcount_block_bits;
321
322     if (refcount_table_index < s->refcount_table_size) {
323
324         uint64_t refcount_block_offset =
325             s->refcount_table[refcount_table_index] & REFT_OFFSET_MASK;
326
327         /* If it's already there, we're done */
328         if (refcount_block_offset) {
329             if (offset_into_cluster(s, refcount_block_offset)) {
330                 qcow2_signal_corruption(bs, true, -1, -1, "Refblock offset %#"
331                                         PRIx64 " unaligned (reftable index: "
332                                         "%#x)", refcount_block_offset,
333                                         refcount_table_index);
334                 return -EIO;
335             }
336
337              return load_refcount_block(bs, refcount_block_offset,
338                                         refcount_block);
339         }
340     }
341
342     /*
343      * If we came here, we need to allocate something. Something is at least
344      * a cluster for the new refcount block. It may also include a new refcount
345      * table if the old refcount table is too small.
346      *
347      * Note that allocating clusters here needs some special care:
348      *
349      * - We can't use the normal qcow2_alloc_clusters(), it would try to
350      *   increase the refcount and very likely we would end up with an endless
351      *   recursion. Instead we must place the refcount blocks in a way that
352      *   they can describe them themselves.
353      *
354      * - We need to consider that at this point we are inside update_refcounts
355      *   and potentially doing an initial refcount increase. This means that
356      *   some clusters have already been allocated by the caller, but their
357      *   refcount isn't accurate yet. If we allocate clusters for metadata, we
358      *   need to return -EAGAIN to signal the caller that it needs to restart
359      *   the search for free clusters.
360      *
361      * - alloc_clusters_noref and qcow2_free_clusters may load a different
362      *   refcount block into the cache
363      */
364
365     *refcount_block = NULL;
366
367     /* We write to the refcount table, so we might depend on L2 tables */
368     ret = qcow2_cache_flush(bs, s->l2_table_cache);
369     if (ret < 0) {
370         return ret;
371     }
372
373     /* Allocate the refcount block itself and mark it as used */
374     int64_t new_block = alloc_clusters_noref(bs, s->cluster_size);
375     if (new_block < 0) {
376         return new_block;
377     }
378
379 #ifdef DEBUG_ALLOC2
380     fprintf(stderr, "qcow2: Allocate refcount block %d for %" PRIx64
381         " at %" PRIx64 "\n",
382         refcount_table_index, cluster_index << s->cluster_bits, new_block);
383 #endif
384
385     if (in_same_refcount_block(s, new_block, cluster_index << s->cluster_bits)) {
386         /* Zero the new refcount block before updating it */
387         ret = qcow2_cache_get_empty(bs, s->refcount_block_cache, new_block,
388                                     refcount_block);
389         if (ret < 0) {
390             goto fail_block;
391         }
392
393         memset(*refcount_block, 0, s->cluster_size);
394
395         /* The block describes itself, need to update the cache */
396         int block_index = (new_block >> s->cluster_bits) &
397             (s->refcount_block_size - 1);
398         s->set_refcount(*refcount_block, block_index, 1);
399     } else {
400         /* Described somewhere else. This can recurse at most twice before we
401          * arrive at a block that describes itself. */
402         ret = update_refcount(bs, new_block, s->cluster_size, 1, false,
403                               QCOW2_DISCARD_NEVER);
404         if (ret < 0) {
405             goto fail_block;
406         }
407
408         ret = qcow2_cache_flush(bs, s->refcount_block_cache);
409         if (ret < 0) {
410             goto fail_block;
411         }
412
413         /* Initialize the new refcount block only after updating its refcount,
414          * update_refcount uses the refcount cache itself */
415         ret = qcow2_cache_get_empty(bs, s->refcount_block_cache, new_block,
416                                     refcount_block);
417         if (ret < 0) {
418             goto fail_block;
419         }
420
421         memset(*refcount_block, 0, s->cluster_size);
422     }
423
424     /* Now the new refcount block needs to be written to disk */
425     BLKDBG_EVENT(bs->file, BLKDBG_REFBLOCK_ALLOC_WRITE);
426     qcow2_cache_entry_mark_dirty(bs, s->refcount_block_cache, *refcount_block);
427     ret = qcow2_cache_flush(bs, s->refcount_block_cache);
428     if (ret < 0) {
429         goto fail_block;
430     }
431
432     /* If the refcount table is big enough, just hook the block up there */
433     if (refcount_table_index < s->refcount_table_size) {
434         uint64_t data64 = cpu_to_be64(new_block);
435         BLKDBG_EVENT(bs->file, BLKDBG_REFBLOCK_ALLOC_HOOKUP);
436         ret = bdrv_pwrite_sync(bs->file->bs,
437             s->refcount_table_offset + refcount_table_index * sizeof(uint64_t),
438             &data64, sizeof(data64));
439         if (ret < 0) {
440             goto fail_block;
441         }
442
443         s->refcount_table[refcount_table_index] = new_block;
444
445         /* The new refcount block may be where the caller intended to put its
446          * data, so let it restart the search. */
447         return -EAGAIN;
448     }
449
450     qcow2_cache_put(bs, s->refcount_block_cache, refcount_block);
451
452     /*
453      * If we come here, we need to grow the refcount table. Again, a new
454      * refcount table needs some space and we can't simply allocate to avoid
455      * endless recursion.
456      *
457      * Therefore let's grab new refcount blocks at the end of the image, which
458      * will describe themselves and the new refcount table. This way we can
459      * reference them only in the new table and do the switch to the new
460      * refcount table at once without producing an inconsistent state in
461      * between.
462      */
463     BLKDBG_EVENT(bs->file, BLKDBG_REFTABLE_GROW);
464
465     /* Calculate the number of refcount blocks needed so far; this will be the
466      * basis for calculating the index of the first cluster used for the
467      * self-describing refcount structures which we are about to create.
468      *
469      * Because we reached this point, there cannot be any refcount entries for
470      * cluster_index or higher indices yet. However, because new_block has been
471      * allocated to describe that cluster (and it will assume this role later
472      * on), we cannot use that index; also, new_block may actually have a higher
473      * cluster index than cluster_index, so it needs to be taken into account
474      * here (and 1 needs to be added to its value because that cluster is used).
475      */
476     uint64_t blocks_used = DIV_ROUND_UP(MAX(cluster_index + 1,
477                                             (new_block >> s->cluster_bits) + 1),
478                                         s->refcount_block_size);
479
480     if (blocks_used > QCOW_MAX_REFTABLE_SIZE / sizeof(uint64_t)) {
481         return -EFBIG;
482     }
483
484     /* And now we need at least one block more for the new metadata */
485     uint64_t table_size = next_refcount_table_size(s, blocks_used + 1);
486     uint64_t last_table_size;
487     uint64_t blocks_clusters;
488     do {
489         uint64_t table_clusters =
490             size_to_clusters(s, table_size * sizeof(uint64_t));
491         blocks_clusters = 1 +
492             ((table_clusters + s->refcount_block_size - 1)
493             / s->refcount_block_size);
494         uint64_t meta_clusters = table_clusters + blocks_clusters;
495
496         last_table_size = table_size;
497         table_size = next_refcount_table_size(s, blocks_used +
498             ((meta_clusters + s->refcount_block_size - 1)
499             / s->refcount_block_size));
500
501     } while (last_table_size != table_size);
502
503 #ifdef DEBUG_ALLOC2
504     fprintf(stderr, "qcow2: Grow refcount table %" PRId32 " => %" PRId64 "\n",
505         s->refcount_table_size, table_size);
506 #endif
507
508     /* Create the new refcount table and blocks */
509     uint64_t meta_offset = (blocks_used * s->refcount_block_size) *
510         s->cluster_size;
511     uint64_t table_offset = meta_offset + blocks_clusters * s->cluster_size;
512     uint64_t *new_table = g_try_new0(uint64_t, table_size);
513     void *new_blocks = g_try_malloc0(blocks_clusters * s->cluster_size);
514
515     assert(table_size > 0 && blocks_clusters > 0);
516     if (new_table == NULL || new_blocks == NULL) {
517         ret = -ENOMEM;
518         goto fail_table;
519     }
520
521     /* Fill the new refcount table */
522     memcpy(new_table, s->refcount_table,
523         s->refcount_table_size * sizeof(uint64_t));
524     new_table[refcount_table_index] = new_block;
525
526     int i;
527     for (i = 0; i < blocks_clusters; i++) {
528         new_table[blocks_used + i] = meta_offset + (i * s->cluster_size);
529     }
530
531     /* Fill the refcount blocks */
532     uint64_t table_clusters = size_to_clusters(s, table_size * sizeof(uint64_t));
533     int block = 0;
534     for (i = 0; i < table_clusters + blocks_clusters; i++) {
535         s->set_refcount(new_blocks, block++, 1);
536     }
537
538     /* Write refcount blocks to disk */
539     BLKDBG_EVENT(bs->file, BLKDBG_REFBLOCK_ALLOC_WRITE_BLOCKS);
540     ret = bdrv_pwrite_sync(bs->file->bs, meta_offset, new_blocks,
541         blocks_clusters * s->cluster_size);
542     g_free(new_blocks);
543     new_blocks = NULL;
544     if (ret < 0) {
545         goto fail_table;
546     }
547
548     /* Write refcount table to disk */
549     for(i = 0; i < table_size; i++) {
550         cpu_to_be64s(&new_table[i]);
551     }
552
553     BLKDBG_EVENT(bs->file, BLKDBG_REFBLOCK_ALLOC_WRITE_TABLE);
554     ret = bdrv_pwrite_sync(bs->file->bs, table_offset, new_table,
555         table_size * sizeof(uint64_t));
556     if (ret < 0) {
557         goto fail_table;
558     }
559
560     for(i = 0; i < table_size; i++) {
561         be64_to_cpus(&new_table[i]);
562     }
563
564     /* Hook up the new refcount table in the qcow2 header */
565     struct QEMU_PACKED {
566         uint64_t d64;
567         uint32_t d32;
568     } data;
569     cpu_to_be64w(&data.d64, table_offset);
570     cpu_to_be32w(&data.d32, table_clusters);
571     BLKDBG_EVENT(bs->file, BLKDBG_REFBLOCK_ALLOC_SWITCH_TABLE);
572     ret = bdrv_pwrite_sync(bs->file->bs,
573                            offsetof(QCowHeader, refcount_table_offset),
574                            &data, sizeof(data));
575     if (ret < 0) {
576         goto fail_table;
577     }
578
579     /* And switch it in memory */
580     uint64_t old_table_offset = s->refcount_table_offset;
581     uint64_t old_table_size = s->refcount_table_size;
582
583     g_free(s->refcount_table);
584     s->refcount_table = new_table;
585     s->refcount_table_size = table_size;
586     s->refcount_table_offset = table_offset;
587
588     /* Free old table. */
589     qcow2_free_clusters(bs, old_table_offset, old_table_size * sizeof(uint64_t),
590                         QCOW2_DISCARD_OTHER);
591
592     ret = load_refcount_block(bs, new_block, refcount_block);
593     if (ret < 0) {
594         return ret;
595     }
596
597     /* If we were trying to do the initial refcount update for some cluster
598      * allocation, we might have used the same clusters to store newly
599      * allocated metadata. Make the caller search some new space. */
600     return -EAGAIN;
601
602 fail_table:
603     g_free(new_blocks);
604     g_free(new_table);
605 fail_block:
606     if (*refcount_block != NULL) {
607         qcow2_cache_put(bs, s->refcount_block_cache, refcount_block);
608     }
609     return ret;
610 }
611
612 void qcow2_process_discards(BlockDriverState *bs, int ret)
613 {
614     BDRVQcow2State *s = bs->opaque;
615     Qcow2DiscardRegion *d, *next;
616
617     QTAILQ_FOREACH_SAFE(d, &s->discards, next, next) {
618         QTAILQ_REMOVE(&s->discards, d, next);
619
620         /* Discard is optional, ignore the return value */
621         if (ret >= 0) {
622             bdrv_discard(bs->file->bs,
623                          d->offset >> BDRV_SECTOR_BITS,
624                          d->bytes >> BDRV_SECTOR_BITS);
625         }
626
627         g_free(d);
628     }
629 }
630
631 static void update_refcount_discard(BlockDriverState *bs,
632                                     uint64_t offset, uint64_t length)
633 {
634     BDRVQcow2State *s = bs->opaque;
635     Qcow2DiscardRegion *d, *p, *next;
636
637     QTAILQ_FOREACH(d, &s->discards, next) {
638         uint64_t new_start = MIN(offset, d->offset);
639         uint64_t new_end = MAX(offset + length, d->offset + d->bytes);
640
641         if (new_end - new_start <= length + d->bytes) {
642             /* There can't be any overlap, areas ending up here have no
643              * references any more and therefore shouldn't get freed another
644              * time. */
645             assert(d->bytes + length == new_end - new_start);
646             d->offset = new_start;
647             d->bytes = new_end - new_start;
648             goto found;
649         }
650     }
651
652     d = g_malloc(sizeof(*d));
653     *d = (Qcow2DiscardRegion) {
654         .bs     = bs,
655         .offset = offset,
656         .bytes  = length,
657     };
658     QTAILQ_INSERT_TAIL(&s->discards, d, next);
659
660 found:
661     /* Merge discard requests if they are adjacent now */
662     QTAILQ_FOREACH_SAFE(p, &s->discards, next, next) {
663         if (p == d
664             || p->offset > d->offset + d->bytes
665             || d->offset > p->offset + p->bytes)
666         {
667             continue;
668         }
669
670         /* Still no overlap possible */
671         assert(p->offset == d->offset + d->bytes
672             || d->offset == p->offset + p->bytes);
673
674         QTAILQ_REMOVE(&s->discards, p, next);
675         d->offset = MIN(d->offset, p->offset);
676         d->bytes += p->bytes;
677         g_free(p);
678     }
679 }
680
681 /* XXX: cache several refcount block clusters ? */
682 /* @addend is the absolute value of the addend; if @decrease is set, @addend
683  * will be subtracted from the current refcount, otherwise it will be added */
684 static int QEMU_WARN_UNUSED_RESULT update_refcount(BlockDriverState *bs,
685                                                    int64_t offset,
686                                                    int64_t length,
687                                                    uint64_t addend,
688                                                    bool decrease,
689                                                    enum qcow2_discard_type type)
690 {
691     BDRVQcow2State *s = bs->opaque;
692     int64_t start, last, cluster_offset;
693     void *refcount_block = NULL;
694     int64_t old_table_index = -1;
695     int ret;
696
697 #ifdef DEBUG_ALLOC2
698     fprintf(stderr, "update_refcount: offset=%" PRId64 " size=%" PRId64
699             " addend=%s%" PRIu64 "\n", offset, length, decrease ? "-" : "",
700             addend);
701 #endif
702     if (length < 0) {
703         return -EINVAL;
704     } else if (length == 0) {
705         return 0;
706     }
707
708     if (decrease) {
709         qcow2_cache_set_dependency(bs, s->refcount_block_cache,
710             s->l2_table_cache);
711     }
712
713     start = start_of_cluster(s, offset);
714     last = start_of_cluster(s, offset + length - 1);
715     for(cluster_offset = start; cluster_offset <= last;
716         cluster_offset += s->cluster_size)
717     {
718         int block_index;
719         uint64_t refcount;
720         int64_t cluster_index = cluster_offset >> s->cluster_bits;
721         int64_t table_index = cluster_index >> s->refcount_block_bits;
722
723         /* Load the refcount block and allocate it if needed */
724         if (table_index != old_table_index) {
725             if (refcount_block) {
726                 qcow2_cache_put(bs, s->refcount_block_cache, &refcount_block);
727             }
728             ret = alloc_refcount_block(bs, cluster_index, &refcount_block);
729             if (ret < 0) {
730                 goto fail;
731             }
732         }
733         old_table_index = table_index;
734
735         qcow2_cache_entry_mark_dirty(bs, s->refcount_block_cache,
736                                      refcount_block);
737
738         /* we can update the count and save it */
739         block_index = cluster_index & (s->refcount_block_size - 1);
740
741         refcount = s->get_refcount(refcount_block, block_index);
742         if (decrease ? (refcount - addend > refcount)
743                      : (refcount + addend < refcount ||
744                         refcount + addend > s->refcount_max))
745         {
746             ret = -EINVAL;
747             goto fail;
748         }
749         if (decrease) {
750             refcount -= addend;
751         } else {
752             refcount += addend;
753         }
754         if (refcount == 0 && cluster_index < s->free_cluster_index) {
755             s->free_cluster_index = cluster_index;
756         }
757         s->set_refcount(refcount_block, block_index, refcount);
758
759         if (refcount == 0 && s->discard_passthrough[type]) {
760             update_refcount_discard(bs, cluster_offset, s->cluster_size);
761         }
762     }
763
764     ret = 0;
765 fail:
766     if (!s->cache_discards) {
767         qcow2_process_discards(bs, ret);
768     }
769
770     /* Write last changed block to disk */
771     if (refcount_block) {
772         qcow2_cache_put(bs, s->refcount_block_cache, &refcount_block);
773     }
774
775     /*
776      * Try do undo any updates if an error is returned (This may succeed in
777      * some cases like ENOSPC for allocating a new refcount block)
778      */
779     if (ret < 0) {
780         int dummy;
781         dummy = update_refcount(bs, offset, cluster_offset - offset, addend,
782                                 !decrease, QCOW2_DISCARD_NEVER);
783         (void)dummy;
784     }
785
786     return ret;
787 }
788
789 /*
790  * Increases or decreases the refcount of a given cluster.
791  *
792  * @addend is the absolute value of the addend; if @decrease is set, @addend
793  * will be subtracted from the current refcount, otherwise it will be added.
794  *
795  * On success 0 is returned; on failure -errno is returned.
796  */
797 int qcow2_update_cluster_refcount(BlockDriverState *bs,
798                                   int64_t cluster_index,
799                                   uint64_t addend, bool decrease,
800                                   enum qcow2_discard_type type)
801 {
802     BDRVQcow2State *s = bs->opaque;
803     int ret;
804
805     ret = update_refcount(bs, cluster_index << s->cluster_bits, 1, addend,
806                           decrease, type);
807     if (ret < 0) {
808         return ret;
809     }
810
811     return 0;
812 }
813
814
815
816 /*********************************************************/
817 /* cluster allocation functions */
818
819
820
821 /* return < 0 if error */
822 static int64_t alloc_clusters_noref(BlockDriverState *bs, uint64_t size)
823 {
824     BDRVQcow2State *s = bs->opaque;
825     uint64_t i, nb_clusters, refcount;
826     int ret;
827
828     /* We can't allocate clusters if they may still be queued for discard. */
829     if (s->cache_discards) {
830         qcow2_process_discards(bs, 0);
831     }
832
833     nb_clusters = size_to_clusters(s, size);
834 retry:
835     for(i = 0; i < nb_clusters; i++) {
836         uint64_t next_cluster_index = s->free_cluster_index++;
837         ret = qcow2_get_refcount(bs, next_cluster_index, &refcount);
838
839         if (ret < 0) {
840             return ret;
841         } else if (refcount != 0) {
842             goto retry;
843         }
844     }
845
846     /* Make sure that all offsets in the "allocated" range are representable
847      * in an int64_t */
848     if (s->free_cluster_index > 0 &&
849         s->free_cluster_index - 1 > (INT64_MAX >> s->cluster_bits))
850     {
851         return -EFBIG;
852     }
853
854 #ifdef DEBUG_ALLOC2
855     fprintf(stderr, "alloc_clusters: size=%" PRId64 " -> %" PRId64 "\n",
856             size,
857             (s->free_cluster_index - nb_clusters) << s->cluster_bits);
858 #endif
859     return (s->free_cluster_index - nb_clusters) << s->cluster_bits;
860 }
861
862 int64_t qcow2_alloc_clusters(BlockDriverState *bs, uint64_t size)
863 {
864     int64_t offset;
865     int ret;
866
867     BLKDBG_EVENT(bs->file, BLKDBG_CLUSTER_ALLOC);
868     do {
869         offset = alloc_clusters_noref(bs, size);
870         if (offset < 0) {
871             return offset;
872         }
873
874         ret = update_refcount(bs, offset, size, 1, false, QCOW2_DISCARD_NEVER);
875     } while (ret == -EAGAIN);
876
877     if (ret < 0) {
878         return ret;
879     }
880
881     return offset;
882 }
883
884 int64_t qcow2_alloc_clusters_at(BlockDriverState *bs, uint64_t offset,
885                                 int64_t nb_clusters)
886 {
887     BDRVQcow2State *s = bs->opaque;
888     uint64_t cluster_index, refcount;
889     uint64_t i;
890     int ret;
891
892     assert(nb_clusters >= 0);
893     if (nb_clusters == 0) {
894         return 0;
895     }
896
897     do {
898         /* Check how many clusters there are free */
899         cluster_index = offset >> s->cluster_bits;
900         for(i = 0; i < nb_clusters; i++) {
901             ret = qcow2_get_refcount(bs, cluster_index++, &refcount);
902             if (ret < 0) {
903                 return ret;
904             } else if (refcount != 0) {
905                 break;
906             }
907         }
908
909         /* And then allocate them */
910         ret = update_refcount(bs, offset, i << s->cluster_bits, 1, false,
911                               QCOW2_DISCARD_NEVER);
912     } while (ret == -EAGAIN);
913
914     if (ret < 0) {
915         return ret;
916     }
917
918     return i;
919 }
920
921 /* only used to allocate compressed sectors. We try to allocate
922    contiguous sectors. size must be <= cluster_size */
923 int64_t qcow2_alloc_bytes(BlockDriverState *bs, int size)
924 {
925     BDRVQcow2State *s = bs->opaque;
926     int64_t offset;
927     size_t free_in_cluster;
928     int ret;
929
930     BLKDBG_EVENT(bs->file, BLKDBG_CLUSTER_ALLOC_BYTES);
931     assert(size > 0 && size <= s->cluster_size);
932     assert(!s->free_byte_offset || offset_into_cluster(s, s->free_byte_offset));
933
934     offset = s->free_byte_offset;
935
936     if (offset) {
937         uint64_t refcount;
938         ret = qcow2_get_refcount(bs, offset >> s->cluster_bits, &refcount);
939         if (ret < 0) {
940             return ret;
941         }
942
943         if (refcount == s->refcount_max) {
944             offset = 0;
945         }
946     }
947
948     free_in_cluster = s->cluster_size - offset_into_cluster(s, offset);
949     do {
950         if (!offset || free_in_cluster < size) {
951             int64_t new_cluster = alloc_clusters_noref(bs, s->cluster_size);
952             if (new_cluster < 0) {
953                 return new_cluster;
954             }
955
956             if (!offset || ROUND_UP(offset, s->cluster_size) != new_cluster) {
957                 offset = new_cluster;
958                 free_in_cluster = s->cluster_size;
959             } else {
960                 free_in_cluster += s->cluster_size;
961             }
962         }
963
964         assert(offset);
965         ret = update_refcount(bs, offset, size, 1, false, QCOW2_DISCARD_NEVER);
966         if (ret < 0) {
967             offset = 0;
968         }
969     } while (ret == -EAGAIN);
970     if (ret < 0) {
971         return ret;
972     }
973
974     /* The cluster refcount was incremented; refcount blocks must be flushed
975      * before the caller's L2 table updates. */
976     qcow2_cache_set_dependency(bs, s->l2_table_cache, s->refcount_block_cache);
977
978     s->free_byte_offset = offset + size;
979     if (!offset_into_cluster(s, s->free_byte_offset)) {
980         s->free_byte_offset = 0;
981     }
982
983     return offset;
984 }
985
986 void qcow2_free_clusters(BlockDriverState *bs,
987                           int64_t offset, int64_t size,
988                           enum qcow2_discard_type type)
989 {
990     int ret;
991
992     BLKDBG_EVENT(bs->file, BLKDBG_CLUSTER_FREE);
993     ret = update_refcount(bs, offset, size, 1, true, type);
994     if (ret < 0) {
995         fprintf(stderr, "qcow2_free_clusters failed: %s\n", strerror(-ret));
996         /* TODO Remember the clusters to free them later and avoid leaking */
997     }
998 }
999
1000 /*
1001  * Free a cluster using its L2 entry (handles clusters of all types, e.g.
1002  * normal cluster, compressed cluster, etc.)
1003  */
1004 void qcow2_free_any_clusters(BlockDriverState *bs, uint64_t l2_entry,
1005                              int nb_clusters, enum qcow2_discard_type type)
1006 {
1007     BDRVQcow2State *s = bs->opaque;
1008
1009     switch (qcow2_get_cluster_type(l2_entry)) {
1010     case QCOW2_CLUSTER_COMPRESSED:
1011         {
1012             int nb_csectors;
1013             nb_csectors = ((l2_entry >> s->csize_shift) &
1014                            s->csize_mask) + 1;
1015             qcow2_free_clusters(bs,
1016                 (l2_entry & s->cluster_offset_mask) & ~511,
1017                 nb_csectors * 512, type);
1018         }
1019         break;
1020     case QCOW2_CLUSTER_NORMAL:
1021     case QCOW2_CLUSTER_ZERO:
1022         if (l2_entry & L2E_OFFSET_MASK) {
1023             if (offset_into_cluster(s, l2_entry & L2E_OFFSET_MASK)) {
1024                 qcow2_signal_corruption(bs, false, -1, -1,
1025                                         "Cannot free unaligned cluster %#llx",
1026                                         l2_entry & L2E_OFFSET_MASK);
1027             } else {
1028                 qcow2_free_clusters(bs, l2_entry & L2E_OFFSET_MASK,
1029                                     nb_clusters << s->cluster_bits, type);
1030             }
1031         }
1032         break;
1033     case QCOW2_CLUSTER_UNALLOCATED:
1034         break;
1035     default:
1036         abort();
1037     }
1038 }
1039
1040
1041
1042 /*********************************************************/
1043 /* snapshots and image creation */
1044
1045
1046
1047 /* update the refcounts of snapshots and the copied flag */
1048 int qcow2_update_snapshot_refcount(BlockDriverState *bs,
1049     int64_t l1_table_offset, int l1_size, int addend)
1050 {
1051     BDRVQcow2State *s = bs->opaque;
1052     uint64_t *l1_table, *l2_table, l2_offset, offset, l1_size2, refcount;
1053     bool l1_allocated = false;
1054     int64_t old_offset, old_l2_offset;
1055     int i, j, l1_modified = 0, nb_csectors;
1056     int ret;
1057
1058     assert(addend >= -1 && addend <= 1);
1059
1060     l2_table = NULL;
1061     l1_table = NULL;
1062     l1_size2 = l1_size * sizeof(uint64_t);
1063
1064     s->cache_discards = true;
1065
1066     /* WARNING: qcow2_snapshot_goto relies on this function not using the
1067      * l1_table_offset when it is the current s->l1_table_offset! Be careful
1068      * when changing this! */
1069     if (l1_table_offset != s->l1_table_offset) {
1070         l1_table = g_try_malloc0(align_offset(l1_size2, 512));
1071         if (l1_size2 && l1_table == NULL) {
1072             ret = -ENOMEM;
1073             goto fail;
1074         }
1075         l1_allocated = true;
1076
1077         ret = bdrv_pread(bs->file->bs, l1_table_offset, l1_table, l1_size2);
1078         if (ret < 0) {
1079             goto fail;
1080         }
1081
1082         for(i = 0;i < l1_size; i++)
1083             be64_to_cpus(&l1_table[i]);
1084     } else {
1085         assert(l1_size == s->l1_size);
1086         l1_table = s->l1_table;
1087         l1_allocated = false;
1088     }
1089
1090     for(i = 0; i < l1_size; i++) {
1091         l2_offset = l1_table[i];
1092         if (l2_offset) {
1093             old_l2_offset = l2_offset;
1094             l2_offset &= L1E_OFFSET_MASK;
1095
1096             if (offset_into_cluster(s, l2_offset)) {
1097                 qcow2_signal_corruption(bs, true, -1, -1, "L2 table offset %#"
1098                                         PRIx64 " unaligned (L1 index: %#x)",
1099                                         l2_offset, i);
1100                 ret = -EIO;
1101                 goto fail;
1102             }
1103
1104             ret = qcow2_cache_get(bs, s->l2_table_cache, l2_offset,
1105                 (void**) &l2_table);
1106             if (ret < 0) {
1107                 goto fail;
1108             }
1109
1110             for(j = 0; j < s->l2_size; j++) {
1111                 uint64_t cluster_index;
1112
1113                 offset = be64_to_cpu(l2_table[j]);
1114                 old_offset = offset;
1115                 offset &= ~QCOW_OFLAG_COPIED;
1116
1117                 switch (qcow2_get_cluster_type(offset)) {
1118                     case QCOW2_CLUSTER_COMPRESSED:
1119                         nb_csectors = ((offset >> s->csize_shift) &
1120                                        s->csize_mask) + 1;
1121                         if (addend != 0) {
1122                             ret = update_refcount(bs,
1123                                 (offset & s->cluster_offset_mask) & ~511,
1124                                 nb_csectors * 512, abs(addend), addend < 0,
1125                                 QCOW2_DISCARD_SNAPSHOT);
1126                             if (ret < 0) {
1127                                 goto fail;
1128                             }
1129                         }
1130                         /* compressed clusters are never modified */
1131                         refcount = 2;
1132                         break;
1133
1134                     case QCOW2_CLUSTER_NORMAL:
1135                     case QCOW2_CLUSTER_ZERO:
1136                         if (offset_into_cluster(s, offset & L2E_OFFSET_MASK)) {
1137                             qcow2_signal_corruption(bs, true, -1, -1, "Data "
1138                                                     "cluster offset %#llx "
1139                                                     "unaligned (L2 offset: %#"
1140                                                     PRIx64 ", L2 index: %#x)",
1141                                                     offset & L2E_OFFSET_MASK,
1142                                                     l2_offset, j);
1143                             ret = -EIO;
1144                             goto fail;
1145                         }
1146
1147                         cluster_index = (offset & L2E_OFFSET_MASK) >> s->cluster_bits;
1148                         if (!cluster_index) {
1149                             /* unallocated */
1150                             refcount = 0;
1151                             break;
1152                         }
1153                         if (addend != 0) {
1154                             ret = qcow2_update_cluster_refcount(bs,
1155                                     cluster_index, abs(addend), addend < 0,
1156                                     QCOW2_DISCARD_SNAPSHOT);
1157                             if (ret < 0) {
1158                                 goto fail;
1159                             }
1160                         }
1161
1162                         ret = qcow2_get_refcount(bs, cluster_index, &refcount);
1163                         if (ret < 0) {
1164                             goto fail;
1165                         }
1166                         break;
1167
1168                     case QCOW2_CLUSTER_UNALLOCATED:
1169                         refcount = 0;
1170                         break;
1171
1172                     default:
1173                         abort();
1174                 }
1175
1176                 if (refcount == 1) {
1177                     offset |= QCOW_OFLAG_COPIED;
1178                 }
1179                 if (offset != old_offset) {
1180                     if (addend > 0) {
1181                         qcow2_cache_set_dependency(bs, s->l2_table_cache,
1182                             s->refcount_block_cache);
1183                     }
1184                     l2_table[j] = cpu_to_be64(offset);
1185                     qcow2_cache_entry_mark_dirty(bs, s->l2_table_cache,
1186                                                  l2_table);
1187                 }
1188             }
1189
1190             qcow2_cache_put(bs, s->l2_table_cache, (void **) &l2_table);
1191
1192             if (addend != 0) {
1193                 ret = qcow2_update_cluster_refcount(bs, l2_offset >>
1194                                                         s->cluster_bits,
1195                                                     abs(addend), addend < 0,
1196                                                     QCOW2_DISCARD_SNAPSHOT);
1197                 if (ret < 0) {
1198                     goto fail;
1199                 }
1200             }
1201             ret = qcow2_get_refcount(bs, l2_offset >> s->cluster_bits,
1202                                      &refcount);
1203             if (ret < 0) {
1204                 goto fail;
1205             } else if (refcount == 1) {
1206                 l2_offset |= QCOW_OFLAG_COPIED;
1207             }
1208             if (l2_offset != old_l2_offset) {
1209                 l1_table[i] = l2_offset;
1210                 l1_modified = 1;
1211             }
1212         }
1213     }
1214
1215     ret = bdrv_flush(bs);
1216 fail:
1217     if (l2_table) {
1218         qcow2_cache_put(bs, s->l2_table_cache, (void**) &l2_table);
1219     }
1220
1221     s->cache_discards = false;
1222     qcow2_process_discards(bs, ret);
1223
1224     /* Update L1 only if it isn't deleted anyway (addend = -1) */
1225     if (ret == 0 && addend >= 0 && l1_modified) {
1226         for (i = 0; i < l1_size; i++) {
1227             cpu_to_be64s(&l1_table[i]);
1228         }
1229
1230         ret = bdrv_pwrite_sync(bs->file->bs, l1_table_offset,
1231                                l1_table, l1_size2);
1232
1233         for (i = 0; i < l1_size; i++) {
1234             be64_to_cpus(&l1_table[i]);
1235         }
1236     }
1237     if (l1_allocated)
1238         g_free(l1_table);
1239     return ret;
1240 }
1241
1242
1243
1244
1245 /*********************************************************/
1246 /* refcount checking functions */
1247
1248
1249 static uint64_t refcount_array_byte_size(BDRVQcow2State *s, uint64_t entries)
1250 {
1251     /* This assertion holds because there is no way we can address more than
1252      * 2^(64 - 9) clusters at once (with cluster size 512 = 2^9, and because
1253      * offsets have to be representable in bytes); due to every cluster
1254      * corresponding to one refcount entry, we are well below that limit */
1255     assert(entries < (UINT64_C(1) << (64 - 9)));
1256
1257     /* Thanks to the assertion this will not overflow, because
1258      * s->refcount_order < 7.
1259      * (note: x << s->refcount_order == x * s->refcount_bits) */
1260     return DIV_ROUND_UP(entries << s->refcount_order, 8);
1261 }
1262
1263 /**
1264  * Reallocates *array so that it can hold new_size entries. *size must contain
1265  * the current number of entries in *array. If the reallocation fails, *array
1266  * and *size will not be modified and -errno will be returned. If the
1267  * reallocation is successful, *array will be set to the new buffer, *size
1268  * will be set to new_size and 0 will be returned. The size of the reallocated
1269  * refcount array buffer will be aligned to a cluster boundary, and the newly
1270  * allocated area will be zeroed.
1271  */
1272 static int realloc_refcount_array(BDRVQcow2State *s, void **array,
1273                                   int64_t *size, int64_t new_size)
1274 {
1275     int64_t old_byte_size, new_byte_size;
1276     void *new_ptr;
1277
1278     /* Round to clusters so the array can be directly written to disk */
1279     old_byte_size = size_to_clusters(s, refcount_array_byte_size(s, *size))
1280                     * s->cluster_size;
1281     new_byte_size = size_to_clusters(s, refcount_array_byte_size(s, new_size))
1282                     * s->cluster_size;
1283
1284     if (new_byte_size == old_byte_size) {
1285         *size = new_size;
1286         return 0;
1287     }
1288
1289     assert(new_byte_size > 0);
1290
1291     if (new_byte_size > SIZE_MAX) {
1292         return -ENOMEM;
1293     }
1294
1295     new_ptr = g_try_realloc(*array, new_byte_size);
1296     if (!new_ptr) {
1297         return -ENOMEM;
1298     }
1299
1300     if (new_byte_size > old_byte_size) {
1301         memset((char *)new_ptr + old_byte_size, 0,
1302                new_byte_size - old_byte_size);
1303     }
1304
1305     *array = new_ptr;
1306     *size  = new_size;
1307
1308     return 0;
1309 }
1310
1311 /*
1312  * Increases the refcount for a range of clusters in a given refcount table.
1313  * This is used to construct a temporary refcount table out of L1 and L2 tables
1314  * which can be compared to the refcount table saved in the image.
1315  *
1316  * Modifies the number of errors in res.
1317  */
1318 static int inc_refcounts(BlockDriverState *bs,
1319                          BdrvCheckResult *res,
1320                          void **refcount_table,
1321                          int64_t *refcount_table_size,
1322                          int64_t offset, int64_t size)
1323 {
1324     BDRVQcow2State *s = bs->opaque;
1325     uint64_t start, last, cluster_offset, k, refcount;
1326     int ret;
1327
1328     if (size <= 0) {
1329         return 0;
1330     }
1331
1332     start = start_of_cluster(s, offset);
1333     last = start_of_cluster(s, offset + size - 1);
1334     for(cluster_offset = start; cluster_offset <= last;
1335         cluster_offset += s->cluster_size) {
1336         k = cluster_offset >> s->cluster_bits;
1337         if (k >= *refcount_table_size) {
1338             ret = realloc_refcount_array(s, refcount_table,
1339                                          refcount_table_size, k + 1);
1340             if (ret < 0) {
1341                 res->check_errors++;
1342                 return ret;
1343             }
1344         }
1345
1346         refcount = s->get_refcount(*refcount_table, k);
1347         if (refcount == s->refcount_max) {
1348             fprintf(stderr, "ERROR: overflow cluster offset=0x%" PRIx64
1349                     "\n", cluster_offset);
1350             fprintf(stderr, "Use qemu-img amend to increase the refcount entry "
1351                     "width or qemu-img convert to create a clean copy if the "
1352                     "image cannot be opened for writing\n");
1353             res->corruptions++;
1354             continue;
1355         }
1356         s->set_refcount(*refcount_table, k, refcount + 1);
1357     }
1358
1359     return 0;
1360 }
1361
1362 /* Flags for check_refcounts_l1() and check_refcounts_l2() */
1363 enum {
1364     CHECK_FRAG_INFO = 0x2,      /* update BlockFragInfo counters */
1365 };
1366
1367 /*
1368  * Increases the refcount in the given refcount table for the all clusters
1369  * referenced in the L2 table. While doing so, performs some checks on L2
1370  * entries.
1371  *
1372  * Returns the number of errors found by the checks or -errno if an internal
1373  * error occurred.
1374  */
1375 static int check_refcounts_l2(BlockDriverState *bs, BdrvCheckResult *res,
1376                               void **refcount_table,
1377                               int64_t *refcount_table_size, int64_t l2_offset,
1378                               int flags)
1379 {
1380     BDRVQcow2State *s = bs->opaque;
1381     uint64_t *l2_table, l2_entry;
1382     uint64_t next_contiguous_offset = 0;
1383     int i, l2_size, nb_csectors, ret;
1384
1385     /* Read L2 table from disk */
1386     l2_size = s->l2_size * sizeof(uint64_t);
1387     l2_table = g_malloc(l2_size);
1388
1389     ret = bdrv_pread(bs->file->bs, l2_offset, l2_table, l2_size);
1390     if (ret < 0) {
1391         fprintf(stderr, "ERROR: I/O error in check_refcounts_l2\n");
1392         res->check_errors++;
1393         goto fail;
1394     }
1395
1396     /* Do the actual checks */
1397     for(i = 0; i < s->l2_size; i++) {
1398         l2_entry = be64_to_cpu(l2_table[i]);
1399
1400         switch (qcow2_get_cluster_type(l2_entry)) {
1401         case QCOW2_CLUSTER_COMPRESSED:
1402             /* Compressed clusters don't have QCOW_OFLAG_COPIED */
1403             if (l2_entry & QCOW_OFLAG_COPIED) {
1404                 fprintf(stderr, "ERROR: cluster %" PRId64 ": "
1405                     "copied flag must never be set for compressed "
1406                     "clusters\n", l2_entry >> s->cluster_bits);
1407                 l2_entry &= ~QCOW_OFLAG_COPIED;
1408                 res->corruptions++;
1409             }
1410
1411             /* Mark cluster as used */
1412             nb_csectors = ((l2_entry >> s->csize_shift) &
1413                            s->csize_mask) + 1;
1414             l2_entry &= s->cluster_offset_mask;
1415             ret = inc_refcounts(bs, res, refcount_table, refcount_table_size,
1416                                 l2_entry & ~511, nb_csectors * 512);
1417             if (ret < 0) {
1418                 goto fail;
1419             }
1420
1421             if (flags & CHECK_FRAG_INFO) {
1422                 res->bfi.allocated_clusters++;
1423                 res->bfi.compressed_clusters++;
1424
1425                 /* Compressed clusters are fragmented by nature.  Since they
1426                  * take up sub-sector space but we only have sector granularity
1427                  * I/O we need to re-read the same sectors even for adjacent
1428                  * compressed clusters.
1429                  */
1430                 res->bfi.fragmented_clusters++;
1431             }
1432             break;
1433
1434         case QCOW2_CLUSTER_ZERO:
1435             if ((l2_entry & L2E_OFFSET_MASK) == 0) {
1436                 break;
1437             }
1438             /* fall through */
1439
1440         case QCOW2_CLUSTER_NORMAL:
1441         {
1442             uint64_t offset = l2_entry & L2E_OFFSET_MASK;
1443
1444             if (flags & CHECK_FRAG_INFO) {
1445                 res->bfi.allocated_clusters++;
1446                 if (next_contiguous_offset &&
1447                     offset != next_contiguous_offset) {
1448                     res->bfi.fragmented_clusters++;
1449                 }
1450                 next_contiguous_offset = offset + s->cluster_size;
1451             }
1452
1453             /* Mark cluster as used */
1454             ret = inc_refcounts(bs, res, refcount_table, refcount_table_size,
1455                                 offset, s->cluster_size);
1456             if (ret < 0) {
1457                 goto fail;
1458             }
1459
1460             /* Correct offsets are cluster aligned */
1461             if (offset_into_cluster(s, offset)) {
1462                 fprintf(stderr, "ERROR offset=%" PRIx64 ": Cluster is not "
1463                     "properly aligned; L2 entry corrupted.\n", offset);
1464                 res->corruptions++;
1465             }
1466             break;
1467         }
1468
1469         case QCOW2_CLUSTER_UNALLOCATED:
1470             break;
1471
1472         default:
1473             abort();
1474         }
1475     }
1476
1477     g_free(l2_table);
1478     return 0;
1479
1480 fail:
1481     g_free(l2_table);
1482     return ret;
1483 }
1484
1485 /*
1486  * Increases the refcount for the L1 table, its L2 tables and all referenced
1487  * clusters in the given refcount table. While doing so, performs some checks
1488  * on L1 and L2 entries.
1489  *
1490  * Returns the number of errors found by the checks or -errno if an internal
1491  * error occurred.
1492  */
1493 static int check_refcounts_l1(BlockDriverState *bs,
1494                               BdrvCheckResult *res,
1495                               void **refcount_table,
1496                               int64_t *refcount_table_size,
1497                               int64_t l1_table_offset, int l1_size,
1498                               int flags)
1499 {
1500     BDRVQcow2State *s = bs->opaque;
1501     uint64_t *l1_table = NULL, l2_offset, l1_size2;
1502     int i, ret;
1503
1504     l1_size2 = l1_size * sizeof(uint64_t);
1505
1506     /* Mark L1 table as used */
1507     ret = inc_refcounts(bs, res, refcount_table, refcount_table_size,
1508                         l1_table_offset, l1_size2);
1509     if (ret < 0) {
1510         goto fail;
1511     }
1512
1513     /* Read L1 table entries from disk */
1514     if (l1_size2 > 0) {
1515         l1_table = g_try_malloc(l1_size2);
1516         if (l1_table == NULL) {
1517             ret = -ENOMEM;
1518             res->check_errors++;
1519             goto fail;
1520         }
1521         ret = bdrv_pread(bs->file->bs, l1_table_offset, l1_table, l1_size2);
1522         if (ret < 0) {
1523             fprintf(stderr, "ERROR: I/O error in check_refcounts_l1\n");
1524             res->check_errors++;
1525             goto fail;
1526         }
1527         for(i = 0;i < l1_size; i++)
1528             be64_to_cpus(&l1_table[i]);
1529     }
1530
1531     /* Do the actual checks */
1532     for(i = 0; i < l1_size; i++) {
1533         l2_offset = l1_table[i];
1534         if (l2_offset) {
1535             /* Mark L2 table as used */
1536             l2_offset &= L1E_OFFSET_MASK;
1537             ret = inc_refcounts(bs, res, refcount_table, refcount_table_size,
1538                                 l2_offset, s->cluster_size);
1539             if (ret < 0) {
1540                 goto fail;
1541             }
1542
1543             /* L2 tables are cluster aligned */
1544             if (offset_into_cluster(s, l2_offset)) {
1545                 fprintf(stderr, "ERROR l2_offset=%" PRIx64 ": Table is not "
1546                     "cluster aligned; L1 entry corrupted\n", l2_offset);
1547                 res->corruptions++;
1548             }
1549
1550             /* Process and check L2 entries */
1551             ret = check_refcounts_l2(bs, res, refcount_table,
1552                                      refcount_table_size, l2_offset, flags);
1553             if (ret < 0) {
1554                 goto fail;
1555             }
1556         }
1557     }
1558     g_free(l1_table);
1559     return 0;
1560
1561 fail:
1562     g_free(l1_table);
1563     return ret;
1564 }
1565
1566 /*
1567  * Checks the OFLAG_COPIED flag for all L1 and L2 entries.
1568  *
1569  * This function does not print an error message nor does it increment
1570  * check_errors if qcow2_get_refcount fails (this is because such an error will
1571  * have been already detected and sufficiently signaled by the calling function
1572  * (qcow2_check_refcounts) by the time this function is called).
1573  */
1574 static int check_oflag_copied(BlockDriverState *bs, BdrvCheckResult *res,
1575                               BdrvCheckMode fix)
1576 {
1577     BDRVQcow2State *s = bs->opaque;
1578     uint64_t *l2_table = qemu_blockalign(bs, s->cluster_size);
1579     int ret;
1580     uint64_t refcount;
1581     int i, j;
1582
1583     for (i = 0; i < s->l1_size; i++) {
1584         uint64_t l1_entry = s->l1_table[i];
1585         uint64_t l2_offset = l1_entry & L1E_OFFSET_MASK;
1586         bool l2_dirty = false;
1587
1588         if (!l2_offset) {
1589             continue;
1590         }
1591
1592         ret = qcow2_get_refcount(bs, l2_offset >> s->cluster_bits,
1593                                  &refcount);
1594         if (ret < 0) {
1595             /* don't print message nor increment check_errors */
1596             continue;
1597         }
1598         if ((refcount == 1) != ((l1_entry & QCOW_OFLAG_COPIED) != 0)) {
1599             fprintf(stderr, "%s OFLAG_COPIED L2 cluster: l1_index=%d "
1600                     "l1_entry=%" PRIx64 " refcount=%" PRIu64 "\n",
1601                     fix & BDRV_FIX_ERRORS ? "Repairing" :
1602                                             "ERROR",
1603                     i, l1_entry, refcount);
1604             if (fix & BDRV_FIX_ERRORS) {
1605                 s->l1_table[i] = refcount == 1
1606                                ? l1_entry |  QCOW_OFLAG_COPIED
1607                                : l1_entry & ~QCOW_OFLAG_COPIED;
1608                 ret = qcow2_write_l1_entry(bs, i);
1609                 if (ret < 0) {
1610                     res->check_errors++;
1611                     goto fail;
1612                 }
1613                 res->corruptions_fixed++;
1614             } else {
1615                 res->corruptions++;
1616             }
1617         }
1618
1619         ret = bdrv_pread(bs->file->bs, l2_offset, l2_table,
1620                          s->l2_size * sizeof(uint64_t));
1621         if (ret < 0) {
1622             fprintf(stderr, "ERROR: Could not read L2 table: %s\n",
1623                     strerror(-ret));
1624             res->check_errors++;
1625             goto fail;
1626         }
1627
1628         for (j = 0; j < s->l2_size; j++) {
1629             uint64_t l2_entry = be64_to_cpu(l2_table[j]);
1630             uint64_t data_offset = l2_entry & L2E_OFFSET_MASK;
1631             int cluster_type = qcow2_get_cluster_type(l2_entry);
1632
1633             if ((cluster_type == QCOW2_CLUSTER_NORMAL) ||
1634                 ((cluster_type == QCOW2_CLUSTER_ZERO) && (data_offset != 0))) {
1635                 ret = qcow2_get_refcount(bs,
1636                                          data_offset >> s->cluster_bits,
1637                                          &refcount);
1638                 if (ret < 0) {
1639                     /* don't print message nor increment check_errors */
1640                     continue;
1641                 }
1642                 if ((refcount == 1) != ((l2_entry & QCOW_OFLAG_COPIED) != 0)) {
1643                     fprintf(stderr, "%s OFLAG_COPIED data cluster: "
1644                             "l2_entry=%" PRIx64 " refcount=%" PRIu64 "\n",
1645                             fix & BDRV_FIX_ERRORS ? "Repairing" :
1646                                                     "ERROR",
1647                             l2_entry, refcount);
1648                     if (fix & BDRV_FIX_ERRORS) {
1649                         l2_table[j] = cpu_to_be64(refcount == 1
1650                                     ? l2_entry |  QCOW_OFLAG_COPIED
1651                                     : l2_entry & ~QCOW_OFLAG_COPIED);
1652                         l2_dirty = true;
1653                         res->corruptions_fixed++;
1654                     } else {
1655                         res->corruptions++;
1656                     }
1657                 }
1658             }
1659         }
1660
1661         if (l2_dirty) {
1662             ret = qcow2_pre_write_overlap_check(bs, QCOW2_OL_ACTIVE_L2,
1663                                                 l2_offset, s->cluster_size);
1664             if (ret < 0) {
1665                 fprintf(stderr, "ERROR: Could not write L2 table; metadata "
1666                         "overlap check failed: %s\n", strerror(-ret));
1667                 res->check_errors++;
1668                 goto fail;
1669             }
1670
1671             ret = bdrv_pwrite(bs->file->bs, l2_offset, l2_table,
1672                               s->cluster_size);
1673             if (ret < 0) {
1674                 fprintf(stderr, "ERROR: Could not write L2 table: %s\n",
1675                         strerror(-ret));
1676                 res->check_errors++;
1677                 goto fail;
1678             }
1679         }
1680     }
1681
1682     ret = 0;
1683
1684 fail:
1685     qemu_vfree(l2_table);
1686     return ret;
1687 }
1688
1689 /*
1690  * Checks consistency of refblocks and accounts for each refblock in
1691  * *refcount_table.
1692  */
1693 static int check_refblocks(BlockDriverState *bs, BdrvCheckResult *res,
1694                            BdrvCheckMode fix, bool *rebuild,
1695                            void **refcount_table, int64_t *nb_clusters)
1696 {
1697     BDRVQcow2State *s = bs->opaque;
1698     int64_t i, size;
1699     int ret;
1700
1701     for(i = 0; i < s->refcount_table_size; i++) {
1702         uint64_t offset, cluster;
1703         offset = s->refcount_table[i];
1704         cluster = offset >> s->cluster_bits;
1705
1706         /* Refcount blocks are cluster aligned */
1707         if (offset_into_cluster(s, offset)) {
1708             fprintf(stderr, "ERROR refcount block %" PRId64 " is not "
1709                 "cluster aligned; refcount table entry corrupted\n", i);
1710             res->corruptions++;
1711             *rebuild = true;
1712             continue;
1713         }
1714
1715         if (cluster >= *nb_clusters) {
1716             fprintf(stderr, "%s refcount block %" PRId64 " is outside image\n",
1717                     fix & BDRV_FIX_ERRORS ? "Repairing" : "ERROR", i);
1718
1719             if (fix & BDRV_FIX_ERRORS) {
1720                 int64_t new_nb_clusters;
1721
1722                 if (offset > INT64_MAX - s->cluster_size) {
1723                     ret = -EINVAL;
1724                     goto resize_fail;
1725                 }
1726
1727                 ret = bdrv_truncate(bs->file->bs, offset + s->cluster_size);
1728                 if (ret < 0) {
1729                     goto resize_fail;
1730                 }
1731                 size = bdrv_getlength(bs->file->bs);
1732                 if (size < 0) {
1733                     ret = size;
1734                     goto resize_fail;
1735                 }
1736
1737                 new_nb_clusters = size_to_clusters(s, size);
1738                 assert(new_nb_clusters >= *nb_clusters);
1739
1740                 ret = realloc_refcount_array(s, refcount_table,
1741                                              nb_clusters, new_nb_clusters);
1742                 if (ret < 0) {
1743                     res->check_errors++;
1744                     return ret;
1745                 }
1746
1747                 if (cluster >= *nb_clusters) {
1748                     ret = -EINVAL;
1749                     goto resize_fail;
1750                 }
1751
1752                 res->corruptions_fixed++;
1753                 ret = inc_refcounts(bs, res, refcount_table, nb_clusters,
1754                                     offset, s->cluster_size);
1755                 if (ret < 0) {
1756                     return ret;
1757                 }
1758                 /* No need to check whether the refcount is now greater than 1:
1759                  * This area was just allocated and zeroed, so it can only be
1760                  * exactly 1 after inc_refcounts() */
1761                 continue;
1762
1763 resize_fail:
1764                 res->corruptions++;
1765                 *rebuild = true;
1766                 fprintf(stderr, "ERROR could not resize image: %s\n",
1767                         strerror(-ret));
1768             } else {
1769                 res->corruptions++;
1770             }
1771             continue;
1772         }
1773
1774         if (offset != 0) {
1775             ret = inc_refcounts(bs, res, refcount_table, nb_clusters,
1776                                 offset, s->cluster_size);
1777             if (ret < 0) {
1778                 return ret;
1779             }
1780             if (s->get_refcount(*refcount_table, cluster) != 1) {
1781                 fprintf(stderr, "ERROR refcount block %" PRId64
1782                         " refcount=%" PRIu64 "\n", i,
1783                         s->get_refcount(*refcount_table, cluster));
1784                 res->corruptions++;
1785                 *rebuild = true;
1786             }
1787         }
1788     }
1789
1790     return 0;
1791 }
1792
1793 /*
1794  * Calculates an in-memory refcount table.
1795  */
1796 static int calculate_refcounts(BlockDriverState *bs, BdrvCheckResult *res,
1797                                BdrvCheckMode fix, bool *rebuild,
1798                                void **refcount_table, int64_t *nb_clusters)
1799 {
1800     BDRVQcow2State *s = bs->opaque;
1801     int64_t i;
1802     QCowSnapshot *sn;
1803     int ret;
1804
1805     if (!*refcount_table) {
1806         int64_t old_size = 0;
1807         ret = realloc_refcount_array(s, refcount_table,
1808                                      &old_size, *nb_clusters);
1809         if (ret < 0) {
1810             res->check_errors++;
1811             return ret;
1812         }
1813     }
1814
1815     /* header */
1816     ret = inc_refcounts(bs, res, refcount_table, nb_clusters,
1817                         0, s->cluster_size);
1818     if (ret < 0) {
1819         return ret;
1820     }
1821
1822     /* current L1 table */
1823     ret = check_refcounts_l1(bs, res, refcount_table, nb_clusters,
1824                              s->l1_table_offset, s->l1_size, CHECK_FRAG_INFO);
1825     if (ret < 0) {
1826         return ret;
1827     }
1828
1829     /* snapshots */
1830     for (i = 0; i < s->nb_snapshots; i++) {
1831         sn = s->snapshots + i;
1832         ret = check_refcounts_l1(bs, res, refcount_table, nb_clusters,
1833                                  sn->l1_table_offset, sn->l1_size, 0);
1834         if (ret < 0) {
1835             return ret;
1836         }
1837     }
1838     ret = inc_refcounts(bs, res, refcount_table, nb_clusters,
1839                         s->snapshots_offset, s->snapshots_size);
1840     if (ret < 0) {
1841         return ret;
1842     }
1843
1844     /* refcount data */
1845     ret = inc_refcounts(bs, res, refcount_table, nb_clusters,
1846                         s->refcount_table_offset,
1847                         s->refcount_table_size * sizeof(uint64_t));
1848     if (ret < 0) {
1849         return ret;
1850     }
1851
1852     return check_refblocks(bs, res, fix, rebuild, refcount_table, nb_clusters);
1853 }
1854
1855 /*
1856  * Compares the actual reference count for each cluster in the image against the
1857  * refcount as reported by the refcount structures on-disk.
1858  */
1859 static void compare_refcounts(BlockDriverState *bs, BdrvCheckResult *res,
1860                               BdrvCheckMode fix, bool *rebuild,
1861                               int64_t *highest_cluster,
1862                               void *refcount_table, int64_t nb_clusters)
1863 {
1864     BDRVQcow2State *s = bs->opaque;
1865     int64_t i;
1866     uint64_t refcount1, refcount2;
1867     int ret;
1868
1869     for (i = 0, *highest_cluster = 0; i < nb_clusters; i++) {
1870         ret = qcow2_get_refcount(bs, i, &refcount1);
1871         if (ret < 0) {
1872             fprintf(stderr, "Can't get refcount for cluster %" PRId64 ": %s\n",
1873                     i, strerror(-ret));
1874             res->check_errors++;
1875             continue;
1876         }
1877
1878         refcount2 = s->get_refcount(refcount_table, i);
1879
1880         if (refcount1 > 0 || refcount2 > 0) {
1881             *highest_cluster = i;
1882         }
1883
1884         if (refcount1 != refcount2) {
1885             /* Check if we're allowed to fix the mismatch */
1886             int *num_fixed = NULL;
1887             if (refcount1 == 0) {
1888                 *rebuild = true;
1889             } else if (refcount1 > refcount2 && (fix & BDRV_FIX_LEAKS)) {
1890                 num_fixed = &res->leaks_fixed;
1891             } else if (refcount1 < refcount2 && (fix & BDRV_FIX_ERRORS)) {
1892                 num_fixed = &res->corruptions_fixed;
1893             }
1894
1895             fprintf(stderr, "%s cluster %" PRId64 " refcount=%" PRIu64
1896                     " reference=%" PRIu64 "\n",
1897                    num_fixed != NULL     ? "Repairing" :
1898                    refcount1 < refcount2 ? "ERROR" :
1899                                            "Leaked",
1900                    i, refcount1, refcount2);
1901
1902             if (num_fixed) {
1903                 ret = update_refcount(bs, i << s->cluster_bits, 1,
1904                                       refcount_diff(refcount1, refcount2),
1905                                       refcount1 > refcount2,
1906                                       QCOW2_DISCARD_ALWAYS);
1907                 if (ret >= 0) {
1908                     (*num_fixed)++;
1909                     continue;
1910                 }
1911             }
1912
1913             /* And if we couldn't, print an error */
1914             if (refcount1 < refcount2) {
1915                 res->corruptions++;
1916             } else {
1917                 res->leaks++;
1918             }
1919         }
1920     }
1921 }
1922
1923 /*
1924  * Allocates clusters using an in-memory refcount table (IMRT) in contrast to
1925  * the on-disk refcount structures.
1926  *
1927  * On input, *first_free_cluster tells where to start looking, and need not
1928  * actually be a free cluster; the returned offset will not be before that
1929  * cluster.  On output, *first_free_cluster points to the first gap found, even
1930  * if that gap was too small to be used as the returned offset.
1931  *
1932  * Note that *first_free_cluster is a cluster index whereas the return value is
1933  * an offset.
1934  */
1935 static int64_t alloc_clusters_imrt(BlockDriverState *bs,
1936                                    int cluster_count,
1937                                    void **refcount_table,
1938                                    int64_t *imrt_nb_clusters,
1939                                    int64_t *first_free_cluster)
1940 {
1941     BDRVQcow2State *s = bs->opaque;
1942     int64_t cluster = *first_free_cluster, i;
1943     bool first_gap = true;
1944     int contiguous_free_clusters;
1945     int ret;
1946
1947     /* Starting at *first_free_cluster, find a range of at least cluster_count
1948      * continuously free clusters */
1949     for (contiguous_free_clusters = 0;
1950          cluster < *imrt_nb_clusters &&
1951          contiguous_free_clusters < cluster_count;
1952          cluster++)
1953     {
1954         if (!s->get_refcount(*refcount_table, cluster)) {
1955             contiguous_free_clusters++;
1956             if (first_gap) {
1957                 /* If this is the first free cluster found, update
1958                  * *first_free_cluster accordingly */
1959                 *first_free_cluster = cluster;
1960                 first_gap = false;
1961             }
1962         } else if (contiguous_free_clusters) {
1963             contiguous_free_clusters = 0;
1964         }
1965     }
1966
1967     /* If contiguous_free_clusters is greater than zero, it contains the number
1968      * of continuously free clusters until the current cluster; the first free
1969      * cluster in the current "gap" is therefore
1970      * cluster - contiguous_free_clusters */
1971
1972     /* If no such range could be found, grow the in-memory refcount table
1973      * accordingly to append free clusters at the end of the image */
1974     if (contiguous_free_clusters < cluster_count) {
1975         /* contiguous_free_clusters clusters are already empty at the image end;
1976          * we need cluster_count clusters; therefore, we have to allocate
1977          * cluster_count - contiguous_free_clusters new clusters at the end of
1978          * the image (which is the current value of cluster; note that cluster
1979          * may exceed old_imrt_nb_clusters if *first_free_cluster pointed beyond
1980          * the image end) */
1981         ret = realloc_refcount_array(s, refcount_table, imrt_nb_clusters,
1982                                      cluster + cluster_count
1983                                      - contiguous_free_clusters);
1984         if (ret < 0) {
1985             return ret;
1986         }
1987     }
1988
1989     /* Go back to the first free cluster */
1990     cluster -= contiguous_free_clusters;
1991     for (i = 0; i < cluster_count; i++) {
1992         s->set_refcount(*refcount_table, cluster + i, 1);
1993     }
1994
1995     return cluster << s->cluster_bits;
1996 }
1997
1998 /*
1999  * Creates a new refcount structure based solely on the in-memory information
2000  * given through *refcount_table. All necessary allocations will be reflected
2001  * in that array.
2002  *
2003  * On success, the old refcount structure is leaked (it will be covered by the
2004  * new refcount structure).
2005  */
2006 static int rebuild_refcount_structure(BlockDriverState *bs,
2007                                       BdrvCheckResult *res,
2008                                       void **refcount_table,
2009                                       int64_t *nb_clusters)
2010 {
2011     BDRVQcow2State *s = bs->opaque;
2012     int64_t first_free_cluster = 0, reftable_offset = -1, cluster = 0;
2013     int64_t refblock_offset, refblock_start, refblock_index;
2014     uint32_t reftable_size = 0;
2015     uint64_t *on_disk_reftable = NULL;
2016     void *on_disk_refblock;
2017     int ret = 0;
2018     struct {
2019         uint64_t reftable_offset;
2020         uint32_t reftable_clusters;
2021     } QEMU_PACKED reftable_offset_and_clusters;
2022
2023     qcow2_cache_empty(bs, s->refcount_block_cache);
2024
2025 write_refblocks:
2026     for (; cluster < *nb_clusters; cluster++) {
2027         if (!s->get_refcount(*refcount_table, cluster)) {
2028             continue;
2029         }
2030
2031         refblock_index = cluster >> s->refcount_block_bits;
2032         refblock_start = refblock_index << s->refcount_block_bits;
2033
2034         /* Don't allocate a cluster in a refblock already written to disk */
2035         if (first_free_cluster < refblock_start) {
2036             first_free_cluster = refblock_start;
2037         }
2038         refblock_offset = alloc_clusters_imrt(bs, 1, refcount_table,
2039                                               nb_clusters, &first_free_cluster);
2040         if (refblock_offset < 0) {
2041             fprintf(stderr, "ERROR allocating refblock: %s\n",
2042                     strerror(-refblock_offset));
2043             res->check_errors++;
2044             ret = refblock_offset;
2045             goto fail;
2046         }
2047
2048         if (reftable_size <= refblock_index) {
2049             uint32_t old_reftable_size = reftable_size;
2050             uint64_t *new_on_disk_reftable;
2051
2052             reftable_size = ROUND_UP((refblock_index + 1) * sizeof(uint64_t),
2053                                      s->cluster_size) / sizeof(uint64_t);
2054             new_on_disk_reftable = g_try_realloc(on_disk_reftable,
2055                                                  reftable_size *
2056                                                  sizeof(uint64_t));
2057             if (!new_on_disk_reftable) {
2058                 res->check_errors++;
2059                 ret = -ENOMEM;
2060                 goto fail;
2061             }
2062             on_disk_reftable = new_on_disk_reftable;
2063
2064             memset(on_disk_reftable + old_reftable_size, 0,
2065                    (reftable_size - old_reftable_size) * sizeof(uint64_t));
2066
2067             /* The offset we have for the reftable is now no longer valid;
2068              * this will leak that range, but we can easily fix that by running
2069              * a leak-fixing check after this rebuild operation */
2070             reftable_offset = -1;
2071         }
2072         on_disk_reftable[refblock_index] = refblock_offset;
2073
2074         /* If this is apparently the last refblock (for now), try to squeeze the
2075          * reftable in */
2076         if (refblock_index == (*nb_clusters - 1) >> s->refcount_block_bits &&
2077             reftable_offset < 0)
2078         {
2079             uint64_t reftable_clusters = size_to_clusters(s, reftable_size *
2080                                                           sizeof(uint64_t));
2081             reftable_offset = alloc_clusters_imrt(bs, reftable_clusters,
2082                                                   refcount_table, nb_clusters,
2083                                                   &first_free_cluster);
2084             if (reftable_offset < 0) {
2085                 fprintf(stderr, "ERROR allocating reftable: %s\n",
2086                         strerror(-reftable_offset));
2087                 res->check_errors++;
2088                 ret = reftable_offset;
2089                 goto fail;
2090             }
2091         }
2092
2093         ret = qcow2_pre_write_overlap_check(bs, 0, refblock_offset,
2094                                             s->cluster_size);
2095         if (ret < 0) {
2096             fprintf(stderr, "ERROR writing refblock: %s\n", strerror(-ret));
2097             goto fail;
2098         }
2099
2100         /* The size of *refcount_table is always cluster-aligned, therefore the
2101          * write operation will not overflow */
2102         on_disk_refblock = (void *)((char *) *refcount_table +
2103                                     refblock_index * s->cluster_size);
2104
2105         ret = bdrv_write(bs->file->bs, refblock_offset / BDRV_SECTOR_SIZE,
2106                          on_disk_refblock, s->cluster_sectors);
2107         if (ret < 0) {
2108             fprintf(stderr, "ERROR writing refblock: %s\n", strerror(-ret));
2109             goto fail;
2110         }
2111
2112         /* Go to the end of this refblock */
2113         cluster = refblock_start + s->refcount_block_size - 1;
2114     }
2115
2116     if (reftable_offset < 0) {
2117         uint64_t post_refblock_start, reftable_clusters;
2118
2119         post_refblock_start = ROUND_UP(*nb_clusters, s->refcount_block_size);
2120         reftable_clusters = size_to_clusters(s,
2121                                              reftable_size * sizeof(uint64_t));
2122         /* Not pretty but simple */
2123         if (first_free_cluster < post_refblock_start) {
2124             first_free_cluster = post_refblock_start;
2125         }
2126         reftable_offset = alloc_clusters_imrt(bs, reftable_clusters,
2127                                               refcount_table, nb_clusters,
2128                                               &first_free_cluster);
2129         if (reftable_offset < 0) {
2130             fprintf(stderr, "ERROR allocating reftable: %s\n",
2131                     strerror(-reftable_offset));
2132             res->check_errors++;
2133             ret = reftable_offset;
2134             goto fail;
2135         }
2136
2137         goto write_refblocks;
2138     }
2139
2140     assert(on_disk_reftable);
2141
2142     for (refblock_index = 0; refblock_index < reftable_size; refblock_index++) {
2143         cpu_to_be64s(&on_disk_reftable[refblock_index]);
2144     }
2145
2146     ret = qcow2_pre_write_overlap_check(bs, 0, reftable_offset,
2147                                         reftable_size * sizeof(uint64_t));
2148     if (ret < 0) {
2149         fprintf(stderr, "ERROR writing reftable: %s\n", strerror(-ret));
2150         goto fail;
2151     }
2152
2153     assert(reftable_size < INT_MAX / sizeof(uint64_t));
2154     ret = bdrv_pwrite(bs->file->bs, reftable_offset, on_disk_reftable,
2155                       reftable_size * sizeof(uint64_t));
2156     if (ret < 0) {
2157         fprintf(stderr, "ERROR writing reftable: %s\n", strerror(-ret));
2158         goto fail;
2159     }
2160
2161     /* Enter new reftable into the image header */
2162     cpu_to_be64w(&reftable_offset_and_clusters.reftable_offset,
2163                  reftable_offset);
2164     cpu_to_be32w(&reftable_offset_and_clusters.reftable_clusters,
2165                  size_to_clusters(s, reftable_size * sizeof(uint64_t)));
2166     ret = bdrv_pwrite_sync(bs->file->bs, offsetof(QCowHeader,
2167                                                   refcount_table_offset),
2168                            &reftable_offset_and_clusters,
2169                            sizeof(reftable_offset_and_clusters));
2170     if (ret < 0) {
2171         fprintf(stderr, "ERROR setting reftable: %s\n", strerror(-ret));
2172         goto fail;
2173     }
2174
2175     for (refblock_index = 0; refblock_index < reftable_size; refblock_index++) {
2176         be64_to_cpus(&on_disk_reftable[refblock_index]);
2177     }
2178     s->refcount_table = on_disk_reftable;
2179     s->refcount_table_offset = reftable_offset;
2180     s->refcount_table_size = reftable_size;
2181
2182     return 0;
2183
2184 fail:
2185     g_free(on_disk_reftable);
2186     return ret;
2187 }
2188
2189 /*
2190  * Checks an image for refcount consistency.
2191  *
2192  * Returns 0 if no errors are found, the number of errors in case the image is
2193  * detected as corrupted, and -errno when an internal error occurred.
2194  */
2195 int qcow2_check_refcounts(BlockDriverState *bs, BdrvCheckResult *res,
2196                           BdrvCheckMode fix)
2197 {
2198     BDRVQcow2State *s = bs->opaque;
2199     BdrvCheckResult pre_compare_res;
2200     int64_t size, highest_cluster, nb_clusters;
2201     void *refcount_table = NULL;
2202     bool rebuild = false;
2203     int ret;
2204
2205     size = bdrv_getlength(bs->file->bs);
2206     if (size < 0) {
2207         res->check_errors++;
2208         return size;
2209     }
2210
2211     nb_clusters = size_to_clusters(s, size);
2212     if (nb_clusters > INT_MAX) {
2213         res->check_errors++;
2214         return -EFBIG;
2215     }
2216
2217     res->bfi.total_clusters =
2218         size_to_clusters(s, bs->total_sectors * BDRV_SECTOR_SIZE);
2219
2220     ret = calculate_refcounts(bs, res, fix, &rebuild, &refcount_table,
2221                               &nb_clusters);
2222     if (ret < 0) {
2223         goto fail;
2224     }
2225
2226     /* In case we don't need to rebuild the refcount structure (but want to fix
2227      * something), this function is immediately called again, in which case the
2228      * result should be ignored */
2229     pre_compare_res = *res;
2230     compare_refcounts(bs, res, 0, &rebuild, &highest_cluster, refcount_table,
2231                       nb_clusters);
2232
2233     if (rebuild && (fix & BDRV_FIX_ERRORS)) {
2234         BdrvCheckResult old_res = *res;
2235         int fresh_leaks = 0;
2236
2237         fprintf(stderr, "Rebuilding refcount structure\n");
2238         ret = rebuild_refcount_structure(bs, res, &refcount_table,
2239                                          &nb_clusters);
2240         if (ret < 0) {
2241             goto fail;
2242         }
2243
2244         res->corruptions = 0;
2245         res->leaks = 0;
2246
2247         /* Because the old reftable has been exchanged for a new one the
2248          * references have to be recalculated */
2249         rebuild = false;
2250         memset(refcount_table, 0, refcount_array_byte_size(s, nb_clusters));
2251         ret = calculate_refcounts(bs, res, 0, &rebuild, &refcount_table,
2252                                   &nb_clusters);
2253         if (ret < 0) {
2254             goto fail;
2255         }
2256
2257         if (fix & BDRV_FIX_LEAKS) {
2258             /* The old refcount structures are now leaked, fix it; the result
2259              * can be ignored, aside from leaks which were introduced by
2260              * rebuild_refcount_structure() that could not be fixed */
2261             BdrvCheckResult saved_res = *res;
2262             *res = (BdrvCheckResult){ 0 };
2263
2264             compare_refcounts(bs, res, BDRV_FIX_LEAKS, &rebuild,
2265                               &highest_cluster, refcount_table, nb_clusters);
2266             if (rebuild) {
2267                 fprintf(stderr, "ERROR rebuilt refcount structure is still "
2268                         "broken\n");
2269             }
2270
2271             /* Any leaks accounted for here were introduced by
2272              * rebuild_refcount_structure() because that function has created a
2273              * new refcount structure from scratch */
2274             fresh_leaks = res->leaks;
2275             *res = saved_res;
2276         }
2277
2278         if (res->corruptions < old_res.corruptions) {
2279             res->corruptions_fixed += old_res.corruptions - res->corruptions;
2280         }
2281         if (res->leaks < old_res.leaks) {
2282             res->leaks_fixed += old_res.leaks - res->leaks;
2283         }
2284         res->leaks += fresh_leaks;
2285     } else if (fix) {
2286         if (rebuild) {
2287             fprintf(stderr, "ERROR need to rebuild refcount structures\n");
2288             res->check_errors++;
2289             ret = -EIO;
2290             goto fail;
2291         }
2292
2293         if (res->leaks || res->corruptions) {
2294             *res = pre_compare_res;
2295             compare_refcounts(bs, res, fix, &rebuild, &highest_cluster,
2296                               refcount_table, nb_clusters);
2297         }
2298     }
2299
2300     /* check OFLAG_COPIED */
2301     ret = check_oflag_copied(bs, res, fix);
2302     if (ret < 0) {
2303         goto fail;
2304     }
2305
2306     res->image_end_offset = (highest_cluster + 1) * s->cluster_size;
2307     ret = 0;
2308
2309 fail:
2310     g_free(refcount_table);
2311
2312     return ret;
2313 }
2314
2315 #define overlaps_with(ofs, sz) \
2316     ranges_overlap(offset, size, ofs, sz)
2317
2318 /*
2319  * Checks if the given offset into the image file is actually free to use by
2320  * looking for overlaps with important metadata sections (L1/L2 tables etc.),
2321  * i.e. a sanity check without relying on the refcount tables.
2322  *
2323  * The ign parameter specifies what checks not to perform (being a bitmask of
2324  * QCow2MetadataOverlap values), i.e., what sections to ignore.
2325  *
2326  * Returns:
2327  * - 0 if writing to this offset will not affect the mentioned metadata
2328  * - a positive QCow2MetadataOverlap value indicating one overlapping section
2329  * - a negative value (-errno) indicating an error while performing a check,
2330  *   e.g. when bdrv_read failed on QCOW2_OL_INACTIVE_L2
2331  */
2332 int qcow2_check_metadata_overlap(BlockDriverState *bs, int ign, int64_t offset,
2333                                  int64_t size)
2334 {
2335     BDRVQcow2State *s = bs->opaque;
2336     int chk = s->overlap_check & ~ign;
2337     int i, j;
2338
2339     if (!size) {
2340         return 0;
2341     }
2342
2343     if (chk & QCOW2_OL_MAIN_HEADER) {
2344         if (offset < s->cluster_size) {
2345             return QCOW2_OL_MAIN_HEADER;
2346         }
2347     }
2348
2349     /* align range to test to cluster boundaries */
2350     size = align_offset(offset_into_cluster(s, offset) + size, s->cluster_size);
2351     offset = start_of_cluster(s, offset);
2352
2353     if ((chk & QCOW2_OL_ACTIVE_L1) && s->l1_size) {
2354         if (overlaps_with(s->l1_table_offset, s->l1_size * sizeof(uint64_t))) {
2355             return QCOW2_OL_ACTIVE_L1;
2356         }
2357     }
2358
2359     if ((chk & QCOW2_OL_REFCOUNT_TABLE) && s->refcount_table_size) {
2360         if (overlaps_with(s->refcount_table_offset,
2361             s->refcount_table_size * sizeof(uint64_t))) {
2362             return QCOW2_OL_REFCOUNT_TABLE;
2363         }
2364     }
2365
2366     if ((chk & QCOW2_OL_SNAPSHOT_TABLE) && s->snapshots_size) {
2367         if (overlaps_with(s->snapshots_offset, s->snapshots_size)) {
2368             return QCOW2_OL_SNAPSHOT_TABLE;
2369         }
2370     }
2371
2372     if ((chk & QCOW2_OL_INACTIVE_L1) && s->snapshots) {
2373         for (i = 0; i < s->nb_snapshots; i++) {
2374             if (s->snapshots[i].l1_size &&
2375                 overlaps_with(s->snapshots[i].l1_table_offset,
2376                 s->snapshots[i].l1_size * sizeof(uint64_t))) {
2377                 return QCOW2_OL_INACTIVE_L1;
2378             }
2379         }
2380     }
2381
2382     if ((chk & QCOW2_OL_ACTIVE_L2) && s->l1_table) {
2383         for (i = 0; i < s->l1_size; i++) {
2384             if ((s->l1_table[i] & L1E_OFFSET_MASK) &&
2385                 overlaps_with(s->l1_table[i] & L1E_OFFSET_MASK,
2386                 s->cluster_size)) {
2387                 return QCOW2_OL_ACTIVE_L2;
2388             }
2389         }
2390     }
2391
2392     if ((chk & QCOW2_OL_REFCOUNT_BLOCK) && s->refcount_table) {
2393         for (i = 0; i < s->refcount_table_size; i++) {
2394             if ((s->refcount_table[i] & REFT_OFFSET_MASK) &&
2395                 overlaps_with(s->refcount_table[i] & REFT_OFFSET_MASK,
2396                 s->cluster_size)) {
2397                 return QCOW2_OL_REFCOUNT_BLOCK;
2398             }
2399         }
2400     }
2401
2402     if ((chk & QCOW2_OL_INACTIVE_L2) && s->snapshots) {
2403         for (i = 0; i < s->nb_snapshots; i++) {
2404             uint64_t l1_ofs = s->snapshots[i].l1_table_offset;
2405             uint32_t l1_sz  = s->snapshots[i].l1_size;
2406             uint64_t l1_sz2 = l1_sz * sizeof(uint64_t);
2407             uint64_t *l1 = g_try_malloc(l1_sz2);
2408             int ret;
2409
2410             if (l1_sz2 && l1 == NULL) {
2411                 return -ENOMEM;
2412             }
2413
2414             ret = bdrv_pread(bs->file->bs, l1_ofs, l1, l1_sz2);
2415             if (ret < 0) {
2416                 g_free(l1);
2417                 return ret;
2418             }
2419
2420             for (j = 0; j < l1_sz; j++) {
2421                 uint64_t l2_ofs = be64_to_cpu(l1[j]) & L1E_OFFSET_MASK;
2422                 if (l2_ofs && overlaps_with(l2_ofs, s->cluster_size)) {
2423                     g_free(l1);
2424                     return QCOW2_OL_INACTIVE_L2;
2425                 }
2426             }
2427
2428             g_free(l1);
2429         }
2430     }
2431
2432     return 0;
2433 }
2434
2435 static const char *metadata_ol_names[] = {
2436     [QCOW2_OL_MAIN_HEADER_BITNR]    = "qcow2_header",
2437     [QCOW2_OL_ACTIVE_L1_BITNR]      = "active L1 table",
2438     [QCOW2_OL_ACTIVE_L2_BITNR]      = "active L2 table",
2439     [QCOW2_OL_REFCOUNT_TABLE_BITNR] = "refcount table",
2440     [QCOW2_OL_REFCOUNT_BLOCK_BITNR] = "refcount block",
2441     [QCOW2_OL_SNAPSHOT_TABLE_BITNR] = "snapshot table",
2442     [QCOW2_OL_INACTIVE_L1_BITNR]    = "inactive L1 table",
2443     [QCOW2_OL_INACTIVE_L2_BITNR]    = "inactive L2 table",
2444 };
2445
2446 /*
2447  * First performs a check for metadata overlaps (through
2448  * qcow2_check_metadata_overlap); if that fails with a negative value (error
2449  * while performing a check), that value is returned. If an impending overlap
2450  * is detected, the BDS will be made unusable, the qcow2 file marked corrupt
2451  * and -EIO returned.
2452  *
2453  * Returns 0 if there were neither overlaps nor errors while checking for
2454  * overlaps; or a negative value (-errno) on error.
2455  */
2456 int qcow2_pre_write_overlap_check(BlockDriverState *bs, int ign, int64_t offset,
2457                                   int64_t size)
2458 {
2459     int ret = qcow2_check_metadata_overlap(bs, ign, offset, size);
2460
2461     if (ret < 0) {
2462         return ret;
2463     } else if (ret > 0) {
2464         int metadata_ol_bitnr = ctz32(ret);
2465         assert(metadata_ol_bitnr < QCOW2_OL_MAX_BITNR);
2466
2467         qcow2_signal_corruption(bs, true, offset, size, "Preventing invalid "
2468                                 "write on metadata (overlaps with %s)",
2469                                 metadata_ol_names[metadata_ol_bitnr]);
2470         return -EIO;
2471     }
2472
2473     return 0;
2474 }
2475
2476 /* A pointer to a function of this type is given to walk_over_reftable(). That
2477  * function will create refblocks and pass them to a RefblockFinishOp once they
2478  * are completed (@refblock). @refblock_empty is set if the refblock is
2479  * completely empty.
2480  *
2481  * Along with the refblock, a corresponding reftable entry is passed, in the
2482  * reftable @reftable (which may be reallocated) at @reftable_index.
2483  *
2484  * @allocated should be set to true if a new cluster has been allocated.
2485  */
2486 typedef int (RefblockFinishOp)(BlockDriverState *bs, uint64_t **reftable,
2487                                uint64_t reftable_index, uint64_t *reftable_size,
2488                                void *refblock, bool refblock_empty,
2489                                bool *allocated, Error **errp);
2490
2491 /**
2492  * This "operation" for walk_over_reftable() allocates the refblock on disk (if
2493  * it is not empty) and inserts its offset into the new reftable. The size of
2494  * this new reftable is increased as required.
2495  */
2496 static int alloc_refblock(BlockDriverState *bs, uint64_t **reftable,
2497                           uint64_t reftable_index, uint64_t *reftable_size,
2498                           void *refblock, bool refblock_empty, bool *allocated,
2499                           Error **errp)
2500 {
2501     BDRVQcow2State *s = bs->opaque;
2502     int64_t offset;
2503
2504     if (!refblock_empty && reftable_index >= *reftable_size) {
2505         uint64_t *new_reftable;
2506         uint64_t new_reftable_size;
2507
2508         new_reftable_size = ROUND_UP(reftable_index + 1,
2509                                      s->cluster_size / sizeof(uint64_t));
2510         if (new_reftable_size > QCOW_MAX_REFTABLE_SIZE / sizeof(uint64_t)) {
2511             error_setg(errp,
2512                        "This operation would make the refcount table grow "
2513                        "beyond the maximum size supported by QEMU, aborting");
2514             return -ENOTSUP;
2515         }
2516
2517         new_reftable = g_try_realloc(*reftable, new_reftable_size *
2518                                                 sizeof(uint64_t));
2519         if (!new_reftable) {
2520             error_setg(errp, "Failed to increase reftable buffer size");
2521             return -ENOMEM;
2522         }
2523
2524         memset(new_reftable + *reftable_size, 0,
2525                (new_reftable_size - *reftable_size) * sizeof(uint64_t));
2526
2527         *reftable      = new_reftable;
2528         *reftable_size = new_reftable_size;
2529     }
2530
2531     if (!refblock_empty && !(*reftable)[reftable_index]) {
2532         offset = qcow2_alloc_clusters(bs, s->cluster_size);
2533         if (offset < 0) {
2534             error_setg_errno(errp, -offset, "Failed to allocate refblock");
2535             return offset;
2536         }
2537         (*reftable)[reftable_index] = offset;
2538         *allocated = true;
2539     }
2540
2541     return 0;
2542 }
2543
2544 /**
2545  * This "operation" for walk_over_reftable() writes the refblock to disk at the
2546  * offset specified by the new reftable's entry. It does not modify the new
2547  * reftable or change any refcounts.
2548  */
2549 static int flush_refblock(BlockDriverState *bs, uint64_t **reftable,
2550                           uint64_t reftable_index, uint64_t *reftable_size,
2551                           void *refblock, bool refblock_empty, bool *allocated,
2552                           Error **errp)
2553 {
2554     BDRVQcow2State *s = bs->opaque;
2555     int64_t offset;
2556     int ret;
2557
2558     if (reftable_index < *reftable_size && (*reftable)[reftable_index]) {
2559         offset = (*reftable)[reftable_index];
2560
2561         ret = qcow2_pre_write_overlap_check(bs, 0, offset, s->cluster_size);
2562         if (ret < 0) {
2563             error_setg_errno(errp, -ret, "Overlap check failed");
2564             return ret;
2565         }
2566
2567         ret = bdrv_pwrite(bs->file->bs, offset, refblock, s->cluster_size);
2568         if (ret < 0) {
2569             error_setg_errno(errp, -ret, "Failed to write refblock");
2570             return ret;
2571         }
2572     } else {
2573         assert(refblock_empty);
2574     }
2575
2576     return 0;
2577 }
2578
2579 /**
2580  * This function walks over the existing reftable and every referenced refblock;
2581  * if @new_set_refcount is non-NULL, it is called for every refcount entry to
2582  * create an equal new entry in the passed @new_refblock. Once that
2583  * @new_refblock is completely filled, @operation will be called.
2584  *
2585  * @status_cb and @cb_opaque are used for the amend operation's status callback.
2586  * @index is the index of the walk_over_reftable() calls and @total is the total
2587  * number of walk_over_reftable() calls per amend operation. Both are used for
2588  * calculating the parameters for the status callback.
2589  *
2590  * @allocated is set to true if a new cluster has been allocated.
2591  */
2592 static int walk_over_reftable(BlockDriverState *bs, uint64_t **new_reftable,
2593                               uint64_t *new_reftable_index,
2594                               uint64_t *new_reftable_size,
2595                               void *new_refblock, int new_refblock_size,
2596                               int new_refcount_bits,
2597                               RefblockFinishOp *operation, bool *allocated,
2598                               Qcow2SetRefcountFunc *new_set_refcount,
2599                               BlockDriverAmendStatusCB *status_cb,
2600                               void *cb_opaque, int index, int total,
2601                               Error **errp)
2602 {
2603     BDRVQcow2State *s = bs->opaque;
2604     uint64_t reftable_index;
2605     bool new_refblock_empty = true;
2606     int refblock_index;
2607     int new_refblock_index = 0;
2608     int ret;
2609
2610     for (reftable_index = 0; reftable_index < s->refcount_table_size;
2611          reftable_index++)
2612     {
2613         uint64_t refblock_offset = s->refcount_table[reftable_index]
2614                                  & REFT_OFFSET_MASK;
2615
2616         status_cb(bs, (uint64_t)index * s->refcount_table_size + reftable_index,
2617                   (uint64_t)total * s->refcount_table_size, cb_opaque);
2618
2619         if (refblock_offset) {
2620             void *refblock;
2621
2622             if (offset_into_cluster(s, refblock_offset)) {
2623                 qcow2_signal_corruption(bs, true, -1, -1, "Refblock offset %#"
2624                                         PRIx64 " unaligned (reftable index: %#"
2625                                         PRIx64 ")", refblock_offset,
2626                                         reftable_index);
2627                 error_setg(errp,
2628                            "Image is corrupt (unaligned refblock offset)");
2629                 return -EIO;
2630             }
2631
2632             ret = qcow2_cache_get(bs, s->refcount_block_cache, refblock_offset,
2633                                   &refblock);
2634             if (ret < 0) {
2635                 error_setg_errno(errp, -ret, "Failed to retrieve refblock");
2636                 return ret;
2637             }
2638
2639             for (refblock_index = 0; refblock_index < s->refcount_block_size;
2640                  refblock_index++)
2641             {
2642                 uint64_t refcount;
2643
2644                 if (new_refblock_index >= new_refblock_size) {
2645                     /* new_refblock is now complete */
2646                     ret = operation(bs, new_reftable, *new_reftable_index,
2647                                     new_reftable_size, new_refblock,
2648                                     new_refblock_empty, allocated, errp);
2649                     if (ret < 0) {
2650                         qcow2_cache_put(bs, s->refcount_block_cache, &refblock);
2651                         return ret;
2652                     }
2653
2654                     (*new_reftable_index)++;
2655                     new_refblock_index = 0;
2656                     new_refblock_empty = true;
2657                 }
2658
2659                 refcount = s->get_refcount(refblock, refblock_index);
2660                 if (new_refcount_bits < 64 && refcount >> new_refcount_bits) {
2661                     uint64_t offset;
2662
2663                     qcow2_cache_put(bs, s->refcount_block_cache, &refblock);
2664
2665                     offset = ((reftable_index << s->refcount_block_bits)
2666                               + refblock_index) << s->cluster_bits;
2667
2668                     error_setg(errp, "Cannot decrease refcount entry width to "
2669                                "%i bits: Cluster at offset %#" PRIx64 " has a "
2670                                "refcount of %" PRIu64, new_refcount_bits,
2671                                offset, refcount);
2672                     return -EINVAL;
2673                 }
2674
2675                 if (new_set_refcount) {
2676                     new_set_refcount(new_refblock, new_refblock_index++,
2677                                      refcount);
2678                 } else {
2679                     new_refblock_index++;
2680                 }
2681                 new_refblock_empty = new_refblock_empty && refcount == 0;
2682             }
2683
2684             qcow2_cache_put(bs, s->refcount_block_cache, &refblock);
2685         } else {
2686             /* No refblock means every refcount is 0 */
2687             for (refblock_index = 0; refblock_index < s->refcount_block_size;
2688                  refblock_index++)
2689             {
2690                 if (new_refblock_index >= new_refblock_size) {
2691                     /* new_refblock is now complete */
2692                     ret = operation(bs, new_reftable, *new_reftable_index,
2693                                     new_reftable_size, new_refblock,
2694                                     new_refblock_empty, allocated, errp);
2695                     if (ret < 0) {
2696                         return ret;
2697                     }
2698
2699                     (*new_reftable_index)++;
2700                     new_refblock_index = 0;
2701                     new_refblock_empty = true;
2702                 }
2703
2704                 if (new_set_refcount) {
2705                     new_set_refcount(new_refblock, new_refblock_index++, 0);
2706                 } else {
2707                     new_refblock_index++;
2708                 }
2709             }
2710         }
2711     }
2712
2713     if (new_refblock_index > 0) {
2714         /* Complete the potentially existing partially filled final refblock */
2715         if (new_set_refcount) {
2716             for (; new_refblock_index < new_refblock_size;
2717                  new_refblock_index++)
2718             {
2719                 new_set_refcount(new_refblock, new_refblock_index, 0);
2720             }
2721         }
2722
2723         ret = operation(bs, new_reftable, *new_reftable_index,
2724                         new_reftable_size, new_refblock, new_refblock_empty,
2725                         allocated, errp);
2726         if (ret < 0) {
2727             return ret;
2728         }
2729
2730         (*new_reftable_index)++;
2731     }
2732
2733     status_cb(bs, (uint64_t)(index + 1) * s->refcount_table_size,
2734               (uint64_t)total * s->refcount_table_size, cb_opaque);
2735
2736     return 0;
2737 }
2738
2739 int qcow2_change_refcount_order(BlockDriverState *bs, int refcount_order,
2740                                 BlockDriverAmendStatusCB *status_cb,
2741                                 void *cb_opaque, Error **errp)
2742 {
2743     BDRVQcow2State *s = bs->opaque;
2744     Qcow2GetRefcountFunc *new_get_refcount;
2745     Qcow2SetRefcountFunc *new_set_refcount;
2746     void *new_refblock = qemu_blockalign(bs->file->bs, s->cluster_size);
2747     uint64_t *new_reftable = NULL, new_reftable_size = 0;
2748     uint64_t *old_reftable, old_reftable_size, old_reftable_offset;
2749     uint64_t new_reftable_index = 0;
2750     uint64_t i;
2751     int64_t new_reftable_offset = 0, allocated_reftable_size = 0;
2752     int new_refblock_size, new_refcount_bits = 1 << refcount_order;
2753     int old_refcount_order;
2754     int walk_index = 0;
2755     int ret;
2756     bool new_allocation;
2757
2758     assert(s->qcow_version >= 3);
2759     assert(refcount_order >= 0 && refcount_order <= 6);
2760
2761     /* see qcow2_open() */
2762     new_refblock_size = 1 << (s->cluster_bits - (refcount_order - 3));
2763
2764     new_get_refcount = get_refcount_funcs[refcount_order];
2765     new_set_refcount = set_refcount_funcs[refcount_order];
2766
2767
2768     do {
2769         int total_walks;
2770
2771         new_allocation = false;
2772
2773         /* At least we have to do this walk and the one which writes the
2774          * refblocks; also, at least we have to do this loop here at least
2775          * twice (normally), first to do the allocations, and second to
2776          * determine that everything is correctly allocated, this then makes
2777          * three walks in total */
2778         total_walks = MAX(walk_index + 2, 3);
2779
2780         /* First, allocate the structures so they are present in the refcount
2781          * structures */
2782         ret = walk_over_reftable(bs, &new_reftable, &new_reftable_index,
2783                                  &new_reftable_size, NULL, new_refblock_size,
2784                                  new_refcount_bits, &alloc_refblock,
2785                                  &new_allocation, NULL, status_cb, cb_opaque,
2786                                  walk_index++, total_walks, errp);
2787         if (ret < 0) {
2788             goto done;
2789         }
2790
2791         new_reftable_index = 0;
2792
2793         if (new_allocation) {
2794             if (new_reftable_offset) {
2795                 qcow2_free_clusters(bs, new_reftable_offset,
2796                                     allocated_reftable_size * sizeof(uint64_t),
2797                                     QCOW2_DISCARD_NEVER);
2798             }
2799
2800             new_reftable_offset = qcow2_alloc_clusters(bs, new_reftable_size *
2801                                                            sizeof(uint64_t));
2802             if (new_reftable_offset < 0) {
2803                 error_setg_errno(errp, -new_reftable_offset,
2804                                  "Failed to allocate the new reftable");
2805                 ret = new_reftable_offset;
2806                 goto done;
2807             }
2808             allocated_reftable_size = new_reftable_size;
2809         }
2810     } while (new_allocation);
2811
2812     /* Second, write the new refblocks */
2813     ret = walk_over_reftable(bs, &new_reftable, &new_reftable_index,
2814                              &new_reftable_size, new_refblock,
2815                              new_refblock_size, new_refcount_bits,
2816                              &flush_refblock, &new_allocation, new_set_refcount,
2817                              status_cb, cb_opaque, walk_index, walk_index + 1,
2818                              errp);
2819     if (ret < 0) {
2820         goto done;
2821     }
2822     assert(!new_allocation);
2823
2824
2825     /* Write the new reftable */
2826     ret = qcow2_pre_write_overlap_check(bs, 0, new_reftable_offset,
2827                                         new_reftable_size * sizeof(uint64_t));
2828     if (ret < 0) {
2829         error_setg_errno(errp, -ret, "Overlap check failed");
2830         goto done;
2831     }
2832
2833     for (i = 0; i < new_reftable_size; i++) {
2834         cpu_to_be64s(&new_reftable[i]);
2835     }
2836
2837     ret = bdrv_pwrite(bs->file->bs, new_reftable_offset, new_reftable,
2838                       new_reftable_size * sizeof(uint64_t));
2839
2840     for (i = 0; i < new_reftable_size; i++) {
2841         be64_to_cpus(&new_reftable[i]);
2842     }
2843
2844     if (ret < 0) {
2845         error_setg_errno(errp, -ret, "Failed to write the new reftable");
2846         goto done;
2847     }
2848
2849
2850     /* Empty the refcount cache */
2851     ret = qcow2_cache_flush(bs, s->refcount_block_cache);
2852     if (ret < 0) {
2853         error_setg_errno(errp, -ret, "Failed to flush the refblock cache");
2854         goto done;
2855     }
2856
2857     /* Update the image header to point to the new reftable; this only updates
2858      * the fields which are relevant to qcow2_update_header(); other fields
2859      * such as s->refcount_table or s->refcount_bits stay stale for now
2860      * (because we have to restore everything if qcow2_update_header() fails) */
2861     old_refcount_order  = s->refcount_order;
2862     old_reftable_size   = s->refcount_table_size;
2863     old_reftable_offset = s->refcount_table_offset;
2864
2865     s->refcount_order        = refcount_order;
2866     s->refcount_table_size   = new_reftable_size;
2867     s->refcount_table_offset = new_reftable_offset;
2868
2869     ret = qcow2_update_header(bs);
2870     if (ret < 0) {
2871         s->refcount_order        = old_refcount_order;
2872         s->refcount_table_size   = old_reftable_size;
2873         s->refcount_table_offset = old_reftable_offset;
2874         error_setg_errno(errp, -ret, "Failed to update the qcow2 header");
2875         goto done;
2876     }
2877
2878     /* Now update the rest of the in-memory information */
2879     old_reftable = s->refcount_table;
2880     s->refcount_table = new_reftable;
2881
2882     s->refcount_bits = 1 << refcount_order;
2883     s->refcount_max = UINT64_C(1) << (s->refcount_bits - 1);
2884     s->refcount_max += s->refcount_max - 1;
2885
2886     s->refcount_block_bits = s->cluster_bits - (refcount_order - 3);
2887     s->refcount_block_size = 1 << s->refcount_block_bits;
2888
2889     s->get_refcount = new_get_refcount;
2890     s->set_refcount = new_set_refcount;
2891
2892     /* For cleaning up all old refblocks and the old reftable below the "done"
2893      * label */
2894     new_reftable        = old_reftable;
2895     new_reftable_size   = old_reftable_size;
2896     new_reftable_offset = old_reftable_offset;
2897
2898 done:
2899     if (new_reftable) {
2900         /* On success, new_reftable actually points to the old reftable (and
2901          * new_reftable_size is the old reftable's size); but that is just
2902          * fine */
2903         for (i = 0; i < new_reftable_size; i++) {
2904             uint64_t offset = new_reftable[i] & REFT_OFFSET_MASK;
2905             if (offset) {
2906                 qcow2_free_clusters(bs, offset, s->cluster_size,
2907                                     QCOW2_DISCARD_OTHER);
2908             }
2909         }
2910         g_free(new_reftable);
2911
2912         if (new_reftable_offset > 0) {
2913             qcow2_free_clusters(bs, new_reftable_offset,
2914                                 new_reftable_size * sizeof(uint64_t),
2915                                 QCOW2_DISCARD_OTHER);
2916         }
2917     }
2918
2919     qemu_vfree(new_refblock);
2920     return ret;
2921 }