Make vfio MSI interrupt be non-threaded.
[kvmfornfv.git] / qemu / block / backup.c
1 /*
2  * QEMU backup
3  *
4  * Copyright (C) 2013 Proxmox Server Solutions
5  *
6  * Authors:
7  *  Dietmar Maurer (dietmar@proxmox.com)
8  *
9  * This work is licensed under the terms of the GNU GPL, version 2 or later.
10  * See the COPYING file in the top-level directory.
11  *
12  */
13
14 #include <stdio.h>
15 #include <errno.h>
16 #include <unistd.h>
17
18 #include "trace.h"
19 #include "block/block.h"
20 #include "block/block_int.h"
21 #include "block/blockjob.h"
22 #include "qapi/qmp/qerror.h"
23 #include "qemu/ratelimit.h"
24
25 #define BACKUP_CLUSTER_BITS 16
26 #define BACKUP_CLUSTER_SIZE (1 << BACKUP_CLUSTER_BITS)
27 #define BACKUP_SECTORS_PER_CLUSTER (BACKUP_CLUSTER_SIZE / BDRV_SECTOR_SIZE)
28
29 #define SLICE_TIME 100000000ULL /* ns */
30
31 typedef struct CowRequest {
32     int64_t start;
33     int64_t end;
34     QLIST_ENTRY(CowRequest) list;
35     CoQueue wait_queue; /* coroutines blocked on this request */
36 } CowRequest;
37
38 typedef struct BackupBlockJob {
39     BlockJob common;
40     BlockDriverState *target;
41     /* bitmap for sync=incremental */
42     BdrvDirtyBitmap *sync_bitmap;
43     MirrorSyncMode sync_mode;
44     RateLimit limit;
45     BlockdevOnError on_source_error;
46     BlockdevOnError on_target_error;
47     CoRwlock flush_rwlock;
48     uint64_t sectors_read;
49     HBitmap *bitmap;
50     QLIST_HEAD(, CowRequest) inflight_reqs;
51 } BackupBlockJob;
52
53 /* See if in-flight requests overlap and wait for them to complete */
54 static void coroutine_fn wait_for_overlapping_requests(BackupBlockJob *job,
55                                                        int64_t start,
56                                                        int64_t end)
57 {
58     CowRequest *req;
59     bool retry;
60
61     do {
62         retry = false;
63         QLIST_FOREACH(req, &job->inflight_reqs, list) {
64             if (end > req->start && start < req->end) {
65                 qemu_co_queue_wait(&req->wait_queue);
66                 retry = true;
67                 break;
68             }
69         }
70     } while (retry);
71 }
72
73 /* Keep track of an in-flight request */
74 static void cow_request_begin(CowRequest *req, BackupBlockJob *job,
75                                      int64_t start, int64_t end)
76 {
77     req->start = start;
78     req->end = end;
79     qemu_co_queue_init(&req->wait_queue);
80     QLIST_INSERT_HEAD(&job->inflight_reqs, req, list);
81 }
82
83 /* Forget about a completed request */
84 static void cow_request_end(CowRequest *req)
85 {
86     QLIST_REMOVE(req, list);
87     qemu_co_queue_restart_all(&req->wait_queue);
88 }
89
90 static int coroutine_fn backup_do_cow(BlockDriverState *bs,
91                                       int64_t sector_num, int nb_sectors,
92                                       bool *error_is_read)
93 {
94     BackupBlockJob *job = (BackupBlockJob *)bs->job;
95     CowRequest cow_request;
96     struct iovec iov;
97     QEMUIOVector bounce_qiov;
98     void *bounce_buffer = NULL;
99     int ret = 0;
100     int64_t start, end;
101     int n;
102
103     qemu_co_rwlock_rdlock(&job->flush_rwlock);
104
105     start = sector_num / BACKUP_SECTORS_PER_CLUSTER;
106     end = DIV_ROUND_UP(sector_num + nb_sectors, BACKUP_SECTORS_PER_CLUSTER);
107
108     trace_backup_do_cow_enter(job, start, sector_num, nb_sectors);
109
110     wait_for_overlapping_requests(job, start, end);
111     cow_request_begin(&cow_request, job, start, end);
112
113     for (; start < end; start++) {
114         if (hbitmap_get(job->bitmap, start)) {
115             trace_backup_do_cow_skip(job, start);
116             continue; /* already copied */
117         }
118
119         trace_backup_do_cow_process(job, start);
120
121         n = MIN(BACKUP_SECTORS_PER_CLUSTER,
122                 job->common.len / BDRV_SECTOR_SIZE -
123                 start * BACKUP_SECTORS_PER_CLUSTER);
124
125         if (!bounce_buffer) {
126             bounce_buffer = qemu_blockalign(bs, BACKUP_CLUSTER_SIZE);
127         }
128         iov.iov_base = bounce_buffer;
129         iov.iov_len = n * BDRV_SECTOR_SIZE;
130         qemu_iovec_init_external(&bounce_qiov, &iov, 1);
131
132         ret = bdrv_co_readv(bs, start * BACKUP_SECTORS_PER_CLUSTER, n,
133                             &bounce_qiov);
134         if (ret < 0) {
135             trace_backup_do_cow_read_fail(job, start, ret);
136             if (error_is_read) {
137                 *error_is_read = true;
138             }
139             goto out;
140         }
141
142         if (buffer_is_zero(iov.iov_base, iov.iov_len)) {
143             ret = bdrv_co_write_zeroes(job->target,
144                                        start * BACKUP_SECTORS_PER_CLUSTER,
145                                        n, BDRV_REQ_MAY_UNMAP);
146         } else {
147             ret = bdrv_co_writev(job->target,
148                                  start * BACKUP_SECTORS_PER_CLUSTER, n,
149                                  &bounce_qiov);
150         }
151         if (ret < 0) {
152             trace_backup_do_cow_write_fail(job, start, ret);
153             if (error_is_read) {
154                 *error_is_read = false;
155             }
156             goto out;
157         }
158
159         hbitmap_set(job->bitmap, start, 1);
160
161         /* Publish progress, guest I/O counts as progress too.  Note that the
162          * offset field is an opaque progress value, it is not a disk offset.
163          */
164         job->sectors_read += n;
165         job->common.offset += n * BDRV_SECTOR_SIZE;
166     }
167
168 out:
169     if (bounce_buffer) {
170         qemu_vfree(bounce_buffer);
171     }
172
173     cow_request_end(&cow_request);
174
175     trace_backup_do_cow_return(job, sector_num, nb_sectors, ret);
176
177     qemu_co_rwlock_unlock(&job->flush_rwlock);
178
179     return ret;
180 }
181
182 static int coroutine_fn backup_before_write_notify(
183         NotifierWithReturn *notifier,
184         void *opaque)
185 {
186     BdrvTrackedRequest *req = opaque;
187     int64_t sector_num = req->offset >> BDRV_SECTOR_BITS;
188     int nb_sectors = req->bytes >> BDRV_SECTOR_BITS;
189
190     assert((req->offset & (BDRV_SECTOR_SIZE - 1)) == 0);
191     assert((req->bytes & (BDRV_SECTOR_SIZE - 1)) == 0);
192
193     return backup_do_cow(req->bs, sector_num, nb_sectors, NULL);
194 }
195
196 static void backup_set_speed(BlockJob *job, int64_t speed, Error **errp)
197 {
198     BackupBlockJob *s = container_of(job, BackupBlockJob, common);
199
200     if (speed < 0) {
201         error_setg(errp, QERR_INVALID_PARAMETER, "speed");
202         return;
203     }
204     ratelimit_set_speed(&s->limit, speed / BDRV_SECTOR_SIZE, SLICE_TIME);
205 }
206
207 static void backup_iostatus_reset(BlockJob *job)
208 {
209     BackupBlockJob *s = container_of(job, BackupBlockJob, common);
210
211     bdrv_iostatus_reset(s->target);
212 }
213
214 static const BlockJobDriver backup_job_driver = {
215     .instance_size  = sizeof(BackupBlockJob),
216     .job_type       = BLOCK_JOB_TYPE_BACKUP,
217     .set_speed      = backup_set_speed,
218     .iostatus_reset = backup_iostatus_reset,
219 };
220
221 static BlockErrorAction backup_error_action(BackupBlockJob *job,
222                                             bool read, int error)
223 {
224     if (read) {
225         return block_job_error_action(&job->common, job->common.bs,
226                                       job->on_source_error, true, error);
227     } else {
228         return block_job_error_action(&job->common, job->target,
229                                       job->on_target_error, false, error);
230     }
231 }
232
233 typedef struct {
234     int ret;
235 } BackupCompleteData;
236
237 static void backup_complete(BlockJob *job, void *opaque)
238 {
239     BackupBlockJob *s = container_of(job, BackupBlockJob, common);
240     BackupCompleteData *data = opaque;
241
242     bdrv_unref(s->target);
243
244     block_job_completed(job, data->ret);
245     g_free(data);
246 }
247
248 static bool coroutine_fn yield_and_check(BackupBlockJob *job)
249 {
250     if (block_job_is_cancelled(&job->common)) {
251         return true;
252     }
253
254     /* we need to yield so that bdrv_drain_all() returns.
255      * (without, VM does not reboot)
256      */
257     if (job->common.speed) {
258         uint64_t delay_ns = ratelimit_calculate_delay(&job->limit,
259                                                       job->sectors_read);
260         job->sectors_read = 0;
261         block_job_sleep_ns(&job->common, QEMU_CLOCK_REALTIME, delay_ns);
262     } else {
263         block_job_sleep_ns(&job->common, QEMU_CLOCK_REALTIME, 0);
264     }
265
266     if (block_job_is_cancelled(&job->common)) {
267         return true;
268     }
269
270     return false;
271 }
272
273 static int coroutine_fn backup_run_incremental(BackupBlockJob *job)
274 {
275     bool error_is_read;
276     int ret = 0;
277     int clusters_per_iter;
278     uint32_t granularity;
279     int64_t sector;
280     int64_t cluster;
281     int64_t end;
282     int64_t last_cluster = -1;
283     BlockDriverState *bs = job->common.bs;
284     HBitmapIter hbi;
285
286     granularity = bdrv_dirty_bitmap_granularity(job->sync_bitmap);
287     clusters_per_iter = MAX((granularity / BACKUP_CLUSTER_SIZE), 1);
288     bdrv_dirty_iter_init(job->sync_bitmap, &hbi);
289
290     /* Find the next dirty sector(s) */
291     while ((sector = hbitmap_iter_next(&hbi)) != -1) {
292         cluster = sector / BACKUP_SECTORS_PER_CLUSTER;
293
294         /* Fake progress updates for any clusters we skipped */
295         if (cluster != last_cluster + 1) {
296             job->common.offset += ((cluster - last_cluster - 1) *
297                                    BACKUP_CLUSTER_SIZE);
298         }
299
300         for (end = cluster + clusters_per_iter; cluster < end; cluster++) {
301             do {
302                 if (yield_and_check(job)) {
303                     return ret;
304                 }
305                 ret = backup_do_cow(bs, cluster * BACKUP_SECTORS_PER_CLUSTER,
306                                     BACKUP_SECTORS_PER_CLUSTER, &error_is_read);
307                 if ((ret < 0) &&
308                     backup_error_action(job, error_is_read, -ret) ==
309                     BLOCK_ERROR_ACTION_REPORT) {
310                     return ret;
311                 }
312             } while (ret < 0);
313         }
314
315         /* If the bitmap granularity is smaller than the backup granularity,
316          * we need to advance the iterator pointer to the next cluster. */
317         if (granularity < BACKUP_CLUSTER_SIZE) {
318             bdrv_set_dirty_iter(&hbi, cluster * BACKUP_SECTORS_PER_CLUSTER);
319         }
320
321         last_cluster = cluster - 1;
322     }
323
324     /* Play some final catchup with the progress meter */
325     end = DIV_ROUND_UP(job->common.len, BACKUP_CLUSTER_SIZE);
326     if (last_cluster + 1 < end) {
327         job->common.offset += ((end - last_cluster - 1) * BACKUP_CLUSTER_SIZE);
328     }
329
330     return ret;
331 }
332
333 static void coroutine_fn backup_run(void *opaque)
334 {
335     BackupBlockJob *job = opaque;
336     BackupCompleteData *data;
337     BlockDriverState *bs = job->common.bs;
338     BlockDriverState *target = job->target;
339     BlockdevOnError on_target_error = job->on_target_error;
340     NotifierWithReturn before_write = {
341         .notify = backup_before_write_notify,
342     };
343     int64_t start, end;
344     int ret = 0;
345
346     QLIST_INIT(&job->inflight_reqs);
347     qemu_co_rwlock_init(&job->flush_rwlock);
348
349     start = 0;
350     end = DIV_ROUND_UP(job->common.len, BACKUP_CLUSTER_SIZE);
351
352     job->bitmap = hbitmap_alloc(end, 0);
353
354     bdrv_set_enable_write_cache(target, true);
355     bdrv_set_on_error(target, on_target_error, on_target_error);
356     bdrv_iostatus_enable(target);
357
358     bdrv_add_before_write_notifier(bs, &before_write);
359
360     if (job->sync_mode == MIRROR_SYNC_MODE_NONE) {
361         while (!block_job_is_cancelled(&job->common)) {
362             /* Yield until the job is cancelled.  We just let our before_write
363              * notify callback service CoW requests. */
364             job->common.busy = false;
365             qemu_coroutine_yield();
366             job->common.busy = true;
367         }
368     } else if (job->sync_mode == MIRROR_SYNC_MODE_INCREMENTAL) {
369         ret = backup_run_incremental(job);
370     } else {
371         /* Both FULL and TOP SYNC_MODE's require copying.. */
372         for (; start < end; start++) {
373             bool error_is_read;
374             if (yield_and_check(job)) {
375                 break;
376             }
377
378             if (job->sync_mode == MIRROR_SYNC_MODE_TOP) {
379                 int i, n;
380                 int alloced = 0;
381
382                 /* Check to see if these blocks are already in the
383                  * backing file. */
384
385                 for (i = 0; i < BACKUP_SECTORS_PER_CLUSTER;) {
386                     /* bdrv_is_allocated() only returns true/false based
387                      * on the first set of sectors it comes across that
388                      * are are all in the same state.
389                      * For that reason we must verify each sector in the
390                      * backup cluster length.  We end up copying more than
391                      * needed but at some point that is always the case. */
392                     alloced =
393                         bdrv_is_allocated(bs,
394                                 start * BACKUP_SECTORS_PER_CLUSTER + i,
395                                 BACKUP_SECTORS_PER_CLUSTER - i, &n);
396                     i += n;
397
398                     if (alloced == 1 || n == 0) {
399                         break;
400                     }
401                 }
402
403                 /* If the above loop never found any sectors that are in
404                  * the topmost image, skip this backup. */
405                 if (alloced == 0) {
406                     continue;
407                 }
408             }
409             /* FULL sync mode we copy the whole drive. */
410             ret = backup_do_cow(bs, start * BACKUP_SECTORS_PER_CLUSTER,
411                     BACKUP_SECTORS_PER_CLUSTER, &error_is_read);
412             if (ret < 0) {
413                 /* Depending on error action, fail now or retry cluster */
414                 BlockErrorAction action =
415                     backup_error_action(job, error_is_read, -ret);
416                 if (action == BLOCK_ERROR_ACTION_REPORT) {
417                     break;
418                 } else {
419                     start--;
420                     continue;
421                 }
422             }
423         }
424     }
425
426     notifier_with_return_remove(&before_write);
427
428     /* wait until pending backup_do_cow() calls have completed */
429     qemu_co_rwlock_wrlock(&job->flush_rwlock);
430     qemu_co_rwlock_unlock(&job->flush_rwlock);
431
432     if (job->sync_bitmap) {
433         BdrvDirtyBitmap *bm;
434         if (ret < 0 || block_job_is_cancelled(&job->common)) {
435             /* Merge the successor back into the parent, delete nothing. */
436             bm = bdrv_reclaim_dirty_bitmap(bs, job->sync_bitmap, NULL);
437             assert(bm);
438         } else {
439             /* Everything is fine, delete this bitmap and install the backup. */
440             bm = bdrv_dirty_bitmap_abdicate(bs, job->sync_bitmap, NULL);
441             assert(bm);
442         }
443     }
444     hbitmap_free(job->bitmap);
445
446     bdrv_iostatus_disable(target);
447     bdrv_op_unblock_all(target, job->common.blocker);
448
449     data = g_malloc(sizeof(*data));
450     data->ret = ret;
451     block_job_defer_to_main_loop(&job->common, backup_complete, data);
452 }
453
454 void backup_start(BlockDriverState *bs, BlockDriverState *target,
455                   int64_t speed, MirrorSyncMode sync_mode,
456                   BdrvDirtyBitmap *sync_bitmap,
457                   BlockdevOnError on_source_error,
458                   BlockdevOnError on_target_error,
459                   BlockCompletionFunc *cb, void *opaque,
460                   Error **errp)
461 {
462     int64_t len;
463
464     assert(bs);
465     assert(target);
466     assert(cb);
467
468     if (bs == target) {
469         error_setg(errp, "Source and target cannot be the same");
470         return;
471     }
472
473     if ((on_source_error == BLOCKDEV_ON_ERROR_STOP ||
474          on_source_error == BLOCKDEV_ON_ERROR_ENOSPC) &&
475         !bdrv_iostatus_is_enabled(bs)) {
476         error_setg(errp, QERR_INVALID_PARAMETER, "on-source-error");
477         return;
478     }
479
480     if (!bdrv_is_inserted(bs)) {
481         error_setg(errp, "Device is not inserted: %s",
482                    bdrv_get_device_name(bs));
483         return;
484     }
485
486     if (!bdrv_is_inserted(target)) {
487         error_setg(errp, "Device is not inserted: %s",
488                    bdrv_get_device_name(target));
489         return;
490     }
491
492     if (bdrv_op_is_blocked(bs, BLOCK_OP_TYPE_BACKUP_SOURCE, errp)) {
493         return;
494     }
495
496     if (bdrv_op_is_blocked(target, BLOCK_OP_TYPE_BACKUP_TARGET, errp)) {
497         return;
498     }
499
500     if (sync_mode == MIRROR_SYNC_MODE_INCREMENTAL) {
501         if (!sync_bitmap) {
502             error_setg(errp, "must provide a valid bitmap name for "
503                              "\"incremental\" sync mode");
504             return;
505         }
506
507         /* Create a new bitmap, and freeze/disable this one. */
508         if (bdrv_dirty_bitmap_create_successor(bs, sync_bitmap, errp) < 0) {
509             return;
510         }
511     } else if (sync_bitmap) {
512         error_setg(errp,
513                    "a sync_bitmap was provided to backup_run, "
514                    "but received an incompatible sync_mode (%s)",
515                    MirrorSyncMode_lookup[sync_mode]);
516         return;
517     }
518
519     len = bdrv_getlength(bs);
520     if (len < 0) {
521         error_setg_errno(errp, -len, "unable to get length for '%s'",
522                          bdrv_get_device_name(bs));
523         goto error;
524     }
525
526     BackupBlockJob *job = block_job_create(&backup_job_driver, bs, speed,
527                                            cb, opaque, errp);
528     if (!job) {
529         goto error;
530     }
531
532     bdrv_op_block_all(target, job->common.blocker);
533
534     job->on_source_error = on_source_error;
535     job->on_target_error = on_target_error;
536     job->target = target;
537     job->sync_mode = sync_mode;
538     job->sync_bitmap = sync_mode == MIRROR_SYNC_MODE_INCREMENTAL ?
539                        sync_bitmap : NULL;
540     job->common.len = len;
541     job->common.co = qemu_coroutine_create(backup_run);
542     qemu_coroutine_enter(job->common.co, job);
543     return;
544
545  error:
546     if (sync_bitmap) {
547         bdrv_reclaim_dirty_bitmap(bs, sync_bitmap, NULL);
548     }
549 }