X-Git-Url: https://gerrit.opnfv.org/gerrit/gitweb?a=blobdiff_plain;f=qemu%2Fblock%2Fio.c;fp=qemu%2Fblock%2Fio.c;h=a7dbf85b19a3ec804608839cc60f42598d4004cc;hb=437fd90c0250dee670290f9b714253671a990160;hp=d4bc83b33b0b7083584ba553efd3ac5501fafd43;hpb=5bbd6fe9b8bab2a93e548c5a53b032d1939eec05;p=kvmfornfv.git diff --git a/qemu/block/io.c b/qemu/block/io.c index d4bc83b33..a7dbf85b1 100644 --- a/qemu/block/io.c +++ b/qemu/block/io.c @@ -22,10 +22,14 @@ * THE SOFTWARE. */ +#include "qemu/osdep.h" #include "trace.h" +#include "sysemu/block-backend.h" #include "block/blockjob.h" #include "block/block_int.h" #include "block/throttle-groups.h" +#include "qemu/cutils.h" +#include "qapi/error.h" #include "qemu/error-report.h" #define NOT_DONE 0x7fffffff /* used while emulated sync operation in progress */ @@ -42,12 +46,6 @@ static int coroutine_fn bdrv_co_readv_em(BlockDriverState *bs, static int coroutine_fn bdrv_co_writev_em(BlockDriverState *bs, int64_t sector_num, int nb_sectors, QEMUIOVector *iov); -static int coroutine_fn bdrv_co_do_preadv(BlockDriverState *bs, - int64_t offset, unsigned int bytes, QEMUIOVector *qiov, - BdrvRequestFlags flags); -static int coroutine_fn bdrv_co_do_pwritev(BlockDriverState *bs, - int64_t offset, unsigned int bytes, QEMUIOVector *qiov, - BdrvRequestFlags flags); static BlockAIOCB *bdrv_co_aio_rw_vector(BlockDriverState *bs, int64_t sector_num, QEMUIOVector *qiov, @@ -156,38 +154,45 @@ void bdrv_refresh_limits(BlockDriverState *bs, Error **errp) /* Take some limits from the children as a default */ if (bs->file) { - bdrv_refresh_limits(bs->file, &local_err); + bdrv_refresh_limits(bs->file->bs, &local_err); if (local_err) { error_propagate(errp, local_err); return; } - bs->bl.opt_transfer_length = bs->file->bl.opt_transfer_length; - bs->bl.max_transfer_length = bs->file->bl.max_transfer_length; - bs->bl.min_mem_alignment = bs->file->bl.min_mem_alignment; - bs->bl.opt_mem_alignment = bs->file->bl.opt_mem_alignment; + bs->bl.opt_transfer_length = bs->file->bs->bl.opt_transfer_length; + bs->bl.max_transfer_length = bs->file->bs->bl.max_transfer_length; + bs->bl.min_mem_alignment = bs->file->bs->bl.min_mem_alignment; + bs->bl.opt_mem_alignment = bs->file->bs->bl.opt_mem_alignment; + bs->bl.max_iov = bs->file->bs->bl.max_iov; } else { bs->bl.min_mem_alignment = 512; bs->bl.opt_mem_alignment = getpagesize(); + + /* Safe default since most protocols use readv()/writev()/etc */ + bs->bl.max_iov = IOV_MAX; } - if (bs->backing_hd) { - bdrv_refresh_limits(bs->backing_hd, &local_err); + if (bs->backing) { + bdrv_refresh_limits(bs->backing->bs, &local_err); if (local_err) { error_propagate(errp, local_err); return; } bs->bl.opt_transfer_length = MAX(bs->bl.opt_transfer_length, - bs->backing_hd->bl.opt_transfer_length); + bs->backing->bs->bl.opt_transfer_length); bs->bl.max_transfer_length = MIN_NON_ZERO(bs->bl.max_transfer_length, - bs->backing_hd->bl.max_transfer_length); + bs->backing->bs->bl.max_transfer_length); bs->bl.opt_mem_alignment = MAX(bs->bl.opt_mem_alignment, - bs->backing_hd->bl.opt_mem_alignment); + bs->backing->bs->bl.opt_mem_alignment); bs->bl.min_mem_alignment = MAX(bs->bl.min_mem_alignment, - bs->backing_hd->bl.min_mem_alignment); + bs->backing->bs->bl.min_mem_alignment); + bs->bl.max_iov = + MIN(bs->bl.max_iov, + bs->backing->bs->bl.max_iov); } /* Then let the driver override it */ @@ -213,8 +218,10 @@ void bdrv_disable_copy_on_read(BlockDriverState *bs) } /* Check if any requests are in-flight (including throttled requests) */ -static bool bdrv_requests_pending(BlockDriverState *bs) +bool bdrv_requests_pending(BlockDriverState *bs) { + BdrvChild *child; + if (!QLIST_EMPTY(&bs->tracked_requests)) { return true; } @@ -224,17 +231,72 @@ static bool bdrv_requests_pending(BlockDriverState *bs) if (!qemu_co_queue_empty(&bs->throttled_reqs[1])) { return true; } - if (bs->file && bdrv_requests_pending(bs->file)) { - return true; - } - if (bs->backing_hd && bdrv_requests_pending(bs->backing_hd)) { - return true; + + QLIST_FOREACH(child, &bs->children, next) { + if (bdrv_requests_pending(child->bs)) { + return true; + } } + return false; } +static void bdrv_drain_recurse(BlockDriverState *bs) +{ + BdrvChild *child; + + if (bs->drv && bs->drv->bdrv_drain) { + bs->drv->bdrv_drain(bs); + } + QLIST_FOREACH(child, &bs->children, next) { + bdrv_drain_recurse(child->bs); + } +} + +typedef struct { + Coroutine *co; + BlockDriverState *bs; + QEMUBH *bh; + bool done; +} BdrvCoDrainData; + +static void bdrv_co_drain_bh_cb(void *opaque) +{ + BdrvCoDrainData *data = opaque; + Coroutine *co = data->co; + + qemu_bh_delete(data->bh); + bdrv_drain(data->bs); + data->done = true; + qemu_coroutine_enter(co, NULL); +} + +void coroutine_fn bdrv_co_drain(BlockDriverState *bs) +{ + BdrvCoDrainData data; + + /* Calling bdrv_drain() from a BH ensures the current coroutine yields and + * other coroutines run if they were queued from + * qemu_co_queue_run_restart(). */ + + assert(qemu_in_coroutine()); + data = (BdrvCoDrainData) { + .co = qemu_coroutine_self(), + .bs = bs, + .done = false, + .bh = aio_bh_new(bdrv_get_aio_context(bs), bdrv_co_drain_bh_cb, &data), + }; + qemu_bh_schedule(data.bh); + + qemu_coroutine_yield(); + /* If we are resumed from some other event (such as an aio completion or a + * timer callback), it is a bug in the caller that should be fixed. */ + assert(data.done); +} + /* - * Wait for pending requests to complete on a single BlockDriverState subtree + * Wait for pending requests to complete on a single BlockDriverState subtree, + * and suspend block driver's internal I/O until next request arrives. * * Note that unlike bdrv_drain_all(), the caller must hold the BlockDriverState * AioContext. @@ -247,6 +309,11 @@ void bdrv_drain(BlockDriverState *bs) { bool busy = true; + bdrv_drain_recurse(bs); + if (qemu_in_coroutine()) { + bdrv_co_drain(bs); + return; + } while (busy) { /* Keep iterating */ bdrv_flush_io_queue(bs); @@ -275,6 +342,7 @@ void bdrv_drain_all(void) if (bs->job) { block_job_pause(bs->job); } + bdrv_drain_recurse(bs); aio_context_release(aio_context); if (!g_slist_find(aio_ctxs, aio_context)) { @@ -344,13 +412,14 @@ static void tracked_request_end(BdrvTrackedRequest *req) static void tracked_request_begin(BdrvTrackedRequest *req, BlockDriverState *bs, int64_t offset, - unsigned int bytes, bool is_write) + unsigned int bytes, + enum BdrvTrackedRequestType type) { *req = (BdrvTrackedRequest){ .bs = bs, .offset = offset, .bytes = bytes, - .is_write = is_write, + .type = type, .co = qemu_coroutine_self(), .serialising = false, .overlap_offset = offset, @@ -593,20 +662,6 @@ int bdrv_read(BlockDriverState *bs, int64_t sector_num, return bdrv_rw_co(bs, sector_num, buf, nb_sectors, false, 0); } -/* Just like bdrv_read(), but with I/O throttling temporarily disabled */ -int bdrv_read_unthrottled(BlockDriverState *bs, int64_t sector_num, - uint8_t *buf, int nb_sectors) -{ - bool enabled; - int ret; - - enabled = bs->io_limits_enabled; - bs->io_limits_enabled = false; - ret = bdrv_read(bs, sector_num, buf, nb_sectors); - bs->io_limits_enabled = enabled; - return ret; -} - /* Return < 0 if error. Important errors are: -EIO generic I/O error (may happen for all errors) -ENOMEDIUM No media inserted. @@ -637,6 +692,7 @@ int bdrv_write_zeroes(BlockDriverState *bs, int64_t sector_num, int bdrv_make_zero(BlockDriverState *bs, BdrvRequestFlags flags) { int64_t target_sectors, ret, nb_sectors, sector_num = 0; + BlockDriverState *file; int n; target_sectors = bdrv_nb_sectors(bs); @@ -649,7 +705,7 @@ int bdrv_make_zero(BlockDriverState *bs, BdrvRequestFlags flags) if (nb_sectors <= 0) { return 0; } - ret = bdrv_get_block_status(bs, sector_num, nb_sectors, &n); + ret = bdrv_get_block_status(bs, sector_num, nb_sectors, &n, &file); if (ret < 0) { error_report("error getting block status at sector %" PRId64 ": %s", sector_num, strerror(-ret)); @@ -736,9 +792,9 @@ int bdrv_pwrite_sync(BlockDriverState *bs, int64_t offset, return ret; } - /* No flush needed for cache modes that already do it */ - if (bs->enable_write_cache) { - bdrv_flush(bs); + ret = bdrv_flush(bs); + if (ret < 0) { + return ret; } return 0; @@ -833,6 +889,7 @@ static int coroutine_fn bdrv_aligned_preadv(BlockDriverState *bs, assert((offset & (BDRV_SECTOR_SIZE - 1)) == 0); assert((bytes & (BDRV_SECTOR_SIZE - 1)) == 0); assert(!qiov || bytes == qiov->size); + assert((bs->open_flags & BDRV_O_NO_IO) == 0); /* Handle Copy on Read and associated serialisation */ if (flags & BDRV_REQ_COPY_ON_READ) { @@ -844,7 +901,9 @@ static int coroutine_fn bdrv_aligned_preadv(BlockDriverState *bs, mark_request_serialising(req, bdrv_get_cluster_size(bs)); } - wait_serialising_requests(req); + if (!(flags & BDRV_REQ_NO_SERIALISING)) { + wait_serialising_requests(req); + } if (flags & BDRV_REQ_COPY_ON_READ) { int pnum; @@ -908,7 +967,7 @@ out: /* * Handle a read request in coroutine context */ -static int coroutine_fn bdrv_co_do_preadv(BlockDriverState *bs, +int coroutine_fn bdrv_co_do_preadv(BlockDriverState *bs, int64_t offset, unsigned int bytes, QEMUIOVector *qiov, BdrvRequestFlags flags) { @@ -932,7 +991,8 @@ static int coroutine_fn bdrv_co_do_preadv(BlockDriverState *bs, return ret; } - if (bs->copy_on_read) { + /* Don't do copy-on-read if we read data before write operation */ + if (bs->copy_on_read && !(flags & BDRV_REQ_NO_SERIALISING)) { flags |= BDRV_REQ_COPY_ON_READ; } @@ -966,7 +1026,7 @@ static int coroutine_fn bdrv_co_do_preadv(BlockDriverState *bs, bytes = ROUND_UP(bytes, align); } - tracked_request_begin(&req, bs, offset, bytes, false); + tracked_request_begin(&req, bs, offset, bytes, BDRV_TRACKED_READ); ret = bdrv_aligned_preadv(bs, &req, offset, bytes, align, use_local_qiov ? &local_qiov : qiov, flags); @@ -1001,6 +1061,15 @@ int coroutine_fn bdrv_co_readv(BlockDriverState *bs, int64_t sector_num, return bdrv_co_do_readv(bs, sector_num, nb_sectors, qiov, 0); } +int coroutine_fn bdrv_co_readv_no_serialising(BlockDriverState *bs, + int64_t sector_num, int nb_sectors, QEMUIOVector *qiov) +{ + trace_bdrv_co_readv_no_serialising(bs, sector_num, nb_sectors); + + return bdrv_co_do_readv(bs, sector_num, nb_sectors, qiov, + BDRV_REQ_NO_SERIALISING); +} + int coroutine_fn bdrv_co_copy_on_readv(BlockDriverState *bs, int64_t sector_num, int nb_sectors, QEMUIOVector *qiov) { @@ -1107,6 +1176,7 @@ static int coroutine_fn bdrv_aligned_pwritev(BlockDriverState *bs, assert((offset & (BDRV_SECTOR_SIZE - 1)) == 0); assert((bytes & (BDRV_SECTOR_SIZE - 1)) == 0); assert(!qiov || bytes == qiov->size); + assert((bs->open_flags & BDRV_O_NO_IO) == 0); waited = wait_serialising_requests(req); assert(!waited || !req->serialising); @@ -1127,21 +1197,30 @@ static int coroutine_fn bdrv_aligned_pwritev(BlockDriverState *bs, if (ret < 0) { /* Do nothing, write notifier decided to fail this request */ } else if (flags & BDRV_REQ_ZERO_WRITE) { - BLKDBG_EVENT(bs, BLKDBG_PWRITEV_ZERO); + bdrv_debug_event(bs, BLKDBG_PWRITEV_ZERO); ret = bdrv_co_do_write_zeroes(bs, sector_num, nb_sectors, flags); + } else if (drv->bdrv_co_writev_flags) { + bdrv_debug_event(bs, BLKDBG_PWRITEV); + ret = drv->bdrv_co_writev_flags(bs, sector_num, nb_sectors, qiov, + flags); } else { - BLKDBG_EVENT(bs, BLKDBG_PWRITEV); + assert(drv->supported_write_flags == 0); + bdrv_debug_event(bs, BLKDBG_PWRITEV); ret = drv->bdrv_co_writev(bs, sector_num, nb_sectors, qiov); } - BLKDBG_EVENT(bs, BLKDBG_PWRITEV_DONE); + bdrv_debug_event(bs, BLKDBG_PWRITEV_DONE); - if (ret == 0 && !bs->enable_write_cache) { + if (ret == 0 && (flags & BDRV_REQ_FUA) && + !(drv->supported_write_flags & BDRV_REQ_FUA)) + { ret = bdrv_co_flush(bs); } bdrv_set_dirty(bs, sector_num, nb_sectors); - block_acct_highest_sector(&bs->stats, sector_num, nb_sectors); + if (bs->wr_highest_offset < offset + bytes) { + bs->wr_highest_offset = offset + bytes; + } if (ret >= 0) { bs->total_sectors = MAX(bs->total_sectors, sector_num + nb_sectors); @@ -1182,13 +1261,13 @@ static int coroutine_fn bdrv_co_do_zero_pwritev(BlockDriverState *bs, /* RMW the unaligned part before head. */ mark_request_serialising(req, align); wait_serialising_requests(req); - BLKDBG_EVENT(bs, BLKDBG_PWRITEV_RMW_HEAD); + bdrv_debug_event(bs, BLKDBG_PWRITEV_RMW_HEAD); ret = bdrv_aligned_preadv(bs, req, offset & ~(align - 1), align, align, &local_qiov, 0); if (ret < 0) { goto fail; } - BLKDBG_EVENT(bs, BLKDBG_PWRITEV_RMW_AFTER_HEAD); + bdrv_debug_event(bs, BLKDBG_PWRITEV_RMW_AFTER_HEAD); memset(buf + head_padding_bytes, 0, zero_bytes); ret = bdrv_aligned_pwritev(bs, req, offset & ~(align - 1), align, @@ -1220,13 +1299,13 @@ static int coroutine_fn bdrv_co_do_zero_pwritev(BlockDriverState *bs, /* RMW the unaligned part after tail. */ mark_request_serialising(req, align); wait_serialising_requests(req); - BLKDBG_EVENT(bs, BLKDBG_PWRITEV_RMW_TAIL); + bdrv_debug_event(bs, BLKDBG_PWRITEV_RMW_TAIL); ret = bdrv_aligned_preadv(bs, req, offset, align, align, &local_qiov, 0); if (ret < 0) { goto fail; } - BLKDBG_EVENT(bs, BLKDBG_PWRITEV_RMW_AFTER_TAIL); + bdrv_debug_event(bs, BLKDBG_PWRITEV_RMW_AFTER_TAIL); memset(buf, 0, bytes); ret = bdrv_aligned_pwritev(bs, req, offset, align, @@ -1241,7 +1320,7 @@ fail: /* * Handle a write request in coroutine context */ -static int coroutine_fn bdrv_co_do_pwritev(BlockDriverState *bs, +int coroutine_fn bdrv_co_do_pwritev(BlockDriverState *bs, int64_t offset, unsigned int bytes, QEMUIOVector *qiov, BdrvRequestFlags flags) { @@ -1260,6 +1339,7 @@ static int coroutine_fn bdrv_co_do_pwritev(BlockDriverState *bs, if (bs->read_only) { return -EPERM; } + assert(!(bs->open_flags & BDRV_O_INACTIVE)); ret = bdrv_check_byte_request(bs, offset, bytes); if (ret < 0) { @@ -1276,7 +1356,7 @@ static int coroutine_fn bdrv_co_do_pwritev(BlockDriverState *bs, * Pad qiov with the read parts and be sure to have a tracked request not * only for bdrv_aligned_pwritev, but also for the reads of the RMW cycle. */ - tracked_request_begin(&req, bs, offset, bytes, true); + tracked_request_begin(&req, bs, offset, bytes, BDRV_TRACKED_WRITE); if (!qiov) { ret = bdrv_co_do_zero_pwritev(bs, offset, bytes, flags, &req); @@ -1297,13 +1377,13 @@ static int coroutine_fn bdrv_co_do_pwritev(BlockDriverState *bs, }; qemu_iovec_init_external(&head_qiov, &head_iov, 1); - BLKDBG_EVENT(bs, BLKDBG_PWRITEV_RMW_HEAD); + bdrv_debug_event(bs, BLKDBG_PWRITEV_RMW_HEAD); ret = bdrv_aligned_preadv(bs, &req, offset & ~(align - 1), align, align, &head_qiov, 0); if (ret < 0) { goto fail; } - BLKDBG_EVENT(bs, BLKDBG_PWRITEV_RMW_AFTER_HEAD); + bdrv_debug_event(bs, BLKDBG_PWRITEV_RMW_AFTER_HEAD); qemu_iovec_init(&local_qiov, qiov->niov + 2); qemu_iovec_add(&local_qiov, head_buf, offset & (align - 1)); @@ -1331,13 +1411,13 @@ static int coroutine_fn bdrv_co_do_pwritev(BlockDriverState *bs, }; qemu_iovec_init_external(&tail_qiov, &tail_iov, 1); - BLKDBG_EVENT(bs, BLKDBG_PWRITEV_RMW_TAIL); + bdrv_debug_event(bs, BLKDBG_PWRITEV_RMW_TAIL); ret = bdrv_aligned_preadv(bs, &req, (offset + bytes) & ~(align - 1), align, align, &tail_qiov, 0); if (ret < 0) { goto fail; } - BLKDBG_EVENT(bs, BLKDBG_PWRITEV_RMW_AFTER_TAIL); + bdrv_debug_event(bs, BLKDBG_PWRITEV_RMW_AFTER_TAIL); if (!use_local_qiov) { qemu_iovec_init(&local_qiov, qiov->niov + 1); @@ -1401,29 +1481,10 @@ int coroutine_fn bdrv_co_write_zeroes(BlockDriverState *bs, BDRV_REQ_ZERO_WRITE | flags); } -int bdrv_flush_all(void) -{ - BlockDriverState *bs = NULL; - int result = 0; - - while ((bs = bdrv_next(bs))) { - AioContext *aio_context = bdrv_get_aio_context(bs); - int ret; - - aio_context_acquire(aio_context); - ret = bdrv_flush(bs); - if (ret < 0 && !result) { - result = ret; - } - aio_context_release(aio_context); - } - - return result; -} - typedef struct BdrvCoGetBlockStatusData { BlockDriverState *bs; BlockDriverState *base; + BlockDriverState **file; int64_t sector_num; int nb_sectors; int *pnum; @@ -1445,10 +1506,14 @@ typedef struct BdrvCoGetBlockStatusData { * * 'nb_sectors' is the max value 'pnum' should be set to. If nb_sectors goes * beyond the end of the disk image it will be clamped. + * + * If returned value is positive and BDRV_BLOCK_OFFSET_VALID bit is set, 'file' + * points to the BDS which the sector range is allocated in. */ static int64_t coroutine_fn bdrv_co_get_block_status(BlockDriverState *bs, int64_t sector_num, - int nb_sectors, int *pnum) + int nb_sectors, int *pnum, + BlockDriverState **file) { int64_t total_sectors; int64_t n; @@ -1478,7 +1543,9 @@ static int64_t coroutine_fn bdrv_co_get_block_status(BlockDriverState *bs, return ret; } - ret = bs->drv->bdrv_co_get_block_status(bs, sector_num, nb_sectors, pnum); + *file = NULL; + ret = bs->drv->bdrv_co_get_block_status(bs, sector_num, nb_sectors, pnum, + file); if (ret < 0) { *pnum = 0; return ret; @@ -1486,8 +1553,8 @@ static int64_t coroutine_fn bdrv_co_get_block_status(BlockDriverState *bs, if (ret & BDRV_BLOCK_RAW) { assert(ret & BDRV_BLOCK_OFFSET_VALID); - return bdrv_get_block_status(bs->file, ret >> BDRV_SECTOR_BITS, - *pnum, pnum); + return bdrv_get_block_status(bs->file->bs, ret >> BDRV_SECTOR_BITS, + *pnum, pnum, file); } if (ret & (BDRV_BLOCK_DATA | BDRV_BLOCK_ZERO)) { @@ -1495,8 +1562,8 @@ static int64_t coroutine_fn bdrv_co_get_block_status(BlockDriverState *bs, } else { if (bdrv_unallocated_blocks_are_zero(bs)) { ret |= BDRV_BLOCK_ZERO; - } else if (bs->backing_hd) { - BlockDriverState *bs2 = bs->backing_hd; + } else if (bs->backing) { + BlockDriverState *bs2 = bs->backing->bs; int64_t nb_sectors2 = bdrv_nb_sectors(bs2); if (nb_sectors2 >= 0 && sector_num >= nb_sectors2) { ret |= BDRV_BLOCK_ZERO; @@ -1504,13 +1571,14 @@ static int64_t coroutine_fn bdrv_co_get_block_status(BlockDriverState *bs, } } - if (bs->file && + if (*file && *file != bs && (ret & BDRV_BLOCK_DATA) && !(ret & BDRV_BLOCK_ZERO) && (ret & BDRV_BLOCK_OFFSET_VALID)) { + BlockDriverState *file2; int file_pnum; - ret2 = bdrv_co_get_block_status(bs->file, ret >> BDRV_SECTOR_BITS, - *pnum, &file_pnum); + ret2 = bdrv_co_get_block_status(*file, ret >> BDRV_SECTOR_BITS, + *pnum, &file_pnum, &file2); if (ret2 >= 0) { /* Ignore errors. This is just providing extra information, it * is useful but not necessary. @@ -1535,14 +1603,15 @@ static int64_t coroutine_fn bdrv_co_get_block_status_above(BlockDriverState *bs, BlockDriverState *base, int64_t sector_num, int nb_sectors, - int *pnum) + int *pnum, + BlockDriverState **file) { BlockDriverState *p; int64_t ret = 0; assert(bs != base); - for (p = bs; p != base; p = p->backing_hd) { - ret = bdrv_co_get_block_status(p, sector_num, nb_sectors, pnum); + for (p = bs; p != base; p = backing_bs(p)) { + ret = bdrv_co_get_block_status(p, sector_num, nb_sectors, pnum, file); if (ret < 0 || ret & BDRV_BLOCK_ALLOCATED) { break; } @@ -1561,7 +1630,8 @@ static void coroutine_fn bdrv_get_block_status_above_co_entry(void *opaque) data->ret = bdrv_co_get_block_status_above(data->bs, data->base, data->sector_num, data->nb_sectors, - data->pnum); + data->pnum, + data->file); data->done = true; } @@ -1573,12 +1643,14 @@ static void coroutine_fn bdrv_get_block_status_above_co_entry(void *opaque) int64_t bdrv_get_block_status_above(BlockDriverState *bs, BlockDriverState *base, int64_t sector_num, - int nb_sectors, int *pnum) + int nb_sectors, int *pnum, + BlockDriverState **file) { Coroutine *co; BdrvCoGetBlockStatusData data = { .bs = bs, .base = base, + .file = file, .sector_num = sector_num, .nb_sectors = nb_sectors, .pnum = pnum, @@ -1602,16 +1674,19 @@ int64_t bdrv_get_block_status_above(BlockDriverState *bs, int64_t bdrv_get_block_status(BlockDriverState *bs, int64_t sector_num, - int nb_sectors, int *pnum) + int nb_sectors, int *pnum, + BlockDriverState **file) { - return bdrv_get_block_status_above(bs, bs->backing_hd, - sector_num, nb_sectors, pnum); + return bdrv_get_block_status_above(bs, backing_bs(bs), + sector_num, nb_sectors, pnum, file); } int coroutine_fn bdrv_is_allocated(BlockDriverState *bs, int64_t sector_num, int nb_sectors, int *pnum) { - int64_t ret = bdrv_get_block_status(bs, sector_num, nb_sectors, pnum); + BlockDriverState *file; + int64_t ret = bdrv_get_block_status(bs, sector_num, nb_sectors, pnum, + &file); if (ret < 0) { return ret; } @@ -1662,7 +1737,7 @@ int bdrv_is_allocated_above(BlockDriverState *top, n = pnum_inter; } - intermediate = intermediate->backing_hd; + intermediate = backing_bs(intermediate); } *pnum = n; @@ -1713,7 +1788,7 @@ int bdrv_writev_vmstate(BlockDriverState *bs, QEMUIOVector *qiov, int64_t pos) } else if (drv->bdrv_save_vmstate) { return drv->bdrv_save_vmstate(bs, qiov, pos); } else if (bs->file) { - return bdrv_writev_vmstate(bs->file, qiov, pos); + return bdrv_writev_vmstate(bs->file->bs, qiov, pos); } return -ENOTSUP; @@ -1728,7 +1803,7 @@ int bdrv_load_vmstate(BlockDriverState *bs, uint8_t *buf, if (drv->bdrv_load_vmstate) return drv->bdrv_load_vmstate(bs, buf, pos, size); if (bs->file) - return bdrv_load_vmstate(bs->file, buf, pos, size); + return bdrv_load_vmstate(bs->file->bs, buf, pos, size); return -ENOTSUP; } @@ -1849,7 +1924,8 @@ static int multiwrite_merge(BlockDriverState *bs, BlockRequest *reqs, merge = 1; } - if (reqs[outidx].qiov->niov + reqs[i].qiov->niov + 1 > IOV_MAX) { + if (reqs[outidx].qiov->niov + reqs[i].qiov->niov + 1 > + bs->bl.max_iov) { merge = 0; } @@ -1893,7 +1969,10 @@ static int multiwrite_merge(BlockDriverState *bs, BlockRequest *reqs, } } - block_acct_merge_done(&bs->stats, BLOCK_ACCT_WRITE, num_reqs - outidx - 1); + if (bs->blk) { + block_acct_merge_done(blk_get_stats(bs->blk), BLOCK_ACCT_WRITE, + num_reqs - outidx - 1); + } return outidx + 1; } @@ -2208,7 +2287,7 @@ void *qemu_aio_get(const AIOCBInfo *aiocb_info, BlockDriverState *bs, { BlockAIOCB *acb; - acb = g_slice_alloc(aiocb_info->aiocb_size); + acb = g_malloc(aiocb_info->aiocb_size); acb->aiocb_info = aiocb_info; acb->bs = bs; acb->cb = cb; @@ -2228,7 +2307,7 @@ void qemu_aio_unref(void *p) BlockAIOCB *acb = p; assert(acb->refcnt > 0); if (--acb->refcnt == 0) { - g_slice_free1(acb->aiocb_info->aiocb_size, acb); + g_free(acb); } } @@ -2298,18 +2377,27 @@ static void coroutine_fn bdrv_flush_co_entry(void *opaque) int coroutine_fn bdrv_co_flush(BlockDriverState *bs) { int ret; + BdrvTrackedRequest req; if (!bs || !bdrv_is_inserted(bs) || bdrv_is_read_only(bs) || bdrv_is_sg(bs)) { return 0; } + tracked_request_begin(&req, bs, 0, 0, BDRV_TRACKED_FLUSH); + + /* Write back all layers by calling one driver function */ + if (bs->drv->bdrv_co_flush) { + ret = bs->drv->bdrv_co_flush(bs); + goto out; + } + /* Write back cached data to the OS even with cache=unsafe */ BLKDBG_EVENT(bs->file, BLKDBG_FLUSH_TO_OS); if (bs->drv->bdrv_co_flush_to_os) { ret = bs->drv->bdrv_co_flush_to_os(bs); if (ret < 0) { - return ret; + goto out; } } @@ -2349,14 +2437,17 @@ int coroutine_fn bdrv_co_flush(BlockDriverState *bs) ret = 0; } if (ret < 0) { - return ret; + goto out; } /* Now flush the underlying protocol. It will also have BDRV_O_NO_FLUSH * in the case of cache=unsafe, so there are no useless flushes. */ flush_parent: - return bdrv_co_flush(bs->file); + ret = bs->file ? bdrv_co_flush(bs->file->bs) : 0; +out: + tracked_request_end(&req); + return ret; } int bdrv_flush(BlockDriverState *bs) @@ -2399,6 +2490,7 @@ static void coroutine_fn bdrv_discard_co_entry(void *opaque) int coroutine_fn bdrv_co_discard(BlockDriverState *bs, int64_t sector_num, int nb_sectors) { + BdrvTrackedRequest req; int max_discard, ret; if (!bs->drv) { @@ -2411,6 +2503,7 @@ int coroutine_fn bdrv_co_discard(BlockDriverState *bs, int64_t sector_num, } else if (bs->read_only) { return -EPERM; } + assert(!(bs->open_flags & BDRV_O_INACTIVE)); /* Do nothing if disabled. */ if (!(bs->open_flags & BDRV_O_UNMAP)) { @@ -2421,6 +2514,8 @@ int coroutine_fn bdrv_co_discard(BlockDriverState *bs, int64_t sector_num, return 0; } + tracked_request_begin(&req, bs, sector_num, nb_sectors, + BDRV_TRACKED_DISCARD); bdrv_set_dirty(bs, sector_num, nb_sectors); max_discard = MIN_NON_ZERO(bs->bl.max_discard, BDRV_REQUEST_MAX_SECTORS); @@ -2454,20 +2549,24 @@ int coroutine_fn bdrv_co_discard(BlockDriverState *bs, int64_t sector_num, acb = bs->drv->bdrv_aio_discard(bs, sector_num, nb_sectors, bdrv_co_io_em_complete, &co); if (acb == NULL) { - return -EIO; + ret = -EIO; + goto out; } else { qemu_coroutine_yield(); ret = co.ret; } } if (ret && ret != -ENOTSUP) { - return ret; + goto out; } sector_num += num; nb_sectors -= num; } - return 0; + ret = 0; +out: + tracked_request_end(&req); + return ret; } int bdrv_discard(BlockDriverState *bs, int64_t sector_num, int nb_sectors) @@ -2496,26 +2595,110 @@ int bdrv_discard(BlockDriverState *bs, int64_t sector_num, int nb_sectors) return rwco.ret; } -/* needed for generic scsi interface */ +typedef struct { + CoroutineIOCompletion *co; + QEMUBH *bh; +} BdrvIoctlCompletionData; -int bdrv_ioctl(BlockDriverState *bs, unsigned long int req, void *buf) +static void bdrv_ioctl_bh_cb(void *opaque) +{ + BdrvIoctlCompletionData *data = opaque; + + bdrv_co_io_em_complete(data->co, -ENOTSUP); + qemu_bh_delete(data->bh); +} + +static int bdrv_co_do_ioctl(BlockDriverState *bs, int req, void *buf) { BlockDriver *drv = bs->drv; + BdrvTrackedRequest tracked_req; + CoroutineIOCompletion co = { + .coroutine = qemu_coroutine_self(), + }; + BlockAIOCB *acb; - if (drv && drv->bdrv_ioctl) - return drv->bdrv_ioctl(bs, req, buf); - return -ENOTSUP; + tracked_request_begin(&tracked_req, bs, 0, 0, BDRV_TRACKED_IOCTL); + if (!drv || !drv->bdrv_aio_ioctl) { + co.ret = -ENOTSUP; + goto out; + } + + acb = drv->bdrv_aio_ioctl(bs, req, buf, bdrv_co_io_em_complete, &co); + if (!acb) { + BdrvIoctlCompletionData *data = g_new(BdrvIoctlCompletionData, 1); + data->bh = aio_bh_new(bdrv_get_aio_context(bs), + bdrv_ioctl_bh_cb, data); + data->co = &co; + qemu_bh_schedule(data->bh); + } + qemu_coroutine_yield(); +out: + tracked_request_end(&tracked_req); + return co.ret; +} + +typedef struct { + BlockDriverState *bs; + int req; + void *buf; + int ret; +} BdrvIoctlCoData; + +static void coroutine_fn bdrv_co_ioctl_entry(void *opaque) +{ + BdrvIoctlCoData *data = opaque; + data->ret = bdrv_co_do_ioctl(data->bs, data->req, data->buf); +} + +/* needed for generic scsi interface */ +int bdrv_ioctl(BlockDriverState *bs, unsigned long int req, void *buf) +{ + BdrvIoctlCoData data = { + .bs = bs, + .req = req, + .buf = buf, + .ret = -EINPROGRESS, + }; + + if (qemu_in_coroutine()) { + /* Fast-path if already in coroutine context */ + bdrv_co_ioctl_entry(&data); + } else { + Coroutine *co = qemu_coroutine_create(bdrv_co_ioctl_entry); + + qemu_coroutine_enter(co, &data); + while (data.ret == -EINPROGRESS) { + aio_poll(bdrv_get_aio_context(bs), true); + } + } + return data.ret; +} + +static void coroutine_fn bdrv_co_aio_ioctl_entry(void *opaque) +{ + BlockAIOCBCoroutine *acb = opaque; + acb->req.error = bdrv_co_do_ioctl(acb->common.bs, + acb->req.req, acb->req.buf); + bdrv_co_complete(acb); } BlockAIOCB *bdrv_aio_ioctl(BlockDriverState *bs, unsigned long int req, void *buf, BlockCompletionFunc *cb, void *opaque) { - BlockDriver *drv = bs->drv; + BlockAIOCBCoroutine *acb = qemu_aio_get(&bdrv_em_co_aiocb_info, + bs, cb, opaque); + Coroutine *co; + + acb->need_bh = true; + acb->req.error = -EINPROGRESS; + acb->req.req = req; + acb->req.buf = buf; + co = qemu_coroutine_create(bdrv_co_aio_ioctl_entry); + qemu_coroutine_enter(co, acb); - if (drv && drv->bdrv_aio_ioctl) - return drv->bdrv_aio_ioctl(bs, req, buf, cb, opaque); - return NULL; + bdrv_co_maybe_schedule_bh(acb); + return &acb->common; } void *qemu_blockalign(BlockDriverState *bs, size_t size) @@ -2584,7 +2767,7 @@ void bdrv_io_plug(BlockDriverState *bs) if (drv && drv->bdrv_io_plug) { drv->bdrv_io_plug(bs); } else if (bs->file) { - bdrv_io_plug(bs->file); + bdrv_io_plug(bs->file->bs); } } @@ -2594,7 +2777,7 @@ void bdrv_io_unplug(BlockDriverState *bs) if (drv && drv->bdrv_io_unplug) { drv->bdrv_io_unplug(bs); } else if (bs->file) { - bdrv_io_unplug(bs->file); + bdrv_io_unplug(bs->file->bs); } } @@ -2604,7 +2787,24 @@ void bdrv_flush_io_queue(BlockDriverState *bs) if (drv && drv->bdrv_flush_io_queue) { drv->bdrv_flush_io_queue(bs); } else if (bs->file) { - bdrv_flush_io_queue(bs->file); + bdrv_flush_io_queue(bs->file->bs); } bdrv_start_throttled_reqs(bs); } + +void bdrv_drained_begin(BlockDriverState *bs) +{ + if (!bs->quiesce_counter++) { + aio_disable_external(bdrv_get_aio_context(bs)); + } + bdrv_drain(bs); +} + +void bdrv_drained_end(BlockDriverState *bs) +{ + assert(bs->quiesce_counter > 0); + if (--bs->quiesce_counter > 0) { + return; + } + aio_enable_external(bdrv_get_aio_context(bs)); +}