X-Git-Url: https://gerrit.opnfv.org/gerrit/gitweb?p=kvmfornfv.git;a=blobdiff_plain;f=qemu%2Fhw%2Fvirtio%2Fvirtio.c;h=30ede3d1cc7bebc8e9eeea006b2af9395498a425;hp=788b556a74c1d4cefba4bff50e6430e88b922e01;hb=437fd90c0250dee670290f9b714253671a990160;hpb=5bbd6fe9b8bab2a93e548c5a53b032d1939eec05 diff --git a/qemu/hw/virtio/virtio.c b/qemu/hw/virtio/virtio.c index 788b556a7..30ede3d1c 100644 --- a/qemu/hw/virtio/virtio.c +++ b/qemu/hw/virtio/virtio.c @@ -11,8 +11,10 @@ * */ -#include - +#include "qemu/osdep.h" +#include "qapi/error.h" +#include "qemu-common.h" +#include "cpu.h" #include "trace.h" #include "exec/address-spaces.h" #include "qemu/error-report.h" @@ -60,6 +62,7 @@ typedef struct VRingUsed typedef struct VRing { unsigned int num; + unsigned int num_default; unsigned int align; hwaddr desc; hwaddr avail; @@ -69,7 +72,15 @@ typedef struct VRing struct VirtQueue { VRing vring; + + /* Next head to pop */ uint16_t last_avail_idx; + + /* Last avail_idx read from VQ. */ + uint16_t shadow_avail_idx; + + uint16_t used_idx; + /* Last used index value we have signalled on */ uint16_t signalled_used; @@ -85,6 +96,7 @@ struct VirtQueue uint16_t vector; void (*handle_output)(VirtIODevice *vdev, VirtQueue *vq); + void (*handle_aio_output)(VirtIODevice *vdev, VirtQueue *vq); VirtIODevice *vdev; EventNotifier guest_notifier; EventNotifier host_notifier; @@ -106,35 +118,15 @@ void virtio_queue_update_rings(VirtIODevice *vdev, int n) vring->align); } -static inline uint64_t vring_desc_addr(VirtIODevice *vdev, hwaddr desc_pa, - int i) -{ - hwaddr pa; - pa = desc_pa + sizeof(VRingDesc) * i + offsetof(VRingDesc, addr); - return virtio_ldq_phys(vdev, pa); -} - -static inline uint32_t vring_desc_len(VirtIODevice *vdev, hwaddr desc_pa, int i) +static void vring_desc_read(VirtIODevice *vdev, VRingDesc *desc, + hwaddr desc_pa, int i) { - hwaddr pa; - pa = desc_pa + sizeof(VRingDesc) * i + offsetof(VRingDesc, len); - return virtio_ldl_phys(vdev, pa); -} - -static inline uint16_t vring_desc_flags(VirtIODevice *vdev, hwaddr desc_pa, - int i) -{ - hwaddr pa; - pa = desc_pa + sizeof(VRingDesc) * i + offsetof(VRingDesc, flags); - return virtio_lduw_phys(vdev, pa); -} - -static inline uint16_t vring_desc_next(VirtIODevice *vdev, hwaddr desc_pa, - int i) -{ - hwaddr pa; - pa = desc_pa + sizeof(VRingDesc) * i + offsetof(VRingDesc, next); - return virtio_lduw_phys(vdev, pa); + address_space_read(&address_space_memory, desc_pa + i * sizeof(VRingDesc), + MEMTXATTRS_UNSPECIFIED, (void *)desc, sizeof(VRingDesc)); + virtio_tswap64s(vdev, &desc->addr); + virtio_tswap32s(vdev, &desc->len); + virtio_tswap16s(vdev, &desc->flags); + virtio_tswap16s(vdev, &desc->next); } static inline uint16_t vring_avail_flags(VirtQueue *vq) @@ -148,7 +140,8 @@ static inline uint16_t vring_avail_idx(VirtQueue *vq) { hwaddr pa; pa = vq->vring.avail + offsetof(VRingAvail, idx); - return virtio_lduw_phys(vq->vdev, pa); + vq->shadow_avail_idx = virtio_lduw_phys(vq->vdev, pa); + return vq->shadow_avail_idx; } static inline uint16_t vring_avail_ring(VirtQueue *vq, int i) @@ -163,18 +156,15 @@ static inline uint16_t vring_get_used_event(VirtQueue *vq) return vring_avail_ring(vq, vq->vring.num); } -static inline void vring_used_ring_id(VirtQueue *vq, int i, uint32_t val) -{ - hwaddr pa; - pa = vq->vring.used + offsetof(VRingUsed, ring[i].id); - virtio_stl_phys(vq->vdev, pa, val); -} - -static inline void vring_used_ring_len(VirtQueue *vq, int i, uint32_t val) +static inline void vring_used_write(VirtQueue *vq, VRingUsedElem *uelem, + int i) { hwaddr pa; - pa = vq->vring.used + offsetof(VRingUsed, ring[i].len); - virtio_stl_phys(vq->vdev, pa, val); + virtio_tswap32s(vq->vdev, &uelem->id); + virtio_tswap32s(vq->vdev, &uelem->len); + pa = vq->vring.used + offsetof(VRingUsed, ring[i]); + address_space_write(&address_space_memory, pa, MEMTXATTRS_UNSPECIFIED, + (void *)uelem, sizeof(VRingUsedElem)); } static uint16_t vring_used_idx(VirtQueue *vq) @@ -189,6 +179,7 @@ static inline void vring_used_idx_set(VirtQueue *vq, uint16_t val) hwaddr pa; pa = vq->vring.used + offsetof(VRingUsed, idx); virtio_stw_phys(vq->vdev, pa, val); + vq->used_idx = val; } static inline void vring_used_flags_set_bit(VirtQueue *vq, int mask) @@ -220,7 +211,7 @@ static inline void vring_set_avail_event(VirtQueue *vq, uint16_t val) void virtio_queue_set_notification(VirtQueue *vq, int enable) { vq->notification = enable; - if (virtio_has_feature(vq->vdev, VIRTIO_RING_F_EVENT_IDX)) { + if (virtio_vdev_has_feature(vq->vdev, VIRTIO_RING_F_EVENT_IDX)) { vring_set_avail_event(vq, vring_avail_idx(vq)); } else if (enable) { vring_used_flags_unset_bit(vq, VRING_USED_F_NO_NOTIFY); @@ -238,19 +229,23 @@ int virtio_queue_ready(VirtQueue *vq) return vq->vring.avail != 0; } +/* Fetch avail_idx from VQ memory only when we really need to know if + * guest has added some buffers. */ int virtio_queue_empty(VirtQueue *vq) { + if (vq->shadow_avail_idx != vq->last_avail_idx) { + return 0; + } + return vring_avail_idx(vq) == vq->last_avail_idx; } -void virtqueue_fill(VirtQueue *vq, const VirtQueueElement *elem, - unsigned int len, unsigned int idx) +static void virtqueue_unmap_sg(VirtQueue *vq, const VirtQueueElement *elem, + unsigned int len) { unsigned int offset; int i; - trace_virtqueue_fill(vq, elem, len, idx); - offset = 0; for (i = 0; i < elem->in_num; i++) { size_t size = MIN(len - offset, elem->in_sg[i].iov_len); @@ -266,12 +261,29 @@ void virtqueue_fill(VirtQueue *vq, const VirtQueueElement *elem, cpu_physical_memory_unmap(elem->out_sg[i].iov_base, elem->out_sg[i].iov_len, 0, elem->out_sg[i].iov_len); +} - idx = (idx + vring_used_idx(vq)) % vq->vring.num; +void virtqueue_discard(VirtQueue *vq, const VirtQueueElement *elem, + unsigned int len) +{ + vq->last_avail_idx--; + virtqueue_unmap_sg(vq, elem, len); +} - /* Get a pointer to the next entry in the used ring. */ - vring_used_ring_id(vq, idx, elem->index); - vring_used_ring_len(vq, idx, len); +void virtqueue_fill(VirtQueue *vq, const VirtQueueElement *elem, + unsigned int len, unsigned int idx) +{ + VRingUsedElem uelem; + + trace_virtqueue_fill(vq, elem, len, idx); + + virtqueue_unmap_sg(vq, elem, len); + + idx = (idx + vq->used_idx) % vq->vring.num; + + uelem.id = elem->index; + uelem.len = len; + vring_used_write(vq, &uelem, idx); } void virtqueue_flush(VirtQueue *vq, unsigned int count) @@ -280,7 +292,7 @@ void virtqueue_flush(VirtQueue *vq, unsigned int count) /* Make sure buffer is written before we update index. */ smp_wmb(); trace_virtqueue_flush(vq, count); - old = vring_used_idx(vq); + old = vq->used_idx; new = old + count; vring_used_idx_set(vq, new); vq->inuse -= count; @@ -302,7 +314,7 @@ static int virtqueue_num_heads(VirtQueue *vq, unsigned int idx) /* Check it isn't doing very strange things with descriptor numbers. */ if (num_heads > vq->vring.num) { error_report("Guest moved used index from %u to %u", - idx, vring_avail_idx(vq)); + idx, vq->shadow_avail_idx); exit(1); } /* On success, callers read a descriptor at vq->last_avail_idx. @@ -331,18 +343,18 @@ static unsigned int virtqueue_get_head(VirtQueue *vq, unsigned int idx) return head; } -static unsigned virtqueue_next_desc(VirtIODevice *vdev, hwaddr desc_pa, - unsigned int i, unsigned int max) +static unsigned virtqueue_read_next_desc(VirtIODevice *vdev, VRingDesc *desc, + hwaddr desc_pa, unsigned int max) { unsigned int next; /* If this descriptor says it doesn't chain, we're done. */ - if (!(vring_desc_flags(vdev, desc_pa, i) & VRING_DESC_F_NEXT)) { + if (!(desc->flags & VRING_DESC_F_NEXT)) { return max; } /* Check they're not leading us off end of descriptors. */ - next = vring_desc_next(vdev, desc_pa, i); + next = desc->next; /* Make sure compiler knows to grab that: we don't want it changing! */ smp_wmb(); @@ -351,6 +363,7 @@ static unsigned virtqueue_next_desc(VirtIODevice *vdev, hwaddr desc_pa, exit(1); } + vring_desc_read(vdev, desc, desc_pa, next); return next; } @@ -367,6 +380,7 @@ void virtqueue_get_avail_bytes(VirtQueue *vq, unsigned int *in_bytes, while (virtqueue_num_heads(vq, idx)) { VirtIODevice *vdev = vq->vdev; unsigned int max, num_bufs, indirect = 0; + VRingDesc desc; hwaddr desc_pa; int i; @@ -374,9 +388,10 @@ void virtqueue_get_avail_bytes(VirtQueue *vq, unsigned int *in_bytes, num_bufs = total_bufs; i = virtqueue_get_head(vq, idx++); desc_pa = vq->vring.desc; + vring_desc_read(vdev, &desc, desc_pa, i); - if (vring_desc_flags(vdev, desc_pa, i) & VRING_DESC_F_INDIRECT) { - if (vring_desc_len(vdev, desc_pa, i) % sizeof(VRingDesc)) { + if (desc.flags & VRING_DESC_F_INDIRECT) { + if (desc.len % sizeof(VRingDesc)) { error_report("Invalid size for indirect buffer table"); exit(1); } @@ -389,9 +404,10 @@ void virtqueue_get_avail_bytes(VirtQueue *vq, unsigned int *in_bytes, /* loop over the indirect descriptor table */ indirect = 1; - max = vring_desc_len(vdev, desc_pa, i) / sizeof(VRingDesc); - desc_pa = vring_desc_addr(vdev, desc_pa, i); + max = desc.len / sizeof(VRingDesc); + desc_pa = desc.addr; num_bufs = i = 0; + vring_desc_read(vdev, &desc, desc_pa, i); } do { @@ -401,15 +417,15 @@ void virtqueue_get_avail_bytes(VirtQueue *vq, unsigned int *in_bytes, exit(1); } - if (vring_desc_flags(vdev, desc_pa, i) & VRING_DESC_F_WRITE) { - in_total += vring_desc_len(vdev, desc_pa, i); + if (desc.flags & VRING_DESC_F_WRITE) { + in_total += desc.len; } else { - out_total += vring_desc_len(vdev, desc_pa, i); + out_total += desc.len; } if (in_total >= max_in_bytes && out_total >= max_out_bytes) { goto done; } - } while ((i = virtqueue_next_desc(vdev, desc_pa, i, max)) != max); + } while ((i = virtqueue_read_next_desc(vdev, &desc, desc_pa, max)) != max); if (!indirect) total_bufs = num_bufs; @@ -434,98 +450,256 @@ int virtqueue_avail_bytes(VirtQueue *vq, unsigned int in_bytes, return in_bytes <= in_total && out_bytes <= out_total; } -void virtqueue_map_sg(struct iovec *sg, hwaddr *addr, - size_t num_sg, int is_write) +static void virtqueue_map_desc(unsigned int *p_num_sg, hwaddr *addr, struct iovec *iov, + unsigned int max_num_sg, bool is_write, + hwaddr pa, size_t sz) +{ + unsigned num_sg = *p_num_sg; + assert(num_sg <= max_num_sg); + + while (sz) { + hwaddr len = sz; + + if (num_sg == max_num_sg) { + error_report("virtio: too many write descriptors in indirect table"); + exit(1); + } + + iov[num_sg].iov_base = cpu_physical_memory_map(pa, &len, is_write); + iov[num_sg].iov_len = len; + addr[num_sg] = pa; + + sz -= len; + pa += len; + num_sg++; + } + *p_num_sg = num_sg; +} + +static void virtqueue_map_iovec(struct iovec *sg, hwaddr *addr, + unsigned int *num_sg, unsigned int max_size, + int is_write) { unsigned int i; hwaddr len; - if (num_sg > VIRTQUEUE_MAX_SIZE) { - error_report("virtio: map attempt out of bounds: %zd > %d", - num_sg, VIRTQUEUE_MAX_SIZE); - exit(1); - } + /* Note: this function MUST validate input, some callers + * are passing in num_sg values received over the network. + */ + /* TODO: teach all callers that this can fail, and return failure instead + * of asserting here. + * When we do, we might be able to re-enable NDEBUG below. + */ +#ifdef NDEBUG +#error building with NDEBUG is not supported +#endif + assert(*num_sg <= max_size); - for (i = 0; i < num_sg; i++) { + for (i = 0; i < *num_sg; i++) { len = sg[i].iov_len; sg[i].iov_base = cpu_physical_memory_map(addr[i], &len, is_write); - if (sg[i].iov_base == NULL || len != sg[i].iov_len) { + if (!sg[i].iov_base) { error_report("virtio: error trying to map MMIO memory"); exit(1); } + if (len != sg[i].iov_len) { + error_report("virtio: unexpected memory split"); + exit(1); + } } } -int virtqueue_pop(VirtQueue *vq, VirtQueueElement *elem) +void virtqueue_map(VirtQueueElement *elem) +{ + virtqueue_map_iovec(elem->in_sg, elem->in_addr, &elem->in_num, + VIRTQUEUE_MAX_SIZE, 1); + virtqueue_map_iovec(elem->out_sg, elem->out_addr, &elem->out_num, + VIRTQUEUE_MAX_SIZE, 0); +} + +void *virtqueue_alloc_element(size_t sz, unsigned out_num, unsigned in_num) +{ + VirtQueueElement *elem; + size_t in_addr_ofs = QEMU_ALIGN_UP(sz, __alignof__(elem->in_addr[0])); + size_t out_addr_ofs = in_addr_ofs + in_num * sizeof(elem->in_addr[0]); + size_t out_addr_end = out_addr_ofs + out_num * sizeof(elem->out_addr[0]); + size_t in_sg_ofs = QEMU_ALIGN_UP(out_addr_end, __alignof__(elem->in_sg[0])); + size_t out_sg_ofs = in_sg_ofs + in_num * sizeof(elem->in_sg[0]); + size_t out_sg_end = out_sg_ofs + out_num * sizeof(elem->out_sg[0]); + + assert(sz >= sizeof(VirtQueueElement)); + elem = g_malloc(out_sg_end); + elem->out_num = out_num; + elem->in_num = in_num; + elem->in_addr = (void *)elem + in_addr_ofs; + elem->out_addr = (void *)elem + out_addr_ofs; + elem->in_sg = (void *)elem + in_sg_ofs; + elem->out_sg = (void *)elem + out_sg_ofs; + return elem; +} + +void *virtqueue_pop(VirtQueue *vq, size_t sz) { unsigned int i, head, max; hwaddr desc_pa = vq->vring.desc; VirtIODevice *vdev = vq->vdev; + VirtQueueElement *elem; + unsigned out_num, in_num; + hwaddr addr[VIRTQUEUE_MAX_SIZE]; + struct iovec iov[VIRTQUEUE_MAX_SIZE]; + VRingDesc desc; - if (!virtqueue_num_heads(vq, vq->last_avail_idx)) - return 0; + if (virtio_queue_empty(vq)) { + return NULL; + } + /* Needed after virtio_queue_empty(), see comment in + * virtqueue_num_heads(). */ + smp_rmb(); /* When we start there are none of either input nor output. */ - elem->out_num = elem->in_num = 0; + out_num = in_num = 0; max = vq->vring.num; i = head = virtqueue_get_head(vq, vq->last_avail_idx++); - if (virtio_has_feature(vdev, VIRTIO_RING_F_EVENT_IDX)) { + if (virtio_vdev_has_feature(vdev, VIRTIO_RING_F_EVENT_IDX)) { vring_set_avail_event(vq, vq->last_avail_idx); } - if (vring_desc_flags(vdev, desc_pa, i) & VRING_DESC_F_INDIRECT) { - if (vring_desc_len(vdev, desc_pa, i) % sizeof(VRingDesc)) { + vring_desc_read(vdev, &desc, desc_pa, i); + if (desc.flags & VRING_DESC_F_INDIRECT) { + if (desc.len % sizeof(VRingDesc)) { error_report("Invalid size for indirect buffer table"); exit(1); } /* loop over the indirect descriptor table */ - max = vring_desc_len(vdev, desc_pa, i) / sizeof(VRingDesc); - desc_pa = vring_desc_addr(vdev, desc_pa, i); + max = desc.len / sizeof(VRingDesc); + desc_pa = desc.addr; i = 0; + vring_desc_read(vdev, &desc, desc_pa, i); } /* Collect all the descriptors */ do { - struct iovec *sg; - - if (vring_desc_flags(vdev, desc_pa, i) & VRING_DESC_F_WRITE) { - if (elem->in_num >= ARRAY_SIZE(elem->in_sg)) { - error_report("Too many write descriptors in indirect table"); - exit(1); - } - elem->in_addr[elem->in_num] = vring_desc_addr(vdev, desc_pa, i); - sg = &elem->in_sg[elem->in_num++]; + if (desc.flags & VRING_DESC_F_WRITE) { + virtqueue_map_desc(&in_num, addr + out_num, iov + out_num, + VIRTQUEUE_MAX_SIZE - out_num, true, desc.addr, desc.len); } else { - if (elem->out_num >= ARRAY_SIZE(elem->out_sg)) { - error_report("Too many read descriptors in indirect table"); + if (in_num) { + error_report("Incorrect order for descriptors"); exit(1); } - elem->out_addr[elem->out_num] = vring_desc_addr(vdev, desc_pa, i); - sg = &elem->out_sg[elem->out_num++]; + virtqueue_map_desc(&out_num, addr, iov, + VIRTQUEUE_MAX_SIZE, false, desc.addr, desc.len); } - sg->iov_len = vring_desc_len(vdev, desc_pa, i); - /* If we've got too many, that implies a descriptor loop. */ - if ((elem->in_num + elem->out_num) > max) { + if ((in_num + out_num) > max) { error_report("Looped descriptor"); exit(1); } - } while ((i = virtqueue_next_desc(vdev, desc_pa, i, max)) != max); - - /* Now map what we have collected */ - virtqueue_map_sg(elem->in_sg, elem->in_addr, elem->in_num, 1); - virtqueue_map_sg(elem->out_sg, elem->out_addr, elem->out_num, 0); + } while ((i = virtqueue_read_next_desc(vdev, &desc, desc_pa, max)) != max); + /* Now copy what we have collected and mapped */ + elem = virtqueue_alloc_element(sz, out_num, in_num); elem->index = head; + for (i = 0; i < out_num; i++) { + elem->out_addr[i] = addr[i]; + elem->out_sg[i] = iov[i]; + } + for (i = 0; i < in_num; i++) { + elem->in_addr[i] = addr[out_num + i]; + elem->in_sg[i] = iov[out_num + i]; + } vq->inuse++; trace_virtqueue_pop(vq, elem, elem->in_num, elem->out_num); - return elem->in_num + elem->out_num; + return elem; +} + +/* Reading and writing a structure directly to QEMUFile is *awful*, but + * it is what QEMU has always done by mistake. We can change it sooner + * or later by bumping the version number of the affected vm states. + * In the meanwhile, since the in-memory layout of VirtQueueElement + * has changed, we need to marshal to and from the layout that was + * used before the change. + */ +typedef struct VirtQueueElementOld { + unsigned int index; + unsigned int out_num; + unsigned int in_num; + hwaddr in_addr[VIRTQUEUE_MAX_SIZE]; + hwaddr out_addr[VIRTQUEUE_MAX_SIZE]; + struct iovec in_sg[VIRTQUEUE_MAX_SIZE]; + struct iovec out_sg[VIRTQUEUE_MAX_SIZE]; +} VirtQueueElementOld; + +void *qemu_get_virtqueue_element(QEMUFile *f, size_t sz) +{ + VirtQueueElement *elem; + VirtQueueElementOld data; + int i; + + qemu_get_buffer(f, (uint8_t *)&data, sizeof(VirtQueueElementOld)); + + elem = virtqueue_alloc_element(sz, data.out_num, data.in_num); + elem->index = data.index; + + for (i = 0; i < elem->in_num; i++) { + elem->in_addr[i] = data.in_addr[i]; + } + + for (i = 0; i < elem->out_num; i++) { + elem->out_addr[i] = data.out_addr[i]; + } + + for (i = 0; i < elem->in_num; i++) { + /* Base is overwritten by virtqueue_map. */ + elem->in_sg[i].iov_base = 0; + elem->in_sg[i].iov_len = data.in_sg[i].iov_len; + } + + for (i = 0; i < elem->out_num; i++) { + /* Base is overwritten by virtqueue_map. */ + elem->out_sg[i].iov_base = 0; + elem->out_sg[i].iov_len = data.out_sg[i].iov_len; + } + + virtqueue_map(elem); + return elem; +} + +void qemu_put_virtqueue_element(QEMUFile *f, VirtQueueElement *elem) +{ + VirtQueueElementOld data; + int i; + + memset(&data, 0, sizeof(data)); + data.index = elem->index; + data.in_num = elem->in_num; + data.out_num = elem->out_num; + + for (i = 0; i < elem->in_num; i++) { + data.in_addr[i] = elem->in_addr[i]; + } + + for (i = 0; i < elem->out_num; i++) { + data.out_addr[i] = elem->out_addr[i]; + } + + for (i = 0; i < elem->in_num; i++) { + /* Base is overwritten by virtqueue_map when loading. Do not + * save it, as it would leak the QEMU address space layout. */ + data.in_sg[i].iov_len = elem->in_sg[i].iov_len; + } + + for (i = 0; i < elem->out_num; i++) { + /* Do not save iov_base as above. */ + data.out_sg[i].iov_len = elem->out_sg[i].iov_len; + } + qemu_put_buffer(f, (uint8_t *)&data, sizeof(VirtQueueElementOld)); } /* virtio device */ @@ -560,7 +734,7 @@ int virtio_set_status(VirtIODevice *vdev, uint8_t val) VirtioDeviceClass *k = VIRTIO_DEVICE_GET_CLASS(vdev); trace_virtio_set_status(vdev, val); - if (virtio_has_feature(vdev, VIRTIO_F_VERSION_1)) { + if (virtio_vdev_has_feature(vdev, VIRTIO_F_VERSION_1)) { if (!(vdev->status & VIRTIO_CONFIG_S_FEATURES_OK) && val & VIRTIO_CONFIG_S_FEATURES_OK) { int ret = virtio_validate_features(vdev); @@ -629,10 +803,13 @@ void virtio_reset(void *opaque) vdev->vq[i].vring.avail = 0; vdev->vq[i].vring.used = 0; vdev->vq[i].last_avail_idx = 0; + vdev->vq[i].shadow_avail_idx = 0; + vdev->vq[i].used_idx = 0; virtio_queue_set_vector(vdev, i, VIRTIO_NO_VECTOR); vdev->vq[i].signalled_used = 0; vdev->vq[i].signalled_used_valid = false; vdev->vq[i].notification = true; + vdev->vq[i].vring.num = vdev->vq[i].vring.num_default; } } @@ -898,7 +1075,7 @@ void virtio_queue_set_align(VirtIODevice *vdev, int n, int align) VirtioBusClass *k = VIRTIO_BUS_GET_CLASS(qbus); /* virtio-1 compliant devices cannot change the alignment */ - if (virtio_has_feature(vdev, VIRTIO_F_VERSION_1)) { + if (virtio_vdev_has_feature(vdev, VIRTIO_F_VERSION_1)) { error_report("tried to modify queue alignment for virtio-1 device"); return; } @@ -912,7 +1089,17 @@ void virtio_queue_set_align(VirtIODevice *vdev, int n, int align) virtio_queue_update_rings(vdev, n); } -void virtio_queue_notify_vq(VirtQueue *vq) +static void virtio_queue_notify_aio_vq(VirtQueue *vq) +{ + if (vq->vring.desc && vq->handle_aio_output) { + VirtIODevice *vdev = vq->vdev; + + trace_virtio_queue_notify(vdev, vq - vdev->vq, vq); + vq->handle_aio_output(vdev, vq); + } +} + +static void virtio_queue_notify_vq(VirtQueue *vq) { if (vq->vring.desc && vq->handle_output) { VirtIODevice *vdev = vq->vdev; @@ -964,8 +1151,10 @@ VirtQueue *virtio_add_queue(VirtIODevice *vdev, int queue_size, abort(); vdev->vq[i].vring.num = queue_size; + vdev->vq[i].vring.num_default = queue_size; vdev->vq[i].vring.align = VIRTIO_PCI_VRING_ALIGN; vdev->vq[i].handle_output = handle_output; + vdev->vq[i].handle_aio_output = NULL; return &vdev->vq[i]; } @@ -977,6 +1166,7 @@ void virtio_del_queue(VirtIODevice *vdev, int n) } vdev->vq[n].vring.num = 0; + vdev->vq[n].vring.num_default = 0; } void virtio_irq(VirtQueue *vq) @@ -986,32 +1176,32 @@ void virtio_irq(VirtQueue *vq) virtio_notify_vector(vq->vdev, vq->vector); } -static bool vring_notify(VirtIODevice *vdev, VirtQueue *vq) +bool virtio_should_notify(VirtIODevice *vdev, VirtQueue *vq) { uint16_t old, new; bool v; /* We need to expose used array entries before checking used event. */ smp_mb(); /* Always notify when queue is empty (when feature acknowledge) */ - if (virtio_has_feature(vdev, VIRTIO_F_NOTIFY_ON_EMPTY) && - !vq->inuse && vring_avail_idx(vq) == vq->last_avail_idx) { + if (virtio_vdev_has_feature(vdev, VIRTIO_F_NOTIFY_ON_EMPTY) && + !vq->inuse && virtio_queue_empty(vq)) { return true; } - if (!virtio_has_feature(vdev, VIRTIO_RING_F_EVENT_IDX)) { + if (!virtio_vdev_has_feature(vdev, VIRTIO_RING_F_EVENT_IDX)) { return !(vring_avail_flags(vq) & VRING_AVAIL_F_NO_INTERRUPT); } v = vq->signalled_used_valid; vq->signalled_used_valid = true; old = vq->signalled_used; - new = vq->signalled_used = vring_used_idx(vq); + new = vq->signalled_used = vq->used_idx; return !v || vring_need_event(vring_get_used_event(vq), new, old); } void virtio_notify(VirtIODevice *vdev, VirtQueue *vq) { - if (!vring_notify(vdev, vq)) { + if (!virtio_should_notify(vdev, vq)) { return; } @@ -1035,7 +1225,7 @@ static bool virtio_device_endian_needed(void *opaque) VirtIODevice *vdev = opaque; assert(vdev->device_endian != VIRTIO_DEVICE_ENDIAN_UNKNOWN); - if (!virtio_has_feature(vdev, VIRTIO_F_VERSION_1)) { + if (!virtio_vdev_has_feature(vdev, VIRTIO_F_VERSION_1)) { return vdev->device_endian != virtio_default_endian(); } /* Devices conforming to VIRTIO 1.0 or later are always LE. */ @@ -1056,33 +1246,38 @@ static bool virtio_virtqueue_needed(void *opaque) return virtio_host_has_feature(vdev, VIRTIO_F_VERSION_1); } -static void put_virtqueue_state(QEMUFile *f, void *pv, size_t size) +static bool virtio_ringsize_needed(void *opaque) { - VirtIODevice *vdev = pv; + VirtIODevice *vdev = opaque; int i; for (i = 0; i < VIRTIO_QUEUE_MAX; i++) { - qemu_put_be64(f, vdev->vq[i].vring.avail); - qemu_put_be64(f, vdev->vq[i].vring.used); + if (vdev->vq[i].vring.num != vdev->vq[i].vring.num_default) { + return true; + } } + return false; } -static int get_virtqueue_state(QEMUFile *f, void *pv, size_t size) +static bool virtio_extra_state_needed(void *opaque) { - VirtIODevice *vdev = pv; - int i; + VirtIODevice *vdev = opaque; + BusState *qbus = qdev_get_parent_bus(DEVICE(vdev)); + VirtioBusClass *k = VIRTIO_BUS_GET_CLASS(qbus); - for (i = 0; i < VIRTIO_QUEUE_MAX; i++) { - vdev->vq[i].vring.avail = qemu_get_be64(f); - vdev->vq[i].vring.used = qemu_get_be64(f); - } - return 0; + return k->has_extra_state && + k->has_extra_state(qbus->parent); } -static VMStateInfo vmstate_info_virtqueue = { +static const VMStateDescription vmstate_virtqueue = { .name = "virtqueue_state", - .get = get_virtqueue_state, - .put = put_virtqueue_state, + .version_id = 1, + .minimum_version_id = 1, + .fields = (VMStateField[]) { + VMSTATE_UINT64(vring.avail, struct VirtQueue), + VMSTATE_UINT64(vring.used, struct VirtQueue), + VMSTATE_END_OF_LIST() + } }; static const VMStateDescription vmstate_virtio_virtqueues = { @@ -1090,13 +1285,75 @@ static const VMStateDescription vmstate_virtio_virtqueues = { .version_id = 1, .minimum_version_id = 1, .needed = &virtio_virtqueue_needed, + .fields = (VMStateField[]) { + VMSTATE_STRUCT_VARRAY_POINTER_KNOWN(vq, struct VirtIODevice, + VIRTIO_QUEUE_MAX, 0, vmstate_virtqueue, VirtQueue), + VMSTATE_END_OF_LIST() + } +}; + +static const VMStateDescription vmstate_ringsize = { + .name = "ringsize_state", + .version_id = 1, + .minimum_version_id = 1, + .fields = (VMStateField[]) { + VMSTATE_UINT32(vring.num_default, struct VirtQueue), + VMSTATE_END_OF_LIST() + } +}; + +static const VMStateDescription vmstate_virtio_ringsize = { + .name = "virtio/ringsize", + .version_id = 1, + .minimum_version_id = 1, + .needed = &virtio_ringsize_needed, + .fields = (VMStateField[]) { + VMSTATE_STRUCT_VARRAY_POINTER_KNOWN(vq, struct VirtIODevice, + VIRTIO_QUEUE_MAX, 0, vmstate_ringsize, VirtQueue), + VMSTATE_END_OF_LIST() + } +}; + +static int get_extra_state(QEMUFile *f, void *pv, size_t size) +{ + VirtIODevice *vdev = pv; + BusState *qbus = qdev_get_parent_bus(DEVICE(vdev)); + VirtioBusClass *k = VIRTIO_BUS_GET_CLASS(qbus); + + if (!k->load_extra_state) { + return -1; + } else { + return k->load_extra_state(qbus->parent, f); + } +} + +static void put_extra_state(QEMUFile *f, void *pv, size_t size) +{ + VirtIODevice *vdev = pv; + BusState *qbus = qdev_get_parent_bus(DEVICE(vdev)); + VirtioBusClass *k = VIRTIO_BUS_GET_CLASS(qbus); + + k->save_extra_state(qbus->parent, f); +} + +static const VMStateInfo vmstate_info_extra_state = { + .name = "virtqueue_extra_state", + .get = get_extra_state, + .put = put_extra_state, +}; + +static const VMStateDescription vmstate_virtio_extra_state = { + .name = "virtio/extra_state", + .version_id = 1, + .minimum_version_id = 1, + .needed = &virtio_extra_state_needed, .fields = (VMStateField[]) { { - .name = "virtqueues", + .name = "extra_state", .version_id = 0, .field_exists = NULL, .size = 0, - .info = &vmstate_info_virtqueue, + .info = &vmstate_info_extra_state, .flags = VMS_SINGLE, .offset = 0, }, @@ -1138,6 +1395,8 @@ static const VMStateDescription vmstate_virtio = { &vmstate_virtio_device_endian, &vmstate_virtio_64bit_features, &vmstate_virtio_virtqueues, + &vmstate_virtio_ringsize, + &vmstate_virtio_extra_state, NULL } }; @@ -1264,7 +1523,7 @@ int virtio_load(VirtIODevice *vdev, QEMUFile *f, int version_id) num = qemu_get_be32(f); if (num > VIRTIO_QUEUE_MAX) { - error_report("Invalid number of PCI queues: 0x%x", num); + error_report("Invalid number of virtqueues: 0x%x", num); return -1; } @@ -1348,6 +1607,8 @@ int virtio_load(VirtIODevice *vdev, QEMUFile *f, int version_id) vdev->vq[i].last_avail_idx, nheads); return -1; } + vdev->vq[i].used_idx = vring_used_idx(&vdev->vq[i]); + vdev->vq[i].shadow_avail_idx = vring_avail_idx(&vdev->vq[i]); } } @@ -1430,6 +1691,7 @@ void virtio_init(VirtIODevice *vdev, const char *name, vdev->vmstate = qemu_add_vm_change_state_handler(virtio_vmstate_change, vdev); vdev->device_endian = virtio_default_endian(); + vdev->use_guest_notifier_mask = true; } hwaddr virtio_queue_get_desc_addr(VirtIODevice *vdev, int n) @@ -1460,7 +1722,7 @@ hwaddr virtio_queue_get_desc_size(VirtIODevice *vdev, int n) hwaddr virtio_queue_get_avail_size(VirtIODevice *vdev, int n) { return offsetof(VRingAvail, ring) + - sizeof(uint64_t) * vdev->vq[n].vring.num; + sizeof(uint16_t) * vdev->vq[n].vring.num; } hwaddr virtio_queue_get_used_size(VirtIODevice *vdev, int n) @@ -1483,6 +1745,7 @@ uint16_t virtio_queue_get_last_avail_idx(VirtIODevice *vdev, int n) void virtio_queue_set_last_avail_idx(VirtIODevice *vdev, int n, uint16_t idx) { vdev->vq[n].last_avail_idx = idx; + vdev->vq[n].shadow_avail_idx = idx; } void virtio_queue_invalidate_signalled_used(VirtIODevice *vdev, int n) @@ -1512,10 +1775,10 @@ void virtio_queue_set_guest_notifier_fd_handler(VirtQueue *vq, bool assign, bool with_irqfd) { if (assign && !with_irqfd) { - event_notifier_set_handler(&vq->guest_notifier, + event_notifier_set_handler(&vq->guest_notifier, false, virtio_queue_guest_notifier_read); } else { - event_notifier_set_handler(&vq->guest_notifier, NULL); + event_notifier_set_handler(&vq->guest_notifier, false, NULL); } if (!assign) { /* Test and clear notifier before closing it, @@ -1529,6 +1792,31 @@ EventNotifier *virtio_queue_get_guest_notifier(VirtQueue *vq) return &vq->guest_notifier; } +static void virtio_queue_host_notifier_aio_read(EventNotifier *n) +{ + VirtQueue *vq = container_of(n, VirtQueue, host_notifier); + if (event_notifier_test_and_clear(n)) { + virtio_queue_notify_aio_vq(vq); + } +} + +void virtio_queue_aio_set_host_notifier_handler(VirtQueue *vq, AioContext *ctx, + void (*handle_output)(VirtIODevice *, + VirtQueue *)) +{ + if (handle_output) { + vq->handle_aio_output = handle_output; + aio_set_event_notifier(ctx, &vq->host_notifier, true, + virtio_queue_host_notifier_aio_read); + } else { + aio_set_event_notifier(ctx, &vq->host_notifier, true, NULL); + /* Test and clear notifier before after disabling event, + * in case poll callback didn't have time to run. */ + virtio_queue_host_notifier_aio_read(&vq->host_notifier); + vq->handle_aio_output = NULL; + } +} + static void virtio_queue_host_notifier_read(EventNotifier *n) { VirtQueue *vq = container_of(n, VirtQueue, host_notifier); @@ -1541,10 +1829,10 @@ void virtio_queue_set_host_notifier_fd_handler(VirtQueue *vq, bool assign, bool set_handler) { if (assign && set_handler) { - event_notifier_set_handler(&vq->host_notifier, + event_notifier_set_handler(&vq->host_notifier, true, virtio_queue_host_notifier_read); } else { - event_notifier_set_handler(&vq->host_notifier, NULL); + event_notifier_set_handler(&vq->host_notifier, true, NULL); } if (!assign) { /* Test and clear notifier before after disabling event,