These changes are the raw update to qemu-2.6.
[kvmfornfv.git] / qemu / hw / virtio / vhost.c
index 2712c6f..4400718 100644 (file)
  * GNU GPL, version 2 or (at your option) any later version.
  */
 
+#include "qemu/osdep.h"
+#include "qapi/error.h"
 #include "hw/virtio/vhost.h"
 #include "hw/hw.h"
 #include "qemu/atomic.h"
 #include "qemu/range.h"
 #include "qemu/error-report.h"
+#include "qemu/memfd.h"
 #include <linux/vhost.h>
 #include "exec/address-spaces.h"
 #include "hw/virtio/virtio-bus.h"
 #include "migration/migration.h"
 
 static struct vhost_log *vhost_log;
+static struct vhost_log *vhost_log_shm;
+
+static unsigned int used_memslots;
+static QLIST_HEAD(, vhost_dev) vhost_devices =
+    QLIST_HEAD_INITIALIZER(vhost_devices);
+
+bool vhost_has_free_slot(void)
+{
+    unsigned int slots_limit = ~0U;
+    struct vhost_dev *hdev;
+
+    QLIST_FOREACH(hdev, &vhost_devices, entry) {
+        unsigned int r = hdev->vhost_ops->vhost_backend_memslots_limit(hdev);
+        slots_limit = MIN(slots_limit, r);
+    }
+    return slots_limit > used_memslots;
+}
 
 static void vhost_dev_sync_region(struct vhost_dev *dev,
                                   MemoryRegionSection *section,
@@ -241,6 +261,13 @@ static void vhost_dev_assign_memory(struct vhost_dev *dev,
             continue;
         }
 
+        if (dev->vhost_ops->vhost_backend_can_merge &&
+            !dev->vhost_ops->vhost_backend_can_merge(dev, uaddr, size,
+                                                     reg->userspace_addr,
+                                                     reg->memory_size)) {
+            continue;
+        }
+
         if (merged) {
             --to;
             assert(to >= 0);
@@ -286,25 +313,46 @@ static uint64_t vhost_get_log_size(struct vhost_dev *dev)
     }
     return log_size;
 }
-static struct vhost_log *vhost_log_alloc(uint64_t size)
+
+static struct vhost_log *vhost_log_alloc(uint64_t size, bool share)
 {
-    struct vhost_log *log = g_malloc0(sizeof *log + size * sizeof(*(log->log)));
+    struct vhost_log *log;
+    uint64_t logsize = size * sizeof(*(log->log));
+    int fd = -1;
+
+    log = g_new0(struct vhost_log, 1);
+    if (share) {
+        log->log = qemu_memfd_alloc("vhost-log", logsize,
+                                    F_SEAL_GROW | F_SEAL_SHRINK | F_SEAL_SEAL,
+                                    &fd);
+        memset(log->log, 0, logsize);
+    } else {
+        log->log = g_malloc0(logsize);
+    }
 
     log->size = size;
     log->refcnt = 1;
+    log->fd = fd;
 
     return log;
 }
 
-static struct vhost_log *vhost_log_get(uint64_t size)
+static struct vhost_log *vhost_log_get(uint64_t size, bool share)
 {
-    if (!vhost_log || vhost_log->size != size) {
-        vhost_log = vhost_log_alloc(size);
+    struct vhost_log *log = share ? vhost_log_shm : vhost_log;
+
+    if (!log || log->size != size) {
+        log = vhost_log_alloc(size, share);
+        if (share) {
+            vhost_log_shm = log;
+        } else {
+            vhost_log = log;
+        }
     } else {
-        ++vhost_log->refcnt;
+        ++log->refcnt;
     }
 
-    return vhost_log;
+    return log;
 }
 
 static void vhost_log_put(struct vhost_dev *dev, bool sync)
@@ -321,20 +369,35 @@ static void vhost_log_put(struct vhost_dev *dev, bool sync)
         if (dev->log_size && sync) {
             vhost_log_sync_range(dev, 0, dev->log_size * VHOST_LOG_CHUNK - 1);
         }
+
         if (vhost_log == log) {
+            g_free(log->log);
             vhost_log = NULL;
+        } else if (vhost_log_shm == log) {
+            qemu_memfd_free(log->log, log->size * sizeof(*(log->log)),
+                            log->fd);
+            vhost_log_shm = NULL;
         }
+
         g_free(log);
     }
 }
 
-static inline void vhost_dev_log_resize(struct vhost_dev* dev, uint64_t size)
+static bool vhost_dev_log_is_shared(struct vhost_dev *dev)
 {
-    struct vhost_log *log = vhost_log_get(size);
+    return dev->vhost_ops->vhost_requires_shm_log &&
+           dev->vhost_ops->vhost_requires_shm_log(dev);
+}
+
+static inline void vhost_dev_log_resize(struct vhost_dev *dev, uint64_t size)
+{
+    struct vhost_log *log = vhost_log_get(size, vhost_dev_log_is_shared(dev));
     uint64_t log_base = (uintptr_t)log->log;
     int r;
 
-    r = dev->vhost_ops->vhost_call(dev, VHOST_SET_LOG_BASE, &log_base);
+    /* inform backend of log switching, this must be done before
+       releasing the current log, to ensure no logging is lost */
+    r = dev->vhost_ops->vhost_set_log_base(dev, log_base, log);
     assert(r >= 0);
     vhost_log_put(dev, true);
     dev->log = log;
@@ -457,6 +520,7 @@ static void vhost_set_memory(MemoryListener *listener,
     dev->mem_changed_start_addr = MIN(dev->mem_changed_start_addr, start_addr);
     dev->mem_changed_end_addr = MAX(dev->mem_changed_end_addr, start_addr + size - 1);
     dev->memory_changed = true;
+    used_memslots = dev->mem->nregions;
 }
 
 static bool vhost_section(MemoryRegionSection *section)
@@ -500,7 +564,7 @@ static void vhost_commit(MemoryListener *listener)
     }
 
     if (!dev->log_enabled) {
-        r = dev->vhost_ops->vhost_call(dev, VHOST_SET_MEM_TABLE, dev->mem);
+        r = dev->vhost_ops->vhost_set_mem_table(dev, dev->mem);
         assert(r >= 0);
         dev->memory_changed = false;
         return;
@@ -513,7 +577,7 @@ static void vhost_commit(MemoryListener *listener)
     if (dev->log_size < log_size) {
         vhost_dev_log_resize(dev, log_size + VHOST_LOG_BUFFER);
     }
-    r = dev->vhost_ops->vhost_call(dev, VHOST_SET_MEM_TABLE, dev->mem);
+    r = dev->vhost_ops->vhost_set_mem_table(dev, dev->mem);
     assert(r >= 0);
     /* To log less, can only decrease log size after table update. */
     if (dev->log_size > log_size + VHOST_LOG_BUFFER) {
@@ -581,7 +645,7 @@ static int vhost_virtqueue_set_addr(struct vhost_dev *dev,
         .log_guest_addr = vq->used_phys,
         .flags = enable_log ? (1 << VHOST_VRING_F_LOG) : 0,
     };
-    int r = dev->vhost_ops->vhost_call(dev, VHOST_SET_VRING_ADDR, &addr);
+    int r = dev->vhost_ops->vhost_set_vring_addr(dev, &addr);
     if (r < 0) {
         return -errno;
     }
@@ -595,19 +659,20 @@ static int vhost_dev_set_features(struct vhost_dev *dev, bool enable_log)
     if (enable_log) {
         features |= 0x1ULL << VHOST_F_LOG_ALL;
     }
-    r = dev->vhost_ops->vhost_call(dev, VHOST_SET_FEATURES, &features);
+    r = dev->vhost_ops->vhost_set_features(dev, features);
     return r < 0 ? -errno : 0;
 }
 
 static int vhost_dev_set_log(struct vhost_dev *dev, bool enable_log)
 {
-    int r, t, i;
+    int r, t, i, idx;
     r = vhost_dev_set_features(dev, enable_log);
     if (r < 0) {
         goto err_features;
     }
     for (i = 0; i < dev->nvqs; ++i) {
-        r = vhost_virtqueue_set_addr(dev, dev->vqs + i, i,
+        idx = dev->vhost_ops->vhost_get_vq_index(dev, dev->vq_index + i);
+        r = vhost_virtqueue_set_addr(dev, dev->vqs + i, idx,
                                      enable_log);
         if (r < 0) {
             goto err_vq;
@@ -616,7 +681,8 @@ static int vhost_dev_set_log(struct vhost_dev *dev, bool enable_log)
     return 0;
 err_vq:
     for (; i >= 0; --i) {
-        t = vhost_virtqueue_set_addr(dev, dev->vqs + i, i,
+        idx = dev->vhost_ops->vhost_get_vq_index(dev, dev->vq_index + i);
+        t = vhost_virtqueue_set_addr(dev, dev->vqs + i, idx,
                                      dev->log_enabled);
         assert(t >= 0);
     }
@@ -691,6 +757,27 @@ static void vhost_log_stop(MemoryListener *listener,
     /* FIXME: implement */
 }
 
+/* The vhost driver natively knows how to handle the vrings of non
+ * cross-endian legacy devices and modern devices. Only legacy devices
+ * exposed to a bi-endian guest may require the vhost driver to use a
+ * specific endianness.
+ */
+static inline bool vhost_needs_vring_endian(VirtIODevice *vdev)
+{
+    if (virtio_vdev_has_feature(vdev, VIRTIO_F_VERSION_1)) {
+        return false;
+    }
+#ifdef TARGET_IS_BIENDIAN
+#ifdef HOST_WORDS_BIGENDIAN
+    return vdev->device_endian == VIRTIO_DEVICE_ENDIAN_LITTLE;
+#else
+    return vdev->device_endian == VIRTIO_DEVICE_ENDIAN_BIG;
+#endif
+#else
+    return false;
+#endif
+}
+
 static int vhost_virtqueue_set_vring_endian_legacy(struct vhost_dev *dev,
                                                    bool is_big_endian,
                                                    int vhost_vq_index)
@@ -700,7 +787,7 @@ static int vhost_virtqueue_set_vring_endian_legacy(struct vhost_dev *dev,
         .num = is_big_endian
     };
 
-    if (!dev->vhost_ops->vhost_call(dev, VHOST_SET_VRING_ENDIAN, &s)) {
+    if (!dev->vhost_ops->vhost_set_vring_endian(dev, &s)) {
         return 0;
     }
 
@@ -719,7 +806,7 @@ static int vhost_virtqueue_start(struct vhost_dev *dev,
 {
     hwaddr s, l, a;
     int r;
-    int vhost_vq_index = idx - dev->vq_index;
+    int vhost_vq_index = dev->vhost_ops->vhost_get_vq_index(dev, idx);
     struct vhost_vring_file file = {
         .index = vhost_vq_index
     };
@@ -728,22 +815,20 @@ static int vhost_virtqueue_start(struct vhost_dev *dev,
     };
     struct VirtQueue *vvq = virtio_get_queue(vdev, idx);
 
-    assert(idx >= dev->vq_index && idx < dev->vq_index + dev->nvqs);
 
     vq->num = state.num = virtio_queue_get_num(vdev, idx);
-    r = dev->vhost_ops->vhost_call(dev, VHOST_SET_VRING_NUM, &state);
+    r = dev->vhost_ops->vhost_set_vring_num(dev, &state);
     if (r) {
         return -errno;
     }
 
     state.num = virtio_queue_get_last_avail_idx(vdev, idx);
-    r = dev->vhost_ops->vhost_call(dev, VHOST_SET_VRING_BASE, &state);
+    r = dev->vhost_ops->vhost_set_vring_base(dev, &state);
     if (r) {
         return -errno;
     }
 
-    if (!virtio_has_feature(vdev, VIRTIO_F_VERSION_1) &&
-        virtio_legacy_is_cross_endian(vdev)) {
+    if (vhost_needs_vring_endian(vdev)) {
         r = vhost_virtqueue_set_vring_endian_legacy(dev,
                                                     virtio_is_big_endian(vdev),
                                                     vhost_vq_index);
@@ -789,7 +874,7 @@ static int vhost_virtqueue_start(struct vhost_dev *dev,
     }
 
     file.fd = event_notifier_get_fd(virtio_queue_get_host_notifier(vvq));
-    r = dev->vhost_ops->vhost_call(dev, VHOST_SET_VRING_KICK, &file);
+    r = dev->vhost_ops->vhost_set_vring_kick(dev, &file);
     if (r) {
         r = -errno;
         goto fail_kick;
@@ -798,6 +883,14 @@ static int vhost_virtqueue_start(struct vhost_dev *dev,
     /* Clear and discard previous events if any. */
     event_notifier_test_and_clear(&vq->masked_notifier);
 
+    /* Init vring in unmasked state, unless guest_notifier_mask
+     * will do it later.
+     */
+    if (!vdev->use_guest_notifier_mask) {
+        /* TODO: check and handle errors. */
+        vhost_virtqueue_mask(dev, vdev, idx, false);
+    }
+
     return 0;
 
 fail_kick:
@@ -822,13 +915,13 @@ static void vhost_virtqueue_stop(struct vhost_dev *dev,
                                     struct vhost_virtqueue *vq,
                                     unsigned idx)
 {
-    int vhost_vq_index = idx - dev->vq_index;
+    int vhost_vq_index = dev->vhost_ops->vhost_get_vq_index(dev, idx);
     struct vhost_vring_state state = {
         .index = vhost_vq_index,
     };
     int r;
-    assert(idx >= dev->vq_index && idx < dev->vq_index + dev->nvqs);
-    r = dev->vhost_ops->vhost_call(dev, VHOST_GET_VRING_BASE, &state);
+
+    r = dev->vhost_ops->vhost_get_vring_base(dev, &state);
     if (r < 0) {
         fprintf(stderr, "vhost VQ %d ring restore failed: %d\n", idx, r);
         fflush(stderr);
@@ -839,8 +932,7 @@ static void vhost_virtqueue_stop(struct vhost_dev *dev,
     /* In the cross-endian case, we need to reset the vring endianness to
      * native as legacy devices expect so by default.
      */
-    if (!virtio_has_feature(vdev, VIRTIO_F_VERSION_1) &&
-        virtio_legacy_is_cross_endian(vdev)) {
+    if (vhost_needs_vring_endian(vdev)) {
         r = vhost_virtqueue_set_vring_endian_legacy(dev,
                                                     !virtio_is_big_endian(vdev),
                                                     vhost_vq_index);
@@ -875,8 +967,9 @@ static void vhost_eventfd_del(MemoryListener *listener,
 static int vhost_virtqueue_init(struct vhost_dev *dev,
                                 struct vhost_virtqueue *vq, int n)
 {
+    int vhost_vq_index = dev->vhost_ops->vhost_get_vq_index(dev, n);
     struct vhost_vring_file file = {
-        .index = n,
+        .index = vhost_vq_index,
     };
     int r = event_notifier_init(&vq->masked_notifier, 0);
     if (r < 0) {
@@ -884,7 +977,7 @@ static int vhost_virtqueue_init(struct vhost_dev *dev,
     }
 
     file.fd = event_notifier_get_fd(&vq->masked_notifier);
-    r = dev->vhost_ops->vhost_call(dev, VHOST_SET_VRING_CALL, &file);
+    r = dev->vhost_ops->vhost_set_vring_call(dev, &file);
     if (r) {
         r = -errno;
         goto fail_call;
@@ -906,6 +999,8 @@ int vhost_dev_init(struct vhost_dev *hdev, void *opaque,
     uint64_t features;
     int i, r;
 
+    hdev->migration_blocker = NULL;
+
     if (vhost_set_backend_type(hdev, backend_type) < 0) {
         close((uintptr_t)opaque);
         return -1;
@@ -916,18 +1011,26 @@ int vhost_dev_init(struct vhost_dev *hdev, void *opaque,
         return -errno;
     }
 
-    r = hdev->vhost_ops->vhost_call(hdev, VHOST_SET_OWNER, NULL);
+    if (used_memslots > hdev->vhost_ops->vhost_backend_memslots_limit(hdev)) {
+        fprintf(stderr, "vhost backend memory slots limit is less"
+                " than current number of present memory slots\n");
+        close((uintptr_t)opaque);
+        return -1;
+    }
+    QLIST_INSERT_HEAD(&vhost_devices, hdev, entry);
+
+    r = hdev->vhost_ops->vhost_set_owner(hdev);
     if (r < 0) {
         goto fail;
     }
 
-    r = hdev->vhost_ops->vhost_call(hdev, VHOST_GET_FEATURES, &features);
+    r = hdev->vhost_ops->vhost_get_features(hdev, &features);
     if (r < 0) {
         goto fail;
     }
 
     for (i = 0; i < hdev->nvqs; ++i) {
-        r = vhost_virtqueue_init(hdev, hdev->vqs + i, i);
+        r = vhost_virtqueue_init(hdev, hdev->vqs + i, hdev->vq_index + i);
         if (r < 0) {
             goto fail_vq;
         }
@@ -949,12 +1052,21 @@ int vhost_dev_init(struct vhost_dev *hdev, void *opaque,
         .eventfd_del = vhost_eventfd_del,
         .priority = 10
     };
-    hdev->migration_blocker = NULL;
-    if (!(hdev->features & (0x1ULL << VHOST_F_LOG_ALL))) {
-        error_setg(&hdev->migration_blocker,
-                   "Migration disabled: vhost lacks VHOST_F_LOG_ALL feature.");
+
+    if (hdev->migration_blocker == NULL) {
+        if (!(hdev->features & (0x1ULL << VHOST_F_LOG_ALL))) {
+            error_setg(&hdev->migration_blocker,
+                       "Migration disabled: vhost lacks VHOST_F_LOG_ALL feature.");
+        } else if (!qemu_memfd_check()) {
+            error_setg(&hdev->migration_blocker,
+                       "Migration disabled: failed to allocate shared memory");
+        }
+    }
+
+    if (hdev->migration_blocker != NULL) {
         migrate_add_blocker(hdev->migration_blocker);
     }
+
     hdev->mem = g_malloc0(offsetof(struct vhost_memory, regions));
     hdev->n_mem_sections = 0;
     hdev->mem_sections = NULL;
@@ -972,6 +1084,7 @@ fail_vq:
 fail:
     r = -errno;
     hdev->vhost_ops->vhost_backend_cleanup(hdev);
+    QLIST_REMOVE(hdev, entry);
     return r;
 }
 
@@ -989,6 +1102,7 @@ void vhost_dev_cleanup(struct vhost_dev *hdev)
     g_free(hdev->mem);
     g_free(hdev->mem_sections);
     hdev->vhost_ops->vhost_backend_cleanup(hdev);
+    QLIST_REMOVE(hdev, entry);
 }
 
 /* Stop processing guest IO notifications in qemu.
@@ -1066,18 +1180,17 @@ void vhost_virtqueue_mask(struct vhost_dev *hdev, VirtIODevice *vdev, int n,
 {
     struct VirtQueue *vvq = virtio_get_queue(vdev, n);
     int r, index = n - hdev->vq_index;
+    struct vhost_vring_file file;
 
-    assert(n >= hdev->vq_index && n < hdev->vq_index + hdev->nvqs);
-
-    struct vhost_vring_file file = {
-        .index = index
-    };
     if (mask) {
+        assert(vdev->use_guest_notifier_mask);
         file.fd = event_notifier_get_fd(&hdev->vqs[index].masked_notifier);
     } else {
         file.fd = event_notifier_get_fd(virtio_queue_get_guest_notifier(vvq));
     }
-    r = hdev->vhost_ops->vhost_call(hdev, VHOST_SET_VRING_CALL, &file);
+
+    file.index = hdev->vhost_ops->vhost_get_vq_index(hdev, n);
+    r = hdev->vhost_ops->vhost_set_vring_call(hdev, &file);
     assert(r >= 0);
 }
 
@@ -1119,7 +1232,7 @@ int vhost_dev_start(struct vhost_dev *hdev, VirtIODevice *vdev)
     if (r < 0) {
         goto fail_features;
     }
-    r = hdev->vhost_ops->vhost_call(hdev, VHOST_SET_MEM_TABLE, hdev->mem);
+    r = hdev->vhost_ops->vhost_set_mem_table(hdev, hdev->mem);
     if (r < 0) {
         r = -errno;
         goto fail_mem;
@@ -1138,10 +1251,12 @@ int vhost_dev_start(struct vhost_dev *hdev, VirtIODevice *vdev)
         uint64_t log_base;
 
         hdev->log_size = vhost_get_log_size(hdev);
-        hdev->log = vhost_log_get(hdev->log_size);
+        hdev->log = vhost_log_get(hdev->log_size,
+                                  vhost_dev_log_is_shared(hdev));
         log_base = (uintptr_t)hdev->log->log;
-        r = hdev->vhost_ops->vhost_call(hdev, VHOST_SET_LOG_BASE,
-                                        hdev->log_size ? &log_base : NULL);
+        r = hdev->vhost_ops->vhost_set_log_base(hdev,
+                                                hdev->log_size ? log_base : 0,
+                                                hdev->log);
         if (r < 0) {
             r = -errno;
             goto fail_log;