These changes are the raw update to qemu-2.6.
[kvmfornfv.git] / qemu / hw / misc / ivshmem.c
1 /*
2  * Inter-VM Shared Memory PCI device.
3  *
4  * Author:
5  *      Cam Macdonell <cam@cs.ualberta.ca>
6  *
7  * Based On: cirrus_vga.c
8  *          Copyright (c) 2004 Fabrice Bellard
9  *          Copyright (c) 2004 Makoto Suzuki (suzu)
10  *
11  *      and rtl8139.c
12  *          Copyright (c) 2006 Igor Kovalenko
13  *
14  * This code is licensed under the GNU GPL v2.
15  *
16  * Contributions after 2012-01-13 are licensed under the terms of the
17  * GNU GPL, version 2 or (at your option) any later version.
18  */
19 #include "qemu/osdep.h"
20 #include "qapi/error.h"
21 #include "qemu/cutils.h"
22 #include "hw/hw.h"
23 #include "hw/i386/pc.h"
24 #include "hw/pci/pci.h"
25 #include "hw/pci/msi.h"
26 #include "hw/pci/msix.h"
27 #include "sysemu/kvm.h"
28 #include "migration/migration.h"
29 #include "qemu/error-report.h"
30 #include "qemu/event_notifier.h"
31 #include "qom/object_interfaces.h"
32 #include "sysemu/char.h"
33 #include "sysemu/hostmem.h"
34 #include "sysemu/qtest.h"
35 #include "qapi/visitor.h"
36 #include "exec/ram_addr.h"
37
38 #include "hw/misc/ivshmem.h"
39
40 #include <sys/mman.h>
41
42 #define PCI_VENDOR_ID_IVSHMEM   PCI_VENDOR_ID_REDHAT_QUMRANET
43 #define PCI_DEVICE_ID_IVSHMEM   0x1110
44
45 #define IVSHMEM_MAX_PEERS UINT16_MAX
46 #define IVSHMEM_IOEVENTFD   0
47 #define IVSHMEM_MSI     1
48
49 #define IVSHMEM_REG_BAR_SIZE 0x100
50
51 #define IVSHMEM_DEBUG 0
52 #define IVSHMEM_DPRINTF(fmt, ...)                       \
53     do {                                                \
54         if (IVSHMEM_DEBUG) {                            \
55             printf("IVSHMEM: " fmt, ## __VA_ARGS__);    \
56         }                                               \
57     } while (0)
58
59 #define TYPE_IVSHMEM_COMMON "ivshmem-common"
60 #define IVSHMEM_COMMON(obj) \
61     OBJECT_CHECK(IVShmemState, (obj), TYPE_IVSHMEM_COMMON)
62
63 #define TYPE_IVSHMEM_PLAIN "ivshmem-plain"
64 #define IVSHMEM_PLAIN(obj) \
65     OBJECT_CHECK(IVShmemState, (obj), TYPE_IVSHMEM_PLAIN)
66
67 #define TYPE_IVSHMEM_DOORBELL "ivshmem-doorbell"
68 #define IVSHMEM_DOORBELL(obj) \
69     OBJECT_CHECK(IVShmemState, (obj), TYPE_IVSHMEM_DOORBELL)
70
71 #define TYPE_IVSHMEM "ivshmem"
72 #define IVSHMEM(obj) \
73     OBJECT_CHECK(IVShmemState, (obj), TYPE_IVSHMEM)
74
75 typedef struct Peer {
76     int nb_eventfds;
77     EventNotifier *eventfds;
78 } Peer;
79
80 typedef struct MSIVector {
81     PCIDevice *pdev;
82     int virq;
83 } MSIVector;
84
85 typedef struct IVShmemState {
86     /*< private >*/
87     PCIDevice parent_obj;
88     /*< public >*/
89
90     uint32_t features;
91
92     /* exactly one of these two may be set */
93     HostMemoryBackend *hostmem; /* with interrupts */
94     CharDriverState *server_chr; /* without interrupts */
95
96     /* registers */
97     uint32_t intrmask;
98     uint32_t intrstatus;
99     int vm_id;
100
101     /* BARs */
102     MemoryRegion ivshmem_mmio;  /* BAR 0 (registers) */
103     MemoryRegion *ivshmem_bar2; /* BAR 2 (shared memory) */
104     MemoryRegion server_bar2;   /* used with server_chr */
105
106     /* interrupt support */
107     Peer *peers;
108     int nb_peers;               /* space in @peers[] */
109     uint32_t vectors;
110     MSIVector *msi_vectors;
111     uint64_t msg_buf;           /* buffer for receiving server messages */
112     int msg_buffered_bytes;     /* #bytes in @msg_buf */
113
114     /* migration stuff */
115     OnOffAuto master;
116     Error *migration_blocker;
117
118     /* legacy cruft */
119     char *role;
120     char *shmobj;
121     char *sizearg;
122     size_t legacy_size;
123     uint32_t not_legacy_32bit;
124 } IVShmemState;
125
126 /* registers for the Inter-VM shared memory device */
127 enum ivshmem_registers {
128     INTRMASK = 0,
129     INTRSTATUS = 4,
130     IVPOSITION = 8,
131     DOORBELL = 12,
132 };
133
134 static inline uint32_t ivshmem_has_feature(IVShmemState *ivs,
135                                                     unsigned int feature) {
136     return (ivs->features & (1 << feature));
137 }
138
139 static inline bool ivshmem_is_master(IVShmemState *s)
140 {
141     assert(s->master != ON_OFF_AUTO_AUTO);
142     return s->master == ON_OFF_AUTO_ON;
143 }
144
145 static void ivshmem_update_irq(IVShmemState *s)
146 {
147     PCIDevice *d = PCI_DEVICE(s);
148     uint32_t isr = s->intrstatus & s->intrmask;
149
150     /*
151      * Do nothing unless the device actually uses INTx.  Here's how
152      * the device variants signal interrupts, what they put in PCI
153      * config space:
154      * Device variant    Interrupt  Interrupt Pin  MSI-X cap.
155      * ivshmem-plain         none            0         no
156      * ivshmem-doorbell     MSI-X            1        yes(1)
157      * ivshmem,msi=off       INTx            1         no
158      * ivshmem,msi=on       MSI-X            1(2)     yes(1)
159      * (1) if guest enabled MSI-X
160      * (2) the device lies
161      * Leads to the condition for doing nothing:
162      */
163     if (ivshmem_has_feature(s, IVSHMEM_MSI)
164         || !d->config[PCI_INTERRUPT_PIN]) {
165         return;
166     }
167
168     /* don't print ISR resets */
169     if (isr) {
170         IVSHMEM_DPRINTF("Set IRQ to %d (%04x %04x)\n",
171                         isr ? 1 : 0, s->intrstatus, s->intrmask);
172     }
173
174     pci_set_irq(d, isr != 0);
175 }
176
177 static void ivshmem_IntrMask_write(IVShmemState *s, uint32_t val)
178 {
179     IVSHMEM_DPRINTF("IntrMask write(w) val = 0x%04x\n", val);
180
181     s->intrmask = val;
182     ivshmem_update_irq(s);
183 }
184
185 static uint32_t ivshmem_IntrMask_read(IVShmemState *s)
186 {
187     uint32_t ret = s->intrmask;
188
189     IVSHMEM_DPRINTF("intrmask read(w) val = 0x%04x\n", ret);
190     return ret;
191 }
192
193 static void ivshmem_IntrStatus_write(IVShmemState *s, uint32_t val)
194 {
195     IVSHMEM_DPRINTF("IntrStatus write(w) val = 0x%04x\n", val);
196
197     s->intrstatus = val;
198     ivshmem_update_irq(s);
199 }
200
201 static uint32_t ivshmem_IntrStatus_read(IVShmemState *s)
202 {
203     uint32_t ret = s->intrstatus;
204
205     /* reading ISR clears all interrupts */
206     s->intrstatus = 0;
207     ivshmem_update_irq(s);
208     return ret;
209 }
210
211 static void ivshmem_io_write(void *opaque, hwaddr addr,
212                              uint64_t val, unsigned size)
213 {
214     IVShmemState *s = opaque;
215
216     uint16_t dest = val >> 16;
217     uint16_t vector = val & 0xff;
218
219     addr &= 0xfc;
220
221     IVSHMEM_DPRINTF("writing to addr " TARGET_FMT_plx "\n", addr);
222     switch (addr)
223     {
224         case INTRMASK:
225             ivshmem_IntrMask_write(s, val);
226             break;
227
228         case INTRSTATUS:
229             ivshmem_IntrStatus_write(s, val);
230             break;
231
232         case DOORBELL:
233             /* check that dest VM ID is reasonable */
234             if (dest >= s->nb_peers) {
235                 IVSHMEM_DPRINTF("Invalid destination VM ID (%d)\n", dest);
236                 break;
237             }
238
239             /* check doorbell range */
240             if (vector < s->peers[dest].nb_eventfds) {
241                 IVSHMEM_DPRINTF("Notifying VM %d on vector %d\n", dest, vector);
242                 event_notifier_set(&s->peers[dest].eventfds[vector]);
243             } else {
244                 IVSHMEM_DPRINTF("Invalid destination vector %d on VM %d\n",
245                                 vector, dest);
246             }
247             break;
248         default:
249             IVSHMEM_DPRINTF("Unhandled write " TARGET_FMT_plx "\n", addr);
250     }
251 }
252
253 static uint64_t ivshmem_io_read(void *opaque, hwaddr addr,
254                                 unsigned size)
255 {
256
257     IVShmemState *s = opaque;
258     uint32_t ret;
259
260     switch (addr)
261     {
262         case INTRMASK:
263             ret = ivshmem_IntrMask_read(s);
264             break;
265
266         case INTRSTATUS:
267             ret = ivshmem_IntrStatus_read(s);
268             break;
269
270         case IVPOSITION:
271             ret = s->vm_id;
272             break;
273
274         default:
275             IVSHMEM_DPRINTF("why are we reading " TARGET_FMT_plx "\n", addr);
276             ret = 0;
277     }
278
279     return ret;
280 }
281
282 static const MemoryRegionOps ivshmem_mmio_ops = {
283     .read = ivshmem_io_read,
284     .write = ivshmem_io_write,
285     .endianness = DEVICE_NATIVE_ENDIAN,
286     .impl = {
287         .min_access_size = 4,
288         .max_access_size = 4,
289     },
290 };
291
292 static void ivshmem_vector_notify(void *opaque)
293 {
294     MSIVector *entry = opaque;
295     PCIDevice *pdev = entry->pdev;
296     IVShmemState *s = IVSHMEM_COMMON(pdev);
297     int vector = entry - s->msi_vectors;
298     EventNotifier *n = &s->peers[s->vm_id].eventfds[vector];
299
300     if (!event_notifier_test_and_clear(n)) {
301         return;
302     }
303
304     IVSHMEM_DPRINTF("interrupt on vector %p %d\n", pdev, vector);
305     if (ivshmem_has_feature(s, IVSHMEM_MSI)) {
306         if (msix_enabled(pdev)) {
307             msix_notify(pdev, vector);
308         }
309     } else {
310         ivshmem_IntrStatus_write(s, 1);
311     }
312 }
313
314 static int ivshmem_vector_unmask(PCIDevice *dev, unsigned vector,
315                                  MSIMessage msg)
316 {
317     IVShmemState *s = IVSHMEM_COMMON(dev);
318     EventNotifier *n = &s->peers[s->vm_id].eventfds[vector];
319     MSIVector *v = &s->msi_vectors[vector];
320     int ret;
321
322     IVSHMEM_DPRINTF("vector unmask %p %d\n", dev, vector);
323
324     ret = kvm_irqchip_update_msi_route(kvm_state, v->virq, msg, dev);
325     if (ret < 0) {
326         return ret;
327     }
328
329     return kvm_irqchip_add_irqfd_notifier_gsi(kvm_state, n, NULL, v->virq);
330 }
331
332 static void ivshmem_vector_mask(PCIDevice *dev, unsigned vector)
333 {
334     IVShmemState *s = IVSHMEM_COMMON(dev);
335     EventNotifier *n = &s->peers[s->vm_id].eventfds[vector];
336     int ret;
337
338     IVSHMEM_DPRINTF("vector mask %p %d\n", dev, vector);
339
340     ret = kvm_irqchip_remove_irqfd_notifier_gsi(kvm_state, n,
341                                                 s->msi_vectors[vector].virq);
342     if (ret != 0) {
343         error_report("remove_irqfd_notifier_gsi failed");
344     }
345 }
346
347 static void ivshmem_vector_poll(PCIDevice *dev,
348                                 unsigned int vector_start,
349                                 unsigned int vector_end)
350 {
351     IVShmemState *s = IVSHMEM_COMMON(dev);
352     unsigned int vector;
353
354     IVSHMEM_DPRINTF("vector poll %p %d-%d\n", dev, vector_start, vector_end);
355
356     vector_end = MIN(vector_end, s->vectors);
357
358     for (vector = vector_start; vector < vector_end; vector++) {
359         EventNotifier *notifier = &s->peers[s->vm_id].eventfds[vector];
360
361         if (!msix_is_masked(dev, vector)) {
362             continue;
363         }
364
365         if (event_notifier_test_and_clear(notifier)) {
366             msix_set_pending(dev, vector);
367         }
368     }
369 }
370
371 static void watch_vector_notifier(IVShmemState *s, EventNotifier *n,
372                                  int vector)
373 {
374     int eventfd = event_notifier_get_fd(n);
375
376     assert(!s->msi_vectors[vector].pdev);
377     s->msi_vectors[vector].pdev = PCI_DEVICE(s);
378
379     qemu_set_fd_handler(eventfd, ivshmem_vector_notify,
380                         NULL, &s->msi_vectors[vector]);
381 }
382
383 static void ivshmem_add_eventfd(IVShmemState *s, int posn, int i)
384 {
385     memory_region_add_eventfd(&s->ivshmem_mmio,
386                               DOORBELL,
387                               4,
388                               true,
389                               (posn << 16) | i,
390                               &s->peers[posn].eventfds[i]);
391 }
392
393 static void ivshmem_del_eventfd(IVShmemState *s, int posn, int i)
394 {
395     memory_region_del_eventfd(&s->ivshmem_mmio,
396                               DOORBELL,
397                               4,
398                               true,
399                               (posn << 16) | i,
400                               &s->peers[posn].eventfds[i]);
401 }
402
403 static void close_peer_eventfds(IVShmemState *s, int posn)
404 {
405     int i, n;
406
407     assert(posn >= 0 && posn < s->nb_peers);
408     n = s->peers[posn].nb_eventfds;
409
410     if (ivshmem_has_feature(s, IVSHMEM_IOEVENTFD)) {
411         memory_region_transaction_begin();
412         for (i = 0; i < n; i++) {
413             ivshmem_del_eventfd(s, posn, i);
414         }
415         memory_region_transaction_commit();
416     }
417
418     for (i = 0; i < n; i++) {
419         event_notifier_cleanup(&s->peers[posn].eventfds[i]);
420     }
421
422     g_free(s->peers[posn].eventfds);
423     s->peers[posn].nb_eventfds = 0;
424 }
425
426 static void resize_peers(IVShmemState *s, int nb_peers)
427 {
428     int old_nb_peers = s->nb_peers;
429     int i;
430
431     assert(nb_peers > old_nb_peers);
432     IVSHMEM_DPRINTF("bumping storage to %d peers\n", nb_peers);
433
434     s->peers = g_realloc(s->peers, nb_peers * sizeof(Peer));
435     s->nb_peers = nb_peers;
436
437     for (i = old_nb_peers; i < nb_peers; i++) {
438         s->peers[i].eventfds = g_new0(EventNotifier, s->vectors);
439         s->peers[i].nb_eventfds = 0;
440     }
441 }
442
443 static void ivshmem_add_kvm_msi_virq(IVShmemState *s, int vector,
444                                      Error **errp)
445 {
446     PCIDevice *pdev = PCI_DEVICE(s);
447     MSIMessage msg = msix_get_message(pdev, vector);
448     int ret;
449
450     IVSHMEM_DPRINTF("ivshmem_add_kvm_msi_virq vector:%d\n", vector);
451     assert(!s->msi_vectors[vector].pdev);
452
453     ret = kvm_irqchip_add_msi_route(kvm_state, msg, pdev);
454     if (ret < 0) {
455         error_setg(errp, "kvm_irqchip_add_msi_route failed");
456         return;
457     }
458
459     s->msi_vectors[vector].virq = ret;
460     s->msi_vectors[vector].pdev = pdev;
461 }
462
463 static void setup_interrupt(IVShmemState *s, int vector, Error **errp)
464 {
465     EventNotifier *n = &s->peers[s->vm_id].eventfds[vector];
466     bool with_irqfd = kvm_msi_via_irqfd_enabled() &&
467         ivshmem_has_feature(s, IVSHMEM_MSI);
468     PCIDevice *pdev = PCI_DEVICE(s);
469     Error *err = NULL;
470
471     IVSHMEM_DPRINTF("setting up interrupt for vector: %d\n", vector);
472
473     if (!with_irqfd) {
474         IVSHMEM_DPRINTF("with eventfd\n");
475         watch_vector_notifier(s, n, vector);
476     } else if (msix_enabled(pdev)) {
477         IVSHMEM_DPRINTF("with irqfd\n");
478         ivshmem_add_kvm_msi_virq(s, vector, &err);
479         if (err) {
480             error_propagate(errp, err);
481             return;
482         }
483
484         if (!msix_is_masked(pdev, vector)) {
485             kvm_irqchip_add_irqfd_notifier_gsi(kvm_state, n, NULL,
486                                                s->msi_vectors[vector].virq);
487             /* TODO handle error */
488         }
489     } else {
490         /* it will be delayed until msix is enabled, in write_config */
491         IVSHMEM_DPRINTF("with irqfd, delayed until msix enabled\n");
492     }
493 }
494
495 static void process_msg_shmem(IVShmemState *s, int fd, Error **errp)
496 {
497     struct stat buf;
498     size_t size;
499     void *ptr;
500
501     if (s->ivshmem_bar2) {
502         error_setg(errp, "server sent unexpected shared memory message");
503         close(fd);
504         return;
505     }
506
507     if (fstat(fd, &buf) < 0) {
508         error_setg_errno(errp, errno,
509             "can't determine size of shared memory sent by server");
510         close(fd);
511         return;
512     }
513
514     size = buf.st_size;
515
516     /* Legacy cruft */
517     if (s->legacy_size != SIZE_MAX) {
518         if (size < s->legacy_size) {
519             error_setg(errp, "server sent only %zd bytes of shared memory",
520                        (size_t)buf.st_size);
521             close(fd);
522             return;
523         }
524         size = s->legacy_size;
525     }
526
527     /* mmap the region and map into the BAR2 */
528     ptr = mmap(0, size, PROT_READ | PROT_WRITE, MAP_SHARED, fd, 0);
529     if (ptr == MAP_FAILED) {
530         error_setg_errno(errp, errno, "Failed to mmap shared memory");
531         close(fd);
532         return;
533     }
534     memory_region_init_ram_ptr(&s->server_bar2, OBJECT(s),
535                                "ivshmem.bar2", size, ptr);
536     qemu_set_ram_fd(memory_region_get_ram_addr(&s->server_bar2), fd);
537     s->ivshmem_bar2 = &s->server_bar2;
538 }
539
540 static void process_msg_disconnect(IVShmemState *s, uint16_t posn,
541                                    Error **errp)
542 {
543     IVSHMEM_DPRINTF("posn %d has gone away\n", posn);
544     if (posn >= s->nb_peers || posn == s->vm_id) {
545         error_setg(errp, "invalid peer %d", posn);
546         return;
547     }
548     close_peer_eventfds(s, posn);
549 }
550
551 static void process_msg_connect(IVShmemState *s, uint16_t posn, int fd,
552                                 Error **errp)
553 {
554     Peer *peer = &s->peers[posn];
555     int vector;
556
557     /*
558      * The N-th connect message for this peer comes with the file
559      * descriptor for vector N-1.  Count messages to find the vector.
560      */
561     if (peer->nb_eventfds >= s->vectors) {
562         error_setg(errp, "Too many eventfd received, device has %d vectors",
563                    s->vectors);
564         close(fd);
565         return;
566     }
567     vector = peer->nb_eventfds++;
568
569     IVSHMEM_DPRINTF("eventfds[%d][%d] = %d\n", posn, vector, fd);
570     event_notifier_init_fd(&peer->eventfds[vector], fd);
571     fcntl_setfl(fd, O_NONBLOCK); /* msix/irqfd poll non block */
572
573     if (posn == s->vm_id) {
574         setup_interrupt(s, vector, errp);
575         /* TODO do we need to handle the error? */
576     }
577
578     if (ivshmem_has_feature(s, IVSHMEM_IOEVENTFD)) {
579         ivshmem_add_eventfd(s, posn, vector);
580     }
581 }
582
583 static void process_msg(IVShmemState *s, int64_t msg, int fd, Error **errp)
584 {
585     IVSHMEM_DPRINTF("posn is %" PRId64 ", fd is %d\n", msg, fd);
586
587     if (msg < -1 || msg > IVSHMEM_MAX_PEERS) {
588         error_setg(errp, "server sent invalid message %" PRId64, msg);
589         close(fd);
590         return;
591     }
592
593     if (msg == -1) {
594         process_msg_shmem(s, fd, errp);
595         return;
596     }
597
598     if (msg >= s->nb_peers) {
599         resize_peers(s, msg + 1);
600     }
601
602     if (fd >= 0) {
603         process_msg_connect(s, msg, fd, errp);
604     } else {
605         process_msg_disconnect(s, msg, errp);
606     }
607 }
608
609 static int ivshmem_can_receive(void *opaque)
610 {
611     IVShmemState *s = opaque;
612
613     assert(s->msg_buffered_bytes < sizeof(s->msg_buf));
614     return sizeof(s->msg_buf) - s->msg_buffered_bytes;
615 }
616
617 static void ivshmem_read(void *opaque, const uint8_t *buf, int size)
618 {
619     IVShmemState *s = opaque;
620     Error *err = NULL;
621     int fd;
622     int64_t msg;
623
624     assert(size >= 0 && s->msg_buffered_bytes + size <= sizeof(s->msg_buf));
625     memcpy((unsigned char *)&s->msg_buf + s->msg_buffered_bytes, buf, size);
626     s->msg_buffered_bytes += size;
627     if (s->msg_buffered_bytes < sizeof(s->msg_buf)) {
628         return;
629     }
630     msg = le64_to_cpu(s->msg_buf);
631     s->msg_buffered_bytes = 0;
632
633     fd = qemu_chr_fe_get_msgfd(s->server_chr);
634     IVSHMEM_DPRINTF("posn is %" PRId64 ", fd is %d\n", msg, fd);
635
636     process_msg(s, msg, fd, &err);
637     if (err) {
638         error_report_err(err);
639     }
640 }
641
642 static int64_t ivshmem_recv_msg(IVShmemState *s, int *pfd, Error **errp)
643 {
644     int64_t msg;
645     int n, ret;
646
647     n = 0;
648     do {
649         ret = qemu_chr_fe_read_all(s->server_chr, (uint8_t *)&msg + n,
650                                  sizeof(msg) - n);
651         if (ret < 0 && ret != -EINTR) {
652             error_setg_errno(errp, -ret, "read from server failed");
653             return INT64_MIN;
654         }
655         n += ret;
656     } while (n < sizeof(msg));
657
658     *pfd = qemu_chr_fe_get_msgfd(s->server_chr);
659     return msg;
660 }
661
662 static void ivshmem_recv_setup(IVShmemState *s, Error **errp)
663 {
664     Error *err = NULL;
665     int64_t msg;
666     int fd;
667
668     msg = ivshmem_recv_msg(s, &fd, &err);
669     if (err) {
670         error_propagate(errp, err);
671         return;
672     }
673     if (msg != IVSHMEM_PROTOCOL_VERSION) {
674         error_setg(errp, "server sent version %" PRId64 ", expecting %d",
675                    msg, IVSHMEM_PROTOCOL_VERSION);
676         return;
677     }
678     if (fd != -1) {
679         error_setg(errp, "server sent invalid version message");
680         return;
681     }
682
683     /*
684      * ivshmem-server sends the remaining initial messages in a fixed
685      * order, but the device has always accepted them in any order.
686      * Stay as compatible as practical, just in case people use
687      * servers that behave differently.
688      */
689
690     /*
691      * ivshmem_device_spec.txt has always required the ID message
692      * right here, and ivshmem-server has always complied.  However,
693      * older versions of the device accepted it out of order, but
694      * broke when an interrupt setup message arrived before it.
695      */
696     msg = ivshmem_recv_msg(s, &fd, &err);
697     if (err) {
698         error_propagate(errp, err);
699         return;
700     }
701     if (fd != -1 || msg < 0 || msg > IVSHMEM_MAX_PEERS) {
702         error_setg(errp, "server sent invalid ID message");
703         return;
704     }
705     s->vm_id = msg;
706
707     /*
708      * Receive more messages until we got shared memory.
709      */
710     do {
711         msg = ivshmem_recv_msg(s, &fd, &err);
712         if (err) {
713             error_propagate(errp, err);
714             return;
715         }
716         process_msg(s, msg, fd, &err);
717         if (err) {
718             error_propagate(errp, err);
719             return;
720         }
721     } while (msg != -1);
722
723     /*
724      * This function must either map the shared memory or fail.  The
725      * loop above ensures that: it terminates normally only after it
726      * successfully processed the server's shared memory message.
727      * Assert that actually mapped the shared memory:
728      */
729     assert(s->ivshmem_bar2);
730 }
731
732 /* Select the MSI-X vectors used by device.
733  * ivshmem maps events to vectors statically, so
734  * we just enable all vectors on init and after reset. */
735 static void ivshmem_msix_vector_use(IVShmemState *s)
736 {
737     PCIDevice *d = PCI_DEVICE(s);
738     int i;
739
740     for (i = 0; i < s->vectors; i++) {
741         msix_vector_use(d, i);
742     }
743 }
744
745 static void ivshmem_reset(DeviceState *d)
746 {
747     IVShmemState *s = IVSHMEM_COMMON(d);
748
749     s->intrstatus = 0;
750     s->intrmask = 0;
751     if (ivshmem_has_feature(s, IVSHMEM_MSI)) {
752         ivshmem_msix_vector_use(s);
753     }
754 }
755
756 static int ivshmem_setup_interrupts(IVShmemState *s)
757 {
758     /* allocate QEMU callback data for receiving interrupts */
759     s->msi_vectors = g_malloc0(s->vectors * sizeof(MSIVector));
760
761     if (ivshmem_has_feature(s, IVSHMEM_MSI)) {
762         if (msix_init_exclusive_bar(PCI_DEVICE(s), s->vectors, 1)) {
763             return -1;
764         }
765
766         IVSHMEM_DPRINTF("msix initialized (%d vectors)\n", s->vectors);
767         ivshmem_msix_vector_use(s);
768     }
769
770     return 0;
771 }
772
773 static void ivshmem_enable_irqfd(IVShmemState *s)
774 {
775     PCIDevice *pdev = PCI_DEVICE(s);
776     int i;
777
778     for (i = 0; i < s->peers[s->vm_id].nb_eventfds; i++) {
779         Error *err = NULL;
780
781         ivshmem_add_kvm_msi_virq(s, i, &err);
782         if (err) {
783             error_report_err(err);
784             /* TODO do we need to handle the error? */
785         }
786     }
787
788     if (msix_set_vector_notifiers(pdev,
789                                   ivshmem_vector_unmask,
790                                   ivshmem_vector_mask,
791                                   ivshmem_vector_poll)) {
792         error_report("ivshmem: msix_set_vector_notifiers failed");
793     }
794 }
795
796 static void ivshmem_remove_kvm_msi_virq(IVShmemState *s, int vector)
797 {
798     IVSHMEM_DPRINTF("ivshmem_remove_kvm_msi_virq vector:%d\n", vector);
799
800     if (s->msi_vectors[vector].pdev == NULL) {
801         return;
802     }
803
804     /* it was cleaned when masked in the frontend. */
805     kvm_irqchip_release_virq(kvm_state, s->msi_vectors[vector].virq);
806
807     s->msi_vectors[vector].pdev = NULL;
808 }
809
810 static void ivshmem_disable_irqfd(IVShmemState *s)
811 {
812     PCIDevice *pdev = PCI_DEVICE(s);
813     int i;
814
815     for (i = 0; i < s->peers[s->vm_id].nb_eventfds; i++) {
816         ivshmem_remove_kvm_msi_virq(s, i);
817     }
818
819     msix_unset_vector_notifiers(pdev);
820 }
821
822 static void ivshmem_write_config(PCIDevice *pdev, uint32_t address,
823                                  uint32_t val, int len)
824 {
825     IVShmemState *s = IVSHMEM_COMMON(pdev);
826     int is_enabled, was_enabled = msix_enabled(pdev);
827
828     pci_default_write_config(pdev, address, val, len);
829     is_enabled = msix_enabled(pdev);
830
831     if (kvm_msi_via_irqfd_enabled()) {
832         if (!was_enabled && is_enabled) {
833             ivshmem_enable_irqfd(s);
834         } else if (was_enabled && !is_enabled) {
835             ivshmem_disable_irqfd(s);
836         }
837     }
838 }
839
840 static void ivshmem_common_realize(PCIDevice *dev, Error **errp)
841 {
842     IVShmemState *s = IVSHMEM_COMMON(dev);
843     Error *err = NULL;
844     uint8_t *pci_conf;
845     uint8_t attr = PCI_BASE_ADDRESS_SPACE_MEMORY |
846         PCI_BASE_ADDRESS_MEM_PREFETCH;
847
848     /* IRQFD requires MSI */
849     if (ivshmem_has_feature(s, IVSHMEM_IOEVENTFD) &&
850         !ivshmem_has_feature(s, IVSHMEM_MSI)) {
851         error_setg(errp, "ioeventfd/irqfd requires MSI");
852         return;
853     }
854
855     pci_conf = dev->config;
856     pci_conf[PCI_COMMAND] = PCI_COMMAND_IO | PCI_COMMAND_MEMORY;
857
858     memory_region_init_io(&s->ivshmem_mmio, OBJECT(s), &ivshmem_mmio_ops, s,
859                           "ivshmem-mmio", IVSHMEM_REG_BAR_SIZE);
860
861     /* region for registers*/
862     pci_register_bar(dev, 0, PCI_BASE_ADDRESS_SPACE_MEMORY,
863                      &s->ivshmem_mmio);
864
865     if (!s->not_legacy_32bit) {
866         attr |= PCI_BASE_ADDRESS_MEM_TYPE_64;
867     }
868
869     if (s->hostmem != NULL) {
870         IVSHMEM_DPRINTF("using hostmem\n");
871
872         s->ivshmem_bar2 = host_memory_backend_get_memory(s->hostmem,
873                                                          &error_abort);
874     } else {
875         assert(s->server_chr);
876
877         IVSHMEM_DPRINTF("using shared memory server (socket = %s)\n",
878                         s->server_chr->filename);
879
880         /* we allocate enough space for 16 peers and grow as needed */
881         resize_peers(s, 16);
882
883         /*
884          * Receive setup messages from server synchronously.
885          * Older versions did it asynchronously, but that creates a
886          * number of entertaining race conditions.
887          */
888         ivshmem_recv_setup(s, &err);
889         if (err) {
890             error_propagate(errp, err);
891             return;
892         }
893
894         if (s->master == ON_OFF_AUTO_ON && s->vm_id != 0) {
895             error_setg(errp,
896                        "master must connect to the server before any peers");
897             return;
898         }
899
900         qemu_chr_add_handlers(s->server_chr, ivshmem_can_receive,
901                               ivshmem_read, NULL, s);
902
903         if (ivshmem_setup_interrupts(s) < 0) {
904             error_setg(errp, "failed to initialize interrupts");
905             return;
906         }
907     }
908
909     vmstate_register_ram(s->ivshmem_bar2, DEVICE(s));
910     pci_register_bar(PCI_DEVICE(s), 2, attr, s->ivshmem_bar2);
911
912     if (s->master == ON_OFF_AUTO_AUTO) {
913         s->master = s->vm_id == 0 ? ON_OFF_AUTO_ON : ON_OFF_AUTO_OFF;
914     }
915
916     if (!ivshmem_is_master(s)) {
917         error_setg(&s->migration_blocker,
918                    "Migration is disabled when using feature 'peer mode' in device 'ivshmem'");
919         migrate_add_blocker(s->migration_blocker);
920     }
921 }
922
923 static void ivshmem_exit(PCIDevice *dev)
924 {
925     IVShmemState *s = IVSHMEM_COMMON(dev);
926     int i;
927
928     if (s->migration_blocker) {
929         migrate_del_blocker(s->migration_blocker);
930         error_free(s->migration_blocker);
931     }
932
933     if (memory_region_is_mapped(s->ivshmem_bar2)) {
934         if (!s->hostmem) {
935             void *addr = memory_region_get_ram_ptr(s->ivshmem_bar2);
936             int fd;
937
938             if (munmap(addr, memory_region_size(s->ivshmem_bar2) == -1)) {
939                 error_report("Failed to munmap shared memory %s",
940                              strerror(errno));
941             }
942
943             fd = qemu_get_ram_fd(memory_region_get_ram_addr(s->ivshmem_bar2));
944             close(fd);
945         }
946
947         vmstate_unregister_ram(s->ivshmem_bar2, DEVICE(dev));
948     }
949
950     if (s->peers) {
951         for (i = 0; i < s->nb_peers; i++) {
952             close_peer_eventfds(s, i);
953         }
954         g_free(s->peers);
955     }
956
957     if (ivshmem_has_feature(s, IVSHMEM_MSI)) {
958         msix_uninit_exclusive_bar(dev);
959     }
960
961     g_free(s->msi_vectors);
962 }
963
964 static int ivshmem_pre_load(void *opaque)
965 {
966     IVShmemState *s = opaque;
967
968     if (!ivshmem_is_master(s)) {
969         error_report("'peer' devices are not migratable");
970         return -EINVAL;
971     }
972
973     return 0;
974 }
975
976 static int ivshmem_post_load(void *opaque, int version_id)
977 {
978     IVShmemState *s = opaque;
979
980     if (ivshmem_has_feature(s, IVSHMEM_MSI)) {
981         ivshmem_msix_vector_use(s);
982     }
983     return 0;
984 }
985
986 static void ivshmem_common_class_init(ObjectClass *klass, void *data)
987 {
988     DeviceClass *dc = DEVICE_CLASS(klass);
989     PCIDeviceClass *k = PCI_DEVICE_CLASS(klass);
990
991     k->realize = ivshmem_common_realize;
992     k->exit = ivshmem_exit;
993     k->config_write = ivshmem_write_config;
994     k->vendor_id = PCI_VENDOR_ID_IVSHMEM;
995     k->device_id = PCI_DEVICE_ID_IVSHMEM;
996     k->class_id = PCI_CLASS_MEMORY_RAM;
997     k->revision = 1;
998     dc->reset = ivshmem_reset;
999     set_bit(DEVICE_CATEGORY_MISC, dc->categories);
1000     dc->desc = "Inter-VM shared memory";
1001 }
1002
1003 static const TypeInfo ivshmem_common_info = {
1004     .name          = TYPE_IVSHMEM_COMMON,
1005     .parent        = TYPE_PCI_DEVICE,
1006     .instance_size = sizeof(IVShmemState),
1007     .abstract      = true,
1008     .class_init    = ivshmem_common_class_init,
1009 };
1010
1011 static void ivshmem_check_memdev_is_busy(Object *obj, const char *name,
1012                                          Object *val, Error **errp)
1013 {
1014     MemoryRegion *mr;
1015
1016     mr = host_memory_backend_get_memory(MEMORY_BACKEND(val), &error_abort);
1017     if (memory_region_is_mapped(mr)) {
1018         char *path = object_get_canonical_path_component(val);
1019         error_setg(errp, "can't use already busy memdev: %s", path);
1020         g_free(path);
1021     } else {
1022         qdev_prop_allow_set_link_before_realize(obj, name, val, errp);
1023     }
1024 }
1025
1026 static const VMStateDescription ivshmem_plain_vmsd = {
1027     .name = TYPE_IVSHMEM_PLAIN,
1028     .version_id = 0,
1029     .minimum_version_id = 0,
1030     .pre_load = ivshmem_pre_load,
1031     .post_load = ivshmem_post_load,
1032     .fields = (VMStateField[]) {
1033         VMSTATE_PCI_DEVICE(parent_obj, IVShmemState),
1034         VMSTATE_UINT32(intrstatus, IVShmemState),
1035         VMSTATE_UINT32(intrmask, IVShmemState),
1036         VMSTATE_END_OF_LIST()
1037     },
1038 };
1039
1040 static Property ivshmem_plain_properties[] = {
1041     DEFINE_PROP_ON_OFF_AUTO("master", IVShmemState, master, ON_OFF_AUTO_OFF),
1042     DEFINE_PROP_END_OF_LIST(),
1043 };
1044
1045 static void ivshmem_plain_init(Object *obj)
1046 {
1047     IVShmemState *s = IVSHMEM_PLAIN(obj);
1048
1049     object_property_add_link(obj, "memdev", TYPE_MEMORY_BACKEND,
1050                              (Object **)&s->hostmem,
1051                              ivshmem_check_memdev_is_busy,
1052                              OBJ_PROP_LINK_UNREF_ON_RELEASE,
1053                              &error_abort);
1054 }
1055
1056 static void ivshmem_plain_realize(PCIDevice *dev, Error **errp)
1057 {
1058     IVShmemState *s = IVSHMEM_COMMON(dev);
1059
1060     if (!s->hostmem) {
1061         error_setg(errp, "You must specify a 'memdev'");
1062         return;
1063     }
1064
1065     ivshmem_common_realize(dev, errp);
1066 }
1067
1068 static void ivshmem_plain_class_init(ObjectClass *klass, void *data)
1069 {
1070     DeviceClass *dc = DEVICE_CLASS(klass);
1071     PCIDeviceClass *k = PCI_DEVICE_CLASS(klass);
1072
1073     k->realize = ivshmem_plain_realize;
1074     dc->props = ivshmem_plain_properties;
1075     dc->vmsd = &ivshmem_plain_vmsd;
1076 }
1077
1078 static const TypeInfo ivshmem_plain_info = {
1079     .name          = TYPE_IVSHMEM_PLAIN,
1080     .parent        = TYPE_IVSHMEM_COMMON,
1081     .instance_size = sizeof(IVShmemState),
1082     .instance_init = ivshmem_plain_init,
1083     .class_init    = ivshmem_plain_class_init,
1084 };
1085
1086 static const VMStateDescription ivshmem_doorbell_vmsd = {
1087     .name = TYPE_IVSHMEM_DOORBELL,
1088     .version_id = 0,
1089     .minimum_version_id = 0,
1090     .pre_load = ivshmem_pre_load,
1091     .post_load = ivshmem_post_load,
1092     .fields = (VMStateField[]) {
1093         VMSTATE_PCI_DEVICE(parent_obj, IVShmemState),
1094         VMSTATE_MSIX(parent_obj, IVShmemState),
1095         VMSTATE_UINT32(intrstatus, IVShmemState),
1096         VMSTATE_UINT32(intrmask, IVShmemState),
1097         VMSTATE_END_OF_LIST()
1098     },
1099 };
1100
1101 static Property ivshmem_doorbell_properties[] = {
1102     DEFINE_PROP_CHR("chardev", IVShmemState, server_chr),
1103     DEFINE_PROP_UINT32("vectors", IVShmemState, vectors, 1),
1104     DEFINE_PROP_BIT("ioeventfd", IVShmemState, features, IVSHMEM_IOEVENTFD,
1105                     true),
1106     DEFINE_PROP_ON_OFF_AUTO("master", IVShmemState, master, ON_OFF_AUTO_OFF),
1107     DEFINE_PROP_END_OF_LIST(),
1108 };
1109
1110 static void ivshmem_doorbell_init(Object *obj)
1111 {
1112     IVShmemState *s = IVSHMEM_DOORBELL(obj);
1113
1114     s->features |= (1 << IVSHMEM_MSI);
1115     s->legacy_size = SIZE_MAX;  /* whatever the server sends */
1116 }
1117
1118 static void ivshmem_doorbell_realize(PCIDevice *dev, Error **errp)
1119 {
1120     IVShmemState *s = IVSHMEM_COMMON(dev);
1121
1122     if (!s->server_chr) {
1123         error_setg(errp, "You must specify a 'chardev'");
1124         return;
1125     }
1126
1127     ivshmem_common_realize(dev, errp);
1128 }
1129
1130 static void ivshmem_doorbell_class_init(ObjectClass *klass, void *data)
1131 {
1132     DeviceClass *dc = DEVICE_CLASS(klass);
1133     PCIDeviceClass *k = PCI_DEVICE_CLASS(klass);
1134
1135     k->realize = ivshmem_doorbell_realize;
1136     dc->props = ivshmem_doorbell_properties;
1137     dc->vmsd = &ivshmem_doorbell_vmsd;
1138 }
1139
1140 static const TypeInfo ivshmem_doorbell_info = {
1141     .name          = TYPE_IVSHMEM_DOORBELL,
1142     .parent        = TYPE_IVSHMEM_COMMON,
1143     .instance_size = sizeof(IVShmemState),
1144     .instance_init = ivshmem_doorbell_init,
1145     .class_init    = ivshmem_doorbell_class_init,
1146 };
1147
1148 static int ivshmem_load_old(QEMUFile *f, void *opaque, int version_id)
1149 {
1150     IVShmemState *s = opaque;
1151     PCIDevice *pdev = PCI_DEVICE(s);
1152     int ret;
1153
1154     IVSHMEM_DPRINTF("ivshmem_load_old\n");
1155
1156     if (version_id != 0) {
1157         return -EINVAL;
1158     }
1159
1160     ret = ivshmem_pre_load(s);
1161     if (ret) {
1162         return ret;
1163     }
1164
1165     ret = pci_device_load(pdev, f);
1166     if (ret) {
1167         return ret;
1168     }
1169
1170     if (ivshmem_has_feature(s, IVSHMEM_MSI)) {
1171         msix_load(pdev, f);
1172         ivshmem_msix_vector_use(s);
1173     } else {
1174         s->intrstatus = qemu_get_be32(f);
1175         s->intrmask = qemu_get_be32(f);
1176     }
1177
1178     return 0;
1179 }
1180
1181 static bool test_msix(void *opaque, int version_id)
1182 {
1183     IVShmemState *s = opaque;
1184
1185     return ivshmem_has_feature(s, IVSHMEM_MSI);
1186 }
1187
1188 static bool test_no_msix(void *opaque, int version_id)
1189 {
1190     return !test_msix(opaque, version_id);
1191 }
1192
1193 static const VMStateDescription ivshmem_vmsd = {
1194     .name = "ivshmem",
1195     .version_id = 1,
1196     .minimum_version_id = 1,
1197     .pre_load = ivshmem_pre_load,
1198     .post_load = ivshmem_post_load,
1199     .fields = (VMStateField[]) {
1200         VMSTATE_PCI_DEVICE(parent_obj, IVShmemState),
1201
1202         VMSTATE_MSIX_TEST(parent_obj, IVShmemState, test_msix),
1203         VMSTATE_UINT32_TEST(intrstatus, IVShmemState, test_no_msix),
1204         VMSTATE_UINT32_TEST(intrmask, IVShmemState, test_no_msix),
1205
1206         VMSTATE_END_OF_LIST()
1207     },
1208     .load_state_old = ivshmem_load_old,
1209     .minimum_version_id_old = 0
1210 };
1211
1212 static Property ivshmem_properties[] = {
1213     DEFINE_PROP_CHR("chardev", IVShmemState, server_chr),
1214     DEFINE_PROP_STRING("size", IVShmemState, sizearg),
1215     DEFINE_PROP_UINT32("vectors", IVShmemState, vectors, 1),
1216     DEFINE_PROP_BIT("ioeventfd", IVShmemState, features, IVSHMEM_IOEVENTFD,
1217                     false),
1218     DEFINE_PROP_BIT("msi", IVShmemState, features, IVSHMEM_MSI, true),
1219     DEFINE_PROP_STRING("shm", IVShmemState, shmobj),
1220     DEFINE_PROP_STRING("role", IVShmemState, role),
1221     DEFINE_PROP_UINT32("use64", IVShmemState, not_legacy_32bit, 1),
1222     DEFINE_PROP_END_OF_LIST(),
1223 };
1224
1225 static void desugar_shm(IVShmemState *s)
1226 {
1227     Object *obj;
1228     char *path;
1229
1230     obj = object_new("memory-backend-file");
1231     path = g_strdup_printf("/dev/shm/%s", s->shmobj);
1232     object_property_set_str(obj, path, "mem-path", &error_abort);
1233     g_free(path);
1234     object_property_set_int(obj, s->legacy_size, "size", &error_abort);
1235     object_property_set_bool(obj, true, "share", &error_abort);
1236     object_property_add_child(OBJECT(s), "internal-shm-backend", obj,
1237                               &error_abort);
1238     user_creatable_complete(obj, &error_abort);
1239     s->hostmem = MEMORY_BACKEND(obj);
1240 }
1241
1242 static void ivshmem_realize(PCIDevice *dev, Error **errp)
1243 {
1244     IVShmemState *s = IVSHMEM_COMMON(dev);
1245
1246     if (!qtest_enabled()) {
1247         error_report("ivshmem is deprecated, please use ivshmem-plain"
1248                      " or ivshmem-doorbell instead");
1249     }
1250
1251     if (!!s->server_chr + !!s->shmobj != 1) {
1252         error_setg(errp, "You must specify either 'shm' or 'chardev'");
1253         return;
1254     }
1255
1256     if (s->sizearg == NULL) {
1257         s->legacy_size = 4 << 20; /* 4 MB default */
1258     } else {
1259         char *end;
1260         int64_t size = qemu_strtosz(s->sizearg, &end);
1261         if (size < 0 || (size_t)size != size || *end != '\0'
1262             || !is_power_of_2(size)) {
1263             error_setg(errp, "Invalid size %s", s->sizearg);
1264             return;
1265         }
1266         s->legacy_size = size;
1267     }
1268
1269     /* check that role is reasonable */
1270     if (s->role) {
1271         if (strncmp(s->role, "peer", 5) == 0) {
1272             s->master = ON_OFF_AUTO_OFF;
1273         } else if (strncmp(s->role, "master", 7) == 0) {
1274             s->master = ON_OFF_AUTO_ON;
1275         } else {
1276             error_setg(errp, "'role' must be 'peer' or 'master'");
1277             return;
1278         }
1279     } else {
1280         s->master = ON_OFF_AUTO_AUTO;
1281     }
1282
1283     if (s->shmobj) {
1284         desugar_shm(s);
1285     }
1286
1287     /*
1288      * Note: we don't use INTx with IVSHMEM_MSI at all, so this is a
1289      * bald-faced lie then.  But it's a backwards compatible lie.
1290      */
1291     pci_config_set_interrupt_pin(dev->config, 1);
1292
1293     ivshmem_common_realize(dev, errp);
1294 }
1295
1296 static void ivshmem_class_init(ObjectClass *klass, void *data)
1297 {
1298     DeviceClass *dc = DEVICE_CLASS(klass);
1299     PCIDeviceClass *k = PCI_DEVICE_CLASS(klass);
1300
1301     k->realize = ivshmem_realize;
1302     k->revision = 0;
1303     dc->desc = "Inter-VM shared memory (legacy)";
1304     dc->props = ivshmem_properties;
1305     dc->vmsd = &ivshmem_vmsd;
1306 }
1307
1308 static const TypeInfo ivshmem_info = {
1309     .name          = TYPE_IVSHMEM,
1310     .parent        = TYPE_IVSHMEM_COMMON,
1311     .instance_size = sizeof(IVShmemState),
1312     .class_init    = ivshmem_class_init,
1313 };
1314
1315 static void ivshmem_register_types(void)
1316 {
1317     type_register_static(&ivshmem_common_info);
1318     type_register_static(&ivshmem_plain_info);
1319     type_register_static(&ivshmem_doorbell_info);
1320     type_register_static(&ivshmem_info);
1321 }
1322
1323 type_init(ivshmem_register_types)