These changes are the raw update to linux-4.4.6-rt14. Kernel sources
[kvmfornfv.git] / kernel / arch / x86 / kvm / assigned-dev.c
1 /*
2  * Kernel-based Virtual Machine - device assignment support
3  *
4  * Copyright (C) 2010 Red Hat, Inc. and/or its affiliates.
5  *
6  * This work is licensed under the terms of the GNU GPL, version 2.  See
7  * the COPYING file in the top-level directory.
8  *
9  */
10
11 #include <linux/kvm_host.h>
12 #include <linux/kvm.h>
13 #include <linux/uaccess.h>
14 #include <linux/vmalloc.h>
15 #include <linux/errno.h>
16 #include <linux/spinlock.h>
17 #include <linux/pci.h>
18 #include <linux/interrupt.h>
19 #include <linux/slab.h>
20 #include <linux/namei.h>
21 #include <linux/fs.h>
22 #include "irq.h"
23 #include "assigned-dev.h"
24 #include "trace/events/kvm.h"
25
26 struct kvm_assigned_dev_kernel {
27         struct kvm_irq_ack_notifier ack_notifier;
28         struct list_head list;
29         int assigned_dev_id;
30         int host_segnr;
31         int host_busnr;
32         int host_devfn;
33         unsigned int entries_nr;
34         int host_irq;
35         bool host_irq_disabled;
36         bool pci_2_3;
37         struct msix_entry *host_msix_entries;
38         int guest_irq;
39         struct msix_entry *guest_msix_entries;
40         unsigned long irq_requested_type;
41         int irq_source_id;
42         int flags;
43         struct pci_dev *dev;
44         struct kvm *kvm;
45         spinlock_t intx_lock;
46         spinlock_t intx_mask_lock;
47         char irq_name[32];
48         struct pci_saved_state *pci_saved_state;
49 };
50
51 static struct kvm_assigned_dev_kernel *kvm_find_assigned_dev(struct list_head *head,
52                                                       int assigned_dev_id)
53 {
54         struct list_head *ptr;
55         struct kvm_assigned_dev_kernel *match;
56
57         list_for_each(ptr, head) {
58                 match = list_entry(ptr, struct kvm_assigned_dev_kernel, list);
59                 if (match->assigned_dev_id == assigned_dev_id)
60                         return match;
61         }
62         return NULL;
63 }
64
65 static int find_index_from_host_irq(struct kvm_assigned_dev_kernel
66                                     *assigned_dev, int irq)
67 {
68         int i, index;
69         struct msix_entry *host_msix_entries;
70
71         host_msix_entries = assigned_dev->host_msix_entries;
72
73         index = -1;
74         for (i = 0; i < assigned_dev->entries_nr; i++)
75                 if (irq == host_msix_entries[i].vector) {
76                         index = i;
77                         break;
78                 }
79         if (index < 0)
80                 printk(KERN_WARNING "Fail to find correlated MSI-X entry!\n");
81
82         return index;
83 }
84
85 static irqreturn_t kvm_assigned_dev_intx(int irq, void *dev_id)
86 {
87         struct kvm_assigned_dev_kernel *assigned_dev = dev_id;
88         int ret;
89
90         spin_lock(&assigned_dev->intx_lock);
91         if (pci_check_and_mask_intx(assigned_dev->dev)) {
92                 assigned_dev->host_irq_disabled = true;
93                 ret = IRQ_WAKE_THREAD;
94         } else
95                 ret = IRQ_NONE;
96         spin_unlock(&assigned_dev->intx_lock);
97
98         return ret;
99 }
100
101 static void
102 kvm_assigned_dev_raise_guest_irq(struct kvm_assigned_dev_kernel *assigned_dev,
103                                  int vector)
104 {
105         if (unlikely(assigned_dev->irq_requested_type &
106                      KVM_DEV_IRQ_GUEST_INTX)) {
107                 spin_lock(&assigned_dev->intx_mask_lock);
108                 if (!(assigned_dev->flags & KVM_DEV_ASSIGN_MASK_INTX))
109                         kvm_set_irq(assigned_dev->kvm,
110                                     assigned_dev->irq_source_id, vector, 1,
111                                     false);
112                 spin_unlock(&assigned_dev->intx_mask_lock);
113         } else
114                 kvm_set_irq(assigned_dev->kvm, assigned_dev->irq_source_id,
115                             vector, 1, false);
116 }
117
118 static irqreturn_t kvm_assigned_dev_thread_intx(int irq, void *dev_id)
119 {
120         struct kvm_assigned_dev_kernel *assigned_dev = dev_id;
121
122         if (!(assigned_dev->flags & KVM_DEV_ASSIGN_PCI_2_3)) {
123                 spin_lock_irq(&assigned_dev->intx_lock);
124                 disable_irq_nosync(irq);
125                 assigned_dev->host_irq_disabled = true;
126                 spin_unlock_irq(&assigned_dev->intx_lock);
127         }
128
129         kvm_assigned_dev_raise_guest_irq(assigned_dev,
130                                          assigned_dev->guest_irq);
131
132         return IRQ_HANDLED;
133 }
134
135 /*
136  * Deliver an IRQ in an atomic context if we can, or return a failure,
137  * user can retry in a process context.
138  * Return value:
139  *  -EWOULDBLOCK - Can't deliver in atomic context: retry in a process context.
140  *  Other values - No need to retry.
141  */
142 static int kvm_set_irq_inatomic(struct kvm *kvm, int irq_source_id, u32 irq,
143                                 int level)
144 {
145         struct kvm_kernel_irq_routing_entry entries[KVM_NR_IRQCHIPS];
146         struct kvm_kernel_irq_routing_entry *e;
147         int ret = -EINVAL;
148         int idx;
149
150         trace_kvm_set_irq(irq, level, irq_source_id);
151
152         /*
153          * Injection into either PIC or IOAPIC might need to scan all CPUs,
154          * which would need to be retried from thread context;  when same GSI
155          * is connected to both PIC and IOAPIC, we'd have to report a
156          * partial failure here.
157          * Since there's no easy way to do this, we only support injecting MSI
158          * which is limited to 1:1 GSI mapping.
159          */
160         idx = srcu_read_lock(&kvm->irq_srcu);
161         if (kvm_irq_map_gsi(kvm, entries, irq) > 0) {
162                 e = &entries[0];
163                 ret = kvm_arch_set_irq_inatomic(e, kvm, irq_source_id,
164                                                 irq, level);
165         }
166         srcu_read_unlock(&kvm->irq_srcu, idx);
167         return ret;
168 }
169
170
171 static irqreturn_t kvm_assigned_dev_msi(int irq, void *dev_id)
172 {
173         struct kvm_assigned_dev_kernel *assigned_dev = dev_id;
174         int ret = kvm_set_irq_inatomic(assigned_dev->kvm,
175                                        assigned_dev->irq_source_id,
176                                        assigned_dev->guest_irq, 1);
177         return unlikely(ret == -EWOULDBLOCK) ? IRQ_WAKE_THREAD : IRQ_HANDLED;
178 }
179
180 static irqreturn_t kvm_assigned_dev_thread_msi(int irq, void *dev_id)
181 {
182         struct kvm_assigned_dev_kernel *assigned_dev = dev_id;
183
184         kvm_assigned_dev_raise_guest_irq(assigned_dev,
185                                          assigned_dev->guest_irq);
186
187         return IRQ_HANDLED;
188 }
189
190 static irqreturn_t kvm_assigned_dev_msix(int irq, void *dev_id)
191 {
192         struct kvm_assigned_dev_kernel *assigned_dev = dev_id;
193         int index = find_index_from_host_irq(assigned_dev, irq);
194         u32 vector;
195         int ret = 0;
196
197         if (index >= 0) {
198                 vector = assigned_dev->guest_msix_entries[index].vector;
199                 ret = kvm_set_irq_inatomic(assigned_dev->kvm,
200                                            assigned_dev->irq_source_id,
201                                            vector, 1);
202         }
203
204         return unlikely(ret == -EWOULDBLOCK) ? IRQ_WAKE_THREAD : IRQ_HANDLED;
205 }
206
207 static irqreturn_t kvm_assigned_dev_thread_msix(int irq, void *dev_id)
208 {
209         struct kvm_assigned_dev_kernel *assigned_dev = dev_id;
210         int index = find_index_from_host_irq(assigned_dev, irq);
211         u32 vector;
212
213         if (index >= 0) {
214                 vector = assigned_dev->guest_msix_entries[index].vector;
215                 kvm_assigned_dev_raise_guest_irq(assigned_dev, vector);
216         }
217
218         return IRQ_HANDLED;
219 }
220
221 /* Ack the irq line for an assigned device */
222 static void kvm_assigned_dev_ack_irq(struct kvm_irq_ack_notifier *kian)
223 {
224         struct kvm_assigned_dev_kernel *dev =
225                 container_of(kian, struct kvm_assigned_dev_kernel,
226                              ack_notifier);
227
228         kvm_set_irq(dev->kvm, dev->irq_source_id, dev->guest_irq, 0, false);
229
230         spin_lock(&dev->intx_mask_lock);
231
232         if (!(dev->flags & KVM_DEV_ASSIGN_MASK_INTX)) {
233                 bool reassert = false;
234
235                 spin_lock_irq(&dev->intx_lock);
236                 /*
237                  * The guest IRQ may be shared so this ack can come from an
238                  * IRQ for another guest device.
239                  */
240                 if (dev->host_irq_disabled) {
241                         if (!(dev->flags & KVM_DEV_ASSIGN_PCI_2_3))
242                                 enable_irq(dev->host_irq);
243                         else if (!pci_check_and_unmask_intx(dev->dev))
244                                 reassert = true;
245                         dev->host_irq_disabled = reassert;
246                 }
247                 spin_unlock_irq(&dev->intx_lock);
248
249                 if (reassert)
250                         kvm_set_irq(dev->kvm, dev->irq_source_id,
251                                     dev->guest_irq, 1, false);
252         }
253
254         spin_unlock(&dev->intx_mask_lock);
255 }
256
257 static void deassign_guest_irq(struct kvm *kvm,
258                                struct kvm_assigned_dev_kernel *assigned_dev)
259 {
260         if (assigned_dev->ack_notifier.gsi != -1)
261                 kvm_unregister_irq_ack_notifier(kvm,
262                                                 &assigned_dev->ack_notifier);
263
264         kvm_set_irq(assigned_dev->kvm, assigned_dev->irq_source_id,
265                     assigned_dev->guest_irq, 0, false);
266
267         if (assigned_dev->irq_source_id != -1)
268                 kvm_free_irq_source_id(kvm, assigned_dev->irq_source_id);
269         assigned_dev->irq_source_id = -1;
270         assigned_dev->irq_requested_type &= ~(KVM_DEV_IRQ_GUEST_MASK);
271 }
272
273 /* The function implicit hold kvm->lock mutex due to cancel_work_sync() */
274 static void deassign_host_irq(struct kvm *kvm,
275                               struct kvm_assigned_dev_kernel *assigned_dev)
276 {
277         /*
278          * We disable irq here to prevent further events.
279          *
280          * Notice this maybe result in nested disable if the interrupt type is
281          * INTx, but it's OK for we are going to free it.
282          *
283          * If this function is a part of VM destroy, please ensure that till
284          * now, the kvm state is still legal for probably we also have to wait
285          * on a currently running IRQ handler.
286          */
287         if (assigned_dev->irq_requested_type & KVM_DEV_IRQ_HOST_MSIX) {
288                 int i;
289                 for (i = 0; i < assigned_dev->entries_nr; i++)
290                         disable_irq(assigned_dev->host_msix_entries[i].vector);
291
292                 for (i = 0; i < assigned_dev->entries_nr; i++)
293                         free_irq(assigned_dev->host_msix_entries[i].vector,
294                                  assigned_dev);
295
296                 assigned_dev->entries_nr = 0;
297                 kfree(assigned_dev->host_msix_entries);
298                 kfree(assigned_dev->guest_msix_entries);
299                 pci_disable_msix(assigned_dev->dev);
300         } else {
301                 /* Deal with MSI and INTx */
302                 if ((assigned_dev->irq_requested_type &
303                      KVM_DEV_IRQ_HOST_INTX) &&
304                     (assigned_dev->flags & KVM_DEV_ASSIGN_PCI_2_3)) {
305                         spin_lock_irq(&assigned_dev->intx_lock);
306                         pci_intx(assigned_dev->dev, false);
307                         spin_unlock_irq(&assigned_dev->intx_lock);
308                         synchronize_irq(assigned_dev->host_irq);
309                 } else
310                         disable_irq(assigned_dev->host_irq);
311
312                 free_irq(assigned_dev->host_irq, assigned_dev);
313
314                 if (assigned_dev->irq_requested_type & KVM_DEV_IRQ_HOST_MSI)
315                         pci_disable_msi(assigned_dev->dev);
316         }
317
318         assigned_dev->irq_requested_type &= ~(KVM_DEV_IRQ_HOST_MASK);
319 }
320
321 static int kvm_deassign_irq(struct kvm *kvm,
322                             struct kvm_assigned_dev_kernel *assigned_dev,
323                             unsigned long irq_requested_type)
324 {
325         unsigned long guest_irq_type, host_irq_type;
326
327         if (!irqchip_in_kernel(kvm))
328                 return -EINVAL;
329         /* no irq assignment to deassign */
330         if (!assigned_dev->irq_requested_type)
331                 return -ENXIO;
332
333         host_irq_type = irq_requested_type & KVM_DEV_IRQ_HOST_MASK;
334         guest_irq_type = irq_requested_type & KVM_DEV_IRQ_GUEST_MASK;
335
336         if (host_irq_type)
337                 deassign_host_irq(kvm, assigned_dev);
338         if (guest_irq_type)
339                 deassign_guest_irq(kvm, assigned_dev);
340
341         return 0;
342 }
343
344 static void kvm_free_assigned_irq(struct kvm *kvm,
345                                   struct kvm_assigned_dev_kernel *assigned_dev)
346 {
347         kvm_deassign_irq(kvm, assigned_dev, assigned_dev->irq_requested_type);
348 }
349
350 static void kvm_free_assigned_device(struct kvm *kvm,
351                                      struct kvm_assigned_dev_kernel
352                                      *assigned_dev)
353 {
354         kvm_free_assigned_irq(kvm, assigned_dev);
355
356         pci_reset_function(assigned_dev->dev);
357         if (pci_load_and_free_saved_state(assigned_dev->dev,
358                                           &assigned_dev->pci_saved_state))
359                 printk(KERN_INFO "%s: Couldn't reload %s saved state\n",
360                        __func__, dev_name(&assigned_dev->dev->dev));
361         else
362                 pci_restore_state(assigned_dev->dev);
363
364         pci_clear_dev_assigned(assigned_dev->dev);
365
366         pci_release_regions(assigned_dev->dev);
367         pci_disable_device(assigned_dev->dev);
368         pci_dev_put(assigned_dev->dev);
369
370         list_del(&assigned_dev->list);
371         kfree(assigned_dev);
372 }
373
374 void kvm_free_all_assigned_devices(struct kvm *kvm)
375 {
376         struct list_head *ptr, *ptr2;
377         struct kvm_assigned_dev_kernel *assigned_dev;
378
379         list_for_each_safe(ptr, ptr2, &kvm->arch.assigned_dev_head) {
380                 assigned_dev = list_entry(ptr,
381                                           struct kvm_assigned_dev_kernel,
382                                           list);
383
384                 kvm_free_assigned_device(kvm, assigned_dev);
385         }
386 }
387
388 static int assigned_device_enable_host_intx(struct kvm *kvm,
389                                             struct kvm_assigned_dev_kernel *dev)
390 {
391         irq_handler_t irq_handler;
392         unsigned long flags;
393
394         dev->host_irq = dev->dev->irq;
395
396         /*
397          * We can only share the IRQ line with other host devices if we are
398          * able to disable the IRQ source at device-level - independently of
399          * the guest driver. Otherwise host devices may suffer from unbounded
400          * IRQ latencies when the guest keeps the line asserted.
401          */
402         if (dev->flags & KVM_DEV_ASSIGN_PCI_2_3) {
403                 irq_handler = kvm_assigned_dev_intx;
404                 flags = IRQF_SHARED;
405         } else {
406                 irq_handler = NULL;
407                 flags = IRQF_ONESHOT;
408         }
409         if (request_threaded_irq(dev->host_irq, irq_handler,
410                                  kvm_assigned_dev_thread_intx, flags,
411                                  dev->irq_name, dev))
412                 return -EIO;
413
414         if (dev->flags & KVM_DEV_ASSIGN_PCI_2_3) {
415                 spin_lock_irq(&dev->intx_lock);
416                 pci_intx(dev->dev, true);
417                 spin_unlock_irq(&dev->intx_lock);
418         }
419         return 0;
420 }
421
422 static int assigned_device_enable_host_msi(struct kvm *kvm,
423                                            struct kvm_assigned_dev_kernel *dev)
424 {
425         int r;
426
427         if (!dev->dev->msi_enabled) {
428                 r = pci_enable_msi(dev->dev);
429                 if (r)
430                         return r;
431         }
432
433         dev->host_irq = dev->dev->irq;
434         if (request_threaded_irq(dev->host_irq, kvm_assigned_dev_msi,
435                                  kvm_assigned_dev_thread_msi, 0,
436                                  dev->irq_name, dev)) {
437                 pci_disable_msi(dev->dev);
438                 return -EIO;
439         }
440
441         return 0;
442 }
443
444 static int assigned_device_enable_host_msix(struct kvm *kvm,
445                                             struct kvm_assigned_dev_kernel *dev)
446 {
447         int i, r = -EINVAL;
448
449         /* host_msix_entries and guest_msix_entries should have been
450          * initialized */
451         if (dev->entries_nr == 0)
452                 return r;
453
454         r = pci_enable_msix_exact(dev->dev,
455                                   dev->host_msix_entries, dev->entries_nr);
456         if (r)
457                 return r;
458
459         for (i = 0; i < dev->entries_nr; i++) {
460                 r = request_threaded_irq(dev->host_msix_entries[i].vector,
461                                          kvm_assigned_dev_msix,
462                                          kvm_assigned_dev_thread_msix,
463                                          0, dev->irq_name, dev);
464                 if (r)
465                         goto err;
466         }
467
468         return 0;
469 err:
470         for (i -= 1; i >= 0; i--)
471                 free_irq(dev->host_msix_entries[i].vector, dev);
472         pci_disable_msix(dev->dev);
473         return r;
474 }
475
476 static int assigned_device_enable_guest_intx(struct kvm *kvm,
477                                 struct kvm_assigned_dev_kernel *dev,
478                                 struct kvm_assigned_irq *irq)
479 {
480         dev->guest_irq = irq->guest_irq;
481         dev->ack_notifier.gsi = irq->guest_irq;
482         return 0;
483 }
484
485 static int assigned_device_enable_guest_msi(struct kvm *kvm,
486                         struct kvm_assigned_dev_kernel *dev,
487                         struct kvm_assigned_irq *irq)
488 {
489         dev->guest_irq = irq->guest_irq;
490         dev->ack_notifier.gsi = -1;
491         return 0;
492 }
493
494 static int assigned_device_enable_guest_msix(struct kvm *kvm,
495                         struct kvm_assigned_dev_kernel *dev,
496                         struct kvm_assigned_irq *irq)
497 {
498         dev->guest_irq = irq->guest_irq;
499         dev->ack_notifier.gsi = -1;
500         return 0;
501 }
502
503 static int assign_host_irq(struct kvm *kvm,
504                            struct kvm_assigned_dev_kernel *dev,
505                            __u32 host_irq_type)
506 {
507         int r = -EEXIST;
508
509         if (dev->irq_requested_type & KVM_DEV_IRQ_HOST_MASK)
510                 return r;
511
512         snprintf(dev->irq_name, sizeof(dev->irq_name), "kvm:%s",
513                  pci_name(dev->dev));
514
515         switch (host_irq_type) {
516         case KVM_DEV_IRQ_HOST_INTX:
517                 r = assigned_device_enable_host_intx(kvm, dev);
518                 break;
519         case KVM_DEV_IRQ_HOST_MSI:
520                 r = assigned_device_enable_host_msi(kvm, dev);
521                 break;
522         case KVM_DEV_IRQ_HOST_MSIX:
523                 r = assigned_device_enable_host_msix(kvm, dev);
524                 break;
525         default:
526                 r = -EINVAL;
527         }
528         dev->host_irq_disabled = false;
529
530         if (!r)
531                 dev->irq_requested_type |= host_irq_type;
532
533         return r;
534 }
535
536 static int assign_guest_irq(struct kvm *kvm,
537                             struct kvm_assigned_dev_kernel *dev,
538                             struct kvm_assigned_irq *irq,
539                             unsigned long guest_irq_type)
540 {
541         int id;
542         int r = -EEXIST;
543
544         if (dev->irq_requested_type & KVM_DEV_IRQ_GUEST_MASK)
545                 return r;
546
547         id = kvm_request_irq_source_id(kvm);
548         if (id < 0)
549                 return id;
550
551         dev->irq_source_id = id;
552
553         switch (guest_irq_type) {
554         case KVM_DEV_IRQ_GUEST_INTX:
555                 r = assigned_device_enable_guest_intx(kvm, dev, irq);
556                 break;
557         case KVM_DEV_IRQ_GUEST_MSI:
558                 r = assigned_device_enable_guest_msi(kvm, dev, irq);
559                 break;
560         case KVM_DEV_IRQ_GUEST_MSIX:
561                 r = assigned_device_enable_guest_msix(kvm, dev, irq);
562                 break;
563         default:
564                 r = -EINVAL;
565         }
566
567         if (!r) {
568                 dev->irq_requested_type |= guest_irq_type;
569                 if (dev->ack_notifier.gsi != -1)
570                         kvm_register_irq_ack_notifier(kvm, &dev->ack_notifier);
571         } else {
572                 kvm_free_irq_source_id(kvm, dev->irq_source_id);
573                 dev->irq_source_id = -1;
574         }
575
576         return r;
577 }
578
579 /* TODO Deal with KVM_DEV_IRQ_ASSIGNED_MASK_MSIX */
580 static int kvm_vm_ioctl_assign_irq(struct kvm *kvm,
581                                    struct kvm_assigned_irq *assigned_irq)
582 {
583         int r = -EINVAL;
584         struct kvm_assigned_dev_kernel *match;
585         unsigned long host_irq_type, guest_irq_type;
586
587         if (!irqchip_in_kernel(kvm))
588                 return r;
589
590         mutex_lock(&kvm->lock);
591         r = -ENODEV;
592         match = kvm_find_assigned_dev(&kvm->arch.assigned_dev_head,
593                                       assigned_irq->assigned_dev_id);
594         if (!match)
595                 goto out;
596
597         host_irq_type = (assigned_irq->flags & KVM_DEV_IRQ_HOST_MASK);
598         guest_irq_type = (assigned_irq->flags & KVM_DEV_IRQ_GUEST_MASK);
599
600         r = -EINVAL;
601         /* can only assign one type at a time */
602         if (hweight_long(host_irq_type) > 1)
603                 goto out;
604         if (hweight_long(guest_irq_type) > 1)
605                 goto out;
606         if (host_irq_type == 0 && guest_irq_type == 0)
607                 goto out;
608
609         r = 0;
610         if (host_irq_type)
611                 r = assign_host_irq(kvm, match, host_irq_type);
612         if (r)
613                 goto out;
614
615         if (guest_irq_type)
616                 r = assign_guest_irq(kvm, match, assigned_irq, guest_irq_type);
617 out:
618         mutex_unlock(&kvm->lock);
619         return r;
620 }
621
622 static int kvm_vm_ioctl_deassign_dev_irq(struct kvm *kvm,
623                                          struct kvm_assigned_irq
624                                          *assigned_irq)
625 {
626         int r = -ENODEV;
627         struct kvm_assigned_dev_kernel *match;
628         unsigned long irq_type;
629
630         mutex_lock(&kvm->lock);
631
632         match = kvm_find_assigned_dev(&kvm->arch.assigned_dev_head,
633                                       assigned_irq->assigned_dev_id);
634         if (!match)
635                 goto out;
636
637         irq_type = assigned_irq->flags & (KVM_DEV_IRQ_HOST_MASK |
638                                           KVM_DEV_IRQ_GUEST_MASK);
639         r = kvm_deassign_irq(kvm, match, irq_type);
640 out:
641         mutex_unlock(&kvm->lock);
642         return r;
643 }
644
645 /*
646  * We want to test whether the caller has been granted permissions to
647  * use this device.  To be able to configure and control the device,
648  * the user needs access to PCI configuration space and BAR resources.
649  * These are accessed through PCI sysfs.  PCI config space is often
650  * passed to the process calling this ioctl via file descriptor, so we
651  * can't rely on access to that file.  We can check for permissions
652  * on each of the BAR resource files, which is a pretty clear
653  * indicator that the user has been granted access to the device.
654  */
655 static int probe_sysfs_permissions(struct pci_dev *dev)
656 {
657 #ifdef CONFIG_SYSFS
658         int i;
659         bool bar_found = false;
660
661         for (i = PCI_STD_RESOURCES; i <= PCI_STD_RESOURCE_END; i++) {
662                 char *kpath, *syspath;
663                 struct path path;
664                 struct inode *inode;
665                 int r;
666
667                 if (!pci_resource_len(dev, i))
668                         continue;
669
670                 kpath = kobject_get_path(&dev->dev.kobj, GFP_KERNEL);
671                 if (!kpath)
672                         return -ENOMEM;
673
674                 /* Per sysfs-rules, sysfs is always at /sys */
675                 syspath = kasprintf(GFP_KERNEL, "/sys%s/resource%d", kpath, i);
676                 kfree(kpath);
677                 if (!syspath)
678                         return -ENOMEM;
679
680                 r = kern_path(syspath, LOOKUP_FOLLOW, &path);
681                 kfree(syspath);
682                 if (r)
683                         return r;
684
685                 inode = d_backing_inode(path.dentry);
686
687                 r = inode_permission(inode, MAY_READ | MAY_WRITE | MAY_ACCESS);
688                 path_put(&path);
689                 if (r)
690                         return r;
691
692                 bar_found = true;
693         }
694
695         /* If no resources, probably something special */
696         if (!bar_found)
697                 return -EPERM;
698
699         return 0;
700 #else
701         return -EINVAL; /* No way to control the device without sysfs */
702 #endif
703 }
704
705 static int kvm_vm_ioctl_assign_device(struct kvm *kvm,
706                                       struct kvm_assigned_pci_dev *assigned_dev)
707 {
708         int r = 0, idx;
709         struct kvm_assigned_dev_kernel *match;
710         struct pci_dev *dev;
711
712         if (!(assigned_dev->flags & KVM_DEV_ASSIGN_ENABLE_IOMMU))
713                 return -EINVAL;
714
715         mutex_lock(&kvm->lock);
716         idx = srcu_read_lock(&kvm->srcu);
717
718         match = kvm_find_assigned_dev(&kvm->arch.assigned_dev_head,
719                                       assigned_dev->assigned_dev_id);
720         if (match) {
721                 /* device already assigned */
722                 r = -EEXIST;
723                 goto out;
724         }
725
726         match = kzalloc(sizeof(struct kvm_assigned_dev_kernel), GFP_KERNEL);
727         if (match == NULL) {
728                 printk(KERN_INFO "%s: Couldn't allocate memory\n",
729                        __func__);
730                 r = -ENOMEM;
731                 goto out;
732         }
733         dev = pci_get_domain_bus_and_slot(assigned_dev->segnr,
734                                    assigned_dev->busnr,
735                                    assigned_dev->devfn);
736         if (!dev) {
737                 printk(KERN_INFO "%s: host device not found\n", __func__);
738                 r = -EINVAL;
739                 goto out_free;
740         }
741
742         /* Don't allow bridges to be assigned */
743         if (dev->hdr_type != PCI_HEADER_TYPE_NORMAL) {
744                 r = -EPERM;
745                 goto out_put;
746         }
747
748         r = probe_sysfs_permissions(dev);
749         if (r)
750                 goto out_put;
751
752         if (pci_enable_device(dev)) {
753                 printk(KERN_INFO "%s: Could not enable PCI device\n", __func__);
754                 r = -EBUSY;
755                 goto out_put;
756         }
757         r = pci_request_regions(dev, "kvm_assigned_device");
758         if (r) {
759                 printk(KERN_INFO "%s: Could not get access to device regions\n",
760                        __func__);
761                 goto out_disable;
762         }
763
764         pci_reset_function(dev);
765         pci_save_state(dev);
766         match->pci_saved_state = pci_store_saved_state(dev);
767         if (!match->pci_saved_state)
768                 printk(KERN_DEBUG "%s: Couldn't store %s saved state\n",
769                        __func__, dev_name(&dev->dev));
770
771         if (!pci_intx_mask_supported(dev))
772                 assigned_dev->flags &= ~KVM_DEV_ASSIGN_PCI_2_3;
773
774         match->assigned_dev_id = assigned_dev->assigned_dev_id;
775         match->host_segnr = assigned_dev->segnr;
776         match->host_busnr = assigned_dev->busnr;
777         match->host_devfn = assigned_dev->devfn;
778         match->flags = assigned_dev->flags;
779         match->dev = dev;
780         spin_lock_init(&match->intx_lock);
781         spin_lock_init(&match->intx_mask_lock);
782         match->irq_source_id = -1;
783         match->kvm = kvm;
784         match->ack_notifier.irq_acked = kvm_assigned_dev_ack_irq;
785
786         list_add(&match->list, &kvm->arch.assigned_dev_head);
787
788         if (!kvm->arch.iommu_domain) {
789                 r = kvm_iommu_map_guest(kvm);
790                 if (r)
791                         goto out_list_del;
792         }
793         r = kvm_assign_device(kvm, match->dev);
794         if (r)
795                 goto out_list_del;
796
797 out:
798         srcu_read_unlock(&kvm->srcu, idx);
799         mutex_unlock(&kvm->lock);
800         return r;
801 out_list_del:
802         if (pci_load_and_free_saved_state(dev, &match->pci_saved_state))
803                 printk(KERN_INFO "%s: Couldn't reload %s saved state\n",
804                        __func__, dev_name(&dev->dev));
805         list_del(&match->list);
806         pci_release_regions(dev);
807 out_disable:
808         pci_disable_device(dev);
809 out_put:
810         pci_dev_put(dev);
811 out_free:
812         kfree(match);
813         srcu_read_unlock(&kvm->srcu, idx);
814         mutex_unlock(&kvm->lock);
815         return r;
816 }
817
818 static int kvm_vm_ioctl_deassign_device(struct kvm *kvm,
819                 struct kvm_assigned_pci_dev *assigned_dev)
820 {
821         int r = 0;
822         struct kvm_assigned_dev_kernel *match;
823
824         mutex_lock(&kvm->lock);
825
826         match = kvm_find_assigned_dev(&kvm->arch.assigned_dev_head,
827                                       assigned_dev->assigned_dev_id);
828         if (!match) {
829                 printk(KERN_INFO "%s: device hasn't been assigned before, "
830                   "so cannot be deassigned\n", __func__);
831                 r = -EINVAL;
832                 goto out;
833         }
834
835         kvm_deassign_device(kvm, match->dev);
836
837         kvm_free_assigned_device(kvm, match);
838
839 out:
840         mutex_unlock(&kvm->lock);
841         return r;
842 }
843
844
845 static int kvm_vm_ioctl_set_msix_nr(struct kvm *kvm,
846                                     struct kvm_assigned_msix_nr *entry_nr)
847 {
848         int r = 0;
849         struct kvm_assigned_dev_kernel *adev;
850
851         mutex_lock(&kvm->lock);
852
853         adev = kvm_find_assigned_dev(&kvm->arch.assigned_dev_head,
854                                       entry_nr->assigned_dev_id);
855         if (!adev) {
856                 r = -EINVAL;
857                 goto msix_nr_out;
858         }
859
860         if (adev->entries_nr == 0) {
861                 adev->entries_nr = entry_nr->entry_nr;
862                 if (adev->entries_nr == 0 ||
863                     adev->entries_nr > KVM_MAX_MSIX_PER_DEV) {
864                         r = -EINVAL;
865                         goto msix_nr_out;
866                 }
867
868                 adev->host_msix_entries = kzalloc(sizeof(struct msix_entry) *
869                                                 entry_nr->entry_nr,
870                                                 GFP_KERNEL);
871                 if (!adev->host_msix_entries) {
872                         r = -ENOMEM;
873                         goto msix_nr_out;
874                 }
875                 adev->guest_msix_entries =
876                         kzalloc(sizeof(struct msix_entry) * entry_nr->entry_nr,
877                                 GFP_KERNEL);
878                 if (!adev->guest_msix_entries) {
879                         kfree(adev->host_msix_entries);
880                         r = -ENOMEM;
881                         goto msix_nr_out;
882                 }
883         } else /* Not allowed set MSI-X number twice */
884                 r = -EINVAL;
885 msix_nr_out:
886         mutex_unlock(&kvm->lock);
887         return r;
888 }
889
890 static int kvm_vm_ioctl_set_msix_entry(struct kvm *kvm,
891                                        struct kvm_assigned_msix_entry *entry)
892 {
893         int r = 0, i;
894         struct kvm_assigned_dev_kernel *adev;
895
896         mutex_lock(&kvm->lock);
897
898         adev = kvm_find_assigned_dev(&kvm->arch.assigned_dev_head,
899                                       entry->assigned_dev_id);
900
901         if (!adev) {
902                 r = -EINVAL;
903                 goto msix_entry_out;
904         }
905
906         for (i = 0; i < adev->entries_nr; i++)
907                 if (adev->guest_msix_entries[i].vector == 0 ||
908                     adev->guest_msix_entries[i].entry == entry->entry) {
909                         adev->guest_msix_entries[i].entry = entry->entry;
910                         adev->guest_msix_entries[i].vector = entry->gsi;
911                         adev->host_msix_entries[i].entry = entry->entry;
912                         break;
913                 }
914         if (i == adev->entries_nr) {
915                 r = -ENOSPC;
916                 goto msix_entry_out;
917         }
918
919 msix_entry_out:
920         mutex_unlock(&kvm->lock);
921
922         return r;
923 }
924
925 static int kvm_vm_ioctl_set_pci_irq_mask(struct kvm *kvm,
926                 struct kvm_assigned_pci_dev *assigned_dev)
927 {
928         int r = 0;
929         struct kvm_assigned_dev_kernel *match;
930
931         mutex_lock(&kvm->lock);
932
933         match = kvm_find_assigned_dev(&kvm->arch.assigned_dev_head,
934                                       assigned_dev->assigned_dev_id);
935         if (!match) {
936                 r = -ENODEV;
937                 goto out;
938         }
939
940         spin_lock(&match->intx_mask_lock);
941
942         match->flags &= ~KVM_DEV_ASSIGN_MASK_INTX;
943         match->flags |= assigned_dev->flags & KVM_DEV_ASSIGN_MASK_INTX;
944
945         if (match->irq_requested_type & KVM_DEV_IRQ_GUEST_INTX) {
946                 if (assigned_dev->flags & KVM_DEV_ASSIGN_MASK_INTX) {
947                         kvm_set_irq(match->kvm, match->irq_source_id,
948                                     match->guest_irq, 0, false);
949                         /*
950                          * Masking at hardware-level is performed on demand,
951                          * i.e. when an IRQ actually arrives at the host.
952                          */
953                 } else if (!(assigned_dev->flags & KVM_DEV_ASSIGN_PCI_2_3)) {
954                         /*
955                          * Unmask the IRQ line if required. Unmasking at
956                          * device level will be performed by user space.
957                          */
958                         spin_lock_irq(&match->intx_lock);
959                         if (match->host_irq_disabled) {
960                                 enable_irq(match->host_irq);
961                                 match->host_irq_disabled = false;
962                         }
963                         spin_unlock_irq(&match->intx_lock);
964                 }
965         }
966
967         spin_unlock(&match->intx_mask_lock);
968
969 out:
970         mutex_unlock(&kvm->lock);
971         return r;
972 }
973
974 long kvm_vm_ioctl_assigned_device(struct kvm *kvm, unsigned ioctl,
975                                   unsigned long arg)
976 {
977         void __user *argp = (void __user *)arg;
978         int r;
979
980         switch (ioctl) {
981         case KVM_ASSIGN_PCI_DEVICE: {
982                 struct kvm_assigned_pci_dev assigned_dev;
983
984                 r = -EFAULT;
985                 if (copy_from_user(&assigned_dev, argp, sizeof assigned_dev))
986                         goto out;
987                 r = kvm_vm_ioctl_assign_device(kvm, &assigned_dev);
988                 if (r)
989                         goto out;
990                 break;
991         }
992         case KVM_ASSIGN_IRQ: {
993                 r = -EOPNOTSUPP;
994                 break;
995         }
996         case KVM_ASSIGN_DEV_IRQ: {
997                 struct kvm_assigned_irq assigned_irq;
998
999                 r = -EFAULT;
1000                 if (copy_from_user(&assigned_irq, argp, sizeof assigned_irq))
1001                         goto out;
1002                 r = kvm_vm_ioctl_assign_irq(kvm, &assigned_irq);
1003                 if (r)
1004                         goto out;
1005                 break;
1006         }
1007         case KVM_DEASSIGN_DEV_IRQ: {
1008                 struct kvm_assigned_irq assigned_irq;
1009
1010                 r = -EFAULT;
1011                 if (copy_from_user(&assigned_irq, argp, sizeof assigned_irq))
1012                         goto out;
1013                 r = kvm_vm_ioctl_deassign_dev_irq(kvm, &assigned_irq);
1014                 if (r)
1015                         goto out;
1016                 break;
1017         }
1018         case KVM_DEASSIGN_PCI_DEVICE: {
1019                 struct kvm_assigned_pci_dev assigned_dev;
1020
1021                 r = -EFAULT;
1022                 if (copy_from_user(&assigned_dev, argp, sizeof assigned_dev))
1023                         goto out;
1024                 r = kvm_vm_ioctl_deassign_device(kvm, &assigned_dev);
1025                 if (r)
1026                         goto out;
1027                 break;
1028         }
1029         case KVM_ASSIGN_SET_MSIX_NR: {
1030                 struct kvm_assigned_msix_nr entry_nr;
1031                 r = -EFAULT;
1032                 if (copy_from_user(&entry_nr, argp, sizeof entry_nr))
1033                         goto out;
1034                 r = kvm_vm_ioctl_set_msix_nr(kvm, &entry_nr);
1035                 if (r)
1036                         goto out;
1037                 break;
1038         }
1039         case KVM_ASSIGN_SET_MSIX_ENTRY: {
1040                 struct kvm_assigned_msix_entry entry;
1041                 r = -EFAULT;
1042                 if (copy_from_user(&entry, argp, sizeof entry))
1043                         goto out;
1044                 r = kvm_vm_ioctl_set_msix_entry(kvm, &entry);
1045                 if (r)
1046                         goto out;
1047                 break;
1048         }
1049         case KVM_ASSIGN_SET_INTX_MASK: {
1050                 struct kvm_assigned_pci_dev assigned_dev;
1051
1052                 r = -EFAULT;
1053                 if (copy_from_user(&assigned_dev, argp, sizeof assigned_dev))
1054                         goto out;
1055                 r = kvm_vm_ioctl_set_pci_irq_mask(kvm, &assigned_dev);
1056                 break;
1057         }
1058         default:
1059                 r = -ENOTTY;
1060                 break;
1061         }
1062 out:
1063         return r;
1064 }