These changes are a raw update to a vanilla kernel 4.1.10, with the
[kvmfornfv.git] / kernel / drivers / iommu / intel-iommu.c
1 /*
2  * Copyright © 2006-2014 Intel Corporation.
3  *
4  * This program is free software; you can redistribute it and/or modify it
5  * under the terms and conditions of the GNU General Public License,
6  * version 2, as published by the Free Software Foundation.
7  *
8  * This program is distributed in the hope it will be useful, but WITHOUT
9  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
10  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
11  * more details.
12  *
13  * Authors: David Woodhouse <dwmw2@infradead.org>,
14  *          Ashok Raj <ashok.raj@intel.com>,
15  *          Shaohua Li <shaohua.li@intel.com>,
16  *          Anil S Keshavamurthy <anil.s.keshavamurthy@intel.com>,
17  *          Fenghua Yu <fenghua.yu@intel.com>
18  */
19
20 #include <linux/init.h>
21 #include <linux/bitmap.h>
22 #include <linux/debugfs.h>
23 #include <linux/export.h>
24 #include <linux/slab.h>
25 #include <linux/irq.h>
26 #include <linux/interrupt.h>
27 #include <linux/spinlock.h>
28 #include <linux/pci.h>
29 #include <linux/dmar.h>
30 #include <linux/dma-mapping.h>
31 #include <linux/mempool.h>
32 #include <linux/memory.h>
33 #include <linux/timer.h>
34 #include <linux/iova.h>
35 #include <linux/iommu.h>
36 #include <linux/intel-iommu.h>
37 #include <linux/syscore_ops.h>
38 #include <linux/tboot.h>
39 #include <linux/dmi.h>
40 #include <linux/pci-ats.h>
41 #include <linux/memblock.h>
42 #include <linux/dma-contiguous.h>
43 #include <asm/irq_remapping.h>
44 #include <asm/cacheflush.h>
45 #include <asm/iommu.h>
46
47 #include "irq_remapping.h"
48
49 #define ROOT_SIZE               VTD_PAGE_SIZE
50 #define CONTEXT_SIZE            VTD_PAGE_SIZE
51
52 #define IS_GFX_DEVICE(pdev) ((pdev->class >> 16) == PCI_BASE_CLASS_DISPLAY)
53 #define IS_USB_DEVICE(pdev) ((pdev->class >> 8) == PCI_CLASS_SERIAL_USB)
54 #define IS_ISA_DEVICE(pdev) ((pdev->class >> 8) == PCI_CLASS_BRIDGE_ISA)
55 #define IS_AZALIA(pdev) ((pdev)->vendor == 0x8086 && (pdev)->device == 0x3a3e)
56
57 #define IOAPIC_RANGE_START      (0xfee00000)
58 #define IOAPIC_RANGE_END        (0xfeefffff)
59 #define IOVA_START_ADDR         (0x1000)
60
61 #define DEFAULT_DOMAIN_ADDRESS_WIDTH 48
62
63 #define MAX_AGAW_WIDTH 64
64 #define MAX_AGAW_PFN_WIDTH      (MAX_AGAW_WIDTH - VTD_PAGE_SHIFT)
65
66 #define __DOMAIN_MAX_PFN(gaw)  ((((uint64_t)1) << (gaw-VTD_PAGE_SHIFT)) - 1)
67 #define __DOMAIN_MAX_ADDR(gaw) ((((uint64_t)1) << gaw) - 1)
68
69 /* We limit DOMAIN_MAX_PFN to fit in an unsigned long, and DOMAIN_MAX_ADDR
70    to match. That way, we can use 'unsigned long' for PFNs with impunity. */
71 #define DOMAIN_MAX_PFN(gaw)     ((unsigned long) min_t(uint64_t, \
72                                 __DOMAIN_MAX_PFN(gaw), (unsigned long)-1))
73 #define DOMAIN_MAX_ADDR(gaw)    (((uint64_t)__DOMAIN_MAX_PFN(gaw)) << VTD_PAGE_SHIFT)
74
75 /* IO virtual address start page frame number */
76 #define IOVA_START_PFN          (1)
77
78 #define IOVA_PFN(addr)          ((addr) >> PAGE_SHIFT)
79 #define DMA_32BIT_PFN           IOVA_PFN(DMA_BIT_MASK(32))
80 #define DMA_64BIT_PFN           IOVA_PFN(DMA_BIT_MASK(64))
81
82 /* page table handling */
83 #define LEVEL_STRIDE            (9)
84 #define LEVEL_MASK              (((u64)1 << LEVEL_STRIDE) - 1)
85
86 /*
87  * This bitmap is used to advertise the page sizes our hardware support
88  * to the IOMMU core, which will then use this information to split
89  * physically contiguous memory regions it is mapping into page sizes
90  * that we support.
91  *
92  * Traditionally the IOMMU core just handed us the mappings directly,
93  * after making sure the size is an order of a 4KiB page and that the
94  * mapping has natural alignment.
95  *
96  * To retain this behavior, we currently advertise that we support
97  * all page sizes that are an order of 4KiB.
98  *
99  * If at some point we'd like to utilize the IOMMU core's new behavior,
100  * we could change this to advertise the real page sizes we support.
101  */
102 #define INTEL_IOMMU_PGSIZES     (~0xFFFUL)
103
104 static inline int agaw_to_level(int agaw)
105 {
106         return agaw + 2;
107 }
108
109 static inline int agaw_to_width(int agaw)
110 {
111         return min_t(int, 30 + agaw * LEVEL_STRIDE, MAX_AGAW_WIDTH);
112 }
113
114 static inline int width_to_agaw(int width)
115 {
116         return DIV_ROUND_UP(width - 30, LEVEL_STRIDE);
117 }
118
119 static inline unsigned int level_to_offset_bits(int level)
120 {
121         return (level - 1) * LEVEL_STRIDE;
122 }
123
124 static inline int pfn_level_offset(unsigned long pfn, int level)
125 {
126         return (pfn >> level_to_offset_bits(level)) & LEVEL_MASK;
127 }
128
129 static inline unsigned long level_mask(int level)
130 {
131         return -1UL << level_to_offset_bits(level);
132 }
133
134 static inline unsigned long level_size(int level)
135 {
136         return 1UL << level_to_offset_bits(level);
137 }
138
139 static inline unsigned long align_to_level(unsigned long pfn, int level)
140 {
141         return (pfn + level_size(level) - 1) & level_mask(level);
142 }
143
144 static inline unsigned long lvl_to_nr_pages(unsigned int lvl)
145 {
146         return  1 << min_t(int, (lvl - 1) * LEVEL_STRIDE, MAX_AGAW_PFN_WIDTH);
147 }
148
149 /* VT-d pages must always be _smaller_ than MM pages. Otherwise things
150    are never going to work. */
151 static inline unsigned long dma_to_mm_pfn(unsigned long dma_pfn)
152 {
153         return dma_pfn >> (PAGE_SHIFT - VTD_PAGE_SHIFT);
154 }
155
156 static inline unsigned long mm_to_dma_pfn(unsigned long mm_pfn)
157 {
158         return mm_pfn << (PAGE_SHIFT - VTD_PAGE_SHIFT);
159 }
160 static inline unsigned long page_to_dma_pfn(struct page *pg)
161 {
162         return mm_to_dma_pfn(page_to_pfn(pg));
163 }
164 static inline unsigned long virt_to_dma_pfn(void *p)
165 {
166         return page_to_dma_pfn(virt_to_page(p));
167 }
168
169 /* global iommu list, set NULL for ignored DMAR units */
170 static struct intel_iommu **g_iommus;
171
172 static void __init check_tylersburg_isoch(void);
173 static int rwbf_quirk;
174
175 /*
176  * set to 1 to panic kernel if can't successfully enable VT-d
177  * (used when kernel is launched w/ TXT)
178  */
179 static int force_on = 0;
180
181 /*
182  * 0: Present
183  * 1-11: Reserved
184  * 12-63: Context Ptr (12 - (haw-1))
185  * 64-127: Reserved
186  */
187 struct root_entry {
188         u64     lo;
189         u64     hi;
190 };
191 #define ROOT_ENTRY_NR (VTD_PAGE_SIZE/sizeof(struct root_entry))
192
193
194 /*
195  * low 64 bits:
196  * 0: present
197  * 1: fault processing disable
198  * 2-3: translation type
199  * 12-63: address space root
200  * high 64 bits:
201  * 0-2: address width
202  * 3-6: aval
203  * 8-23: domain id
204  */
205 struct context_entry {
206         u64 lo;
207         u64 hi;
208 };
209
210 static inline bool context_present(struct context_entry *context)
211 {
212         return (context->lo & 1);
213 }
214 static inline void context_set_present(struct context_entry *context)
215 {
216         context->lo |= 1;
217 }
218
219 static inline void context_set_fault_enable(struct context_entry *context)
220 {
221         context->lo &= (((u64)-1) << 2) | 1;
222 }
223
224 static inline void context_set_translation_type(struct context_entry *context,
225                                                 unsigned long value)
226 {
227         context->lo &= (((u64)-1) << 4) | 3;
228         context->lo |= (value & 3) << 2;
229 }
230
231 static inline void context_set_address_root(struct context_entry *context,
232                                             unsigned long value)
233 {
234         context->lo &= ~VTD_PAGE_MASK;
235         context->lo |= value & VTD_PAGE_MASK;
236 }
237
238 static inline void context_set_address_width(struct context_entry *context,
239                                              unsigned long value)
240 {
241         context->hi |= value & 7;
242 }
243
244 static inline void context_set_domain_id(struct context_entry *context,
245                                          unsigned long value)
246 {
247         context->hi |= (value & ((1 << 16) - 1)) << 8;
248 }
249
250 static inline void context_clear_entry(struct context_entry *context)
251 {
252         context->lo = 0;
253         context->hi = 0;
254 }
255
256 /*
257  * 0: readable
258  * 1: writable
259  * 2-6: reserved
260  * 7: super page
261  * 8-10: available
262  * 11: snoop behavior
263  * 12-63: Host physcial address
264  */
265 struct dma_pte {
266         u64 val;
267 };
268
269 static inline void dma_clear_pte(struct dma_pte *pte)
270 {
271         pte->val = 0;
272 }
273
274 static inline u64 dma_pte_addr(struct dma_pte *pte)
275 {
276 #ifdef CONFIG_64BIT
277         return pte->val & VTD_PAGE_MASK;
278 #else
279         /* Must have a full atomic 64-bit read */
280         return  __cmpxchg64(&pte->val, 0ULL, 0ULL) & VTD_PAGE_MASK;
281 #endif
282 }
283
284 static inline bool dma_pte_present(struct dma_pte *pte)
285 {
286         return (pte->val & 3) != 0;
287 }
288
289 static inline bool dma_pte_superpage(struct dma_pte *pte)
290 {
291         return (pte->val & DMA_PTE_LARGE_PAGE);
292 }
293
294 static inline int first_pte_in_page(struct dma_pte *pte)
295 {
296         return !((unsigned long)pte & ~VTD_PAGE_MASK);
297 }
298
299 /*
300  * This domain is a statically identity mapping domain.
301  *      1. This domain creats a static 1:1 mapping to all usable memory.
302  *      2. It maps to each iommu if successful.
303  *      3. Each iommu mapps to this domain if successful.
304  */
305 static struct dmar_domain *si_domain;
306 static int hw_pass_through = 1;
307
308 /* domain represents a virtual machine, more than one devices
309  * across iommus may be owned in one domain, e.g. kvm guest.
310  */
311 #define DOMAIN_FLAG_VIRTUAL_MACHINE     (1 << 0)
312
313 /* si_domain contains mulitple devices */
314 #define DOMAIN_FLAG_STATIC_IDENTITY     (1 << 1)
315
316 struct dmar_domain {
317         int     id;                     /* domain id */
318         int     nid;                    /* node id */
319         DECLARE_BITMAP(iommu_bmp, DMAR_UNITS_SUPPORTED);
320                                         /* bitmap of iommus this domain uses*/
321
322         struct list_head devices;       /* all devices' list */
323         struct iova_domain iovad;       /* iova's that belong to this domain */
324
325         struct dma_pte  *pgd;           /* virtual address */
326         int             gaw;            /* max guest address width */
327
328         /* adjusted guest address width, 0 is level 2 30-bit */
329         int             agaw;
330
331         int             flags;          /* flags to find out type of domain */
332
333         int             iommu_coherency;/* indicate coherency of iommu access */
334         int             iommu_snooping; /* indicate snooping control feature*/
335         int             iommu_count;    /* reference count of iommu */
336         int             iommu_superpage;/* Level of superpages supported:
337                                            0 == 4KiB (no superpages), 1 == 2MiB,
338                                            2 == 1GiB, 3 == 512GiB, 4 == 1TiB */
339         spinlock_t      iommu_lock;     /* protect iommu set in domain */
340         u64             max_addr;       /* maximum mapped address */
341
342         struct iommu_domain domain;     /* generic domain data structure for
343                                            iommu core */
344 };
345
346 /* PCI domain-device relationship */
347 struct device_domain_info {
348         struct list_head link;  /* link to domain siblings */
349         struct list_head global; /* link to global list */
350         u8 bus;                 /* PCI bus number */
351         u8 devfn;               /* PCI devfn number */
352         struct device *dev; /* it's NULL for PCIe-to-PCI bridge */
353         struct intel_iommu *iommu; /* IOMMU used by this device */
354         struct dmar_domain *domain; /* pointer to domain */
355 };
356
357 struct dmar_rmrr_unit {
358         struct list_head list;          /* list of rmrr units   */
359         struct acpi_dmar_header *hdr;   /* ACPI header          */
360         u64     base_address;           /* reserved base address*/
361         u64     end_address;            /* reserved end address */
362         struct dmar_dev_scope *devices; /* target devices */
363         int     devices_cnt;            /* target device count */
364 };
365
366 struct dmar_atsr_unit {
367         struct list_head list;          /* list of ATSR units */
368         struct acpi_dmar_header *hdr;   /* ACPI header */
369         struct dmar_dev_scope *devices; /* target devices */
370         int devices_cnt;                /* target device count */
371         u8 include_all:1;               /* include all ports */
372 };
373
374 static LIST_HEAD(dmar_atsr_units);
375 static LIST_HEAD(dmar_rmrr_units);
376
377 #define for_each_rmrr_units(rmrr) \
378         list_for_each_entry(rmrr, &dmar_rmrr_units, list)
379
380 static void flush_unmaps_timeout(unsigned long data);
381
382 static DEFINE_TIMER(unmap_timer,  flush_unmaps_timeout, 0, 0);
383
384 #define HIGH_WATER_MARK 250
385 struct deferred_flush_tables {
386         int next;
387         struct iova *iova[HIGH_WATER_MARK];
388         struct dmar_domain *domain[HIGH_WATER_MARK];
389         struct page *freelist[HIGH_WATER_MARK];
390 };
391
392 static struct deferred_flush_tables *deferred_flush;
393
394 /* bitmap for indexing intel_iommus */
395 static int g_num_of_iommus;
396
397 static DEFINE_SPINLOCK(async_umap_flush_lock);
398 static LIST_HEAD(unmaps_to_do);
399
400 static int timer_on;
401 static long list_size;
402
403 static void domain_exit(struct dmar_domain *domain);
404 static void domain_remove_dev_info(struct dmar_domain *domain);
405 static void domain_remove_one_dev_info(struct dmar_domain *domain,
406                                        struct device *dev);
407 static void iommu_detach_dependent_devices(struct intel_iommu *iommu,
408                                            struct device *dev);
409 static int domain_detach_iommu(struct dmar_domain *domain,
410                                struct intel_iommu *iommu);
411
412 #ifdef CONFIG_INTEL_IOMMU_DEFAULT_ON
413 int dmar_disabled = 0;
414 #else
415 int dmar_disabled = 1;
416 #endif /*CONFIG_INTEL_IOMMU_DEFAULT_ON*/
417
418 int intel_iommu_enabled = 0;
419 EXPORT_SYMBOL_GPL(intel_iommu_enabled);
420
421 static int dmar_map_gfx = 1;
422 static int dmar_forcedac;
423 static int intel_iommu_strict;
424 static int intel_iommu_superpage = 1;
425 static int intel_iommu_ecs = 1;
426
427 /* We only actually use ECS when PASID support (on the new bit 40)
428  * is also advertised. Some early implementations — the ones with
429  * PASID support on bit 28 — have issues even when we *only* use
430  * extended root/context tables. */
431 #define ecs_enabled(iommu) (intel_iommu_ecs && ecap_ecs(iommu->ecap) && \
432                             ecap_pasid(iommu->ecap))
433
434 int intel_iommu_gfx_mapped;
435 EXPORT_SYMBOL_GPL(intel_iommu_gfx_mapped);
436
437 #define DUMMY_DEVICE_DOMAIN_INFO ((struct device_domain_info *)(-1))
438 static DEFINE_SPINLOCK(device_domain_lock);
439 static LIST_HEAD(device_domain_list);
440
441 static const struct iommu_ops intel_iommu_ops;
442
443 /* Convert generic 'struct iommu_domain to private struct dmar_domain */
444 static struct dmar_domain *to_dmar_domain(struct iommu_domain *dom)
445 {
446         return container_of(dom, struct dmar_domain, domain);
447 }
448
449 static int __init intel_iommu_setup(char *str)
450 {
451         if (!str)
452                 return -EINVAL;
453         while (*str) {
454                 if (!strncmp(str, "on", 2)) {
455                         dmar_disabled = 0;
456                         printk(KERN_INFO "Intel-IOMMU: enabled\n");
457                 } else if (!strncmp(str, "off", 3)) {
458                         dmar_disabled = 1;
459                         printk(KERN_INFO "Intel-IOMMU: disabled\n");
460                 } else if (!strncmp(str, "igfx_off", 8)) {
461                         dmar_map_gfx = 0;
462                         printk(KERN_INFO
463                                 "Intel-IOMMU: disable GFX device mapping\n");
464                 } else if (!strncmp(str, "forcedac", 8)) {
465                         printk(KERN_INFO
466                                 "Intel-IOMMU: Forcing DAC for PCI devices\n");
467                         dmar_forcedac = 1;
468                 } else if (!strncmp(str, "strict", 6)) {
469                         printk(KERN_INFO
470                                 "Intel-IOMMU: disable batched IOTLB flush\n");
471                         intel_iommu_strict = 1;
472                 } else if (!strncmp(str, "sp_off", 6)) {
473                         printk(KERN_INFO
474                                 "Intel-IOMMU: disable supported super page\n");
475                         intel_iommu_superpage = 0;
476                 } else if (!strncmp(str, "ecs_off", 7)) {
477                         printk(KERN_INFO
478                                 "Intel-IOMMU: disable extended context table support\n");
479                         intel_iommu_ecs = 0;
480                 }
481
482                 str += strcspn(str, ",");
483                 while (*str == ',')
484                         str++;
485         }
486         return 0;
487 }
488 __setup("intel_iommu=", intel_iommu_setup);
489
490 static struct kmem_cache *iommu_domain_cache;
491 static struct kmem_cache *iommu_devinfo_cache;
492
493 static inline void *alloc_pgtable_page(int node)
494 {
495         struct page *page;
496         void *vaddr = NULL;
497
498         page = alloc_pages_node(node, GFP_ATOMIC | __GFP_ZERO, 0);
499         if (page)
500                 vaddr = page_address(page);
501         return vaddr;
502 }
503
504 static inline void free_pgtable_page(void *vaddr)
505 {
506         free_page((unsigned long)vaddr);
507 }
508
509 static inline void *alloc_domain_mem(void)
510 {
511         return kmem_cache_alloc(iommu_domain_cache, GFP_ATOMIC);
512 }
513
514 static void free_domain_mem(void *vaddr)
515 {
516         kmem_cache_free(iommu_domain_cache, vaddr);
517 }
518
519 static inline void * alloc_devinfo_mem(void)
520 {
521         return kmem_cache_alloc(iommu_devinfo_cache, GFP_ATOMIC);
522 }
523
524 static inline void free_devinfo_mem(void *vaddr)
525 {
526         kmem_cache_free(iommu_devinfo_cache, vaddr);
527 }
528
529 static inline int domain_type_is_vm(struct dmar_domain *domain)
530 {
531         return domain->flags & DOMAIN_FLAG_VIRTUAL_MACHINE;
532 }
533
534 static inline int domain_type_is_vm_or_si(struct dmar_domain *domain)
535 {
536         return domain->flags & (DOMAIN_FLAG_VIRTUAL_MACHINE |
537                                 DOMAIN_FLAG_STATIC_IDENTITY);
538 }
539
540 static inline int domain_pfn_supported(struct dmar_domain *domain,
541                                        unsigned long pfn)
542 {
543         int addr_width = agaw_to_width(domain->agaw) - VTD_PAGE_SHIFT;
544
545         return !(addr_width < BITS_PER_LONG && pfn >> addr_width);
546 }
547
548 static int __iommu_calculate_agaw(struct intel_iommu *iommu, int max_gaw)
549 {
550         unsigned long sagaw;
551         int agaw = -1;
552
553         sagaw = cap_sagaw(iommu->cap);
554         for (agaw = width_to_agaw(max_gaw);
555              agaw >= 0; agaw--) {
556                 if (test_bit(agaw, &sagaw))
557                         break;
558         }
559
560         return agaw;
561 }
562
563 /*
564  * Calculate max SAGAW for each iommu.
565  */
566 int iommu_calculate_max_sagaw(struct intel_iommu *iommu)
567 {
568         return __iommu_calculate_agaw(iommu, MAX_AGAW_WIDTH);
569 }
570
571 /*
572  * calculate agaw for each iommu.
573  * "SAGAW" may be different across iommus, use a default agaw, and
574  * get a supported less agaw for iommus that don't support the default agaw.
575  */
576 int iommu_calculate_agaw(struct intel_iommu *iommu)
577 {
578         return __iommu_calculate_agaw(iommu, DEFAULT_DOMAIN_ADDRESS_WIDTH);
579 }
580
581 /* This functionin only returns single iommu in a domain */
582 static struct intel_iommu *domain_get_iommu(struct dmar_domain *domain)
583 {
584         int iommu_id;
585
586         /* si_domain and vm domain should not get here. */
587         BUG_ON(domain_type_is_vm_or_si(domain));
588         iommu_id = find_first_bit(domain->iommu_bmp, g_num_of_iommus);
589         if (iommu_id < 0 || iommu_id >= g_num_of_iommus)
590                 return NULL;
591
592         return g_iommus[iommu_id];
593 }
594
595 static void domain_update_iommu_coherency(struct dmar_domain *domain)
596 {
597         struct dmar_drhd_unit *drhd;
598         struct intel_iommu *iommu;
599         bool found = false;
600         int i;
601
602         domain->iommu_coherency = 1;
603
604         for_each_set_bit(i, domain->iommu_bmp, g_num_of_iommus) {
605                 found = true;
606                 if (!ecap_coherent(g_iommus[i]->ecap)) {
607                         domain->iommu_coherency = 0;
608                         break;
609                 }
610         }
611         if (found)
612                 return;
613
614         /* No hardware attached; use lowest common denominator */
615         rcu_read_lock();
616         for_each_active_iommu(iommu, drhd) {
617                 if (!ecap_coherent(iommu->ecap)) {
618                         domain->iommu_coherency = 0;
619                         break;
620                 }
621         }
622         rcu_read_unlock();
623 }
624
625 static int domain_update_iommu_snooping(struct intel_iommu *skip)
626 {
627         struct dmar_drhd_unit *drhd;
628         struct intel_iommu *iommu;
629         int ret = 1;
630
631         rcu_read_lock();
632         for_each_active_iommu(iommu, drhd) {
633                 if (iommu != skip) {
634                         if (!ecap_sc_support(iommu->ecap)) {
635                                 ret = 0;
636                                 break;
637                         }
638                 }
639         }
640         rcu_read_unlock();
641
642         return ret;
643 }
644
645 static int domain_update_iommu_superpage(struct intel_iommu *skip)
646 {
647         struct dmar_drhd_unit *drhd;
648         struct intel_iommu *iommu;
649         int mask = 0xf;
650
651         if (!intel_iommu_superpage) {
652                 return 0;
653         }
654
655         /* set iommu_superpage to the smallest common denominator */
656         rcu_read_lock();
657         for_each_active_iommu(iommu, drhd) {
658                 if (iommu != skip) {
659                         mask &= cap_super_page_val(iommu->cap);
660                         if (!mask)
661                                 break;
662                 }
663         }
664         rcu_read_unlock();
665
666         return fls(mask);
667 }
668
669 /* Some capabilities may be different across iommus */
670 static void domain_update_iommu_cap(struct dmar_domain *domain)
671 {
672         domain_update_iommu_coherency(domain);
673         domain->iommu_snooping = domain_update_iommu_snooping(NULL);
674         domain->iommu_superpage = domain_update_iommu_superpage(NULL);
675 }
676
677 static inline struct context_entry *iommu_context_addr(struct intel_iommu *iommu,
678                                                        u8 bus, u8 devfn, int alloc)
679 {
680         struct root_entry *root = &iommu->root_entry[bus];
681         struct context_entry *context;
682         u64 *entry;
683
684         entry = &root->lo;
685         if (ecs_enabled(iommu)) {
686                 if (devfn >= 0x80) {
687                         devfn -= 0x80;
688                         entry = &root->hi;
689                 }
690                 devfn *= 2;
691         }
692         if (*entry & 1)
693                 context = phys_to_virt(*entry & VTD_PAGE_MASK);
694         else {
695                 unsigned long phy_addr;
696                 if (!alloc)
697                         return NULL;
698
699                 context = alloc_pgtable_page(iommu->node);
700                 if (!context)
701                         return NULL;
702
703                 __iommu_flush_cache(iommu, (void *)context, CONTEXT_SIZE);
704                 phy_addr = virt_to_phys((void *)context);
705                 *entry = phy_addr | 1;
706                 __iommu_flush_cache(iommu, entry, sizeof(*entry));
707         }
708         return &context[devfn];
709 }
710
711 static int iommu_dummy(struct device *dev)
712 {
713         return dev->archdata.iommu == DUMMY_DEVICE_DOMAIN_INFO;
714 }
715
716 static struct intel_iommu *device_to_iommu(struct device *dev, u8 *bus, u8 *devfn)
717 {
718         struct dmar_drhd_unit *drhd = NULL;
719         struct intel_iommu *iommu;
720         struct device *tmp;
721         struct pci_dev *ptmp, *pdev = NULL;
722         u16 segment = 0;
723         int i;
724
725         if (iommu_dummy(dev))
726                 return NULL;
727
728         if (dev_is_pci(dev)) {
729                 pdev = to_pci_dev(dev);
730                 segment = pci_domain_nr(pdev->bus);
731         } else if (has_acpi_companion(dev))
732                 dev = &ACPI_COMPANION(dev)->dev;
733
734         rcu_read_lock();
735         for_each_active_iommu(iommu, drhd) {
736                 if (pdev && segment != drhd->segment)
737                         continue;
738
739                 for_each_active_dev_scope(drhd->devices,
740                                           drhd->devices_cnt, i, tmp) {
741                         if (tmp == dev) {
742                                 *bus = drhd->devices[i].bus;
743                                 *devfn = drhd->devices[i].devfn;
744                                 goto out;
745                         }
746
747                         if (!pdev || !dev_is_pci(tmp))
748                                 continue;
749
750                         ptmp = to_pci_dev(tmp);
751                         if (ptmp->subordinate &&
752                             ptmp->subordinate->number <= pdev->bus->number &&
753                             ptmp->subordinate->busn_res.end >= pdev->bus->number)
754                                 goto got_pdev;
755                 }
756
757                 if (pdev && drhd->include_all) {
758                 got_pdev:
759                         *bus = pdev->bus->number;
760                         *devfn = pdev->devfn;
761                         goto out;
762                 }
763         }
764         iommu = NULL;
765  out:
766         rcu_read_unlock();
767
768         return iommu;
769 }
770
771 static void domain_flush_cache(struct dmar_domain *domain,
772                                void *addr, int size)
773 {
774         if (!domain->iommu_coherency)
775                 clflush_cache_range(addr, size);
776 }
777
778 static int device_context_mapped(struct intel_iommu *iommu, u8 bus, u8 devfn)
779 {
780         struct context_entry *context;
781         int ret = 0;
782         unsigned long flags;
783
784         spin_lock_irqsave(&iommu->lock, flags);
785         context = iommu_context_addr(iommu, bus, devfn, 0);
786         if (context)
787                 ret = context_present(context);
788         spin_unlock_irqrestore(&iommu->lock, flags);
789         return ret;
790 }
791
792 static void clear_context_table(struct intel_iommu *iommu, u8 bus, u8 devfn)
793 {
794         struct context_entry *context;
795         unsigned long flags;
796
797         spin_lock_irqsave(&iommu->lock, flags);
798         context = iommu_context_addr(iommu, bus, devfn, 0);
799         if (context) {
800                 context_clear_entry(context);
801                 __iommu_flush_cache(iommu, context, sizeof(*context));
802         }
803         spin_unlock_irqrestore(&iommu->lock, flags);
804 }
805
806 static void free_context_table(struct intel_iommu *iommu)
807 {
808         int i;
809         unsigned long flags;
810         struct context_entry *context;
811
812         spin_lock_irqsave(&iommu->lock, flags);
813         if (!iommu->root_entry) {
814                 goto out;
815         }
816         for (i = 0; i < ROOT_ENTRY_NR; i++) {
817                 context = iommu_context_addr(iommu, i, 0, 0);
818                 if (context)
819                         free_pgtable_page(context);
820
821                 if (!ecs_enabled(iommu))
822                         continue;
823
824                 context = iommu_context_addr(iommu, i, 0x80, 0);
825                 if (context)
826                         free_pgtable_page(context);
827
828         }
829         free_pgtable_page(iommu->root_entry);
830         iommu->root_entry = NULL;
831 out:
832         spin_unlock_irqrestore(&iommu->lock, flags);
833 }
834
835 static struct dma_pte *pfn_to_dma_pte(struct dmar_domain *domain,
836                                       unsigned long pfn, int *target_level)
837 {
838         struct dma_pte *parent, *pte = NULL;
839         int level = agaw_to_level(domain->agaw);
840         int offset;
841
842         BUG_ON(!domain->pgd);
843
844         if (!domain_pfn_supported(domain, pfn))
845                 /* Address beyond IOMMU's addressing capabilities. */
846                 return NULL;
847
848         parent = domain->pgd;
849
850         while (1) {
851                 void *tmp_page;
852
853                 offset = pfn_level_offset(pfn, level);
854                 pte = &parent[offset];
855                 if (!*target_level && (dma_pte_superpage(pte) || !dma_pte_present(pte)))
856                         break;
857                 if (level == *target_level)
858                         break;
859
860                 if (!dma_pte_present(pte)) {
861                         uint64_t pteval;
862
863                         tmp_page = alloc_pgtable_page(domain->nid);
864
865                         if (!tmp_page)
866                                 return NULL;
867
868                         domain_flush_cache(domain, tmp_page, VTD_PAGE_SIZE);
869                         pteval = ((uint64_t)virt_to_dma_pfn(tmp_page) << VTD_PAGE_SHIFT) | DMA_PTE_READ | DMA_PTE_WRITE;
870                         if (cmpxchg64(&pte->val, 0ULL, pteval))
871                                 /* Someone else set it while we were thinking; use theirs. */
872                                 free_pgtable_page(tmp_page);
873                         else
874                                 domain_flush_cache(domain, pte, sizeof(*pte));
875                 }
876                 if (level == 1)
877                         break;
878
879                 parent = phys_to_virt(dma_pte_addr(pte));
880                 level--;
881         }
882
883         if (!*target_level)
884                 *target_level = level;
885
886         return pte;
887 }
888
889
890 /* return address's pte at specific level */
891 static struct dma_pte *dma_pfn_level_pte(struct dmar_domain *domain,
892                                          unsigned long pfn,
893                                          int level, int *large_page)
894 {
895         struct dma_pte *parent, *pte = NULL;
896         int total = agaw_to_level(domain->agaw);
897         int offset;
898
899         parent = domain->pgd;
900         while (level <= total) {
901                 offset = pfn_level_offset(pfn, total);
902                 pte = &parent[offset];
903                 if (level == total)
904                         return pte;
905
906                 if (!dma_pte_present(pte)) {
907                         *large_page = total;
908                         break;
909                 }
910
911                 if (dma_pte_superpage(pte)) {
912                         *large_page = total;
913                         return pte;
914                 }
915
916                 parent = phys_to_virt(dma_pte_addr(pte));
917                 total--;
918         }
919         return NULL;
920 }
921
922 /* clear last level pte, a tlb flush should be followed */
923 static void dma_pte_clear_range(struct dmar_domain *domain,
924                                 unsigned long start_pfn,
925                                 unsigned long last_pfn)
926 {
927         unsigned int large_page = 1;
928         struct dma_pte *first_pte, *pte;
929
930         BUG_ON(!domain_pfn_supported(domain, start_pfn));
931         BUG_ON(!domain_pfn_supported(domain, last_pfn));
932         BUG_ON(start_pfn > last_pfn);
933
934         /* we don't need lock here; nobody else touches the iova range */
935         do {
936                 large_page = 1;
937                 first_pte = pte = dma_pfn_level_pte(domain, start_pfn, 1, &large_page);
938                 if (!pte) {
939                         start_pfn = align_to_level(start_pfn + 1, large_page + 1);
940                         continue;
941                 }
942                 do {
943                         dma_clear_pte(pte);
944                         start_pfn += lvl_to_nr_pages(large_page);
945                         pte++;
946                 } while (start_pfn <= last_pfn && !first_pte_in_page(pte));
947
948                 domain_flush_cache(domain, first_pte,
949                                    (void *)pte - (void *)first_pte);
950
951         } while (start_pfn && start_pfn <= last_pfn);
952 }
953
954 static void dma_pte_free_level(struct dmar_domain *domain, int level,
955                                struct dma_pte *pte, unsigned long pfn,
956                                unsigned long start_pfn, unsigned long last_pfn)
957 {
958         pfn = max(start_pfn, pfn);
959         pte = &pte[pfn_level_offset(pfn, level)];
960
961         do {
962                 unsigned long level_pfn;
963                 struct dma_pte *level_pte;
964
965                 if (!dma_pte_present(pte) || dma_pte_superpage(pte))
966                         goto next;
967
968                 level_pfn = pfn & level_mask(level - 1);
969                 level_pte = phys_to_virt(dma_pte_addr(pte));
970
971                 if (level > 2)
972                         dma_pte_free_level(domain, level - 1, level_pte,
973                                            level_pfn, start_pfn, last_pfn);
974
975                 /* If range covers entire pagetable, free it */
976                 if (!(start_pfn > level_pfn ||
977                       last_pfn < level_pfn + level_size(level) - 1)) {
978                         dma_clear_pte(pte);
979                         domain_flush_cache(domain, pte, sizeof(*pte));
980                         free_pgtable_page(level_pte);
981                 }
982 next:
983                 pfn += level_size(level);
984         } while (!first_pte_in_page(++pte) && pfn <= last_pfn);
985 }
986
987 /* free page table pages. last level pte should already be cleared */
988 static void dma_pte_free_pagetable(struct dmar_domain *domain,
989                                    unsigned long start_pfn,
990                                    unsigned long last_pfn)
991 {
992         BUG_ON(!domain_pfn_supported(domain, start_pfn));
993         BUG_ON(!domain_pfn_supported(domain, last_pfn));
994         BUG_ON(start_pfn > last_pfn);
995
996         dma_pte_clear_range(domain, start_pfn, last_pfn);
997
998         /* We don't need lock here; nobody else touches the iova range */
999         dma_pte_free_level(domain, agaw_to_level(domain->agaw),
1000                            domain->pgd, 0, start_pfn, last_pfn);
1001
1002         /* free pgd */
1003         if (start_pfn == 0 && last_pfn == DOMAIN_MAX_PFN(domain->gaw)) {
1004                 free_pgtable_page(domain->pgd);
1005                 domain->pgd = NULL;
1006         }
1007 }
1008
1009 /* When a page at a given level is being unlinked from its parent, we don't
1010    need to *modify* it at all. All we need to do is make a list of all the
1011    pages which can be freed just as soon as we've flushed the IOTLB and we
1012    know the hardware page-walk will no longer touch them.
1013    The 'pte' argument is the *parent* PTE, pointing to the page that is to
1014    be freed. */
1015 static struct page *dma_pte_list_pagetables(struct dmar_domain *domain,
1016                                             int level, struct dma_pte *pte,
1017                                             struct page *freelist)
1018 {
1019         struct page *pg;
1020
1021         pg = pfn_to_page(dma_pte_addr(pte) >> PAGE_SHIFT);
1022         pg->freelist = freelist;
1023         freelist = pg;
1024
1025         if (level == 1)
1026                 return freelist;
1027
1028         pte = page_address(pg);
1029         do {
1030                 if (dma_pte_present(pte) && !dma_pte_superpage(pte))
1031                         freelist = dma_pte_list_pagetables(domain, level - 1,
1032                                                            pte, freelist);
1033                 pte++;
1034         } while (!first_pte_in_page(pte));
1035
1036         return freelist;
1037 }
1038
1039 static struct page *dma_pte_clear_level(struct dmar_domain *domain, int level,
1040                                         struct dma_pte *pte, unsigned long pfn,
1041                                         unsigned long start_pfn,
1042                                         unsigned long last_pfn,
1043                                         struct page *freelist)
1044 {
1045         struct dma_pte *first_pte = NULL, *last_pte = NULL;
1046
1047         pfn = max(start_pfn, pfn);
1048         pte = &pte[pfn_level_offset(pfn, level)];
1049
1050         do {
1051                 unsigned long level_pfn;
1052
1053                 if (!dma_pte_present(pte))
1054                         goto next;
1055
1056                 level_pfn = pfn & level_mask(level);
1057
1058                 /* If range covers entire pagetable, free it */
1059                 if (start_pfn <= level_pfn &&
1060                     last_pfn >= level_pfn + level_size(level) - 1) {
1061                         /* These suborbinate page tables are going away entirely. Don't
1062                            bother to clear them; we're just going to *free* them. */
1063                         if (level > 1 && !dma_pte_superpage(pte))
1064                                 freelist = dma_pte_list_pagetables(domain, level - 1, pte, freelist);
1065
1066                         dma_clear_pte(pte);
1067                         if (!first_pte)
1068                                 first_pte = pte;
1069                         last_pte = pte;
1070                 } else if (level > 1) {
1071                         /* Recurse down into a level that isn't *entirely* obsolete */
1072                         freelist = dma_pte_clear_level(domain, level - 1,
1073                                                        phys_to_virt(dma_pte_addr(pte)),
1074                                                        level_pfn, start_pfn, last_pfn,
1075                                                        freelist);
1076                 }
1077 next:
1078                 pfn += level_size(level);
1079         } while (!first_pte_in_page(++pte) && pfn <= last_pfn);
1080
1081         if (first_pte)
1082                 domain_flush_cache(domain, first_pte,
1083                                    (void *)++last_pte - (void *)first_pte);
1084
1085         return freelist;
1086 }
1087
1088 /* We can't just free the pages because the IOMMU may still be walking
1089    the page tables, and may have cached the intermediate levels. The
1090    pages can only be freed after the IOTLB flush has been done. */
1091 struct page *domain_unmap(struct dmar_domain *domain,
1092                           unsigned long start_pfn,
1093                           unsigned long last_pfn)
1094 {
1095         struct page *freelist = NULL;
1096
1097         BUG_ON(!domain_pfn_supported(domain, start_pfn));
1098         BUG_ON(!domain_pfn_supported(domain, last_pfn));
1099         BUG_ON(start_pfn > last_pfn);
1100
1101         /* we don't need lock here; nobody else touches the iova range */
1102         freelist = dma_pte_clear_level(domain, agaw_to_level(domain->agaw),
1103                                        domain->pgd, 0, start_pfn, last_pfn, NULL);
1104
1105         /* free pgd */
1106         if (start_pfn == 0 && last_pfn == DOMAIN_MAX_PFN(domain->gaw)) {
1107                 struct page *pgd_page = virt_to_page(domain->pgd);
1108                 pgd_page->freelist = freelist;
1109                 freelist = pgd_page;
1110
1111                 domain->pgd = NULL;
1112         }
1113
1114         return freelist;
1115 }
1116
1117 void dma_free_pagelist(struct page *freelist)
1118 {
1119         struct page *pg;
1120
1121         while ((pg = freelist)) {
1122                 freelist = pg->freelist;
1123                 free_pgtable_page(page_address(pg));
1124         }
1125 }
1126
1127 /* iommu handling */
1128 static int iommu_alloc_root_entry(struct intel_iommu *iommu)
1129 {
1130         struct root_entry *root;
1131         unsigned long flags;
1132
1133         root = (struct root_entry *)alloc_pgtable_page(iommu->node);
1134         if (!root) {
1135                 pr_err("IOMMU: allocating root entry for %s failed\n",
1136                         iommu->name);
1137                 return -ENOMEM;
1138         }
1139
1140         __iommu_flush_cache(iommu, root, ROOT_SIZE);
1141
1142         spin_lock_irqsave(&iommu->lock, flags);
1143         iommu->root_entry = root;
1144         spin_unlock_irqrestore(&iommu->lock, flags);
1145
1146         return 0;
1147 }
1148
1149 static void iommu_set_root_entry(struct intel_iommu *iommu)
1150 {
1151         u64 addr;
1152         u32 sts;
1153         unsigned long flag;
1154
1155         addr = virt_to_phys(iommu->root_entry);
1156         if (ecs_enabled(iommu))
1157                 addr |= DMA_RTADDR_RTT;
1158
1159         raw_spin_lock_irqsave(&iommu->register_lock, flag);
1160         dmar_writeq(iommu->reg + DMAR_RTADDR_REG, addr);
1161
1162         writel(iommu->gcmd | DMA_GCMD_SRTP, iommu->reg + DMAR_GCMD_REG);
1163
1164         /* Make sure hardware complete it */
1165         IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1166                       readl, (sts & DMA_GSTS_RTPS), sts);
1167
1168         raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1169 }
1170
1171 static void iommu_flush_write_buffer(struct intel_iommu *iommu)
1172 {
1173         u32 val;
1174         unsigned long flag;
1175
1176         if (!rwbf_quirk && !cap_rwbf(iommu->cap))
1177                 return;
1178
1179         raw_spin_lock_irqsave(&iommu->register_lock, flag);
1180         writel(iommu->gcmd | DMA_GCMD_WBF, iommu->reg + DMAR_GCMD_REG);
1181
1182         /* Make sure hardware complete it */
1183         IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1184                       readl, (!(val & DMA_GSTS_WBFS)), val);
1185
1186         raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1187 }
1188
1189 /* return value determine if we need a write buffer flush */
1190 static void __iommu_flush_context(struct intel_iommu *iommu,
1191                                   u16 did, u16 source_id, u8 function_mask,
1192                                   u64 type)
1193 {
1194         u64 val = 0;
1195         unsigned long flag;
1196
1197         switch (type) {
1198         case DMA_CCMD_GLOBAL_INVL:
1199                 val = DMA_CCMD_GLOBAL_INVL;
1200                 break;
1201         case DMA_CCMD_DOMAIN_INVL:
1202                 val = DMA_CCMD_DOMAIN_INVL|DMA_CCMD_DID(did);
1203                 break;
1204         case DMA_CCMD_DEVICE_INVL:
1205                 val = DMA_CCMD_DEVICE_INVL|DMA_CCMD_DID(did)
1206                         | DMA_CCMD_SID(source_id) | DMA_CCMD_FM(function_mask);
1207                 break;
1208         default:
1209                 BUG();
1210         }
1211         val |= DMA_CCMD_ICC;
1212
1213         raw_spin_lock_irqsave(&iommu->register_lock, flag);
1214         dmar_writeq(iommu->reg + DMAR_CCMD_REG, val);
1215
1216         /* Make sure hardware complete it */
1217         IOMMU_WAIT_OP(iommu, DMAR_CCMD_REG,
1218                 dmar_readq, (!(val & DMA_CCMD_ICC)), val);
1219
1220         raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1221 }
1222
1223 /* return value determine if we need a write buffer flush */
1224 static void __iommu_flush_iotlb(struct intel_iommu *iommu, u16 did,
1225                                 u64 addr, unsigned int size_order, u64 type)
1226 {
1227         int tlb_offset = ecap_iotlb_offset(iommu->ecap);
1228         u64 val = 0, val_iva = 0;
1229         unsigned long flag;
1230
1231         switch (type) {
1232         case DMA_TLB_GLOBAL_FLUSH:
1233                 /* global flush doesn't need set IVA_REG */
1234                 val = DMA_TLB_GLOBAL_FLUSH|DMA_TLB_IVT;
1235                 break;
1236         case DMA_TLB_DSI_FLUSH:
1237                 val = DMA_TLB_DSI_FLUSH|DMA_TLB_IVT|DMA_TLB_DID(did);
1238                 break;
1239         case DMA_TLB_PSI_FLUSH:
1240                 val = DMA_TLB_PSI_FLUSH|DMA_TLB_IVT|DMA_TLB_DID(did);
1241                 /* IH bit is passed in as part of address */
1242                 val_iva = size_order | addr;
1243                 break;
1244         default:
1245                 BUG();
1246         }
1247         /* Note: set drain read/write */
1248 #if 0
1249         /*
1250          * This is probably to be super secure.. Looks like we can
1251          * ignore it without any impact.
1252          */
1253         if (cap_read_drain(iommu->cap))
1254                 val |= DMA_TLB_READ_DRAIN;
1255 #endif
1256         if (cap_write_drain(iommu->cap))
1257                 val |= DMA_TLB_WRITE_DRAIN;
1258
1259         raw_spin_lock_irqsave(&iommu->register_lock, flag);
1260         /* Note: Only uses first TLB reg currently */
1261         if (val_iva)
1262                 dmar_writeq(iommu->reg + tlb_offset, val_iva);
1263         dmar_writeq(iommu->reg + tlb_offset + 8, val);
1264
1265         /* Make sure hardware complete it */
1266         IOMMU_WAIT_OP(iommu, tlb_offset + 8,
1267                 dmar_readq, (!(val & DMA_TLB_IVT)), val);
1268
1269         raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1270
1271         /* check IOTLB invalidation granularity */
1272         if (DMA_TLB_IAIG(val) == 0)
1273                 printk(KERN_ERR"IOMMU: flush IOTLB failed\n");
1274         if (DMA_TLB_IAIG(val) != DMA_TLB_IIRG(type))
1275                 pr_debug("IOMMU: tlb flush request %Lx, actual %Lx\n",
1276                         (unsigned long long)DMA_TLB_IIRG(type),
1277                         (unsigned long long)DMA_TLB_IAIG(val));
1278 }
1279
1280 static struct device_domain_info *
1281 iommu_support_dev_iotlb (struct dmar_domain *domain, struct intel_iommu *iommu,
1282                          u8 bus, u8 devfn)
1283 {
1284         bool found = false;
1285         unsigned long flags;
1286         struct device_domain_info *info;
1287         struct pci_dev *pdev;
1288
1289         if (!ecap_dev_iotlb_support(iommu->ecap))
1290                 return NULL;
1291
1292         if (!iommu->qi)
1293                 return NULL;
1294
1295         spin_lock_irqsave(&device_domain_lock, flags);
1296         list_for_each_entry(info, &domain->devices, link)
1297                 if (info->iommu == iommu && info->bus == bus &&
1298                     info->devfn == devfn) {
1299                         found = true;
1300                         break;
1301                 }
1302         spin_unlock_irqrestore(&device_domain_lock, flags);
1303
1304         if (!found || !info->dev || !dev_is_pci(info->dev))
1305                 return NULL;
1306
1307         pdev = to_pci_dev(info->dev);
1308
1309         if (!pci_find_ext_capability(pdev, PCI_EXT_CAP_ID_ATS))
1310                 return NULL;
1311
1312         if (!dmar_find_matched_atsr_unit(pdev))
1313                 return NULL;
1314
1315         return info;
1316 }
1317
1318 static void iommu_enable_dev_iotlb(struct device_domain_info *info)
1319 {
1320         if (!info || !dev_is_pci(info->dev))
1321                 return;
1322
1323         pci_enable_ats(to_pci_dev(info->dev), VTD_PAGE_SHIFT);
1324 }
1325
1326 static void iommu_disable_dev_iotlb(struct device_domain_info *info)
1327 {
1328         if (!info->dev || !dev_is_pci(info->dev) ||
1329             !pci_ats_enabled(to_pci_dev(info->dev)))
1330                 return;
1331
1332         pci_disable_ats(to_pci_dev(info->dev));
1333 }
1334
1335 static void iommu_flush_dev_iotlb(struct dmar_domain *domain,
1336                                   u64 addr, unsigned mask)
1337 {
1338         u16 sid, qdep;
1339         unsigned long flags;
1340         struct device_domain_info *info;
1341
1342         spin_lock_irqsave(&device_domain_lock, flags);
1343         list_for_each_entry(info, &domain->devices, link) {
1344                 struct pci_dev *pdev;
1345                 if (!info->dev || !dev_is_pci(info->dev))
1346                         continue;
1347
1348                 pdev = to_pci_dev(info->dev);
1349                 if (!pci_ats_enabled(pdev))
1350                         continue;
1351
1352                 sid = info->bus << 8 | info->devfn;
1353                 qdep = pci_ats_queue_depth(pdev);
1354                 qi_flush_dev_iotlb(info->iommu, sid, qdep, addr, mask);
1355         }
1356         spin_unlock_irqrestore(&device_domain_lock, flags);
1357 }
1358
1359 static void iommu_flush_iotlb_psi(struct intel_iommu *iommu, u16 did,
1360                                   unsigned long pfn, unsigned int pages, int ih, int map)
1361 {
1362         unsigned int mask = ilog2(__roundup_pow_of_two(pages));
1363         uint64_t addr = (uint64_t)pfn << VTD_PAGE_SHIFT;
1364
1365         BUG_ON(pages == 0);
1366
1367         if (ih)
1368                 ih = 1 << 6;
1369         /*
1370          * Fallback to domain selective flush if no PSI support or the size is
1371          * too big.
1372          * PSI requires page size to be 2 ^ x, and the base address is naturally
1373          * aligned to the size
1374          */
1375         if (!cap_pgsel_inv(iommu->cap) || mask > cap_max_amask_val(iommu->cap))
1376                 iommu->flush.flush_iotlb(iommu, did, 0, 0,
1377                                                 DMA_TLB_DSI_FLUSH);
1378         else
1379                 iommu->flush.flush_iotlb(iommu, did, addr | ih, mask,
1380                                                 DMA_TLB_PSI_FLUSH);
1381
1382         /*
1383          * In caching mode, changes of pages from non-present to present require
1384          * flush. However, device IOTLB doesn't need to be flushed in this case.
1385          */
1386         if (!cap_caching_mode(iommu->cap) || !map)
1387                 iommu_flush_dev_iotlb(iommu->domains[did], addr, mask);
1388 }
1389
1390 static void iommu_disable_protect_mem_regions(struct intel_iommu *iommu)
1391 {
1392         u32 pmen;
1393         unsigned long flags;
1394
1395         raw_spin_lock_irqsave(&iommu->register_lock, flags);
1396         pmen = readl(iommu->reg + DMAR_PMEN_REG);
1397         pmen &= ~DMA_PMEN_EPM;
1398         writel(pmen, iommu->reg + DMAR_PMEN_REG);
1399
1400         /* wait for the protected region status bit to clear */
1401         IOMMU_WAIT_OP(iommu, DMAR_PMEN_REG,
1402                 readl, !(pmen & DMA_PMEN_PRS), pmen);
1403
1404         raw_spin_unlock_irqrestore(&iommu->register_lock, flags);
1405 }
1406
1407 static void iommu_enable_translation(struct intel_iommu *iommu)
1408 {
1409         u32 sts;
1410         unsigned long flags;
1411
1412         raw_spin_lock_irqsave(&iommu->register_lock, flags);
1413         iommu->gcmd |= DMA_GCMD_TE;
1414         writel(iommu->gcmd, iommu->reg + DMAR_GCMD_REG);
1415
1416         /* Make sure hardware complete it */
1417         IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1418                       readl, (sts & DMA_GSTS_TES), sts);
1419
1420         raw_spin_unlock_irqrestore(&iommu->register_lock, flags);
1421 }
1422
1423 static void iommu_disable_translation(struct intel_iommu *iommu)
1424 {
1425         u32 sts;
1426         unsigned long flag;
1427
1428         raw_spin_lock_irqsave(&iommu->register_lock, flag);
1429         iommu->gcmd &= ~DMA_GCMD_TE;
1430         writel(iommu->gcmd, iommu->reg + DMAR_GCMD_REG);
1431
1432         /* Make sure hardware complete it */
1433         IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1434                       readl, (!(sts & DMA_GSTS_TES)), sts);
1435
1436         raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1437 }
1438
1439
1440 static int iommu_init_domains(struct intel_iommu *iommu)
1441 {
1442         unsigned long ndomains;
1443         unsigned long nlongs;
1444
1445         ndomains = cap_ndoms(iommu->cap);
1446         pr_debug("IOMMU%d: Number of Domains supported <%ld>\n",
1447                  iommu->seq_id, ndomains);
1448         nlongs = BITS_TO_LONGS(ndomains);
1449
1450         spin_lock_init(&iommu->lock);
1451
1452         /* TBD: there might be 64K domains,
1453          * consider other allocation for future chip
1454          */
1455         iommu->domain_ids = kcalloc(nlongs, sizeof(unsigned long), GFP_KERNEL);
1456         if (!iommu->domain_ids) {
1457                 pr_err("IOMMU%d: allocating domain id array failed\n",
1458                        iommu->seq_id);
1459                 return -ENOMEM;
1460         }
1461         iommu->domains = kcalloc(ndomains, sizeof(struct dmar_domain *),
1462                         GFP_KERNEL);
1463         if (!iommu->domains) {
1464                 pr_err("IOMMU%d: allocating domain array failed\n",
1465                        iommu->seq_id);
1466                 kfree(iommu->domain_ids);
1467                 iommu->domain_ids = NULL;
1468                 return -ENOMEM;
1469         }
1470
1471         /*
1472          * if Caching mode is set, then invalid translations are tagged
1473          * with domainid 0. Hence we need to pre-allocate it.
1474          */
1475         if (cap_caching_mode(iommu->cap))
1476                 set_bit(0, iommu->domain_ids);
1477         return 0;
1478 }
1479
1480 static void disable_dmar_iommu(struct intel_iommu *iommu)
1481 {
1482         struct dmar_domain *domain;
1483         int i;
1484
1485         if ((iommu->domains) && (iommu->domain_ids)) {
1486                 for_each_set_bit(i, iommu->domain_ids, cap_ndoms(iommu->cap)) {
1487                         /*
1488                          * Domain id 0 is reserved for invalid translation
1489                          * if hardware supports caching mode.
1490                          */
1491                         if (cap_caching_mode(iommu->cap) && i == 0)
1492                                 continue;
1493
1494                         domain = iommu->domains[i];
1495                         clear_bit(i, iommu->domain_ids);
1496                         if (domain_detach_iommu(domain, iommu) == 0 &&
1497                             !domain_type_is_vm(domain))
1498                                 domain_exit(domain);
1499                 }
1500         }
1501
1502         if (iommu->gcmd & DMA_GCMD_TE)
1503                 iommu_disable_translation(iommu);
1504 }
1505
1506 static void free_dmar_iommu(struct intel_iommu *iommu)
1507 {
1508         if ((iommu->domains) && (iommu->domain_ids)) {
1509                 kfree(iommu->domains);
1510                 kfree(iommu->domain_ids);
1511                 iommu->domains = NULL;
1512                 iommu->domain_ids = NULL;
1513         }
1514
1515         g_iommus[iommu->seq_id] = NULL;
1516
1517         /* free context mapping */
1518         free_context_table(iommu);
1519 }
1520
1521 static struct dmar_domain *alloc_domain(int flags)
1522 {
1523         /* domain id for virtual machine, it won't be set in context */
1524         static atomic_t vm_domid = ATOMIC_INIT(0);
1525         struct dmar_domain *domain;
1526
1527         domain = alloc_domain_mem();
1528         if (!domain)
1529                 return NULL;
1530
1531         memset(domain, 0, sizeof(*domain));
1532         domain->nid = -1;
1533         domain->flags = flags;
1534         spin_lock_init(&domain->iommu_lock);
1535         INIT_LIST_HEAD(&domain->devices);
1536         if (flags & DOMAIN_FLAG_VIRTUAL_MACHINE)
1537                 domain->id = atomic_inc_return(&vm_domid);
1538
1539         return domain;
1540 }
1541
1542 static int __iommu_attach_domain(struct dmar_domain *domain,
1543                                  struct intel_iommu *iommu)
1544 {
1545         int num;
1546         unsigned long ndomains;
1547
1548         ndomains = cap_ndoms(iommu->cap);
1549         num = find_first_zero_bit(iommu->domain_ids, ndomains);
1550         if (num < ndomains) {
1551                 set_bit(num, iommu->domain_ids);
1552                 iommu->domains[num] = domain;
1553         } else {
1554                 num = -ENOSPC;
1555         }
1556
1557         return num;
1558 }
1559
1560 static int iommu_attach_domain(struct dmar_domain *domain,
1561                                struct intel_iommu *iommu)
1562 {
1563         int num;
1564         unsigned long flags;
1565
1566         spin_lock_irqsave(&iommu->lock, flags);
1567         num = __iommu_attach_domain(domain, iommu);
1568         spin_unlock_irqrestore(&iommu->lock, flags);
1569         if (num < 0)
1570                 pr_err("IOMMU: no free domain ids\n");
1571
1572         return num;
1573 }
1574
1575 static int iommu_attach_vm_domain(struct dmar_domain *domain,
1576                                   struct intel_iommu *iommu)
1577 {
1578         int num;
1579         unsigned long ndomains;
1580
1581         ndomains = cap_ndoms(iommu->cap);
1582         for_each_set_bit(num, iommu->domain_ids, ndomains)
1583                 if (iommu->domains[num] == domain)
1584                         return num;
1585
1586         return __iommu_attach_domain(domain, iommu);
1587 }
1588
1589 static void iommu_detach_domain(struct dmar_domain *domain,
1590                                 struct intel_iommu *iommu)
1591 {
1592         unsigned long flags;
1593         int num, ndomains;
1594
1595         spin_lock_irqsave(&iommu->lock, flags);
1596         if (domain_type_is_vm_or_si(domain)) {
1597                 ndomains = cap_ndoms(iommu->cap);
1598                 for_each_set_bit(num, iommu->domain_ids, ndomains) {
1599                         if (iommu->domains[num] == domain) {
1600                                 clear_bit(num, iommu->domain_ids);
1601                                 iommu->domains[num] = NULL;
1602                                 break;
1603                         }
1604                 }
1605         } else {
1606                 clear_bit(domain->id, iommu->domain_ids);
1607                 iommu->domains[domain->id] = NULL;
1608         }
1609         spin_unlock_irqrestore(&iommu->lock, flags);
1610 }
1611
1612 static void domain_attach_iommu(struct dmar_domain *domain,
1613                                struct intel_iommu *iommu)
1614 {
1615         unsigned long flags;
1616
1617         spin_lock_irqsave(&domain->iommu_lock, flags);
1618         if (!test_and_set_bit(iommu->seq_id, domain->iommu_bmp)) {
1619                 domain->iommu_count++;
1620                 if (domain->iommu_count == 1)
1621                         domain->nid = iommu->node;
1622                 domain_update_iommu_cap(domain);
1623         }
1624         spin_unlock_irqrestore(&domain->iommu_lock, flags);
1625 }
1626
1627 static int domain_detach_iommu(struct dmar_domain *domain,
1628                                struct intel_iommu *iommu)
1629 {
1630         unsigned long flags;
1631         int count = INT_MAX;
1632
1633         spin_lock_irqsave(&domain->iommu_lock, flags);
1634         if (test_and_clear_bit(iommu->seq_id, domain->iommu_bmp)) {
1635                 count = --domain->iommu_count;
1636                 domain_update_iommu_cap(domain);
1637         }
1638         spin_unlock_irqrestore(&domain->iommu_lock, flags);
1639
1640         return count;
1641 }
1642
1643 static struct iova_domain reserved_iova_list;
1644 static struct lock_class_key reserved_rbtree_key;
1645
1646 static int dmar_init_reserved_ranges(void)
1647 {
1648         struct pci_dev *pdev = NULL;
1649         struct iova *iova;
1650         int i;
1651
1652         init_iova_domain(&reserved_iova_list, VTD_PAGE_SIZE, IOVA_START_PFN,
1653                         DMA_32BIT_PFN);
1654
1655         lockdep_set_class(&reserved_iova_list.iova_rbtree_lock,
1656                 &reserved_rbtree_key);
1657
1658         /* IOAPIC ranges shouldn't be accessed by DMA */
1659         iova = reserve_iova(&reserved_iova_list, IOVA_PFN(IOAPIC_RANGE_START),
1660                 IOVA_PFN(IOAPIC_RANGE_END));
1661         if (!iova) {
1662                 printk(KERN_ERR "Reserve IOAPIC range failed\n");
1663                 return -ENODEV;
1664         }
1665
1666         /* Reserve all PCI MMIO to avoid peer-to-peer access */
1667         for_each_pci_dev(pdev) {
1668                 struct resource *r;
1669
1670                 for (i = 0; i < PCI_NUM_RESOURCES; i++) {
1671                         r = &pdev->resource[i];
1672                         if (!r->flags || !(r->flags & IORESOURCE_MEM))
1673                                 continue;
1674                         iova = reserve_iova(&reserved_iova_list,
1675                                             IOVA_PFN(r->start),
1676                                             IOVA_PFN(r->end));
1677                         if (!iova) {
1678                                 printk(KERN_ERR "Reserve iova failed\n");
1679                                 return -ENODEV;
1680                         }
1681                 }
1682         }
1683         return 0;
1684 }
1685
1686 static void domain_reserve_special_ranges(struct dmar_domain *domain)
1687 {
1688         copy_reserved_iova(&reserved_iova_list, &domain->iovad);
1689 }
1690
1691 static inline int guestwidth_to_adjustwidth(int gaw)
1692 {
1693         int agaw;
1694         int r = (gaw - 12) % 9;
1695
1696         if (r == 0)
1697                 agaw = gaw;
1698         else
1699                 agaw = gaw + 9 - r;
1700         if (agaw > 64)
1701                 agaw = 64;
1702         return agaw;
1703 }
1704
1705 static int domain_init(struct dmar_domain *domain, int guest_width)
1706 {
1707         struct intel_iommu *iommu;
1708         int adjust_width, agaw;
1709         unsigned long sagaw;
1710
1711         init_iova_domain(&domain->iovad, VTD_PAGE_SIZE, IOVA_START_PFN,
1712                         DMA_32BIT_PFN);
1713         domain_reserve_special_ranges(domain);
1714
1715         /* calculate AGAW */
1716         iommu = domain_get_iommu(domain);
1717         if (guest_width > cap_mgaw(iommu->cap))
1718                 guest_width = cap_mgaw(iommu->cap);
1719         domain->gaw = guest_width;
1720         adjust_width = guestwidth_to_adjustwidth(guest_width);
1721         agaw = width_to_agaw(adjust_width);
1722         sagaw = cap_sagaw(iommu->cap);
1723         if (!test_bit(agaw, &sagaw)) {
1724                 /* hardware doesn't support it, choose a bigger one */
1725                 pr_debug("IOMMU: hardware doesn't support agaw %d\n", agaw);
1726                 agaw = find_next_bit(&sagaw, 5, agaw);
1727                 if (agaw >= 5)
1728                         return -ENODEV;
1729         }
1730         domain->agaw = agaw;
1731
1732         if (ecap_coherent(iommu->ecap))
1733                 domain->iommu_coherency = 1;
1734         else
1735                 domain->iommu_coherency = 0;
1736
1737         if (ecap_sc_support(iommu->ecap))
1738                 domain->iommu_snooping = 1;
1739         else
1740                 domain->iommu_snooping = 0;
1741
1742         if (intel_iommu_superpage)
1743                 domain->iommu_superpage = fls(cap_super_page_val(iommu->cap));
1744         else
1745                 domain->iommu_superpage = 0;
1746
1747         domain->nid = iommu->node;
1748
1749         /* always allocate the top pgd */
1750         domain->pgd = (struct dma_pte *)alloc_pgtable_page(domain->nid);
1751         if (!domain->pgd)
1752                 return -ENOMEM;
1753         __iommu_flush_cache(iommu, domain->pgd, PAGE_SIZE);
1754         return 0;
1755 }
1756
1757 static void domain_exit(struct dmar_domain *domain)
1758 {
1759         struct dmar_drhd_unit *drhd;
1760         struct intel_iommu *iommu;
1761         struct page *freelist = NULL;
1762
1763         /* Domain 0 is reserved, so dont process it */
1764         if (!domain)
1765                 return;
1766
1767         /* Flush any lazy unmaps that may reference this domain */
1768         if (!intel_iommu_strict)
1769                 flush_unmaps_timeout(0);
1770
1771         /* remove associated devices */
1772         domain_remove_dev_info(domain);
1773
1774         /* destroy iovas */
1775         put_iova_domain(&domain->iovad);
1776
1777         freelist = domain_unmap(domain, 0, DOMAIN_MAX_PFN(domain->gaw));
1778
1779         /* clear attached or cached domains */
1780         rcu_read_lock();
1781         for_each_active_iommu(iommu, drhd)
1782                 if (domain_type_is_vm(domain) ||
1783                     test_bit(iommu->seq_id, domain->iommu_bmp))
1784                         iommu_detach_domain(domain, iommu);
1785         rcu_read_unlock();
1786
1787         dma_free_pagelist(freelist);
1788
1789         free_domain_mem(domain);
1790 }
1791
1792 static int domain_context_mapping_one(struct dmar_domain *domain,
1793                                       struct intel_iommu *iommu,
1794                                       u8 bus, u8 devfn, int translation)
1795 {
1796         struct context_entry *context;
1797         unsigned long flags;
1798         struct dma_pte *pgd;
1799         int id;
1800         int agaw;
1801         struct device_domain_info *info = NULL;
1802
1803         pr_debug("Set context mapping for %02x:%02x.%d\n",
1804                 bus, PCI_SLOT(devfn), PCI_FUNC(devfn));
1805
1806         BUG_ON(!domain->pgd);
1807         BUG_ON(translation != CONTEXT_TT_PASS_THROUGH &&
1808                translation != CONTEXT_TT_MULTI_LEVEL);
1809
1810         spin_lock_irqsave(&iommu->lock, flags);
1811         context = iommu_context_addr(iommu, bus, devfn, 1);
1812         spin_unlock_irqrestore(&iommu->lock, flags);
1813         if (!context)
1814                 return -ENOMEM;
1815         spin_lock_irqsave(&iommu->lock, flags);
1816         if (context_present(context)) {
1817                 spin_unlock_irqrestore(&iommu->lock, flags);
1818                 return 0;
1819         }
1820
1821         id = domain->id;
1822         pgd = domain->pgd;
1823
1824         if (domain_type_is_vm_or_si(domain)) {
1825                 if (domain_type_is_vm(domain)) {
1826                         id = iommu_attach_vm_domain(domain, iommu);
1827                         if (id < 0) {
1828                                 spin_unlock_irqrestore(&iommu->lock, flags);
1829                                 pr_err("IOMMU: no free domain ids\n");
1830                                 return -EFAULT;
1831                         }
1832                 }
1833
1834                 /* Skip top levels of page tables for
1835                  * iommu which has less agaw than default.
1836                  * Unnecessary for PT mode.
1837                  */
1838                 if (translation != CONTEXT_TT_PASS_THROUGH) {
1839                         for (agaw = domain->agaw; agaw != iommu->agaw; agaw--) {
1840                                 pgd = phys_to_virt(dma_pte_addr(pgd));
1841                                 if (!dma_pte_present(pgd)) {
1842                                         spin_unlock_irqrestore(&iommu->lock, flags);
1843                                         return -ENOMEM;
1844                                 }
1845                         }
1846                 }
1847         }
1848
1849         context_set_domain_id(context, id);
1850
1851         if (translation != CONTEXT_TT_PASS_THROUGH) {
1852                 info = iommu_support_dev_iotlb(domain, iommu, bus, devfn);
1853                 translation = info ? CONTEXT_TT_DEV_IOTLB :
1854                                      CONTEXT_TT_MULTI_LEVEL;
1855         }
1856         /*
1857          * In pass through mode, AW must be programmed to indicate the largest
1858          * AGAW value supported by hardware. And ASR is ignored by hardware.
1859          */
1860         if (unlikely(translation == CONTEXT_TT_PASS_THROUGH))
1861                 context_set_address_width(context, iommu->msagaw);
1862         else {
1863                 context_set_address_root(context, virt_to_phys(pgd));
1864                 context_set_address_width(context, iommu->agaw);
1865         }
1866
1867         context_set_translation_type(context, translation);
1868         context_set_fault_enable(context);
1869         context_set_present(context);
1870         domain_flush_cache(domain, context, sizeof(*context));
1871
1872         /*
1873          * It's a non-present to present mapping. If hardware doesn't cache
1874          * non-present entry we only need to flush the write-buffer. If the
1875          * _does_ cache non-present entries, then it does so in the special
1876          * domain #0, which we have to flush:
1877          */
1878         if (cap_caching_mode(iommu->cap)) {
1879                 iommu->flush.flush_context(iommu, 0,
1880                                            (((u16)bus) << 8) | devfn,
1881                                            DMA_CCMD_MASK_NOBIT,
1882                                            DMA_CCMD_DEVICE_INVL);
1883                 iommu->flush.flush_iotlb(iommu, id, 0, 0, DMA_TLB_DSI_FLUSH);
1884         } else {
1885                 iommu_flush_write_buffer(iommu);
1886         }
1887         iommu_enable_dev_iotlb(info);
1888         spin_unlock_irqrestore(&iommu->lock, flags);
1889
1890         domain_attach_iommu(domain, iommu);
1891
1892         return 0;
1893 }
1894
1895 struct domain_context_mapping_data {
1896         struct dmar_domain *domain;
1897         struct intel_iommu *iommu;
1898         int translation;
1899 };
1900
1901 static int domain_context_mapping_cb(struct pci_dev *pdev,
1902                                      u16 alias, void *opaque)
1903 {
1904         struct domain_context_mapping_data *data = opaque;
1905
1906         return domain_context_mapping_one(data->domain, data->iommu,
1907                                           PCI_BUS_NUM(alias), alias & 0xff,
1908                                           data->translation);
1909 }
1910
1911 static int
1912 domain_context_mapping(struct dmar_domain *domain, struct device *dev,
1913                        int translation)
1914 {
1915         struct intel_iommu *iommu;
1916         u8 bus, devfn;
1917         struct domain_context_mapping_data data;
1918
1919         iommu = device_to_iommu(dev, &bus, &devfn);
1920         if (!iommu)
1921                 return -ENODEV;
1922
1923         if (!dev_is_pci(dev))
1924                 return domain_context_mapping_one(domain, iommu, bus, devfn,
1925                                                   translation);
1926
1927         data.domain = domain;
1928         data.iommu = iommu;
1929         data.translation = translation;
1930
1931         return pci_for_each_dma_alias(to_pci_dev(dev),
1932                                       &domain_context_mapping_cb, &data);
1933 }
1934
1935 static int domain_context_mapped_cb(struct pci_dev *pdev,
1936                                     u16 alias, void *opaque)
1937 {
1938         struct intel_iommu *iommu = opaque;
1939
1940         return !device_context_mapped(iommu, PCI_BUS_NUM(alias), alias & 0xff);
1941 }
1942
1943 static int domain_context_mapped(struct device *dev)
1944 {
1945         struct intel_iommu *iommu;
1946         u8 bus, devfn;
1947
1948         iommu = device_to_iommu(dev, &bus, &devfn);
1949         if (!iommu)
1950                 return -ENODEV;
1951
1952         if (!dev_is_pci(dev))
1953                 return device_context_mapped(iommu, bus, devfn);
1954
1955         return !pci_for_each_dma_alias(to_pci_dev(dev),
1956                                        domain_context_mapped_cb, iommu);
1957 }
1958
1959 /* Returns a number of VTD pages, but aligned to MM page size */
1960 static inline unsigned long aligned_nrpages(unsigned long host_addr,
1961                                             size_t size)
1962 {
1963         host_addr &= ~PAGE_MASK;
1964         return PAGE_ALIGN(host_addr + size) >> VTD_PAGE_SHIFT;
1965 }
1966
1967 /* Return largest possible superpage level for a given mapping */
1968 static inline int hardware_largepage_caps(struct dmar_domain *domain,
1969                                           unsigned long iov_pfn,
1970                                           unsigned long phy_pfn,
1971                                           unsigned long pages)
1972 {
1973         int support, level = 1;
1974         unsigned long pfnmerge;
1975
1976         support = domain->iommu_superpage;
1977
1978         /* To use a large page, the virtual *and* physical addresses
1979            must be aligned to 2MiB/1GiB/etc. Lower bits set in either
1980            of them will mean we have to use smaller pages. So just
1981            merge them and check both at once. */
1982         pfnmerge = iov_pfn | phy_pfn;
1983
1984         while (support && !(pfnmerge & ~VTD_STRIDE_MASK)) {
1985                 pages >>= VTD_STRIDE_SHIFT;
1986                 if (!pages)
1987                         break;
1988                 pfnmerge >>= VTD_STRIDE_SHIFT;
1989                 level++;
1990                 support--;
1991         }
1992         return level;
1993 }
1994
1995 static int __domain_mapping(struct dmar_domain *domain, unsigned long iov_pfn,
1996                             struct scatterlist *sg, unsigned long phys_pfn,
1997                             unsigned long nr_pages, int prot)
1998 {
1999         struct dma_pte *first_pte = NULL, *pte = NULL;
2000         phys_addr_t uninitialized_var(pteval);
2001         unsigned long sg_res = 0;
2002         unsigned int largepage_lvl = 0;
2003         unsigned long lvl_pages = 0;
2004
2005         BUG_ON(!domain_pfn_supported(domain, iov_pfn + nr_pages - 1));
2006
2007         if ((prot & (DMA_PTE_READ|DMA_PTE_WRITE)) == 0)
2008                 return -EINVAL;
2009
2010         prot &= DMA_PTE_READ | DMA_PTE_WRITE | DMA_PTE_SNP;
2011
2012         if (!sg) {
2013                 sg_res = nr_pages;
2014                 pteval = ((phys_addr_t)phys_pfn << VTD_PAGE_SHIFT) | prot;
2015         }
2016
2017         while (nr_pages > 0) {
2018                 uint64_t tmp;
2019
2020                 if (!sg_res) {
2021                         sg_res = aligned_nrpages(sg->offset, sg->length);
2022                         sg->dma_address = ((dma_addr_t)iov_pfn << VTD_PAGE_SHIFT) + sg->offset;
2023                         sg->dma_length = sg->length;
2024                         pteval = page_to_phys(sg_page(sg)) | prot;
2025                         phys_pfn = pteval >> VTD_PAGE_SHIFT;
2026                 }
2027
2028                 if (!pte) {
2029                         largepage_lvl = hardware_largepage_caps(domain, iov_pfn, phys_pfn, sg_res);
2030
2031                         first_pte = pte = pfn_to_dma_pte(domain, iov_pfn, &largepage_lvl);
2032                         if (!pte)
2033                                 return -ENOMEM;
2034                         /* It is large page*/
2035                         if (largepage_lvl > 1) {
2036                                 pteval |= DMA_PTE_LARGE_PAGE;
2037                                 lvl_pages = lvl_to_nr_pages(largepage_lvl);
2038                                 /*
2039                                  * Ensure that old small page tables are
2040                                  * removed to make room for superpage,
2041                                  * if they exist.
2042                                  */
2043                                 dma_pte_free_pagetable(domain, iov_pfn,
2044                                                        iov_pfn + lvl_pages - 1);
2045                         } else {
2046                                 pteval &= ~(uint64_t)DMA_PTE_LARGE_PAGE;
2047                         }
2048
2049                 }
2050                 /* We don't need lock here, nobody else
2051                  * touches the iova range
2052                  */
2053                 tmp = cmpxchg64_local(&pte->val, 0ULL, pteval);
2054                 if (tmp) {
2055                         static int dumps = 5;
2056                         printk(KERN_CRIT "ERROR: DMA PTE for vPFN 0x%lx already set (to %llx not %llx)\n",
2057                                iov_pfn, tmp, (unsigned long long)pteval);
2058                         if (dumps) {
2059                                 dumps--;
2060                                 debug_dma_dump_mappings(NULL);
2061                         }
2062                         WARN_ON(1);
2063                 }
2064
2065                 lvl_pages = lvl_to_nr_pages(largepage_lvl);
2066
2067                 BUG_ON(nr_pages < lvl_pages);
2068                 BUG_ON(sg_res < lvl_pages);
2069
2070                 nr_pages -= lvl_pages;
2071                 iov_pfn += lvl_pages;
2072                 phys_pfn += lvl_pages;
2073                 pteval += lvl_pages * VTD_PAGE_SIZE;
2074                 sg_res -= lvl_pages;
2075
2076                 /* If the next PTE would be the first in a new page, then we
2077                    need to flush the cache on the entries we've just written.
2078                    And then we'll need to recalculate 'pte', so clear it and
2079                    let it get set again in the if (!pte) block above.
2080
2081                    If we're done (!nr_pages) we need to flush the cache too.
2082
2083                    Also if we've been setting superpages, we may need to
2084                    recalculate 'pte' and switch back to smaller pages for the
2085                    end of the mapping, if the trailing size is not enough to
2086                    use another superpage (i.e. sg_res < lvl_pages). */
2087                 pte++;
2088                 if (!nr_pages || first_pte_in_page(pte) ||
2089                     (largepage_lvl > 1 && sg_res < lvl_pages)) {
2090                         domain_flush_cache(domain, first_pte,
2091                                            (void *)pte - (void *)first_pte);
2092                         pte = NULL;
2093                 }
2094
2095                 if (!sg_res && nr_pages)
2096                         sg = sg_next(sg);
2097         }
2098         return 0;
2099 }
2100
2101 static inline int domain_sg_mapping(struct dmar_domain *domain, unsigned long iov_pfn,
2102                                     struct scatterlist *sg, unsigned long nr_pages,
2103                                     int prot)
2104 {
2105         return __domain_mapping(domain, iov_pfn, sg, 0, nr_pages, prot);
2106 }
2107
2108 static inline int domain_pfn_mapping(struct dmar_domain *domain, unsigned long iov_pfn,
2109                                      unsigned long phys_pfn, unsigned long nr_pages,
2110                                      int prot)
2111 {
2112         return __domain_mapping(domain, iov_pfn, NULL, phys_pfn, nr_pages, prot);
2113 }
2114
2115 static void iommu_detach_dev(struct intel_iommu *iommu, u8 bus, u8 devfn)
2116 {
2117         if (!iommu)
2118                 return;
2119
2120         clear_context_table(iommu, bus, devfn);
2121         iommu->flush.flush_context(iommu, 0, 0, 0,
2122                                            DMA_CCMD_GLOBAL_INVL);
2123         iommu->flush.flush_iotlb(iommu, 0, 0, 0, DMA_TLB_GLOBAL_FLUSH);
2124 }
2125
2126 static inline void unlink_domain_info(struct device_domain_info *info)
2127 {
2128         assert_spin_locked(&device_domain_lock);
2129         list_del(&info->link);
2130         list_del(&info->global);
2131         if (info->dev)
2132                 info->dev->archdata.iommu = NULL;
2133 }
2134
2135 static void domain_remove_dev_info(struct dmar_domain *domain)
2136 {
2137         struct device_domain_info *info, *tmp;
2138         unsigned long flags;
2139
2140         spin_lock_irqsave(&device_domain_lock, flags);
2141         list_for_each_entry_safe(info, tmp, &domain->devices, link) {
2142                 unlink_domain_info(info);
2143                 spin_unlock_irqrestore(&device_domain_lock, flags);
2144
2145                 iommu_disable_dev_iotlb(info);
2146                 iommu_detach_dev(info->iommu, info->bus, info->devfn);
2147
2148                 if (domain_type_is_vm(domain)) {
2149                         iommu_detach_dependent_devices(info->iommu, info->dev);
2150                         domain_detach_iommu(domain, info->iommu);
2151                 }
2152
2153                 free_devinfo_mem(info);
2154                 spin_lock_irqsave(&device_domain_lock, flags);
2155         }
2156         spin_unlock_irqrestore(&device_domain_lock, flags);
2157 }
2158
2159 /*
2160  * find_domain
2161  * Note: we use struct device->archdata.iommu stores the info
2162  */
2163 static struct dmar_domain *find_domain(struct device *dev)
2164 {
2165         struct device_domain_info *info;
2166
2167         /* No lock here, assumes no domain exit in normal case */
2168         info = dev->archdata.iommu;
2169         if (info)
2170                 return info->domain;
2171         return NULL;
2172 }
2173
2174 static inline struct device_domain_info *
2175 dmar_search_domain_by_dev_info(int segment, int bus, int devfn)
2176 {
2177         struct device_domain_info *info;
2178
2179         list_for_each_entry(info, &device_domain_list, global)
2180                 if (info->iommu->segment == segment && info->bus == bus &&
2181                     info->devfn == devfn)
2182                         return info;
2183
2184         return NULL;
2185 }
2186
2187 static struct dmar_domain *dmar_insert_dev_info(struct intel_iommu *iommu,
2188                                                 int bus, int devfn,
2189                                                 struct device *dev,
2190                                                 struct dmar_domain *domain)
2191 {
2192         struct dmar_domain *found = NULL;
2193         struct device_domain_info *info;
2194         unsigned long flags;
2195
2196         info = alloc_devinfo_mem();
2197         if (!info)
2198                 return NULL;
2199
2200         info->bus = bus;
2201         info->devfn = devfn;
2202         info->dev = dev;
2203         info->domain = domain;
2204         info->iommu = iommu;
2205
2206         spin_lock_irqsave(&device_domain_lock, flags);
2207         if (dev)
2208                 found = find_domain(dev);
2209         else {
2210                 struct device_domain_info *info2;
2211                 info2 = dmar_search_domain_by_dev_info(iommu->segment, bus, devfn);
2212                 if (info2)
2213                         found = info2->domain;
2214         }
2215         if (found) {
2216                 spin_unlock_irqrestore(&device_domain_lock, flags);
2217                 free_devinfo_mem(info);
2218                 /* Caller must free the original domain */
2219                 return found;
2220         }
2221
2222         list_add(&info->link, &domain->devices);
2223         list_add(&info->global, &device_domain_list);
2224         if (dev)
2225                 dev->archdata.iommu = info;
2226         spin_unlock_irqrestore(&device_domain_lock, flags);
2227
2228         return domain;
2229 }
2230
2231 static int get_last_alias(struct pci_dev *pdev, u16 alias, void *opaque)
2232 {
2233         *(u16 *)opaque = alias;
2234         return 0;
2235 }
2236
2237 /* domain is initialized */
2238 static struct dmar_domain *get_domain_for_dev(struct device *dev, int gaw)
2239 {
2240         struct dmar_domain *domain, *tmp;
2241         struct intel_iommu *iommu;
2242         struct device_domain_info *info;
2243         u16 dma_alias;
2244         unsigned long flags;
2245         u8 bus, devfn;
2246
2247         domain = find_domain(dev);
2248         if (domain)
2249                 return domain;
2250
2251         iommu = device_to_iommu(dev, &bus, &devfn);
2252         if (!iommu)
2253                 return NULL;
2254
2255         if (dev_is_pci(dev)) {
2256                 struct pci_dev *pdev = to_pci_dev(dev);
2257
2258                 pci_for_each_dma_alias(pdev, get_last_alias, &dma_alias);
2259
2260                 spin_lock_irqsave(&device_domain_lock, flags);
2261                 info = dmar_search_domain_by_dev_info(pci_domain_nr(pdev->bus),
2262                                                       PCI_BUS_NUM(dma_alias),
2263                                                       dma_alias & 0xff);
2264                 if (info) {
2265                         iommu = info->iommu;
2266                         domain = info->domain;
2267                 }
2268                 spin_unlock_irqrestore(&device_domain_lock, flags);
2269
2270                 /* DMA alias already has a domain, uses it */
2271                 if (info)
2272                         goto found_domain;
2273         }
2274
2275         /* Allocate and initialize new domain for the device */
2276         domain = alloc_domain(0);
2277         if (!domain)
2278                 return NULL;
2279         domain->id = iommu_attach_domain(domain, iommu);
2280         if (domain->id < 0) {
2281                 free_domain_mem(domain);
2282                 return NULL;
2283         }
2284         domain_attach_iommu(domain, iommu);
2285         if (domain_init(domain, gaw)) {
2286                 domain_exit(domain);
2287                 return NULL;
2288         }
2289
2290         /* register PCI DMA alias device */
2291         if (dev_is_pci(dev)) {
2292                 tmp = dmar_insert_dev_info(iommu, PCI_BUS_NUM(dma_alias),
2293                                            dma_alias & 0xff, NULL, domain);
2294
2295                 if (!tmp || tmp != domain) {
2296                         domain_exit(domain);
2297                         domain = tmp;
2298                 }
2299
2300                 if (!domain)
2301                         return NULL;
2302         }
2303
2304 found_domain:
2305         tmp = dmar_insert_dev_info(iommu, bus, devfn, dev, domain);
2306
2307         if (!tmp || tmp != domain) {
2308                 domain_exit(domain);
2309                 domain = tmp;
2310         }
2311
2312         return domain;
2313 }
2314
2315 static int iommu_identity_mapping;
2316 #define IDENTMAP_ALL            1
2317 #define IDENTMAP_GFX            2
2318 #define IDENTMAP_AZALIA         4
2319
2320 static int iommu_domain_identity_map(struct dmar_domain *domain,
2321                                      unsigned long long start,
2322                                      unsigned long long end)
2323 {
2324         unsigned long first_vpfn = start >> VTD_PAGE_SHIFT;
2325         unsigned long last_vpfn = end >> VTD_PAGE_SHIFT;
2326
2327         if (!reserve_iova(&domain->iovad, dma_to_mm_pfn(first_vpfn),
2328                           dma_to_mm_pfn(last_vpfn))) {
2329                 printk(KERN_ERR "IOMMU: reserve iova failed\n");
2330                 return -ENOMEM;
2331         }
2332
2333         pr_debug("Mapping reserved region %llx-%llx for domain %d\n",
2334                  start, end, domain->id);
2335         /*
2336          * RMRR range might have overlap with physical memory range,
2337          * clear it first
2338          */
2339         dma_pte_clear_range(domain, first_vpfn, last_vpfn);
2340
2341         return domain_pfn_mapping(domain, first_vpfn, first_vpfn,
2342                                   last_vpfn - first_vpfn + 1,
2343                                   DMA_PTE_READ|DMA_PTE_WRITE);
2344 }
2345
2346 static int iommu_prepare_identity_map(struct device *dev,
2347                                       unsigned long long start,
2348                                       unsigned long long end)
2349 {
2350         struct dmar_domain *domain;
2351         int ret;
2352
2353         domain = get_domain_for_dev(dev, DEFAULT_DOMAIN_ADDRESS_WIDTH);
2354         if (!domain)
2355                 return -ENOMEM;
2356
2357         /* For _hardware_ passthrough, don't bother. But for software
2358            passthrough, we do it anyway -- it may indicate a memory
2359            range which is reserved in E820, so which didn't get set
2360            up to start with in si_domain */
2361         if (domain == si_domain && hw_pass_through) {
2362                 printk("Ignoring identity map for HW passthrough device %s [0x%Lx - 0x%Lx]\n",
2363                        dev_name(dev), start, end);
2364                 return 0;
2365         }
2366
2367         printk(KERN_INFO
2368                "IOMMU: Setting identity map for device %s [0x%Lx - 0x%Lx]\n",
2369                dev_name(dev), start, end);
2370         
2371         if (end < start) {
2372                 WARN(1, "Your BIOS is broken; RMRR ends before it starts!\n"
2373                         "BIOS vendor: %s; Ver: %s; Product Version: %s\n",
2374                         dmi_get_system_info(DMI_BIOS_VENDOR),
2375                         dmi_get_system_info(DMI_BIOS_VERSION),
2376                      dmi_get_system_info(DMI_PRODUCT_VERSION));
2377                 ret = -EIO;
2378                 goto error;
2379         }
2380
2381         if (end >> agaw_to_width(domain->agaw)) {
2382                 WARN(1, "Your BIOS is broken; RMRR exceeds permitted address width (%d bits)\n"
2383                      "BIOS vendor: %s; Ver: %s; Product Version: %s\n",
2384                      agaw_to_width(domain->agaw),
2385                      dmi_get_system_info(DMI_BIOS_VENDOR),
2386                      dmi_get_system_info(DMI_BIOS_VERSION),
2387                      dmi_get_system_info(DMI_PRODUCT_VERSION));
2388                 ret = -EIO;
2389                 goto error;
2390         }
2391
2392         ret = iommu_domain_identity_map(domain, start, end);
2393         if (ret)
2394                 goto error;
2395
2396         /* context entry init */
2397         ret = domain_context_mapping(domain, dev, CONTEXT_TT_MULTI_LEVEL);
2398         if (ret)
2399                 goto error;
2400
2401         return 0;
2402
2403  error:
2404         domain_exit(domain);
2405         return ret;
2406 }
2407
2408 static inline int iommu_prepare_rmrr_dev(struct dmar_rmrr_unit *rmrr,
2409                                          struct device *dev)
2410 {
2411         if (dev->archdata.iommu == DUMMY_DEVICE_DOMAIN_INFO)
2412                 return 0;
2413         return iommu_prepare_identity_map(dev, rmrr->base_address,
2414                                           rmrr->end_address);
2415 }
2416
2417 #ifdef CONFIG_INTEL_IOMMU_FLOPPY_WA
2418 static inline void iommu_prepare_isa(void)
2419 {
2420         struct pci_dev *pdev;
2421         int ret;
2422
2423         pdev = pci_get_class(PCI_CLASS_BRIDGE_ISA << 8, NULL);
2424         if (!pdev)
2425                 return;
2426
2427         printk(KERN_INFO "IOMMU: Prepare 0-16MiB unity mapping for LPC\n");
2428         ret = iommu_prepare_identity_map(&pdev->dev, 0, 16*1024*1024 - 1);
2429
2430         if (ret)
2431                 printk(KERN_ERR "IOMMU: Failed to create 0-16MiB identity map; "
2432                        "floppy might not work\n");
2433
2434         pci_dev_put(pdev);
2435 }
2436 #else
2437 static inline void iommu_prepare_isa(void)
2438 {
2439         return;
2440 }
2441 #endif /* !CONFIG_INTEL_IOMMU_FLPY_WA */
2442
2443 static int md_domain_init(struct dmar_domain *domain, int guest_width);
2444
2445 static int __init si_domain_init(int hw)
2446 {
2447         struct dmar_drhd_unit *drhd;
2448         struct intel_iommu *iommu;
2449         int nid, ret = 0;
2450         bool first = true;
2451
2452         si_domain = alloc_domain(DOMAIN_FLAG_STATIC_IDENTITY);
2453         if (!si_domain)
2454                 return -EFAULT;
2455
2456         for_each_active_iommu(iommu, drhd) {
2457                 ret = iommu_attach_domain(si_domain, iommu);
2458                 if (ret < 0) {
2459                         domain_exit(si_domain);
2460                         return -EFAULT;
2461                 } else if (first) {
2462                         si_domain->id = ret;
2463                         first = false;
2464                 } else if (si_domain->id != ret) {
2465                         domain_exit(si_domain);
2466                         return -EFAULT;
2467                 }
2468                 domain_attach_iommu(si_domain, iommu);
2469         }
2470
2471         if (md_domain_init(si_domain, DEFAULT_DOMAIN_ADDRESS_WIDTH)) {
2472                 domain_exit(si_domain);
2473                 return -EFAULT;
2474         }
2475
2476         pr_debug("IOMMU: identity mapping domain is domain %d\n",
2477                  si_domain->id);
2478
2479         if (hw)
2480                 return 0;
2481
2482         for_each_online_node(nid) {
2483                 unsigned long start_pfn, end_pfn;
2484                 int i;
2485
2486                 for_each_mem_pfn_range(i, nid, &start_pfn, &end_pfn, NULL) {
2487                         ret = iommu_domain_identity_map(si_domain,
2488                                         PFN_PHYS(start_pfn), PFN_PHYS(end_pfn));
2489                         if (ret)
2490                                 return ret;
2491                 }
2492         }
2493
2494         return 0;
2495 }
2496
2497 static int identity_mapping(struct device *dev)
2498 {
2499         struct device_domain_info *info;
2500
2501         if (likely(!iommu_identity_mapping))
2502                 return 0;
2503
2504         info = dev->archdata.iommu;
2505         if (info && info != DUMMY_DEVICE_DOMAIN_INFO)
2506                 return (info->domain == si_domain);
2507
2508         return 0;
2509 }
2510
2511 static int domain_add_dev_info(struct dmar_domain *domain,
2512                                struct device *dev, int translation)
2513 {
2514         struct dmar_domain *ndomain;
2515         struct intel_iommu *iommu;
2516         u8 bus, devfn;
2517         int ret;
2518
2519         iommu = device_to_iommu(dev, &bus, &devfn);
2520         if (!iommu)
2521                 return -ENODEV;
2522
2523         ndomain = dmar_insert_dev_info(iommu, bus, devfn, dev, domain);
2524         if (ndomain != domain)
2525                 return -EBUSY;
2526
2527         ret = domain_context_mapping(domain, dev, translation);
2528         if (ret) {
2529                 domain_remove_one_dev_info(domain, dev);
2530                 return ret;
2531         }
2532
2533         return 0;
2534 }
2535
2536 static bool device_has_rmrr(struct device *dev)
2537 {
2538         struct dmar_rmrr_unit *rmrr;
2539         struct device *tmp;
2540         int i;
2541
2542         rcu_read_lock();
2543         for_each_rmrr_units(rmrr) {
2544                 /*
2545                  * Return TRUE if this RMRR contains the device that
2546                  * is passed in.
2547                  */
2548                 for_each_active_dev_scope(rmrr->devices,
2549                                           rmrr->devices_cnt, i, tmp)
2550                         if (tmp == dev) {
2551                                 rcu_read_unlock();
2552                                 return true;
2553                         }
2554         }
2555         rcu_read_unlock();
2556         return false;
2557 }
2558
2559 /*
2560  * There are a couple cases where we need to restrict the functionality of
2561  * devices associated with RMRRs.  The first is when evaluating a device for
2562  * identity mapping because problems exist when devices are moved in and out
2563  * of domains and their respective RMRR information is lost.  This means that
2564  * a device with associated RMRRs will never be in a "passthrough" domain.
2565  * The second is use of the device through the IOMMU API.  This interface
2566  * expects to have full control of the IOVA space for the device.  We cannot
2567  * satisfy both the requirement that RMRR access is maintained and have an
2568  * unencumbered IOVA space.  We also have no ability to quiesce the device's
2569  * use of the RMRR space or even inform the IOMMU API user of the restriction.
2570  * We therefore prevent devices associated with an RMRR from participating in
2571  * the IOMMU API, which eliminates them from device assignment.
2572  *
2573  * In both cases we assume that PCI USB devices with RMRRs have them largely
2574  * for historical reasons and that the RMRR space is not actively used post
2575  * boot.  This exclusion may change if vendors begin to abuse it.
2576  *
2577  * The same exception is made for graphics devices, with the requirement that
2578  * any use of the RMRR regions will be torn down before assigning the device
2579  * to a guest.
2580  */
2581 static bool device_is_rmrr_locked(struct device *dev)
2582 {
2583         if (!device_has_rmrr(dev))
2584                 return false;
2585
2586         if (dev_is_pci(dev)) {
2587                 struct pci_dev *pdev = to_pci_dev(dev);
2588
2589                 if (IS_USB_DEVICE(pdev) || IS_GFX_DEVICE(pdev))
2590                         return false;
2591         }
2592
2593         return true;
2594 }
2595
2596 static int iommu_should_identity_map(struct device *dev, int startup)
2597 {
2598
2599         if (dev_is_pci(dev)) {
2600                 struct pci_dev *pdev = to_pci_dev(dev);
2601
2602                 if (device_is_rmrr_locked(dev))
2603                         return 0;
2604
2605                 if ((iommu_identity_mapping & IDENTMAP_AZALIA) && IS_AZALIA(pdev))
2606                         return 1;
2607
2608                 if ((iommu_identity_mapping & IDENTMAP_GFX) && IS_GFX_DEVICE(pdev))
2609                         return 1;
2610
2611                 if (!(iommu_identity_mapping & IDENTMAP_ALL))
2612                         return 0;
2613
2614                 /*
2615                  * We want to start off with all devices in the 1:1 domain, and
2616                  * take them out later if we find they can't access all of memory.
2617                  *
2618                  * However, we can't do this for PCI devices behind bridges,
2619                  * because all PCI devices behind the same bridge will end up
2620                  * with the same source-id on their transactions.
2621                  *
2622                  * Practically speaking, we can't change things around for these
2623                  * devices at run-time, because we can't be sure there'll be no
2624                  * DMA transactions in flight for any of their siblings.
2625                  *
2626                  * So PCI devices (unless they're on the root bus) as well as
2627                  * their parent PCI-PCI or PCIe-PCI bridges must be left _out_ of
2628                  * the 1:1 domain, just in _case_ one of their siblings turns out
2629                  * not to be able to map all of memory.
2630                  */
2631                 if (!pci_is_pcie(pdev)) {
2632                         if (!pci_is_root_bus(pdev->bus))
2633                                 return 0;
2634                         if (pdev->class >> 8 == PCI_CLASS_BRIDGE_PCI)
2635                                 return 0;
2636                 } else if (pci_pcie_type(pdev) == PCI_EXP_TYPE_PCI_BRIDGE)
2637                         return 0;
2638         } else {
2639                 if (device_has_rmrr(dev))
2640                         return 0;
2641         }
2642
2643         /*
2644          * At boot time, we don't yet know if devices will be 64-bit capable.
2645          * Assume that they will — if they turn out not to be, then we can
2646          * take them out of the 1:1 domain later.
2647          */
2648         if (!startup) {
2649                 /*
2650                  * If the device's dma_mask is less than the system's memory
2651                  * size then this is not a candidate for identity mapping.
2652                  */
2653                 u64 dma_mask = *dev->dma_mask;
2654
2655                 if (dev->coherent_dma_mask &&
2656                     dev->coherent_dma_mask < dma_mask)
2657                         dma_mask = dev->coherent_dma_mask;
2658
2659                 return dma_mask >= dma_get_required_mask(dev);
2660         }
2661
2662         return 1;
2663 }
2664
2665 static int __init dev_prepare_static_identity_mapping(struct device *dev, int hw)
2666 {
2667         int ret;
2668
2669         if (!iommu_should_identity_map(dev, 1))
2670                 return 0;
2671
2672         ret = domain_add_dev_info(si_domain, dev,
2673                                   hw ? CONTEXT_TT_PASS_THROUGH :
2674                                        CONTEXT_TT_MULTI_LEVEL);
2675         if (!ret)
2676                 pr_info("IOMMU: %s identity mapping for device %s\n",
2677                         hw ? "hardware" : "software", dev_name(dev));
2678         else if (ret == -ENODEV)
2679                 /* device not associated with an iommu */
2680                 ret = 0;
2681
2682         return ret;
2683 }
2684
2685
2686 static int __init iommu_prepare_static_identity_mapping(int hw)
2687 {
2688         struct pci_dev *pdev = NULL;
2689         struct dmar_drhd_unit *drhd;
2690         struct intel_iommu *iommu;
2691         struct device *dev;
2692         int i;
2693         int ret = 0;
2694
2695         ret = si_domain_init(hw);
2696         if (ret)
2697                 return -EFAULT;
2698
2699         for_each_pci_dev(pdev) {
2700                 ret = dev_prepare_static_identity_mapping(&pdev->dev, hw);
2701                 if (ret)
2702                         return ret;
2703         }
2704
2705         for_each_active_iommu(iommu, drhd)
2706                 for_each_active_dev_scope(drhd->devices, drhd->devices_cnt, i, dev) {
2707                         struct acpi_device_physical_node *pn;
2708                         struct acpi_device *adev;
2709
2710                         if (dev->bus != &acpi_bus_type)
2711                                 continue;
2712                                 
2713                         adev= to_acpi_device(dev);
2714                         mutex_lock(&adev->physical_node_lock);
2715                         list_for_each_entry(pn, &adev->physical_node_list, node) {
2716                                 ret = dev_prepare_static_identity_mapping(pn->dev, hw);
2717                                 if (ret)
2718                                         break;
2719                         }
2720                         mutex_unlock(&adev->physical_node_lock);
2721                         if (ret)
2722                                 return ret;
2723                 }
2724
2725         return 0;
2726 }
2727
2728 static void intel_iommu_init_qi(struct intel_iommu *iommu)
2729 {
2730         /*
2731          * Start from the sane iommu hardware state.
2732          * If the queued invalidation is already initialized by us
2733          * (for example, while enabling interrupt-remapping) then
2734          * we got the things already rolling from a sane state.
2735          */
2736         if (!iommu->qi) {
2737                 /*
2738                  * Clear any previous faults.
2739                  */
2740                 dmar_fault(-1, iommu);
2741                 /*
2742                  * Disable queued invalidation if supported and already enabled
2743                  * before OS handover.
2744                  */
2745                 dmar_disable_qi(iommu);
2746         }
2747
2748         if (dmar_enable_qi(iommu)) {
2749                 /*
2750                  * Queued Invalidate not enabled, use Register Based Invalidate
2751                  */
2752                 iommu->flush.flush_context = __iommu_flush_context;
2753                 iommu->flush.flush_iotlb = __iommu_flush_iotlb;
2754                 pr_info("IOMMU: %s using Register based invalidation\n",
2755                         iommu->name);
2756         } else {
2757                 iommu->flush.flush_context = qi_flush_context;
2758                 iommu->flush.flush_iotlb = qi_flush_iotlb;
2759                 pr_info("IOMMU: %s using Queued invalidation\n", iommu->name);
2760         }
2761 }
2762
2763 static int __init init_dmars(void)
2764 {
2765         struct dmar_drhd_unit *drhd;
2766         struct dmar_rmrr_unit *rmrr;
2767         struct device *dev;
2768         struct intel_iommu *iommu;
2769         int i, ret;
2770
2771         /*
2772          * for each drhd
2773          *    allocate root
2774          *    initialize and program root entry to not present
2775          * endfor
2776          */
2777         for_each_drhd_unit(drhd) {
2778                 /*
2779                  * lock not needed as this is only incremented in the single
2780                  * threaded kernel __init code path all other access are read
2781                  * only
2782                  */
2783                 if (g_num_of_iommus < DMAR_UNITS_SUPPORTED) {
2784                         g_num_of_iommus++;
2785                         continue;
2786                 }
2787                 printk_once(KERN_ERR "intel-iommu: exceeded %d IOMMUs\n",
2788                           DMAR_UNITS_SUPPORTED);
2789         }
2790
2791         /* Preallocate enough resources for IOMMU hot-addition */
2792         if (g_num_of_iommus < DMAR_UNITS_SUPPORTED)
2793                 g_num_of_iommus = DMAR_UNITS_SUPPORTED;
2794
2795         g_iommus = kcalloc(g_num_of_iommus, sizeof(struct intel_iommu *),
2796                         GFP_KERNEL);
2797         if (!g_iommus) {
2798                 printk(KERN_ERR "Allocating global iommu array failed\n");
2799                 ret = -ENOMEM;
2800                 goto error;
2801         }
2802
2803         deferred_flush = kzalloc(g_num_of_iommus *
2804                 sizeof(struct deferred_flush_tables), GFP_KERNEL);
2805         if (!deferred_flush) {
2806                 ret = -ENOMEM;
2807                 goto free_g_iommus;
2808         }
2809
2810         for_each_active_iommu(iommu, drhd) {
2811                 g_iommus[iommu->seq_id] = iommu;
2812
2813                 ret = iommu_init_domains(iommu);
2814                 if (ret)
2815                         goto free_iommu;
2816
2817                 /*
2818                  * TBD:
2819                  * we could share the same root & context tables
2820                  * among all IOMMU's. Need to Split it later.
2821                  */
2822                 ret = iommu_alloc_root_entry(iommu);
2823                 if (ret)
2824                         goto free_iommu;
2825                 if (!ecap_pass_through(iommu->ecap))
2826                         hw_pass_through = 0;
2827         }
2828
2829         for_each_active_iommu(iommu, drhd)
2830                 intel_iommu_init_qi(iommu);
2831
2832         if (iommu_pass_through)
2833                 iommu_identity_mapping |= IDENTMAP_ALL;
2834
2835 #ifdef CONFIG_INTEL_IOMMU_BROKEN_GFX_WA
2836         iommu_identity_mapping |= IDENTMAP_GFX;
2837 #endif
2838
2839         check_tylersburg_isoch();
2840
2841         /*
2842          * If pass through is not set or not enabled, setup context entries for
2843          * identity mappings for rmrr, gfx, and isa and may fall back to static
2844          * identity mapping if iommu_identity_mapping is set.
2845          */
2846         if (iommu_identity_mapping) {
2847                 ret = iommu_prepare_static_identity_mapping(hw_pass_through);
2848                 if (ret) {
2849                         printk(KERN_CRIT "Failed to setup IOMMU pass-through\n");
2850                         goto free_iommu;
2851                 }
2852         }
2853         /*
2854          * For each rmrr
2855          *   for each dev attached to rmrr
2856          *   do
2857          *     locate drhd for dev, alloc domain for dev
2858          *     allocate free domain
2859          *     allocate page table entries for rmrr
2860          *     if context not allocated for bus
2861          *           allocate and init context
2862          *           set present in root table for this bus
2863          *     init context with domain, translation etc
2864          *    endfor
2865          * endfor
2866          */
2867         printk(KERN_INFO "IOMMU: Setting RMRR:\n");
2868         for_each_rmrr_units(rmrr) {
2869                 /* some BIOS lists non-exist devices in DMAR table. */
2870                 for_each_active_dev_scope(rmrr->devices, rmrr->devices_cnt,
2871                                           i, dev) {
2872                         ret = iommu_prepare_rmrr_dev(rmrr, dev);
2873                         if (ret)
2874                                 printk(KERN_ERR
2875                                        "IOMMU: mapping reserved region failed\n");
2876                 }
2877         }
2878
2879         iommu_prepare_isa();
2880
2881         /*
2882          * for each drhd
2883          *   enable fault log
2884          *   global invalidate context cache
2885          *   global invalidate iotlb
2886          *   enable translation
2887          */
2888         for_each_iommu(iommu, drhd) {
2889                 if (drhd->ignored) {
2890                         /*
2891                          * we always have to disable PMRs or DMA may fail on
2892                          * this device
2893                          */
2894                         if (force_on)
2895                                 iommu_disable_protect_mem_regions(iommu);
2896                         continue;
2897                 }
2898
2899                 iommu_flush_write_buffer(iommu);
2900
2901                 ret = dmar_set_interrupt(iommu);
2902                 if (ret)
2903                         goto free_iommu;
2904
2905                 iommu_set_root_entry(iommu);
2906
2907                 iommu->flush.flush_context(iommu, 0, 0, 0, DMA_CCMD_GLOBAL_INVL);
2908                 iommu->flush.flush_iotlb(iommu, 0, 0, 0, DMA_TLB_GLOBAL_FLUSH);
2909                 iommu_enable_translation(iommu);
2910                 iommu_disable_protect_mem_regions(iommu);
2911         }
2912
2913         return 0;
2914
2915 free_iommu:
2916         for_each_active_iommu(iommu, drhd) {
2917                 disable_dmar_iommu(iommu);
2918                 free_dmar_iommu(iommu);
2919         }
2920         kfree(deferred_flush);
2921 free_g_iommus:
2922         kfree(g_iommus);
2923 error:
2924         return ret;
2925 }
2926
2927 /* This takes a number of _MM_ pages, not VTD pages */
2928 static struct iova *intel_alloc_iova(struct device *dev,
2929                                      struct dmar_domain *domain,
2930                                      unsigned long nrpages, uint64_t dma_mask)
2931 {
2932         struct iova *iova = NULL;
2933
2934         /* Restrict dma_mask to the width that the iommu can handle */
2935         dma_mask = min_t(uint64_t, DOMAIN_MAX_ADDR(domain->gaw), dma_mask);
2936
2937         if (!dmar_forcedac && dma_mask > DMA_BIT_MASK(32)) {
2938                 /*
2939                  * First try to allocate an io virtual address in
2940                  * DMA_BIT_MASK(32) and if that fails then try allocating
2941                  * from higher range
2942                  */
2943                 iova = alloc_iova(&domain->iovad, nrpages,
2944                                   IOVA_PFN(DMA_BIT_MASK(32)), 1);
2945                 if (iova)
2946                         return iova;
2947         }
2948         iova = alloc_iova(&domain->iovad, nrpages, IOVA_PFN(dma_mask), 1);
2949         if (unlikely(!iova)) {
2950                 printk(KERN_ERR "Allocating %ld-page iova for %s failed",
2951                        nrpages, dev_name(dev));
2952                 return NULL;
2953         }
2954
2955         return iova;
2956 }
2957
2958 static struct dmar_domain *__get_valid_domain_for_dev(struct device *dev)
2959 {
2960         struct dmar_domain *domain;
2961         int ret;
2962
2963         domain = get_domain_for_dev(dev, DEFAULT_DOMAIN_ADDRESS_WIDTH);
2964         if (!domain) {
2965                 printk(KERN_ERR "Allocating domain for %s failed",
2966                        dev_name(dev));
2967                 return NULL;
2968         }
2969
2970         /* make sure context mapping is ok */
2971         if (unlikely(!domain_context_mapped(dev))) {
2972                 ret = domain_context_mapping(domain, dev, CONTEXT_TT_MULTI_LEVEL);
2973                 if (ret) {
2974                         printk(KERN_ERR "Domain context map for %s failed",
2975                                dev_name(dev));
2976                         return NULL;
2977                 }
2978         }
2979
2980         return domain;
2981 }
2982
2983 static inline struct dmar_domain *get_valid_domain_for_dev(struct device *dev)
2984 {
2985         struct device_domain_info *info;
2986
2987         /* No lock here, assumes no domain exit in normal case */
2988         info = dev->archdata.iommu;
2989         if (likely(info))
2990                 return info->domain;
2991
2992         return __get_valid_domain_for_dev(dev);
2993 }
2994
2995 /* Check if the dev needs to go through non-identity map and unmap process.*/
2996 static int iommu_no_mapping(struct device *dev)
2997 {
2998         int found;
2999
3000         if (iommu_dummy(dev))
3001                 return 1;
3002
3003         if (!iommu_identity_mapping)
3004                 return 0;
3005
3006         found = identity_mapping(dev);
3007         if (found) {
3008                 if (iommu_should_identity_map(dev, 0))
3009                         return 1;
3010                 else {
3011                         /*
3012                          * 32 bit DMA is removed from si_domain and fall back
3013                          * to non-identity mapping.
3014                          */
3015                         domain_remove_one_dev_info(si_domain, dev);
3016                         printk(KERN_INFO "32bit %s uses non-identity mapping\n",
3017                                dev_name(dev));
3018                         return 0;
3019                 }
3020         } else {
3021                 /*
3022                  * In case of a detached 64 bit DMA device from vm, the device
3023                  * is put into si_domain for identity mapping.
3024                  */
3025                 if (iommu_should_identity_map(dev, 0)) {
3026                         int ret;
3027                         ret = domain_add_dev_info(si_domain, dev,
3028                                                   hw_pass_through ?
3029                                                   CONTEXT_TT_PASS_THROUGH :
3030                                                   CONTEXT_TT_MULTI_LEVEL);
3031                         if (!ret) {
3032                                 printk(KERN_INFO "64bit %s uses identity mapping\n",
3033                                        dev_name(dev));
3034                                 return 1;
3035                         }
3036                 }
3037         }
3038
3039         return 0;
3040 }
3041
3042 static dma_addr_t __intel_map_single(struct device *dev, phys_addr_t paddr,
3043                                      size_t size, int dir, u64 dma_mask)
3044 {
3045         struct dmar_domain *domain;
3046         phys_addr_t start_paddr;
3047         struct iova *iova;
3048         int prot = 0;
3049         int ret;
3050         struct intel_iommu *iommu;
3051         unsigned long paddr_pfn = paddr >> PAGE_SHIFT;
3052
3053         BUG_ON(dir == DMA_NONE);
3054
3055         if (iommu_no_mapping(dev))
3056                 return paddr;
3057
3058         domain = get_valid_domain_for_dev(dev);
3059         if (!domain)
3060                 return 0;
3061
3062         iommu = domain_get_iommu(domain);
3063         size = aligned_nrpages(paddr, size);
3064
3065         iova = intel_alloc_iova(dev, domain, dma_to_mm_pfn(size), dma_mask);
3066         if (!iova)
3067                 goto error;
3068
3069         /*
3070          * Check if DMAR supports zero-length reads on write only
3071          * mappings..
3072          */
3073         if (dir == DMA_TO_DEVICE || dir == DMA_BIDIRECTIONAL || \
3074                         !cap_zlr(iommu->cap))
3075                 prot |= DMA_PTE_READ;
3076         if (dir == DMA_FROM_DEVICE || dir == DMA_BIDIRECTIONAL)
3077                 prot |= DMA_PTE_WRITE;
3078         /*
3079          * paddr - (paddr + size) might be partial page, we should map the whole
3080          * page.  Note: if two part of one page are separately mapped, we
3081          * might have two guest_addr mapping to the same host paddr, but this
3082          * is not a big problem
3083          */
3084         ret = domain_pfn_mapping(domain, mm_to_dma_pfn(iova->pfn_lo),
3085                                  mm_to_dma_pfn(paddr_pfn), size, prot);
3086         if (ret)
3087                 goto error;
3088
3089         /* it's a non-present to present mapping. Only flush if caching mode */
3090         if (cap_caching_mode(iommu->cap))
3091                 iommu_flush_iotlb_psi(iommu, domain->id, mm_to_dma_pfn(iova->pfn_lo), size, 0, 1);
3092         else
3093                 iommu_flush_write_buffer(iommu);
3094
3095         start_paddr = (phys_addr_t)iova->pfn_lo << PAGE_SHIFT;
3096         start_paddr += paddr & ~PAGE_MASK;
3097         return start_paddr;
3098
3099 error:
3100         if (iova)
3101                 __free_iova(&domain->iovad, iova);
3102         printk(KERN_ERR"Device %s request: %zx@%llx dir %d --- failed\n",
3103                 dev_name(dev), size, (unsigned long long)paddr, dir);
3104         return 0;
3105 }
3106
3107 static dma_addr_t intel_map_page(struct device *dev, struct page *page,
3108                                  unsigned long offset, size_t size,
3109                                  enum dma_data_direction dir,
3110                                  struct dma_attrs *attrs)
3111 {
3112         return __intel_map_single(dev, page_to_phys(page) + offset, size,
3113                                   dir, *dev->dma_mask);
3114 }
3115
3116 static void flush_unmaps(void)
3117 {
3118         int i, j;
3119
3120         timer_on = 0;
3121
3122         /* just flush them all */
3123         for (i = 0; i < g_num_of_iommus; i++) {
3124                 struct intel_iommu *iommu = g_iommus[i];
3125                 if (!iommu)
3126                         continue;
3127
3128                 if (!deferred_flush[i].next)
3129                         continue;
3130
3131                 /* In caching mode, global flushes turn emulation expensive */
3132                 if (!cap_caching_mode(iommu->cap))
3133                         iommu->flush.flush_iotlb(iommu, 0, 0, 0,
3134                                          DMA_TLB_GLOBAL_FLUSH);
3135                 for (j = 0; j < deferred_flush[i].next; j++) {
3136                         unsigned long mask;
3137                         struct iova *iova = deferred_flush[i].iova[j];
3138                         struct dmar_domain *domain = deferred_flush[i].domain[j];
3139
3140                         /* On real hardware multiple invalidations are expensive */
3141                         if (cap_caching_mode(iommu->cap))
3142                                 iommu_flush_iotlb_psi(iommu, domain->id,
3143                                         iova->pfn_lo, iova_size(iova),
3144                                         !deferred_flush[i].freelist[j], 0);
3145                         else {
3146                                 mask = ilog2(mm_to_dma_pfn(iova_size(iova)));
3147                                 iommu_flush_dev_iotlb(deferred_flush[i].domain[j],
3148                                                 (uint64_t)iova->pfn_lo << PAGE_SHIFT, mask);
3149                         }
3150                         __free_iova(&deferred_flush[i].domain[j]->iovad, iova);
3151                         if (deferred_flush[i].freelist[j])
3152                                 dma_free_pagelist(deferred_flush[i].freelist[j]);
3153                 }
3154                 deferred_flush[i].next = 0;
3155         }
3156
3157         list_size = 0;
3158 }
3159
3160 static void flush_unmaps_timeout(unsigned long data)
3161 {
3162         unsigned long flags;
3163
3164         spin_lock_irqsave(&async_umap_flush_lock, flags);
3165         flush_unmaps();
3166         spin_unlock_irqrestore(&async_umap_flush_lock, flags);
3167 }
3168
3169 static void add_unmap(struct dmar_domain *dom, struct iova *iova, struct page *freelist)
3170 {
3171         unsigned long flags;
3172         int next, iommu_id;
3173         struct intel_iommu *iommu;
3174
3175         spin_lock_irqsave(&async_umap_flush_lock, flags);
3176         if (list_size == HIGH_WATER_MARK)
3177                 flush_unmaps();
3178
3179         iommu = domain_get_iommu(dom);
3180         iommu_id = iommu->seq_id;
3181
3182         next = deferred_flush[iommu_id].next;
3183         deferred_flush[iommu_id].domain[next] = dom;
3184         deferred_flush[iommu_id].iova[next] = iova;
3185         deferred_flush[iommu_id].freelist[next] = freelist;
3186         deferred_flush[iommu_id].next++;
3187
3188         if (!timer_on) {
3189                 mod_timer(&unmap_timer, jiffies + msecs_to_jiffies(10));
3190                 timer_on = 1;
3191         }
3192         list_size++;
3193         spin_unlock_irqrestore(&async_umap_flush_lock, flags);
3194 }
3195
3196 static void intel_unmap(struct device *dev, dma_addr_t dev_addr)
3197 {
3198         struct dmar_domain *domain;
3199         unsigned long start_pfn, last_pfn;
3200         struct iova *iova;
3201         struct intel_iommu *iommu;
3202         struct page *freelist;
3203
3204         if (iommu_no_mapping(dev))
3205                 return;
3206
3207         domain = find_domain(dev);
3208         BUG_ON(!domain);
3209
3210         iommu = domain_get_iommu(domain);
3211
3212         iova = find_iova(&domain->iovad, IOVA_PFN(dev_addr));
3213         if (WARN_ONCE(!iova, "Driver unmaps unmatched page at PFN %llx\n",
3214                       (unsigned long long)dev_addr))
3215                 return;
3216
3217         start_pfn = mm_to_dma_pfn(iova->pfn_lo);
3218         last_pfn = mm_to_dma_pfn(iova->pfn_hi + 1) - 1;
3219
3220         pr_debug("Device %s unmapping: pfn %lx-%lx\n",
3221                  dev_name(dev), start_pfn, last_pfn);
3222
3223         freelist = domain_unmap(domain, start_pfn, last_pfn);
3224
3225         if (intel_iommu_strict) {
3226                 iommu_flush_iotlb_psi(iommu, domain->id, start_pfn,
3227                                       last_pfn - start_pfn + 1, !freelist, 0);
3228                 /* free iova */
3229                 __free_iova(&domain->iovad, iova);
3230                 dma_free_pagelist(freelist);
3231         } else {
3232                 add_unmap(domain, iova, freelist);
3233                 /*
3234                  * queue up the release of the unmap to save the 1/6th of the
3235                  * cpu used up by the iotlb flush operation...
3236                  */
3237         }
3238 }
3239
3240 static void intel_unmap_page(struct device *dev, dma_addr_t dev_addr,
3241                              size_t size, enum dma_data_direction dir,
3242                              struct dma_attrs *attrs)
3243 {
3244         intel_unmap(dev, dev_addr);
3245 }
3246
3247 static void *intel_alloc_coherent(struct device *dev, size_t size,
3248                                   dma_addr_t *dma_handle, gfp_t flags,
3249                                   struct dma_attrs *attrs)
3250 {
3251         struct page *page = NULL;
3252         int order;
3253
3254         size = PAGE_ALIGN(size);
3255         order = get_order(size);
3256
3257         if (!iommu_no_mapping(dev))
3258                 flags &= ~(GFP_DMA | GFP_DMA32);
3259         else if (dev->coherent_dma_mask < dma_get_required_mask(dev)) {
3260                 if (dev->coherent_dma_mask < DMA_BIT_MASK(32))
3261                         flags |= GFP_DMA;
3262                 else
3263                         flags |= GFP_DMA32;
3264         }
3265
3266         if (flags & __GFP_WAIT) {
3267                 unsigned int count = size >> PAGE_SHIFT;
3268
3269                 page = dma_alloc_from_contiguous(dev, count, order);
3270                 if (page && iommu_no_mapping(dev) &&
3271                     page_to_phys(page) + size > dev->coherent_dma_mask) {
3272                         dma_release_from_contiguous(dev, page, count);
3273                         page = NULL;
3274                 }
3275         }
3276
3277         if (!page)
3278                 page = alloc_pages(flags, order);
3279         if (!page)
3280                 return NULL;
3281         memset(page_address(page), 0, size);
3282
3283         *dma_handle = __intel_map_single(dev, page_to_phys(page), size,
3284                                          DMA_BIDIRECTIONAL,
3285                                          dev->coherent_dma_mask);
3286         if (*dma_handle)
3287                 return page_address(page);
3288         if (!dma_release_from_contiguous(dev, page, size >> PAGE_SHIFT))
3289                 __free_pages(page, order);
3290
3291         return NULL;
3292 }
3293
3294 static void intel_free_coherent(struct device *dev, size_t size, void *vaddr,
3295                                 dma_addr_t dma_handle, struct dma_attrs *attrs)
3296 {
3297         int order;
3298         struct page *page = virt_to_page(vaddr);
3299
3300         size = PAGE_ALIGN(size);
3301         order = get_order(size);
3302
3303         intel_unmap(dev, dma_handle);
3304         if (!dma_release_from_contiguous(dev, page, size >> PAGE_SHIFT))
3305                 __free_pages(page, order);
3306 }
3307
3308 static void intel_unmap_sg(struct device *dev, struct scatterlist *sglist,
3309                            int nelems, enum dma_data_direction dir,
3310                            struct dma_attrs *attrs)
3311 {
3312         intel_unmap(dev, sglist[0].dma_address);
3313 }
3314
3315 static int intel_nontranslate_map_sg(struct device *hddev,
3316         struct scatterlist *sglist, int nelems, int dir)
3317 {
3318         int i;
3319         struct scatterlist *sg;
3320
3321         for_each_sg(sglist, sg, nelems, i) {
3322                 BUG_ON(!sg_page(sg));
3323                 sg->dma_address = page_to_phys(sg_page(sg)) + sg->offset;
3324                 sg->dma_length = sg->length;
3325         }
3326         return nelems;
3327 }
3328
3329 static int intel_map_sg(struct device *dev, struct scatterlist *sglist, int nelems,
3330                         enum dma_data_direction dir, struct dma_attrs *attrs)
3331 {
3332         int i;
3333         struct dmar_domain *domain;
3334         size_t size = 0;
3335         int prot = 0;
3336         struct iova *iova = NULL;
3337         int ret;
3338         struct scatterlist *sg;
3339         unsigned long start_vpfn;
3340         struct intel_iommu *iommu;
3341
3342         BUG_ON(dir == DMA_NONE);
3343         if (iommu_no_mapping(dev))
3344                 return intel_nontranslate_map_sg(dev, sglist, nelems, dir);
3345
3346         domain = get_valid_domain_for_dev(dev);
3347         if (!domain)
3348                 return 0;
3349
3350         iommu = domain_get_iommu(domain);
3351
3352         for_each_sg(sglist, sg, nelems, i)
3353                 size += aligned_nrpages(sg->offset, sg->length);
3354
3355         iova = intel_alloc_iova(dev, domain, dma_to_mm_pfn(size),
3356                                 *dev->dma_mask);
3357         if (!iova) {
3358                 sglist->dma_length = 0;
3359                 return 0;
3360         }
3361
3362         /*
3363          * Check if DMAR supports zero-length reads on write only
3364          * mappings..
3365          */
3366         if (dir == DMA_TO_DEVICE || dir == DMA_BIDIRECTIONAL || \
3367                         !cap_zlr(iommu->cap))
3368                 prot |= DMA_PTE_READ;
3369         if (dir == DMA_FROM_DEVICE || dir == DMA_BIDIRECTIONAL)
3370                 prot |= DMA_PTE_WRITE;
3371
3372         start_vpfn = mm_to_dma_pfn(iova->pfn_lo);
3373
3374         ret = domain_sg_mapping(domain, start_vpfn, sglist, size, prot);
3375         if (unlikely(ret)) {
3376                 dma_pte_free_pagetable(domain, start_vpfn,
3377                                        start_vpfn + size - 1);
3378                 __free_iova(&domain->iovad, iova);
3379                 return 0;
3380         }
3381
3382         /* it's a non-present to present mapping. Only flush if caching mode */
3383         if (cap_caching_mode(iommu->cap))
3384                 iommu_flush_iotlb_psi(iommu, domain->id, start_vpfn, size, 0, 1);
3385         else
3386                 iommu_flush_write_buffer(iommu);
3387
3388         return nelems;
3389 }
3390
3391 static int intel_mapping_error(struct device *dev, dma_addr_t dma_addr)
3392 {
3393         return !dma_addr;
3394 }
3395
3396 struct dma_map_ops intel_dma_ops = {
3397         .alloc = intel_alloc_coherent,
3398         .free = intel_free_coherent,
3399         .map_sg = intel_map_sg,
3400         .unmap_sg = intel_unmap_sg,
3401         .map_page = intel_map_page,
3402         .unmap_page = intel_unmap_page,
3403         .mapping_error = intel_mapping_error,
3404 };
3405
3406 static inline int iommu_domain_cache_init(void)
3407 {
3408         int ret = 0;
3409
3410         iommu_domain_cache = kmem_cache_create("iommu_domain",
3411                                          sizeof(struct dmar_domain),
3412                                          0,
3413                                          SLAB_HWCACHE_ALIGN,
3414
3415                                          NULL);
3416         if (!iommu_domain_cache) {
3417                 printk(KERN_ERR "Couldn't create iommu_domain cache\n");
3418                 ret = -ENOMEM;
3419         }
3420
3421         return ret;
3422 }
3423
3424 static inline int iommu_devinfo_cache_init(void)
3425 {
3426         int ret = 0;
3427
3428         iommu_devinfo_cache = kmem_cache_create("iommu_devinfo",
3429                                          sizeof(struct device_domain_info),
3430                                          0,
3431                                          SLAB_HWCACHE_ALIGN,
3432                                          NULL);
3433         if (!iommu_devinfo_cache) {
3434                 printk(KERN_ERR "Couldn't create devinfo cache\n");
3435                 ret = -ENOMEM;
3436         }
3437
3438         return ret;
3439 }
3440
3441 static int __init iommu_init_mempool(void)
3442 {
3443         int ret;
3444         ret = iommu_iova_cache_init();
3445         if (ret)
3446                 return ret;
3447
3448         ret = iommu_domain_cache_init();
3449         if (ret)
3450                 goto domain_error;
3451
3452         ret = iommu_devinfo_cache_init();
3453         if (!ret)
3454                 return ret;
3455
3456         kmem_cache_destroy(iommu_domain_cache);
3457 domain_error:
3458         iommu_iova_cache_destroy();
3459
3460         return -ENOMEM;
3461 }
3462
3463 static void __init iommu_exit_mempool(void)
3464 {
3465         kmem_cache_destroy(iommu_devinfo_cache);
3466         kmem_cache_destroy(iommu_domain_cache);
3467         iommu_iova_cache_destroy();
3468 }
3469
3470 static void quirk_ioat_snb_local_iommu(struct pci_dev *pdev)
3471 {
3472         struct dmar_drhd_unit *drhd;
3473         u32 vtbar;
3474         int rc;
3475
3476         /* We know that this device on this chipset has its own IOMMU.
3477          * If we find it under a different IOMMU, then the BIOS is lying
3478          * to us. Hope that the IOMMU for this device is actually
3479          * disabled, and it needs no translation...
3480          */
3481         rc = pci_bus_read_config_dword(pdev->bus, PCI_DEVFN(0, 0), 0xb0, &vtbar);
3482         if (rc) {
3483                 /* "can't" happen */
3484                 dev_info(&pdev->dev, "failed to run vt-d quirk\n");
3485                 return;
3486         }
3487         vtbar &= 0xffff0000;
3488
3489         /* we know that the this iommu should be at offset 0xa000 from vtbar */
3490         drhd = dmar_find_matched_drhd_unit(pdev);
3491         if (WARN_TAINT_ONCE(!drhd || drhd->reg_base_addr - vtbar != 0xa000,
3492                             TAINT_FIRMWARE_WORKAROUND,
3493                             "BIOS assigned incorrect VT-d unit for Intel(R) QuickData Technology device\n"))
3494                 pdev->dev.archdata.iommu = DUMMY_DEVICE_DOMAIN_INFO;
3495 }
3496 DECLARE_PCI_FIXUP_ENABLE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_IOAT_SNB, quirk_ioat_snb_local_iommu);
3497
3498 static void __init init_no_remapping_devices(void)
3499 {
3500         struct dmar_drhd_unit *drhd;
3501         struct device *dev;
3502         int i;
3503
3504         for_each_drhd_unit(drhd) {
3505                 if (!drhd->include_all) {
3506                         for_each_active_dev_scope(drhd->devices,
3507                                                   drhd->devices_cnt, i, dev)
3508                                 break;
3509                         /* ignore DMAR unit if no devices exist */
3510                         if (i == drhd->devices_cnt)
3511                                 drhd->ignored = 1;
3512                 }
3513         }
3514
3515         for_each_active_drhd_unit(drhd) {
3516                 if (drhd->include_all)
3517                         continue;
3518
3519                 for_each_active_dev_scope(drhd->devices,
3520                                           drhd->devices_cnt, i, dev)
3521                         if (!dev_is_pci(dev) || !IS_GFX_DEVICE(to_pci_dev(dev)))
3522                                 break;
3523                 if (i < drhd->devices_cnt)
3524                         continue;
3525
3526                 /* This IOMMU has *only* gfx devices. Either bypass it or
3527                    set the gfx_mapped flag, as appropriate */
3528                 if (dmar_map_gfx) {
3529                         intel_iommu_gfx_mapped = 1;
3530                 } else {
3531                         drhd->ignored = 1;
3532                         for_each_active_dev_scope(drhd->devices,
3533                                                   drhd->devices_cnt, i, dev)
3534                                 dev->archdata.iommu = DUMMY_DEVICE_DOMAIN_INFO;
3535                 }
3536         }
3537 }
3538
3539 #ifdef CONFIG_SUSPEND
3540 static int init_iommu_hw(void)
3541 {
3542         struct dmar_drhd_unit *drhd;
3543         struct intel_iommu *iommu = NULL;
3544
3545         for_each_active_iommu(iommu, drhd)
3546                 if (iommu->qi)
3547                         dmar_reenable_qi(iommu);
3548
3549         for_each_iommu(iommu, drhd) {
3550                 if (drhd->ignored) {
3551                         /*
3552                          * we always have to disable PMRs or DMA may fail on
3553                          * this device
3554                          */
3555                         if (force_on)
3556                                 iommu_disable_protect_mem_regions(iommu);
3557                         continue;
3558                 }
3559         
3560                 iommu_flush_write_buffer(iommu);
3561
3562                 iommu_set_root_entry(iommu);
3563
3564                 iommu->flush.flush_context(iommu, 0, 0, 0,
3565                                            DMA_CCMD_GLOBAL_INVL);
3566                 iommu->flush.flush_iotlb(iommu, 0, 0, 0, DMA_TLB_GLOBAL_FLUSH);
3567                 iommu_enable_translation(iommu);
3568                 iommu_disable_protect_mem_regions(iommu);
3569         }
3570
3571         return 0;
3572 }
3573
3574 static void iommu_flush_all(void)
3575 {
3576         struct dmar_drhd_unit *drhd;
3577         struct intel_iommu *iommu;
3578
3579         for_each_active_iommu(iommu, drhd) {
3580                 iommu->flush.flush_context(iommu, 0, 0, 0,
3581                                            DMA_CCMD_GLOBAL_INVL);
3582                 iommu->flush.flush_iotlb(iommu, 0, 0, 0,
3583                                          DMA_TLB_GLOBAL_FLUSH);
3584         }
3585 }
3586
3587 static int iommu_suspend(void)
3588 {
3589         struct dmar_drhd_unit *drhd;
3590         struct intel_iommu *iommu = NULL;
3591         unsigned long flag;
3592
3593         for_each_active_iommu(iommu, drhd) {
3594                 iommu->iommu_state = kzalloc(sizeof(u32) * MAX_SR_DMAR_REGS,
3595                                                  GFP_ATOMIC);
3596                 if (!iommu->iommu_state)
3597                         goto nomem;
3598         }
3599
3600         iommu_flush_all();
3601
3602         for_each_active_iommu(iommu, drhd) {
3603                 iommu_disable_translation(iommu);
3604
3605                 raw_spin_lock_irqsave(&iommu->register_lock, flag);
3606
3607                 iommu->iommu_state[SR_DMAR_FECTL_REG] =
3608                         readl(iommu->reg + DMAR_FECTL_REG);
3609                 iommu->iommu_state[SR_DMAR_FEDATA_REG] =
3610                         readl(iommu->reg + DMAR_FEDATA_REG);
3611                 iommu->iommu_state[SR_DMAR_FEADDR_REG] =
3612                         readl(iommu->reg + DMAR_FEADDR_REG);
3613                 iommu->iommu_state[SR_DMAR_FEUADDR_REG] =
3614                         readl(iommu->reg + DMAR_FEUADDR_REG);
3615
3616                 raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
3617         }
3618         return 0;
3619
3620 nomem:
3621         for_each_active_iommu(iommu, drhd)
3622                 kfree(iommu->iommu_state);
3623
3624         return -ENOMEM;
3625 }
3626
3627 static void iommu_resume(void)
3628 {
3629         struct dmar_drhd_unit *drhd;
3630         struct intel_iommu *iommu = NULL;
3631         unsigned long flag;
3632
3633         if (init_iommu_hw()) {
3634                 if (force_on)
3635                         panic("tboot: IOMMU setup failed, DMAR can not resume!\n");
3636                 else
3637                         WARN(1, "IOMMU setup failed, DMAR can not resume!\n");
3638                 return;
3639         }
3640
3641         for_each_active_iommu(iommu, drhd) {
3642
3643                 raw_spin_lock_irqsave(&iommu->register_lock, flag);
3644
3645                 writel(iommu->iommu_state[SR_DMAR_FECTL_REG],
3646                         iommu->reg + DMAR_FECTL_REG);
3647                 writel(iommu->iommu_state[SR_DMAR_FEDATA_REG],
3648                         iommu->reg + DMAR_FEDATA_REG);
3649                 writel(iommu->iommu_state[SR_DMAR_FEADDR_REG],
3650                         iommu->reg + DMAR_FEADDR_REG);
3651                 writel(iommu->iommu_state[SR_DMAR_FEUADDR_REG],
3652                         iommu->reg + DMAR_FEUADDR_REG);
3653
3654                 raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
3655         }
3656
3657         for_each_active_iommu(iommu, drhd)
3658                 kfree(iommu->iommu_state);
3659 }
3660
3661 static struct syscore_ops iommu_syscore_ops = {
3662         .resume         = iommu_resume,
3663         .suspend        = iommu_suspend,
3664 };
3665
3666 static void __init init_iommu_pm_ops(void)
3667 {
3668         register_syscore_ops(&iommu_syscore_ops);
3669 }
3670
3671 #else
3672 static inline void init_iommu_pm_ops(void) {}
3673 #endif  /* CONFIG_PM */
3674
3675
3676 int __init dmar_parse_one_rmrr(struct acpi_dmar_header *header, void *arg)
3677 {
3678         struct acpi_dmar_reserved_memory *rmrr;
3679         struct dmar_rmrr_unit *rmrru;
3680
3681         rmrru = kzalloc(sizeof(*rmrru), GFP_KERNEL);
3682         if (!rmrru)
3683                 return -ENOMEM;
3684
3685         rmrru->hdr = header;
3686         rmrr = (struct acpi_dmar_reserved_memory *)header;
3687         rmrru->base_address = rmrr->base_address;
3688         rmrru->end_address = rmrr->end_address;
3689         rmrru->devices = dmar_alloc_dev_scope((void *)(rmrr + 1),
3690                                 ((void *)rmrr) + rmrr->header.length,
3691                                 &rmrru->devices_cnt);
3692         if (rmrru->devices_cnt && rmrru->devices == NULL) {
3693                 kfree(rmrru);
3694                 return -ENOMEM;
3695         }
3696
3697         list_add(&rmrru->list, &dmar_rmrr_units);
3698
3699         return 0;
3700 }
3701
3702 static struct dmar_atsr_unit *dmar_find_atsr(struct acpi_dmar_atsr *atsr)
3703 {
3704         struct dmar_atsr_unit *atsru;
3705         struct acpi_dmar_atsr *tmp;
3706
3707         list_for_each_entry_rcu(atsru, &dmar_atsr_units, list) {
3708                 tmp = (struct acpi_dmar_atsr *)atsru->hdr;
3709                 if (atsr->segment != tmp->segment)
3710                         continue;
3711                 if (atsr->header.length != tmp->header.length)
3712                         continue;
3713                 if (memcmp(atsr, tmp, atsr->header.length) == 0)
3714                         return atsru;
3715         }
3716
3717         return NULL;
3718 }
3719
3720 int dmar_parse_one_atsr(struct acpi_dmar_header *hdr, void *arg)
3721 {
3722         struct acpi_dmar_atsr *atsr;
3723         struct dmar_atsr_unit *atsru;
3724
3725         if (system_state != SYSTEM_BOOTING && !intel_iommu_enabled)
3726                 return 0;
3727
3728         atsr = container_of(hdr, struct acpi_dmar_atsr, header);
3729         atsru = dmar_find_atsr(atsr);
3730         if (atsru)
3731                 return 0;
3732
3733         atsru = kzalloc(sizeof(*atsru) + hdr->length, GFP_KERNEL);
3734         if (!atsru)
3735                 return -ENOMEM;
3736
3737         /*
3738          * If memory is allocated from slab by ACPI _DSM method, we need to
3739          * copy the memory content because the memory buffer will be freed
3740          * on return.
3741          */
3742         atsru->hdr = (void *)(atsru + 1);
3743         memcpy(atsru->hdr, hdr, hdr->length);
3744         atsru->include_all = atsr->flags & 0x1;
3745         if (!atsru->include_all) {
3746                 atsru->devices = dmar_alloc_dev_scope((void *)(atsr + 1),
3747                                 (void *)atsr + atsr->header.length,
3748                                 &atsru->devices_cnt);
3749                 if (atsru->devices_cnt && atsru->devices == NULL) {
3750                         kfree(atsru);
3751                         return -ENOMEM;
3752                 }
3753         }
3754
3755         list_add_rcu(&atsru->list, &dmar_atsr_units);
3756
3757         return 0;
3758 }
3759
3760 static void intel_iommu_free_atsr(struct dmar_atsr_unit *atsru)
3761 {
3762         dmar_free_dev_scope(&atsru->devices, &atsru->devices_cnt);
3763         kfree(atsru);
3764 }
3765
3766 int dmar_release_one_atsr(struct acpi_dmar_header *hdr, void *arg)
3767 {
3768         struct acpi_dmar_atsr *atsr;
3769         struct dmar_atsr_unit *atsru;
3770
3771         atsr = container_of(hdr, struct acpi_dmar_atsr, header);
3772         atsru = dmar_find_atsr(atsr);
3773         if (atsru) {
3774                 list_del_rcu(&atsru->list);
3775                 synchronize_rcu();
3776                 intel_iommu_free_atsr(atsru);
3777         }
3778
3779         return 0;
3780 }
3781
3782 int dmar_check_one_atsr(struct acpi_dmar_header *hdr, void *arg)
3783 {
3784         int i;
3785         struct device *dev;
3786         struct acpi_dmar_atsr *atsr;
3787         struct dmar_atsr_unit *atsru;
3788
3789         atsr = container_of(hdr, struct acpi_dmar_atsr, header);
3790         atsru = dmar_find_atsr(atsr);
3791         if (!atsru)
3792                 return 0;
3793
3794         if (!atsru->include_all && atsru->devices && atsru->devices_cnt)
3795                 for_each_active_dev_scope(atsru->devices, atsru->devices_cnt,
3796                                           i, dev)
3797                         return -EBUSY;
3798
3799         return 0;
3800 }
3801
3802 static int intel_iommu_add(struct dmar_drhd_unit *dmaru)
3803 {
3804         int sp, ret = 0;
3805         struct intel_iommu *iommu = dmaru->iommu;
3806
3807         if (g_iommus[iommu->seq_id])
3808                 return 0;
3809
3810         if (hw_pass_through && !ecap_pass_through(iommu->ecap)) {
3811                 pr_warn("IOMMU: %s doesn't support hardware pass through.\n",
3812                         iommu->name);
3813                 return -ENXIO;
3814         }
3815         if (!ecap_sc_support(iommu->ecap) &&
3816             domain_update_iommu_snooping(iommu)) {
3817                 pr_warn("IOMMU: %s doesn't support snooping.\n",
3818                         iommu->name);
3819                 return -ENXIO;
3820         }
3821         sp = domain_update_iommu_superpage(iommu) - 1;
3822         if (sp >= 0 && !(cap_super_page_val(iommu->cap) & (1 << sp))) {
3823                 pr_warn("IOMMU: %s doesn't support large page.\n",
3824                         iommu->name);
3825                 return -ENXIO;
3826         }
3827
3828         /*
3829          * Disable translation if already enabled prior to OS handover.
3830          */
3831         if (iommu->gcmd & DMA_GCMD_TE)
3832                 iommu_disable_translation(iommu);
3833
3834         g_iommus[iommu->seq_id] = iommu;
3835         ret = iommu_init_domains(iommu);
3836         if (ret == 0)
3837                 ret = iommu_alloc_root_entry(iommu);
3838         if (ret)
3839                 goto out;
3840
3841         if (dmaru->ignored) {
3842                 /*
3843                  * we always have to disable PMRs or DMA may fail on this device
3844                  */
3845                 if (force_on)
3846                         iommu_disable_protect_mem_regions(iommu);
3847                 return 0;
3848         }
3849
3850         intel_iommu_init_qi(iommu);
3851         iommu_flush_write_buffer(iommu);
3852         ret = dmar_set_interrupt(iommu);
3853         if (ret)
3854                 goto disable_iommu;
3855
3856         iommu_set_root_entry(iommu);
3857         iommu->flush.flush_context(iommu, 0, 0, 0, DMA_CCMD_GLOBAL_INVL);
3858         iommu->flush.flush_iotlb(iommu, 0, 0, 0, DMA_TLB_GLOBAL_FLUSH);
3859         iommu_enable_translation(iommu);
3860
3861         if (si_domain) {
3862                 ret = iommu_attach_domain(si_domain, iommu);
3863                 if (ret < 0 || si_domain->id != ret)
3864                         goto disable_iommu;
3865                 domain_attach_iommu(si_domain, iommu);
3866         }
3867
3868         iommu_disable_protect_mem_regions(iommu);
3869         return 0;
3870
3871 disable_iommu:
3872         disable_dmar_iommu(iommu);
3873 out:
3874         free_dmar_iommu(iommu);
3875         return ret;
3876 }
3877
3878 int dmar_iommu_hotplug(struct dmar_drhd_unit *dmaru, bool insert)
3879 {
3880         int ret = 0;
3881         struct intel_iommu *iommu = dmaru->iommu;
3882
3883         if (!intel_iommu_enabled)
3884                 return 0;
3885         if (iommu == NULL)
3886                 return -EINVAL;
3887
3888         if (insert) {
3889                 ret = intel_iommu_add(dmaru);
3890         } else {
3891                 disable_dmar_iommu(iommu);
3892                 free_dmar_iommu(iommu);
3893         }
3894
3895         return ret;
3896 }
3897
3898 static void intel_iommu_free_dmars(void)
3899 {
3900         struct dmar_rmrr_unit *rmrru, *rmrr_n;
3901         struct dmar_atsr_unit *atsru, *atsr_n;
3902
3903         list_for_each_entry_safe(rmrru, rmrr_n, &dmar_rmrr_units, list) {
3904                 list_del(&rmrru->list);
3905                 dmar_free_dev_scope(&rmrru->devices, &rmrru->devices_cnt);
3906                 kfree(rmrru);
3907         }
3908
3909         list_for_each_entry_safe(atsru, atsr_n, &dmar_atsr_units, list) {
3910                 list_del(&atsru->list);
3911                 intel_iommu_free_atsr(atsru);
3912         }
3913 }
3914
3915 int dmar_find_matched_atsr_unit(struct pci_dev *dev)
3916 {
3917         int i, ret = 1;
3918         struct pci_bus *bus;
3919         struct pci_dev *bridge = NULL;
3920         struct device *tmp;
3921         struct acpi_dmar_atsr *atsr;
3922         struct dmar_atsr_unit *atsru;
3923
3924         dev = pci_physfn(dev);
3925         for (bus = dev->bus; bus; bus = bus->parent) {
3926                 bridge = bus->self;
3927                 if (!bridge || !pci_is_pcie(bridge) ||
3928                     pci_pcie_type(bridge) == PCI_EXP_TYPE_PCI_BRIDGE)
3929                         return 0;
3930                 if (pci_pcie_type(bridge) == PCI_EXP_TYPE_ROOT_PORT)
3931                         break;
3932         }
3933         if (!bridge)
3934                 return 0;
3935
3936         rcu_read_lock();
3937         list_for_each_entry_rcu(atsru, &dmar_atsr_units, list) {
3938                 atsr = container_of(atsru->hdr, struct acpi_dmar_atsr, header);
3939                 if (atsr->segment != pci_domain_nr(dev->bus))
3940                         continue;
3941
3942                 for_each_dev_scope(atsru->devices, atsru->devices_cnt, i, tmp)
3943                         if (tmp == &bridge->dev)
3944                                 goto out;
3945
3946                 if (atsru->include_all)
3947                         goto out;
3948         }
3949         ret = 0;
3950 out:
3951         rcu_read_unlock();
3952
3953         return ret;
3954 }
3955
3956 int dmar_iommu_notify_scope_dev(struct dmar_pci_notify_info *info)
3957 {
3958         int ret = 0;
3959         struct dmar_rmrr_unit *rmrru;
3960         struct dmar_atsr_unit *atsru;
3961         struct acpi_dmar_atsr *atsr;
3962         struct acpi_dmar_reserved_memory *rmrr;
3963
3964         if (!intel_iommu_enabled && system_state != SYSTEM_BOOTING)
3965                 return 0;
3966
3967         list_for_each_entry(rmrru, &dmar_rmrr_units, list) {
3968                 rmrr = container_of(rmrru->hdr,
3969                                     struct acpi_dmar_reserved_memory, header);
3970                 if (info->event == BUS_NOTIFY_ADD_DEVICE) {
3971                         ret = dmar_insert_dev_scope(info, (void *)(rmrr + 1),
3972                                 ((void *)rmrr) + rmrr->header.length,
3973                                 rmrr->segment, rmrru->devices,
3974                                 rmrru->devices_cnt);
3975                         if(ret < 0)
3976                                 return ret;
3977                 } else if (info->event == BUS_NOTIFY_DEL_DEVICE) {
3978                         dmar_remove_dev_scope(info, rmrr->segment,
3979                                 rmrru->devices, rmrru->devices_cnt);
3980                 }
3981         }
3982
3983         list_for_each_entry(atsru, &dmar_atsr_units, list) {
3984                 if (atsru->include_all)
3985                         continue;
3986
3987                 atsr = container_of(atsru->hdr, struct acpi_dmar_atsr, header);
3988                 if (info->event == BUS_NOTIFY_ADD_DEVICE) {
3989                         ret = dmar_insert_dev_scope(info, (void *)(atsr + 1),
3990                                         (void *)atsr + atsr->header.length,
3991                                         atsr->segment, atsru->devices,
3992                                         atsru->devices_cnt);
3993                         if (ret > 0)
3994                                 break;
3995                         else if(ret < 0)
3996                                 return ret;
3997                 } else if (info->event == BUS_NOTIFY_DEL_DEVICE) {
3998                         if (dmar_remove_dev_scope(info, atsr->segment,
3999                                         atsru->devices, atsru->devices_cnt))
4000                                 break;
4001                 }
4002         }
4003
4004         return 0;
4005 }
4006
4007 /*
4008  * Here we only respond to action of unbound device from driver.
4009  *
4010  * Added device is not attached to its DMAR domain here yet. That will happen
4011  * when mapping the device to iova.
4012  */
4013 static int device_notifier(struct notifier_block *nb,
4014                                   unsigned long action, void *data)
4015 {
4016         struct device *dev = data;
4017         struct dmar_domain *domain;
4018
4019         if (iommu_dummy(dev))
4020                 return 0;
4021
4022         if (action != BUS_NOTIFY_REMOVED_DEVICE)
4023                 return 0;
4024
4025         domain = find_domain(dev);
4026         if (!domain)
4027                 return 0;
4028
4029         down_read(&dmar_global_lock);
4030         domain_remove_one_dev_info(domain, dev);
4031         if (!domain_type_is_vm_or_si(domain) && list_empty(&domain->devices))
4032                 domain_exit(domain);
4033         up_read(&dmar_global_lock);
4034
4035         return 0;
4036 }
4037
4038 static struct notifier_block device_nb = {
4039         .notifier_call = device_notifier,
4040 };
4041
4042 static int intel_iommu_memory_notifier(struct notifier_block *nb,
4043                                        unsigned long val, void *v)
4044 {
4045         struct memory_notify *mhp = v;
4046         unsigned long long start, end;
4047         unsigned long start_vpfn, last_vpfn;
4048
4049         switch (val) {
4050         case MEM_GOING_ONLINE:
4051                 start = mhp->start_pfn << PAGE_SHIFT;
4052                 end = ((mhp->start_pfn + mhp->nr_pages) << PAGE_SHIFT) - 1;
4053                 if (iommu_domain_identity_map(si_domain, start, end)) {
4054                         pr_warn("dmar: failed to build identity map for [%llx-%llx]\n",
4055                                 start, end);
4056                         return NOTIFY_BAD;
4057                 }
4058                 break;
4059
4060         case MEM_OFFLINE:
4061         case MEM_CANCEL_ONLINE:
4062                 start_vpfn = mm_to_dma_pfn(mhp->start_pfn);
4063                 last_vpfn = mm_to_dma_pfn(mhp->start_pfn + mhp->nr_pages - 1);
4064                 while (start_vpfn <= last_vpfn) {
4065                         struct iova *iova;
4066                         struct dmar_drhd_unit *drhd;
4067                         struct intel_iommu *iommu;
4068                         struct page *freelist;
4069
4070                         iova = find_iova(&si_domain->iovad, start_vpfn);
4071                         if (iova == NULL) {
4072                                 pr_debug("dmar: failed get IOVA for PFN %lx\n",
4073                                          start_vpfn);
4074                                 break;
4075                         }
4076
4077                         iova = split_and_remove_iova(&si_domain->iovad, iova,
4078                                                      start_vpfn, last_vpfn);
4079                         if (iova == NULL) {
4080                                 pr_warn("dmar: failed to split IOVA PFN [%lx-%lx]\n",
4081                                         start_vpfn, last_vpfn);
4082                                 return NOTIFY_BAD;
4083                         }
4084
4085                         freelist = domain_unmap(si_domain, iova->pfn_lo,
4086                                                iova->pfn_hi);
4087
4088                         rcu_read_lock();
4089                         for_each_active_iommu(iommu, drhd)
4090                                 iommu_flush_iotlb_psi(iommu, si_domain->id,
4091                                         iova->pfn_lo, iova_size(iova),
4092                                         !freelist, 0);
4093                         rcu_read_unlock();
4094                         dma_free_pagelist(freelist);
4095
4096                         start_vpfn = iova->pfn_hi + 1;
4097                         free_iova_mem(iova);
4098                 }
4099                 break;
4100         }
4101
4102         return NOTIFY_OK;
4103 }
4104
4105 static struct notifier_block intel_iommu_memory_nb = {
4106         .notifier_call = intel_iommu_memory_notifier,
4107         .priority = 0
4108 };
4109
4110
4111 static ssize_t intel_iommu_show_version(struct device *dev,
4112                                         struct device_attribute *attr,
4113                                         char *buf)
4114 {
4115         struct intel_iommu *iommu = dev_get_drvdata(dev);
4116         u32 ver = readl(iommu->reg + DMAR_VER_REG);
4117         return sprintf(buf, "%d:%d\n",
4118                        DMAR_VER_MAJOR(ver), DMAR_VER_MINOR(ver));
4119 }
4120 static DEVICE_ATTR(version, S_IRUGO, intel_iommu_show_version, NULL);
4121
4122 static ssize_t intel_iommu_show_address(struct device *dev,
4123                                         struct device_attribute *attr,
4124                                         char *buf)
4125 {
4126         struct intel_iommu *iommu = dev_get_drvdata(dev);
4127         return sprintf(buf, "%llx\n", iommu->reg_phys);
4128 }
4129 static DEVICE_ATTR(address, S_IRUGO, intel_iommu_show_address, NULL);
4130
4131 static ssize_t intel_iommu_show_cap(struct device *dev,
4132                                     struct device_attribute *attr,
4133                                     char *buf)
4134 {
4135         struct intel_iommu *iommu = dev_get_drvdata(dev);
4136         return sprintf(buf, "%llx\n", iommu->cap);
4137 }
4138 static DEVICE_ATTR(cap, S_IRUGO, intel_iommu_show_cap, NULL);
4139
4140 static ssize_t intel_iommu_show_ecap(struct device *dev,
4141                                     struct device_attribute *attr,
4142                                     char *buf)
4143 {
4144         struct intel_iommu *iommu = dev_get_drvdata(dev);
4145         return sprintf(buf, "%llx\n", iommu->ecap);
4146 }
4147 static DEVICE_ATTR(ecap, S_IRUGO, intel_iommu_show_ecap, NULL);
4148
4149 static struct attribute *intel_iommu_attrs[] = {
4150         &dev_attr_version.attr,
4151         &dev_attr_address.attr,
4152         &dev_attr_cap.attr,
4153         &dev_attr_ecap.attr,
4154         NULL,
4155 };
4156
4157 static struct attribute_group intel_iommu_group = {
4158         .name = "intel-iommu",
4159         .attrs = intel_iommu_attrs,
4160 };
4161
4162 const struct attribute_group *intel_iommu_groups[] = {
4163         &intel_iommu_group,
4164         NULL,
4165 };
4166
4167 int __init intel_iommu_init(void)
4168 {
4169         int ret = -ENODEV;
4170         struct dmar_drhd_unit *drhd;
4171         struct intel_iommu *iommu;
4172
4173         /* VT-d is required for a TXT/tboot launch, so enforce that */
4174         force_on = tboot_force_iommu();
4175
4176         if (iommu_init_mempool()) {
4177                 if (force_on)
4178                         panic("tboot: Failed to initialize iommu memory\n");
4179                 return -ENOMEM;
4180         }
4181
4182         down_write(&dmar_global_lock);
4183         if (dmar_table_init()) {
4184                 if (force_on)
4185                         panic("tboot: Failed to initialize DMAR table\n");
4186                 goto out_free_dmar;
4187         }
4188
4189         /*
4190          * Disable translation if already enabled prior to OS handover.
4191          */
4192         for_each_active_iommu(iommu, drhd)
4193                 if (iommu->gcmd & DMA_GCMD_TE)
4194                         iommu_disable_translation(iommu);
4195
4196         if (dmar_dev_scope_init() < 0) {
4197                 if (force_on)
4198                         panic("tboot: Failed to initialize DMAR device scope\n");
4199                 goto out_free_dmar;
4200         }
4201
4202         if (no_iommu || dmar_disabled)
4203                 goto out_free_dmar;
4204
4205         if (list_empty(&dmar_rmrr_units))
4206                 printk(KERN_INFO "DMAR: No RMRR found\n");
4207
4208         if (list_empty(&dmar_atsr_units))
4209                 printk(KERN_INFO "DMAR: No ATSR found\n");
4210
4211         if (dmar_init_reserved_ranges()) {
4212                 if (force_on)
4213                         panic("tboot: Failed to reserve iommu ranges\n");
4214                 goto out_free_reserved_range;
4215         }
4216
4217         init_no_remapping_devices();
4218
4219         ret = init_dmars();
4220         if (ret) {
4221                 if (force_on)
4222                         panic("tboot: Failed to initialize DMARs\n");
4223                 printk(KERN_ERR "IOMMU: dmar init failed\n");
4224                 goto out_free_reserved_range;
4225         }
4226         up_write(&dmar_global_lock);
4227         printk(KERN_INFO
4228         "PCI-DMA: Intel(R) Virtualization Technology for Directed I/O\n");
4229
4230         init_timer(&unmap_timer);
4231 #ifdef CONFIG_SWIOTLB
4232         swiotlb = 0;
4233 #endif
4234         dma_ops = &intel_dma_ops;
4235
4236         init_iommu_pm_ops();
4237
4238         for_each_active_iommu(iommu, drhd)
4239                 iommu->iommu_dev = iommu_device_create(NULL, iommu,
4240                                                        intel_iommu_groups,
4241                                                        iommu->name);
4242
4243         bus_set_iommu(&pci_bus_type, &intel_iommu_ops);
4244         bus_register_notifier(&pci_bus_type, &device_nb);
4245         if (si_domain && !hw_pass_through)
4246                 register_memory_notifier(&intel_iommu_memory_nb);
4247
4248         intel_iommu_enabled = 1;
4249
4250         return 0;
4251
4252 out_free_reserved_range:
4253         put_iova_domain(&reserved_iova_list);
4254 out_free_dmar:
4255         intel_iommu_free_dmars();
4256         up_write(&dmar_global_lock);
4257         iommu_exit_mempool();
4258         return ret;
4259 }
4260
4261 static int iommu_detach_dev_cb(struct pci_dev *pdev, u16 alias, void *opaque)
4262 {
4263         struct intel_iommu *iommu = opaque;
4264
4265         iommu_detach_dev(iommu, PCI_BUS_NUM(alias), alias & 0xff);
4266         return 0;
4267 }
4268
4269 /*
4270  * NB - intel-iommu lacks any sort of reference counting for the users of
4271  * dependent devices.  If multiple endpoints have intersecting dependent
4272  * devices, unbinding the driver from any one of them will possibly leave
4273  * the others unable to operate.
4274  */
4275 static void iommu_detach_dependent_devices(struct intel_iommu *iommu,
4276                                            struct device *dev)
4277 {
4278         if (!iommu || !dev || !dev_is_pci(dev))
4279                 return;
4280
4281         pci_for_each_dma_alias(to_pci_dev(dev), &iommu_detach_dev_cb, iommu);
4282 }
4283
4284 static void domain_remove_one_dev_info(struct dmar_domain *domain,
4285                                        struct device *dev)
4286 {
4287         struct device_domain_info *info, *tmp;
4288         struct intel_iommu *iommu;
4289         unsigned long flags;
4290         bool found = false;
4291         u8 bus, devfn;
4292
4293         iommu = device_to_iommu(dev, &bus, &devfn);
4294         if (!iommu)
4295                 return;
4296
4297         spin_lock_irqsave(&device_domain_lock, flags);
4298         list_for_each_entry_safe(info, tmp, &domain->devices, link) {
4299                 if (info->iommu == iommu && info->bus == bus &&
4300                     info->devfn == devfn) {
4301                         unlink_domain_info(info);
4302                         spin_unlock_irqrestore(&device_domain_lock, flags);
4303
4304                         iommu_disable_dev_iotlb(info);
4305                         iommu_detach_dev(iommu, info->bus, info->devfn);
4306                         iommu_detach_dependent_devices(iommu, dev);
4307                         free_devinfo_mem(info);
4308
4309                         spin_lock_irqsave(&device_domain_lock, flags);
4310
4311                         if (found)
4312                                 break;
4313                         else
4314                                 continue;
4315                 }
4316
4317                 /* if there is no other devices under the same iommu
4318                  * owned by this domain, clear this iommu in iommu_bmp
4319                  * update iommu count and coherency
4320                  */
4321                 if (info->iommu == iommu)
4322                         found = true;
4323         }
4324
4325         spin_unlock_irqrestore(&device_domain_lock, flags);
4326
4327         if (found == 0) {
4328                 domain_detach_iommu(domain, iommu);
4329                 if (!domain_type_is_vm_or_si(domain))
4330                         iommu_detach_domain(domain, iommu);
4331         }
4332 }
4333
4334 static int md_domain_init(struct dmar_domain *domain, int guest_width)
4335 {
4336         int adjust_width;
4337
4338         init_iova_domain(&domain->iovad, VTD_PAGE_SIZE, IOVA_START_PFN,
4339                         DMA_32BIT_PFN);
4340         domain_reserve_special_ranges(domain);
4341
4342         /* calculate AGAW */
4343         domain->gaw = guest_width;
4344         adjust_width = guestwidth_to_adjustwidth(guest_width);
4345         domain->agaw = width_to_agaw(adjust_width);
4346
4347         domain->iommu_coherency = 0;
4348         domain->iommu_snooping = 0;
4349         domain->iommu_superpage = 0;
4350         domain->max_addr = 0;
4351
4352         /* always allocate the top pgd */
4353         domain->pgd = (struct dma_pte *)alloc_pgtable_page(domain->nid);
4354         if (!domain->pgd)
4355                 return -ENOMEM;
4356         domain_flush_cache(domain, domain->pgd, PAGE_SIZE);
4357         return 0;
4358 }
4359
4360 static struct iommu_domain *intel_iommu_domain_alloc(unsigned type)
4361 {
4362         struct dmar_domain *dmar_domain;
4363         struct iommu_domain *domain;
4364
4365         if (type != IOMMU_DOMAIN_UNMANAGED)
4366                 return NULL;
4367
4368         dmar_domain = alloc_domain(DOMAIN_FLAG_VIRTUAL_MACHINE);
4369         if (!dmar_domain) {
4370                 printk(KERN_ERR
4371                         "intel_iommu_domain_init: dmar_domain == NULL\n");
4372                 return NULL;
4373         }
4374         if (md_domain_init(dmar_domain, DEFAULT_DOMAIN_ADDRESS_WIDTH)) {
4375                 printk(KERN_ERR
4376                         "intel_iommu_domain_init() failed\n");
4377                 domain_exit(dmar_domain);
4378                 return NULL;
4379         }
4380         domain_update_iommu_cap(dmar_domain);
4381
4382         domain = &dmar_domain->domain;
4383         domain->geometry.aperture_start = 0;
4384         domain->geometry.aperture_end   = __DOMAIN_MAX_ADDR(dmar_domain->gaw);
4385         domain->geometry.force_aperture = true;
4386
4387         return domain;
4388 }
4389
4390 static void intel_iommu_domain_free(struct iommu_domain *domain)
4391 {
4392         domain_exit(to_dmar_domain(domain));
4393 }
4394
4395 static int intel_iommu_attach_device(struct iommu_domain *domain,
4396                                      struct device *dev)
4397 {
4398         struct dmar_domain *dmar_domain = to_dmar_domain(domain);
4399         struct intel_iommu *iommu;
4400         int addr_width;
4401         u8 bus, devfn;
4402
4403         if (device_is_rmrr_locked(dev)) {
4404                 dev_warn(dev, "Device is ineligible for IOMMU domain attach due to platform RMRR requirement.  Contact your platform vendor.\n");
4405                 return -EPERM;
4406         }
4407
4408         /* normally dev is not mapped */
4409         if (unlikely(domain_context_mapped(dev))) {
4410                 struct dmar_domain *old_domain;
4411
4412                 old_domain = find_domain(dev);
4413                 if (old_domain) {
4414                         if (domain_type_is_vm_or_si(dmar_domain))
4415                                 domain_remove_one_dev_info(old_domain, dev);
4416                         else
4417                                 domain_remove_dev_info(old_domain);
4418
4419                         if (!domain_type_is_vm_or_si(old_domain) &&
4420                              list_empty(&old_domain->devices))
4421                                 domain_exit(old_domain);
4422                 }
4423         }
4424
4425         iommu = device_to_iommu(dev, &bus, &devfn);
4426         if (!iommu)
4427                 return -ENODEV;
4428
4429         /* check if this iommu agaw is sufficient for max mapped address */
4430         addr_width = agaw_to_width(iommu->agaw);
4431         if (addr_width > cap_mgaw(iommu->cap))
4432                 addr_width = cap_mgaw(iommu->cap);
4433
4434         if (dmar_domain->max_addr > (1LL << addr_width)) {
4435                 printk(KERN_ERR "%s: iommu width (%d) is not "
4436                        "sufficient for the mapped address (%llx)\n",
4437                        __func__, addr_width, dmar_domain->max_addr);
4438                 return -EFAULT;
4439         }
4440         dmar_domain->gaw = addr_width;
4441
4442         /*
4443          * Knock out extra levels of page tables if necessary
4444          */
4445         while (iommu->agaw < dmar_domain->agaw) {
4446                 struct dma_pte *pte;
4447
4448                 pte = dmar_domain->pgd;
4449                 if (dma_pte_present(pte)) {
4450                         dmar_domain->pgd = (struct dma_pte *)
4451                                 phys_to_virt(dma_pte_addr(pte));
4452                         free_pgtable_page(pte);
4453                 }
4454                 dmar_domain->agaw--;
4455         }
4456
4457         return domain_add_dev_info(dmar_domain, dev, CONTEXT_TT_MULTI_LEVEL);
4458 }
4459
4460 static void intel_iommu_detach_device(struct iommu_domain *domain,
4461                                       struct device *dev)
4462 {
4463         domain_remove_one_dev_info(to_dmar_domain(domain), dev);
4464 }
4465
4466 static int intel_iommu_map(struct iommu_domain *domain,
4467                            unsigned long iova, phys_addr_t hpa,
4468                            size_t size, int iommu_prot)
4469 {
4470         struct dmar_domain *dmar_domain = to_dmar_domain(domain);
4471         u64 max_addr;
4472         int prot = 0;
4473         int ret;
4474
4475         if (iommu_prot & IOMMU_READ)
4476                 prot |= DMA_PTE_READ;
4477         if (iommu_prot & IOMMU_WRITE)
4478                 prot |= DMA_PTE_WRITE;
4479         if ((iommu_prot & IOMMU_CACHE) && dmar_domain->iommu_snooping)
4480                 prot |= DMA_PTE_SNP;
4481
4482         max_addr = iova + size;
4483         if (dmar_domain->max_addr < max_addr) {
4484                 u64 end;
4485
4486                 /* check if minimum agaw is sufficient for mapped address */
4487                 end = __DOMAIN_MAX_ADDR(dmar_domain->gaw) + 1;
4488                 if (end < max_addr) {
4489                         printk(KERN_ERR "%s: iommu width (%d) is not "
4490                                "sufficient for the mapped address (%llx)\n",
4491                                __func__, dmar_domain->gaw, max_addr);
4492                         return -EFAULT;
4493                 }
4494                 dmar_domain->max_addr = max_addr;
4495         }
4496         /* Round up size to next multiple of PAGE_SIZE, if it and
4497            the low bits of hpa would take us onto the next page */
4498         size = aligned_nrpages(hpa, size);
4499         ret = domain_pfn_mapping(dmar_domain, iova >> VTD_PAGE_SHIFT,
4500                                  hpa >> VTD_PAGE_SHIFT, size, prot);
4501         return ret;
4502 }
4503
4504 static size_t intel_iommu_unmap(struct iommu_domain *domain,
4505                                 unsigned long iova, size_t size)
4506 {
4507         struct dmar_domain *dmar_domain = to_dmar_domain(domain);
4508         struct page *freelist = NULL;
4509         struct intel_iommu *iommu;
4510         unsigned long start_pfn, last_pfn;
4511         unsigned int npages;
4512         int iommu_id, num, ndomains, level = 0;
4513
4514         /* Cope with horrid API which requires us to unmap more than the
4515            size argument if it happens to be a large-page mapping. */
4516         if (!pfn_to_dma_pte(dmar_domain, iova >> VTD_PAGE_SHIFT, &level))
4517                 BUG();
4518
4519         if (size < VTD_PAGE_SIZE << level_to_offset_bits(level))
4520                 size = VTD_PAGE_SIZE << level_to_offset_bits(level);
4521
4522         start_pfn = iova >> VTD_PAGE_SHIFT;
4523         last_pfn = (iova + size - 1) >> VTD_PAGE_SHIFT;
4524
4525         freelist = domain_unmap(dmar_domain, start_pfn, last_pfn);
4526
4527         npages = last_pfn - start_pfn + 1;
4528
4529         for_each_set_bit(iommu_id, dmar_domain->iommu_bmp, g_num_of_iommus) {
4530                iommu = g_iommus[iommu_id];
4531
4532                /*
4533                 * find bit position of dmar_domain
4534                 */
4535                ndomains = cap_ndoms(iommu->cap);
4536                for_each_set_bit(num, iommu->domain_ids, ndomains) {
4537                        if (iommu->domains[num] == dmar_domain)
4538                                iommu_flush_iotlb_psi(iommu, num, start_pfn,
4539                                                      npages, !freelist, 0);
4540                }
4541
4542         }
4543
4544         dma_free_pagelist(freelist);
4545
4546         if (dmar_domain->max_addr == iova + size)
4547                 dmar_domain->max_addr = iova;
4548
4549         return size;
4550 }
4551
4552 static phys_addr_t intel_iommu_iova_to_phys(struct iommu_domain *domain,
4553                                             dma_addr_t iova)
4554 {
4555         struct dmar_domain *dmar_domain = to_dmar_domain(domain);
4556         struct dma_pte *pte;
4557         int level = 0;
4558         u64 phys = 0;
4559
4560         pte = pfn_to_dma_pte(dmar_domain, iova >> VTD_PAGE_SHIFT, &level);
4561         if (pte)
4562                 phys = dma_pte_addr(pte);
4563
4564         return phys;
4565 }
4566
4567 static bool intel_iommu_capable(enum iommu_cap cap)
4568 {
4569         if (cap == IOMMU_CAP_CACHE_COHERENCY)
4570                 return domain_update_iommu_snooping(NULL) == 1;
4571         if (cap == IOMMU_CAP_INTR_REMAP)
4572                 return irq_remapping_enabled == 1;
4573
4574         return false;
4575 }
4576
4577 static int intel_iommu_add_device(struct device *dev)
4578 {
4579         struct intel_iommu *iommu;
4580         struct iommu_group *group;
4581         u8 bus, devfn;
4582
4583         iommu = device_to_iommu(dev, &bus, &devfn);
4584         if (!iommu)
4585                 return -ENODEV;
4586
4587         iommu_device_link(iommu->iommu_dev, dev);
4588
4589         group = iommu_group_get_for_dev(dev);
4590
4591         if (IS_ERR(group))
4592                 return PTR_ERR(group);
4593
4594         iommu_group_put(group);
4595         return 0;
4596 }
4597
4598 static void intel_iommu_remove_device(struct device *dev)
4599 {
4600         struct intel_iommu *iommu;
4601         u8 bus, devfn;
4602
4603         iommu = device_to_iommu(dev, &bus, &devfn);
4604         if (!iommu)
4605                 return;
4606
4607         iommu_group_remove_device(dev);
4608
4609         iommu_device_unlink(iommu->iommu_dev, dev);
4610 }
4611
4612 static const struct iommu_ops intel_iommu_ops = {
4613         .capable        = intel_iommu_capable,
4614         .domain_alloc   = intel_iommu_domain_alloc,
4615         .domain_free    = intel_iommu_domain_free,
4616         .attach_dev     = intel_iommu_attach_device,
4617         .detach_dev     = intel_iommu_detach_device,
4618         .map            = intel_iommu_map,
4619         .unmap          = intel_iommu_unmap,
4620         .map_sg         = default_iommu_map_sg,
4621         .iova_to_phys   = intel_iommu_iova_to_phys,
4622         .add_device     = intel_iommu_add_device,
4623         .remove_device  = intel_iommu_remove_device,
4624         .pgsize_bitmap  = INTEL_IOMMU_PGSIZES,
4625 };
4626
4627 static void quirk_iommu_g4x_gfx(struct pci_dev *dev)
4628 {
4629         /* G4x/GM45 integrated gfx dmar support is totally busted. */
4630         printk(KERN_INFO "DMAR: Disabling IOMMU for graphics on this chipset\n");
4631         dmar_map_gfx = 0;
4632 }
4633
4634 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2a40, quirk_iommu_g4x_gfx);
4635 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e00, quirk_iommu_g4x_gfx);
4636 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e10, quirk_iommu_g4x_gfx);
4637 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e20, quirk_iommu_g4x_gfx);
4638 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e30, quirk_iommu_g4x_gfx);
4639 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e40, quirk_iommu_g4x_gfx);
4640 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e90, quirk_iommu_g4x_gfx);
4641
4642 static void quirk_iommu_rwbf(struct pci_dev *dev)
4643 {
4644         /*
4645          * Mobile 4 Series Chipset neglects to set RWBF capability,
4646          * but needs it. Same seems to hold for the desktop versions.
4647          */
4648         printk(KERN_INFO "DMAR: Forcing write-buffer flush capability\n");
4649         rwbf_quirk = 1;
4650 }
4651
4652 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2a40, quirk_iommu_rwbf);
4653 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e00, quirk_iommu_rwbf);
4654 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e10, quirk_iommu_rwbf);
4655 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e20, quirk_iommu_rwbf);
4656 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e30, quirk_iommu_rwbf);
4657 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e40, quirk_iommu_rwbf);
4658 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e90, quirk_iommu_rwbf);
4659
4660 #define GGC 0x52
4661 #define GGC_MEMORY_SIZE_MASK    (0xf << 8)
4662 #define GGC_MEMORY_SIZE_NONE    (0x0 << 8)
4663 #define GGC_MEMORY_SIZE_1M      (0x1 << 8)
4664 #define GGC_MEMORY_SIZE_2M      (0x3 << 8)
4665 #define GGC_MEMORY_VT_ENABLED   (0x8 << 8)
4666 #define GGC_MEMORY_SIZE_2M_VT   (0x9 << 8)
4667 #define GGC_MEMORY_SIZE_3M_VT   (0xa << 8)
4668 #define GGC_MEMORY_SIZE_4M_VT   (0xb << 8)
4669
4670 static void quirk_calpella_no_shadow_gtt(struct pci_dev *dev)
4671 {
4672         unsigned short ggc;
4673
4674         if (pci_read_config_word(dev, GGC, &ggc))
4675                 return;
4676
4677         if (!(ggc & GGC_MEMORY_VT_ENABLED)) {
4678                 printk(KERN_INFO "DMAR: BIOS has allocated no shadow GTT; disabling IOMMU for graphics\n");
4679                 dmar_map_gfx = 0;
4680         } else if (dmar_map_gfx) {
4681                 /* we have to ensure the gfx device is idle before we flush */
4682                 printk(KERN_INFO "DMAR: Disabling batched IOTLB flush on Ironlake\n");
4683                 intel_iommu_strict = 1;
4684        }
4685 }
4686 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x0040, quirk_calpella_no_shadow_gtt);
4687 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x0044, quirk_calpella_no_shadow_gtt);
4688 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x0062, quirk_calpella_no_shadow_gtt);
4689 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x006a, quirk_calpella_no_shadow_gtt);
4690
4691 /* On Tylersburg chipsets, some BIOSes have been known to enable the
4692    ISOCH DMAR unit for the Azalia sound device, but not give it any
4693    TLB entries, which causes it to deadlock. Check for that.  We do
4694    this in a function called from init_dmars(), instead of in a PCI
4695    quirk, because we don't want to print the obnoxious "BIOS broken"
4696    message if VT-d is actually disabled.
4697 */
4698 static void __init check_tylersburg_isoch(void)
4699 {
4700         struct pci_dev *pdev;
4701         uint32_t vtisochctrl;
4702
4703         /* If there's no Azalia in the system anyway, forget it. */
4704         pdev = pci_get_device(PCI_VENDOR_ID_INTEL, 0x3a3e, NULL);
4705         if (!pdev)
4706                 return;
4707         pci_dev_put(pdev);
4708
4709         /* System Management Registers. Might be hidden, in which case
4710            we can't do the sanity check. But that's OK, because the
4711            known-broken BIOSes _don't_ actually hide it, so far. */
4712         pdev = pci_get_device(PCI_VENDOR_ID_INTEL, 0x342e, NULL);
4713         if (!pdev)
4714                 return;
4715
4716         if (pci_read_config_dword(pdev, 0x188, &vtisochctrl)) {
4717                 pci_dev_put(pdev);
4718                 return;
4719         }
4720
4721         pci_dev_put(pdev);
4722
4723         /* If Azalia DMA is routed to the non-isoch DMAR unit, fine. */
4724         if (vtisochctrl & 1)
4725                 return;
4726
4727         /* Drop all bits other than the number of TLB entries */
4728         vtisochctrl &= 0x1c;
4729
4730         /* If we have the recommended number of TLB entries (16), fine. */
4731         if (vtisochctrl == 0x10)
4732                 return;
4733
4734         /* Zero TLB entries? You get to ride the short bus to school. */
4735         if (!vtisochctrl) {
4736                 WARN(1, "Your BIOS is broken; DMA routed to ISOCH DMAR unit but no TLB space.\n"
4737                      "BIOS vendor: %s; Ver: %s; Product Version: %s\n",
4738                      dmi_get_system_info(DMI_BIOS_VENDOR),
4739                      dmi_get_system_info(DMI_BIOS_VERSION),
4740                      dmi_get_system_info(DMI_PRODUCT_VERSION));
4741                 iommu_identity_mapping |= IDENTMAP_AZALIA;
4742                 return;
4743         }
4744         
4745         printk(KERN_WARNING "DMAR: Recommended TLB entries for ISOCH unit is 16; your BIOS set %d\n",
4746                vtisochctrl);
4747 }