Upgrade to 4.4.50-rt62
[kvmfornfv.git] / kernel / drivers / iommu / intel-iommu.c
1 /*
2  * Copyright © 2006-2014 Intel Corporation.
3  *
4  * This program is free software; you can redistribute it and/or modify it
5  * under the terms and conditions of the GNU General Public License,
6  * version 2, as published by the Free Software Foundation.
7  *
8  * This program is distributed in the hope it will be useful, but WITHOUT
9  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
10  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
11  * more details.
12  *
13  * Authors: David Woodhouse <dwmw2@infradead.org>,
14  *          Ashok Raj <ashok.raj@intel.com>,
15  *          Shaohua Li <shaohua.li@intel.com>,
16  *          Anil S Keshavamurthy <anil.s.keshavamurthy@intel.com>,
17  *          Fenghua Yu <fenghua.yu@intel.com>
18  *          Joerg Roedel <jroedel@suse.de>
19  */
20
21 #define pr_fmt(fmt)     "DMAR: " fmt
22
23 #include <linux/init.h>
24 #include <linux/bitmap.h>
25 #include <linux/debugfs.h>
26 #include <linux/export.h>
27 #include <linux/slab.h>
28 #include <linux/irq.h>
29 #include <linux/interrupt.h>
30 #include <linux/spinlock.h>
31 #include <linux/pci.h>
32 #include <linux/dmar.h>
33 #include <linux/dma-mapping.h>
34 #include <linux/mempool.h>
35 #include <linux/memory.h>
36 #include <linux/timer.h>
37 #include <linux/io.h>
38 #include <linux/iova.h>
39 #include <linux/iommu.h>
40 #include <linux/intel-iommu.h>
41 #include <linux/syscore_ops.h>
42 #include <linux/tboot.h>
43 #include <linux/dmi.h>
44 #include <linux/pci-ats.h>
45 #include <linux/memblock.h>
46 #include <linux/dma-contiguous.h>
47 #include <linux/crash_dump.h>
48 #include <asm/irq_remapping.h>
49 #include <asm/cacheflush.h>
50 #include <asm/iommu.h>
51
52 #include "irq_remapping.h"
53
54 #define ROOT_SIZE               VTD_PAGE_SIZE
55 #define CONTEXT_SIZE            VTD_PAGE_SIZE
56
57 #define IS_GFX_DEVICE(pdev) ((pdev->class >> 16) == PCI_BASE_CLASS_DISPLAY)
58 #define IS_USB_DEVICE(pdev) ((pdev->class >> 8) == PCI_CLASS_SERIAL_USB)
59 #define IS_ISA_DEVICE(pdev) ((pdev->class >> 8) == PCI_CLASS_BRIDGE_ISA)
60 #define IS_AZALIA(pdev) ((pdev)->vendor == 0x8086 && (pdev)->device == 0x3a3e)
61
62 #define IOAPIC_RANGE_START      (0xfee00000)
63 #define IOAPIC_RANGE_END        (0xfeefffff)
64 #define IOVA_START_ADDR         (0x1000)
65
66 #define DEFAULT_DOMAIN_ADDRESS_WIDTH 48
67
68 #define MAX_AGAW_WIDTH 64
69 #define MAX_AGAW_PFN_WIDTH      (MAX_AGAW_WIDTH - VTD_PAGE_SHIFT)
70
71 #define __DOMAIN_MAX_PFN(gaw)  ((((uint64_t)1) << (gaw-VTD_PAGE_SHIFT)) - 1)
72 #define __DOMAIN_MAX_ADDR(gaw) ((((uint64_t)1) << gaw) - 1)
73
74 /* We limit DOMAIN_MAX_PFN to fit in an unsigned long, and DOMAIN_MAX_ADDR
75    to match. That way, we can use 'unsigned long' for PFNs with impunity. */
76 #define DOMAIN_MAX_PFN(gaw)     ((unsigned long) min_t(uint64_t, \
77                                 __DOMAIN_MAX_PFN(gaw), (unsigned long)-1))
78 #define DOMAIN_MAX_ADDR(gaw)    (((uint64_t)__DOMAIN_MAX_PFN(gaw)) << VTD_PAGE_SHIFT)
79
80 /* IO virtual address start page frame number */
81 #define IOVA_START_PFN          (1)
82
83 #define IOVA_PFN(addr)          ((addr) >> PAGE_SHIFT)
84 #define DMA_32BIT_PFN           IOVA_PFN(DMA_BIT_MASK(32))
85 #define DMA_64BIT_PFN           IOVA_PFN(DMA_BIT_MASK(64))
86
87 /* page table handling */
88 #define LEVEL_STRIDE            (9)
89 #define LEVEL_MASK              (((u64)1 << LEVEL_STRIDE) - 1)
90
91 /*
92  * This bitmap is used to advertise the page sizes our hardware support
93  * to the IOMMU core, which will then use this information to split
94  * physically contiguous memory regions it is mapping into page sizes
95  * that we support.
96  *
97  * Traditionally the IOMMU core just handed us the mappings directly,
98  * after making sure the size is an order of a 4KiB page and that the
99  * mapping has natural alignment.
100  *
101  * To retain this behavior, we currently advertise that we support
102  * all page sizes that are an order of 4KiB.
103  *
104  * If at some point we'd like to utilize the IOMMU core's new behavior,
105  * we could change this to advertise the real page sizes we support.
106  */
107 #define INTEL_IOMMU_PGSIZES     (~0xFFFUL)
108
109 static inline int agaw_to_level(int agaw)
110 {
111         return agaw + 2;
112 }
113
114 static inline int agaw_to_width(int agaw)
115 {
116         return min_t(int, 30 + agaw * LEVEL_STRIDE, MAX_AGAW_WIDTH);
117 }
118
119 static inline int width_to_agaw(int width)
120 {
121         return DIV_ROUND_UP(width - 30, LEVEL_STRIDE);
122 }
123
124 static inline unsigned int level_to_offset_bits(int level)
125 {
126         return (level - 1) * LEVEL_STRIDE;
127 }
128
129 static inline int pfn_level_offset(unsigned long pfn, int level)
130 {
131         return (pfn >> level_to_offset_bits(level)) & LEVEL_MASK;
132 }
133
134 static inline unsigned long level_mask(int level)
135 {
136         return -1UL << level_to_offset_bits(level);
137 }
138
139 static inline unsigned long level_size(int level)
140 {
141         return 1UL << level_to_offset_bits(level);
142 }
143
144 static inline unsigned long align_to_level(unsigned long pfn, int level)
145 {
146         return (pfn + level_size(level) - 1) & level_mask(level);
147 }
148
149 static inline unsigned long lvl_to_nr_pages(unsigned int lvl)
150 {
151         return  1 << min_t(int, (lvl - 1) * LEVEL_STRIDE, MAX_AGAW_PFN_WIDTH);
152 }
153
154 /* VT-d pages must always be _smaller_ than MM pages. Otherwise things
155    are never going to work. */
156 static inline unsigned long dma_to_mm_pfn(unsigned long dma_pfn)
157 {
158         return dma_pfn >> (PAGE_SHIFT - VTD_PAGE_SHIFT);
159 }
160
161 static inline unsigned long mm_to_dma_pfn(unsigned long mm_pfn)
162 {
163         return mm_pfn << (PAGE_SHIFT - VTD_PAGE_SHIFT);
164 }
165 static inline unsigned long page_to_dma_pfn(struct page *pg)
166 {
167         return mm_to_dma_pfn(page_to_pfn(pg));
168 }
169 static inline unsigned long virt_to_dma_pfn(void *p)
170 {
171         return page_to_dma_pfn(virt_to_page(p));
172 }
173
174 /* global iommu list, set NULL for ignored DMAR units */
175 static struct intel_iommu **g_iommus;
176
177 static void __init check_tylersburg_isoch(void);
178 static int rwbf_quirk;
179
180 /*
181  * set to 1 to panic kernel if can't successfully enable VT-d
182  * (used when kernel is launched w/ TXT)
183  */
184 static int force_on = 0;
185
186 /*
187  * 0: Present
188  * 1-11: Reserved
189  * 12-63: Context Ptr (12 - (haw-1))
190  * 64-127: Reserved
191  */
192 struct root_entry {
193         u64     lo;
194         u64     hi;
195 };
196 #define ROOT_ENTRY_NR (VTD_PAGE_SIZE/sizeof(struct root_entry))
197
198 /*
199  * Take a root_entry and return the Lower Context Table Pointer (LCTP)
200  * if marked present.
201  */
202 static phys_addr_t root_entry_lctp(struct root_entry *re)
203 {
204         if (!(re->lo & 1))
205                 return 0;
206
207         return re->lo & VTD_PAGE_MASK;
208 }
209
210 /*
211  * Take a root_entry and return the Upper Context Table Pointer (UCTP)
212  * if marked present.
213  */
214 static phys_addr_t root_entry_uctp(struct root_entry *re)
215 {
216         if (!(re->hi & 1))
217                 return 0;
218
219         return re->hi & VTD_PAGE_MASK;
220 }
221 /*
222  * low 64 bits:
223  * 0: present
224  * 1: fault processing disable
225  * 2-3: translation type
226  * 12-63: address space root
227  * high 64 bits:
228  * 0-2: address width
229  * 3-6: aval
230  * 8-23: domain id
231  */
232 struct context_entry {
233         u64 lo;
234         u64 hi;
235 };
236
237 static inline void context_clear_pasid_enable(struct context_entry *context)
238 {
239         context->lo &= ~(1ULL << 11);
240 }
241
242 static inline bool context_pasid_enabled(struct context_entry *context)
243 {
244         return !!(context->lo & (1ULL << 11));
245 }
246
247 static inline void context_set_copied(struct context_entry *context)
248 {
249         context->hi |= (1ull << 3);
250 }
251
252 static inline bool context_copied(struct context_entry *context)
253 {
254         return !!(context->hi & (1ULL << 3));
255 }
256
257 static inline bool __context_present(struct context_entry *context)
258 {
259         return (context->lo & 1);
260 }
261
262 static inline bool context_present(struct context_entry *context)
263 {
264         return context_pasid_enabled(context) ?
265              __context_present(context) :
266              __context_present(context) && !context_copied(context);
267 }
268
269 static inline void context_set_present(struct context_entry *context)
270 {
271         context->lo |= 1;
272 }
273
274 static inline void context_set_fault_enable(struct context_entry *context)
275 {
276         context->lo &= (((u64)-1) << 2) | 1;
277 }
278
279 static inline void context_set_translation_type(struct context_entry *context,
280                                                 unsigned long value)
281 {
282         context->lo &= (((u64)-1) << 4) | 3;
283         context->lo |= (value & 3) << 2;
284 }
285
286 static inline void context_set_address_root(struct context_entry *context,
287                                             unsigned long value)
288 {
289         context->lo &= ~VTD_PAGE_MASK;
290         context->lo |= value & VTD_PAGE_MASK;
291 }
292
293 static inline void context_set_address_width(struct context_entry *context,
294                                              unsigned long value)
295 {
296         context->hi |= value & 7;
297 }
298
299 static inline void context_set_domain_id(struct context_entry *context,
300                                          unsigned long value)
301 {
302         context->hi |= (value & ((1 << 16) - 1)) << 8;
303 }
304
305 static inline int context_domain_id(struct context_entry *c)
306 {
307         return((c->hi >> 8) & 0xffff);
308 }
309
310 static inline void context_clear_entry(struct context_entry *context)
311 {
312         context->lo = 0;
313         context->hi = 0;
314 }
315
316 /*
317  * 0: readable
318  * 1: writable
319  * 2-6: reserved
320  * 7: super page
321  * 8-10: available
322  * 11: snoop behavior
323  * 12-63: Host physcial address
324  */
325 struct dma_pte {
326         u64 val;
327 };
328
329 static inline void dma_clear_pte(struct dma_pte *pte)
330 {
331         pte->val = 0;
332 }
333
334 static inline u64 dma_pte_addr(struct dma_pte *pte)
335 {
336 #ifdef CONFIG_64BIT
337         return pte->val & VTD_PAGE_MASK;
338 #else
339         /* Must have a full atomic 64-bit read */
340         return  __cmpxchg64(&pte->val, 0ULL, 0ULL) & VTD_PAGE_MASK;
341 #endif
342 }
343
344 static inline bool dma_pte_present(struct dma_pte *pte)
345 {
346         return (pte->val & 3) != 0;
347 }
348
349 static inline bool dma_pte_superpage(struct dma_pte *pte)
350 {
351         return (pte->val & DMA_PTE_LARGE_PAGE);
352 }
353
354 static inline int first_pte_in_page(struct dma_pte *pte)
355 {
356         return !((unsigned long)pte & ~VTD_PAGE_MASK);
357 }
358
359 /*
360  * This domain is a statically identity mapping domain.
361  *      1. This domain creats a static 1:1 mapping to all usable memory.
362  *      2. It maps to each iommu if successful.
363  *      3. Each iommu mapps to this domain if successful.
364  */
365 static struct dmar_domain *si_domain;
366 static int hw_pass_through = 1;
367
368 /*
369  * Domain represents a virtual machine, more than one devices
370  * across iommus may be owned in one domain, e.g. kvm guest.
371  */
372 #define DOMAIN_FLAG_VIRTUAL_MACHINE     (1 << 0)
373
374 /* si_domain contains mulitple devices */
375 #define DOMAIN_FLAG_STATIC_IDENTITY     (1 << 1)
376
377 #define for_each_domain_iommu(idx, domain)                      \
378         for (idx = 0; idx < g_num_of_iommus; idx++)             \
379                 if (domain->iommu_refcnt[idx])
380
381 struct dmar_domain {
382         int     nid;                    /* node id */
383
384         unsigned        iommu_refcnt[DMAR_UNITS_SUPPORTED];
385                                         /* Refcount of devices per iommu */
386
387
388         u16             iommu_did[DMAR_UNITS_SUPPORTED];
389                                         /* Domain ids per IOMMU. Use u16 since
390                                          * domain ids are 16 bit wide according
391                                          * to VT-d spec, section 9.3 */
392
393         struct list_head devices;       /* all devices' list */
394         struct iova_domain iovad;       /* iova's that belong to this domain */
395
396         struct dma_pte  *pgd;           /* virtual address */
397         int             gaw;            /* max guest address width */
398
399         /* adjusted guest address width, 0 is level 2 30-bit */
400         int             agaw;
401
402         int             flags;          /* flags to find out type of domain */
403
404         int             iommu_coherency;/* indicate coherency of iommu access */
405         int             iommu_snooping; /* indicate snooping control feature*/
406         int             iommu_count;    /* reference count of iommu */
407         int             iommu_superpage;/* Level of superpages supported:
408                                            0 == 4KiB (no superpages), 1 == 2MiB,
409                                            2 == 1GiB, 3 == 512GiB, 4 == 1TiB */
410         u64             max_addr;       /* maximum mapped address */
411
412         struct iommu_domain domain;     /* generic domain data structure for
413                                            iommu core */
414 };
415
416 /* PCI domain-device relationship */
417 struct device_domain_info {
418         struct list_head link;  /* link to domain siblings */
419         struct list_head global; /* link to global list */
420         u8 bus;                 /* PCI bus number */
421         u8 devfn;               /* PCI devfn number */
422         u8 pasid_supported:3;
423         u8 pasid_enabled:1;
424         u8 pri_supported:1;
425         u8 pri_enabled:1;
426         u8 ats_supported:1;
427         u8 ats_enabled:1;
428         u8 ats_qdep;
429         struct device *dev; /* it's NULL for PCIe-to-PCI bridge */
430         struct intel_iommu *iommu; /* IOMMU used by this device */
431         struct dmar_domain *domain; /* pointer to domain */
432 };
433
434 struct dmar_rmrr_unit {
435         struct list_head list;          /* list of rmrr units   */
436         struct acpi_dmar_header *hdr;   /* ACPI header          */
437         u64     base_address;           /* reserved base address*/
438         u64     end_address;            /* reserved end address */
439         struct dmar_dev_scope *devices; /* target devices */
440         int     devices_cnt;            /* target device count */
441 };
442
443 struct dmar_atsr_unit {
444         struct list_head list;          /* list of ATSR units */
445         struct acpi_dmar_header *hdr;   /* ACPI header */
446         struct dmar_dev_scope *devices; /* target devices */
447         int devices_cnt;                /* target device count */
448         u8 include_all:1;               /* include all ports */
449 };
450
451 static LIST_HEAD(dmar_atsr_units);
452 static LIST_HEAD(dmar_rmrr_units);
453
454 #define for_each_rmrr_units(rmrr) \
455         list_for_each_entry(rmrr, &dmar_rmrr_units, list)
456
457 static void flush_unmaps_timeout(unsigned long data);
458
459 static DEFINE_TIMER(unmap_timer,  flush_unmaps_timeout, 0, 0);
460
461 #define HIGH_WATER_MARK 250
462 struct deferred_flush_tables {
463         int next;
464         struct iova *iova[HIGH_WATER_MARK];
465         struct dmar_domain *domain[HIGH_WATER_MARK];
466         struct page *freelist[HIGH_WATER_MARK];
467 };
468
469 static struct deferred_flush_tables *deferred_flush;
470
471 /* bitmap for indexing intel_iommus */
472 static int g_num_of_iommus;
473
474 static DEFINE_SPINLOCK(async_umap_flush_lock);
475 static LIST_HEAD(unmaps_to_do);
476
477 static int timer_on;
478 static long list_size;
479
480 static void domain_exit(struct dmar_domain *domain);
481 static void domain_remove_dev_info(struct dmar_domain *domain);
482 static void dmar_remove_one_dev_info(struct dmar_domain *domain,
483                                      struct device *dev);
484 static void __dmar_remove_one_dev_info(struct device_domain_info *info);
485 static void domain_context_clear(struct intel_iommu *iommu,
486                                  struct device *dev);
487 static int domain_detach_iommu(struct dmar_domain *domain,
488                                struct intel_iommu *iommu);
489
490 #ifdef CONFIG_INTEL_IOMMU_DEFAULT_ON
491 int dmar_disabled = 0;
492 #else
493 int dmar_disabled = 1;
494 #endif /*CONFIG_INTEL_IOMMU_DEFAULT_ON*/
495
496 int intel_iommu_enabled = 0;
497 EXPORT_SYMBOL_GPL(intel_iommu_enabled);
498
499 static int dmar_map_gfx = 1;
500 static int dmar_forcedac;
501 static int intel_iommu_strict;
502 static int intel_iommu_superpage = 1;
503 static int intel_iommu_ecs = 1;
504 static int intel_iommu_pasid28;
505 static int iommu_identity_mapping;
506
507 #define IDENTMAP_ALL            1
508 #define IDENTMAP_GFX            2
509 #define IDENTMAP_AZALIA         4
510
511 /* Broadwell and Skylake have broken ECS support — normal so-called "second
512  * level" translation of DMA requests-without-PASID doesn't actually happen
513  * unless you also set the NESTE bit in an extended context-entry. Which of
514  * course means that SVM doesn't work because it's trying to do nested
515  * translation of the physical addresses it finds in the process page tables,
516  * through the IOVA->phys mapping found in the "second level" page tables.
517  *
518  * The VT-d specification was retroactively changed to change the definition
519  * of the capability bits and pretend that Broadwell/Skylake never happened...
520  * but unfortunately the wrong bit was changed. It's ECS which is broken, but
521  * for some reason it was the PASID capability bit which was redefined (from
522  * bit 28 on BDW/SKL to bit 40 in future).
523  *
524  * So our test for ECS needs to eschew those implementations which set the old
525  * PASID capabiity bit 28, since those are the ones on which ECS is broken.
526  * Unless we are working around the 'pasid28' limitations, that is, by putting
527  * the device into passthrough mode for normal DMA and thus masking the bug.
528  */
529 #define ecs_enabled(iommu) (intel_iommu_ecs && ecap_ecs(iommu->ecap) && \
530                             (intel_iommu_pasid28 || !ecap_broken_pasid(iommu->ecap)))
531 /* PASID support is thus enabled if ECS is enabled and *either* of the old
532  * or new capability bits are set. */
533 #define pasid_enabled(iommu) (ecs_enabled(iommu) &&                     \
534                               (ecap_pasid(iommu->ecap) || ecap_broken_pasid(iommu->ecap)))
535
536 int intel_iommu_gfx_mapped;
537 EXPORT_SYMBOL_GPL(intel_iommu_gfx_mapped);
538
539 #define DUMMY_DEVICE_DOMAIN_INFO ((struct device_domain_info *)(-1))
540 static DEFINE_SPINLOCK(device_domain_lock);
541 static LIST_HEAD(device_domain_list);
542
543 static const struct iommu_ops intel_iommu_ops;
544
545 static bool translation_pre_enabled(struct intel_iommu *iommu)
546 {
547         return (iommu->flags & VTD_FLAG_TRANS_PRE_ENABLED);
548 }
549
550 static void clear_translation_pre_enabled(struct intel_iommu *iommu)
551 {
552         iommu->flags &= ~VTD_FLAG_TRANS_PRE_ENABLED;
553 }
554
555 static void init_translation_status(struct intel_iommu *iommu)
556 {
557         u32 gsts;
558
559         gsts = readl(iommu->reg + DMAR_GSTS_REG);
560         if (gsts & DMA_GSTS_TES)
561                 iommu->flags |= VTD_FLAG_TRANS_PRE_ENABLED;
562 }
563
564 /* Convert generic 'struct iommu_domain to private struct dmar_domain */
565 static struct dmar_domain *to_dmar_domain(struct iommu_domain *dom)
566 {
567         return container_of(dom, struct dmar_domain, domain);
568 }
569
570 static int __init intel_iommu_setup(char *str)
571 {
572         if (!str)
573                 return -EINVAL;
574         while (*str) {
575                 if (!strncmp(str, "on", 2)) {
576                         dmar_disabled = 0;
577                         pr_info("IOMMU enabled\n");
578                 } else if (!strncmp(str, "off", 3)) {
579                         dmar_disabled = 1;
580                         pr_info("IOMMU disabled\n");
581                 } else if (!strncmp(str, "igfx_off", 8)) {
582                         dmar_map_gfx = 0;
583                         pr_info("Disable GFX device mapping\n");
584                 } else if (!strncmp(str, "forcedac", 8)) {
585                         pr_info("Forcing DAC for PCI devices\n");
586                         dmar_forcedac = 1;
587                 } else if (!strncmp(str, "strict", 6)) {
588                         pr_info("Disable batched IOTLB flush\n");
589                         intel_iommu_strict = 1;
590                 } else if (!strncmp(str, "sp_off", 6)) {
591                         pr_info("Disable supported super page\n");
592                         intel_iommu_superpage = 0;
593                 } else if (!strncmp(str, "ecs_off", 7)) {
594                         printk(KERN_INFO
595                                 "Intel-IOMMU: disable extended context table support\n");
596                         intel_iommu_ecs = 0;
597                 } else if (!strncmp(str, "pasid28", 7)) {
598                         printk(KERN_INFO
599                                 "Intel-IOMMU: enable pre-production PASID support\n");
600                         intel_iommu_pasid28 = 1;
601                         iommu_identity_mapping |= IDENTMAP_GFX;
602                 }
603
604                 str += strcspn(str, ",");
605                 while (*str == ',')
606                         str++;
607         }
608         return 0;
609 }
610 __setup("intel_iommu=", intel_iommu_setup);
611
612 static struct kmem_cache *iommu_domain_cache;
613 static struct kmem_cache *iommu_devinfo_cache;
614
615 static struct dmar_domain* get_iommu_domain(struct intel_iommu *iommu, u16 did)
616 {
617         struct dmar_domain **domains;
618         int idx = did >> 8;
619
620         domains = iommu->domains[idx];
621         if (!domains)
622                 return NULL;
623
624         return domains[did & 0xff];
625 }
626
627 static void set_iommu_domain(struct intel_iommu *iommu, u16 did,
628                              struct dmar_domain *domain)
629 {
630         struct dmar_domain **domains;
631         int idx = did >> 8;
632
633         if (!iommu->domains[idx]) {
634                 size_t size = 256 * sizeof(struct dmar_domain *);
635                 iommu->domains[idx] = kzalloc(size, GFP_ATOMIC);
636         }
637
638         domains = iommu->domains[idx];
639         if (WARN_ON(!domains))
640                 return;
641         else
642                 domains[did & 0xff] = domain;
643 }
644
645 static inline void *alloc_pgtable_page(int node)
646 {
647         struct page *page;
648         void *vaddr = NULL;
649
650         page = alloc_pages_node(node, GFP_ATOMIC | __GFP_ZERO, 0);
651         if (page)
652                 vaddr = page_address(page);
653         return vaddr;
654 }
655
656 static inline void free_pgtable_page(void *vaddr)
657 {
658         free_page((unsigned long)vaddr);
659 }
660
661 static inline void *alloc_domain_mem(void)
662 {
663         return kmem_cache_alloc(iommu_domain_cache, GFP_ATOMIC);
664 }
665
666 static void free_domain_mem(void *vaddr)
667 {
668         kmem_cache_free(iommu_domain_cache, vaddr);
669 }
670
671 static inline void * alloc_devinfo_mem(void)
672 {
673         return kmem_cache_alloc(iommu_devinfo_cache, GFP_ATOMIC);
674 }
675
676 static inline void free_devinfo_mem(void *vaddr)
677 {
678         kmem_cache_free(iommu_devinfo_cache, vaddr);
679 }
680
681 static inline int domain_type_is_vm(struct dmar_domain *domain)
682 {
683         return domain->flags & DOMAIN_FLAG_VIRTUAL_MACHINE;
684 }
685
686 static inline int domain_type_is_si(struct dmar_domain *domain)
687 {
688         return domain->flags & DOMAIN_FLAG_STATIC_IDENTITY;
689 }
690
691 static inline int domain_type_is_vm_or_si(struct dmar_domain *domain)
692 {
693         return domain->flags & (DOMAIN_FLAG_VIRTUAL_MACHINE |
694                                 DOMAIN_FLAG_STATIC_IDENTITY);
695 }
696
697 static inline int domain_pfn_supported(struct dmar_domain *domain,
698                                        unsigned long pfn)
699 {
700         int addr_width = agaw_to_width(domain->agaw) - VTD_PAGE_SHIFT;
701
702         return !(addr_width < BITS_PER_LONG && pfn >> addr_width);
703 }
704
705 static int __iommu_calculate_agaw(struct intel_iommu *iommu, int max_gaw)
706 {
707         unsigned long sagaw;
708         int agaw = -1;
709
710         sagaw = cap_sagaw(iommu->cap);
711         for (agaw = width_to_agaw(max_gaw);
712              agaw >= 0; agaw--) {
713                 if (test_bit(agaw, &sagaw))
714                         break;
715         }
716
717         return agaw;
718 }
719
720 /*
721  * Calculate max SAGAW for each iommu.
722  */
723 int iommu_calculate_max_sagaw(struct intel_iommu *iommu)
724 {
725         return __iommu_calculate_agaw(iommu, MAX_AGAW_WIDTH);
726 }
727
728 /*
729  * calculate agaw for each iommu.
730  * "SAGAW" may be different across iommus, use a default agaw, and
731  * get a supported less agaw for iommus that don't support the default agaw.
732  */
733 int iommu_calculate_agaw(struct intel_iommu *iommu)
734 {
735         return __iommu_calculate_agaw(iommu, DEFAULT_DOMAIN_ADDRESS_WIDTH);
736 }
737
738 /* This functionin only returns single iommu in a domain */
739 static struct intel_iommu *domain_get_iommu(struct dmar_domain *domain)
740 {
741         int iommu_id;
742
743         /* si_domain and vm domain should not get here. */
744         BUG_ON(domain_type_is_vm_or_si(domain));
745         for_each_domain_iommu(iommu_id, domain)
746                 break;
747
748         if (iommu_id < 0 || iommu_id >= g_num_of_iommus)
749                 return NULL;
750
751         return g_iommus[iommu_id];
752 }
753
754 static void domain_update_iommu_coherency(struct dmar_domain *domain)
755 {
756         struct dmar_drhd_unit *drhd;
757         struct intel_iommu *iommu;
758         bool found = false;
759         int i;
760
761         domain->iommu_coherency = 1;
762
763         for_each_domain_iommu(i, domain) {
764                 found = true;
765                 if (!ecap_coherent(g_iommus[i]->ecap)) {
766                         domain->iommu_coherency = 0;
767                         break;
768                 }
769         }
770         if (found)
771                 return;
772
773         /* No hardware attached; use lowest common denominator */
774         rcu_read_lock();
775         for_each_active_iommu(iommu, drhd) {
776                 if (!ecap_coherent(iommu->ecap)) {
777                         domain->iommu_coherency = 0;
778                         break;
779                 }
780         }
781         rcu_read_unlock();
782 }
783
784 static int domain_update_iommu_snooping(struct intel_iommu *skip)
785 {
786         struct dmar_drhd_unit *drhd;
787         struct intel_iommu *iommu;
788         int ret = 1;
789
790         rcu_read_lock();
791         for_each_active_iommu(iommu, drhd) {
792                 if (iommu != skip) {
793                         if (!ecap_sc_support(iommu->ecap)) {
794                                 ret = 0;
795                                 break;
796                         }
797                 }
798         }
799         rcu_read_unlock();
800
801         return ret;
802 }
803
804 static int domain_update_iommu_superpage(struct intel_iommu *skip)
805 {
806         struct dmar_drhd_unit *drhd;
807         struct intel_iommu *iommu;
808         int mask = 0xf;
809
810         if (!intel_iommu_superpage) {
811                 return 0;
812         }
813
814         /* set iommu_superpage to the smallest common denominator */
815         rcu_read_lock();
816         for_each_active_iommu(iommu, drhd) {
817                 if (iommu != skip) {
818                         mask &= cap_super_page_val(iommu->cap);
819                         if (!mask)
820                                 break;
821                 }
822         }
823         rcu_read_unlock();
824
825         return fls(mask);
826 }
827
828 /* Some capabilities may be different across iommus */
829 static void domain_update_iommu_cap(struct dmar_domain *domain)
830 {
831         domain_update_iommu_coherency(domain);
832         domain->iommu_snooping = domain_update_iommu_snooping(NULL);
833         domain->iommu_superpage = domain_update_iommu_superpage(NULL);
834 }
835
836 static inline struct context_entry *iommu_context_addr(struct intel_iommu *iommu,
837                                                        u8 bus, u8 devfn, int alloc)
838 {
839         struct root_entry *root = &iommu->root_entry[bus];
840         struct context_entry *context;
841         u64 *entry;
842
843         entry = &root->lo;
844         if (ecs_enabled(iommu)) {
845                 if (devfn >= 0x80) {
846                         devfn -= 0x80;
847                         entry = &root->hi;
848                 }
849                 devfn *= 2;
850         }
851         if (*entry & 1)
852                 context = phys_to_virt(*entry & VTD_PAGE_MASK);
853         else {
854                 unsigned long phy_addr;
855                 if (!alloc)
856                         return NULL;
857
858                 context = alloc_pgtable_page(iommu->node);
859                 if (!context)
860                         return NULL;
861
862                 __iommu_flush_cache(iommu, (void *)context, CONTEXT_SIZE);
863                 phy_addr = virt_to_phys((void *)context);
864                 *entry = phy_addr | 1;
865                 __iommu_flush_cache(iommu, entry, sizeof(*entry));
866         }
867         return &context[devfn];
868 }
869
870 static int iommu_dummy(struct device *dev)
871 {
872         return dev->archdata.iommu == DUMMY_DEVICE_DOMAIN_INFO;
873 }
874
875 static struct intel_iommu *device_to_iommu(struct device *dev, u8 *bus, u8 *devfn)
876 {
877         struct dmar_drhd_unit *drhd = NULL;
878         struct intel_iommu *iommu;
879         struct device *tmp;
880         struct pci_dev *ptmp, *pdev = NULL;
881         u16 segment = 0;
882         int i;
883
884         if (iommu_dummy(dev))
885                 return NULL;
886
887         if (dev_is_pci(dev)) {
888                 struct pci_dev *pf_pdev;
889
890                 pdev = to_pci_dev(dev);
891                 /* VFs aren't listed in scope tables; we need to look up
892                  * the PF instead to find the IOMMU. */
893                 pf_pdev = pci_physfn(pdev);
894                 dev = &pf_pdev->dev;
895                 segment = pci_domain_nr(pdev->bus);
896         } else if (has_acpi_companion(dev))
897                 dev = &ACPI_COMPANION(dev)->dev;
898
899         rcu_read_lock();
900         for_each_active_iommu(iommu, drhd) {
901                 if (pdev && segment != drhd->segment)
902                         continue;
903
904                 for_each_active_dev_scope(drhd->devices,
905                                           drhd->devices_cnt, i, tmp) {
906                         if (tmp == dev) {
907                                 /* For a VF use its original BDF# not that of the PF
908                                  * which we used for the IOMMU lookup. Strictly speaking
909                                  * we could do this for all PCI devices; we only need to
910                                  * get the BDF# from the scope table for ACPI matches. */
911                                 if (pdev->is_virtfn)
912                                         goto got_pdev;
913
914                                 *bus = drhd->devices[i].bus;
915                                 *devfn = drhd->devices[i].devfn;
916                                 goto out;
917                         }
918
919                         if (!pdev || !dev_is_pci(tmp))
920                                 continue;
921
922                         ptmp = to_pci_dev(tmp);
923                         if (ptmp->subordinate &&
924                             ptmp->subordinate->number <= pdev->bus->number &&
925                             ptmp->subordinate->busn_res.end >= pdev->bus->number)
926                                 goto got_pdev;
927                 }
928
929                 if (pdev && drhd->include_all) {
930                 got_pdev:
931                         *bus = pdev->bus->number;
932                         *devfn = pdev->devfn;
933                         goto out;
934                 }
935         }
936         iommu = NULL;
937  out:
938         rcu_read_unlock();
939
940         return iommu;
941 }
942
943 static void domain_flush_cache(struct dmar_domain *domain,
944                                void *addr, int size)
945 {
946         if (!domain->iommu_coherency)
947                 clflush_cache_range(addr, size);
948 }
949
950 static int device_context_mapped(struct intel_iommu *iommu, u8 bus, u8 devfn)
951 {
952         struct context_entry *context;
953         int ret = 0;
954         unsigned long flags;
955
956         spin_lock_irqsave(&iommu->lock, flags);
957         context = iommu_context_addr(iommu, bus, devfn, 0);
958         if (context)
959                 ret = context_present(context);
960         spin_unlock_irqrestore(&iommu->lock, flags);
961         return ret;
962 }
963
964 static void clear_context_table(struct intel_iommu *iommu, u8 bus, u8 devfn)
965 {
966         struct context_entry *context;
967         unsigned long flags;
968
969         spin_lock_irqsave(&iommu->lock, flags);
970         context = iommu_context_addr(iommu, bus, devfn, 0);
971         if (context) {
972                 context_clear_entry(context);
973                 __iommu_flush_cache(iommu, context, sizeof(*context));
974         }
975         spin_unlock_irqrestore(&iommu->lock, flags);
976 }
977
978 static void free_context_table(struct intel_iommu *iommu)
979 {
980         int i;
981         unsigned long flags;
982         struct context_entry *context;
983
984         spin_lock_irqsave(&iommu->lock, flags);
985         if (!iommu->root_entry) {
986                 goto out;
987         }
988         for (i = 0; i < ROOT_ENTRY_NR; i++) {
989                 context = iommu_context_addr(iommu, i, 0, 0);
990                 if (context)
991                         free_pgtable_page(context);
992
993                 if (!ecs_enabled(iommu))
994                         continue;
995
996                 context = iommu_context_addr(iommu, i, 0x80, 0);
997                 if (context)
998                         free_pgtable_page(context);
999
1000         }
1001         free_pgtable_page(iommu->root_entry);
1002         iommu->root_entry = NULL;
1003 out:
1004         spin_unlock_irqrestore(&iommu->lock, flags);
1005 }
1006
1007 static struct dma_pte *pfn_to_dma_pte(struct dmar_domain *domain,
1008                                       unsigned long pfn, int *target_level)
1009 {
1010         struct dma_pte *parent, *pte = NULL;
1011         int level = agaw_to_level(domain->agaw);
1012         int offset;
1013
1014         BUG_ON(!domain->pgd);
1015
1016         if (!domain_pfn_supported(domain, pfn))
1017                 /* Address beyond IOMMU's addressing capabilities. */
1018                 return NULL;
1019
1020         parent = domain->pgd;
1021
1022         while (1) {
1023                 void *tmp_page;
1024
1025                 offset = pfn_level_offset(pfn, level);
1026                 pte = &parent[offset];
1027                 if (!*target_level && (dma_pte_superpage(pte) || !dma_pte_present(pte)))
1028                         break;
1029                 if (level == *target_level)
1030                         break;
1031
1032                 if (!dma_pte_present(pte)) {
1033                         uint64_t pteval;
1034
1035                         tmp_page = alloc_pgtable_page(domain->nid);
1036
1037                         if (!tmp_page)
1038                                 return NULL;
1039
1040                         domain_flush_cache(domain, tmp_page, VTD_PAGE_SIZE);
1041                         pteval = ((uint64_t)virt_to_dma_pfn(tmp_page) << VTD_PAGE_SHIFT) | DMA_PTE_READ | DMA_PTE_WRITE;
1042                         if (cmpxchg64(&pte->val, 0ULL, pteval))
1043                                 /* Someone else set it while we were thinking; use theirs. */
1044                                 free_pgtable_page(tmp_page);
1045                         else
1046                                 domain_flush_cache(domain, pte, sizeof(*pte));
1047                 }
1048                 if (level == 1)
1049                         break;
1050
1051                 parent = phys_to_virt(dma_pte_addr(pte));
1052                 level--;
1053         }
1054
1055         if (!*target_level)
1056                 *target_level = level;
1057
1058         return pte;
1059 }
1060
1061
1062 /* return address's pte at specific level */
1063 static struct dma_pte *dma_pfn_level_pte(struct dmar_domain *domain,
1064                                          unsigned long pfn,
1065                                          int level, int *large_page)
1066 {
1067         struct dma_pte *parent, *pte = NULL;
1068         int total = agaw_to_level(domain->agaw);
1069         int offset;
1070
1071         parent = domain->pgd;
1072         while (level <= total) {
1073                 offset = pfn_level_offset(pfn, total);
1074                 pte = &parent[offset];
1075                 if (level == total)
1076                         return pte;
1077
1078                 if (!dma_pte_present(pte)) {
1079                         *large_page = total;
1080                         break;
1081                 }
1082
1083                 if (dma_pte_superpage(pte)) {
1084                         *large_page = total;
1085                         return pte;
1086                 }
1087
1088                 parent = phys_to_virt(dma_pte_addr(pte));
1089                 total--;
1090         }
1091         return NULL;
1092 }
1093
1094 /* clear last level pte, a tlb flush should be followed */
1095 static void dma_pte_clear_range(struct dmar_domain *domain,
1096                                 unsigned long start_pfn,
1097                                 unsigned long last_pfn)
1098 {
1099         unsigned int large_page = 1;
1100         struct dma_pte *first_pte, *pte;
1101
1102         BUG_ON(!domain_pfn_supported(domain, start_pfn));
1103         BUG_ON(!domain_pfn_supported(domain, last_pfn));
1104         BUG_ON(start_pfn > last_pfn);
1105
1106         /* we don't need lock here; nobody else touches the iova range */
1107         do {
1108                 large_page = 1;
1109                 first_pte = pte = dma_pfn_level_pte(domain, start_pfn, 1, &large_page);
1110                 if (!pte) {
1111                         start_pfn = align_to_level(start_pfn + 1, large_page + 1);
1112                         continue;
1113                 }
1114                 do {
1115                         dma_clear_pte(pte);
1116                         start_pfn += lvl_to_nr_pages(large_page);
1117                         pte++;
1118                 } while (start_pfn <= last_pfn && !first_pte_in_page(pte));
1119
1120                 domain_flush_cache(domain, first_pte,
1121                                    (void *)pte - (void *)first_pte);
1122
1123         } while (start_pfn && start_pfn <= last_pfn);
1124 }
1125
1126 static void dma_pte_free_level(struct dmar_domain *domain, int level,
1127                                struct dma_pte *pte, unsigned long pfn,
1128                                unsigned long start_pfn, unsigned long last_pfn)
1129 {
1130         pfn = max(start_pfn, pfn);
1131         pte = &pte[pfn_level_offset(pfn, level)];
1132
1133         do {
1134                 unsigned long level_pfn;
1135                 struct dma_pte *level_pte;
1136
1137                 if (!dma_pte_present(pte) || dma_pte_superpage(pte))
1138                         goto next;
1139
1140                 level_pfn = pfn & level_mask(level - 1);
1141                 level_pte = phys_to_virt(dma_pte_addr(pte));
1142
1143                 if (level > 2)
1144                         dma_pte_free_level(domain, level - 1, level_pte,
1145                                            level_pfn, start_pfn, last_pfn);
1146
1147                 /* If range covers entire pagetable, free it */
1148                 if (!(start_pfn > level_pfn ||
1149                       last_pfn < level_pfn + level_size(level) - 1)) {
1150                         dma_clear_pte(pte);
1151                         domain_flush_cache(domain, pte, sizeof(*pte));
1152                         free_pgtable_page(level_pte);
1153                 }
1154 next:
1155                 pfn += level_size(level);
1156         } while (!first_pte_in_page(++pte) && pfn <= last_pfn);
1157 }
1158
1159 /* free page table pages. last level pte should already be cleared */
1160 static void dma_pte_free_pagetable(struct dmar_domain *domain,
1161                                    unsigned long start_pfn,
1162                                    unsigned long last_pfn)
1163 {
1164         BUG_ON(!domain_pfn_supported(domain, start_pfn));
1165         BUG_ON(!domain_pfn_supported(domain, last_pfn));
1166         BUG_ON(start_pfn > last_pfn);
1167
1168         dma_pte_clear_range(domain, start_pfn, last_pfn);
1169
1170         /* We don't need lock here; nobody else touches the iova range */
1171         dma_pte_free_level(domain, agaw_to_level(domain->agaw),
1172                            domain->pgd, 0, start_pfn, last_pfn);
1173
1174         /* free pgd */
1175         if (start_pfn == 0 && last_pfn == DOMAIN_MAX_PFN(domain->gaw)) {
1176                 free_pgtable_page(domain->pgd);
1177                 domain->pgd = NULL;
1178         }
1179 }
1180
1181 /* When a page at a given level is being unlinked from its parent, we don't
1182    need to *modify* it at all. All we need to do is make a list of all the
1183    pages which can be freed just as soon as we've flushed the IOTLB and we
1184    know the hardware page-walk will no longer touch them.
1185    The 'pte' argument is the *parent* PTE, pointing to the page that is to
1186    be freed. */
1187 static struct page *dma_pte_list_pagetables(struct dmar_domain *domain,
1188                                             int level, struct dma_pte *pte,
1189                                             struct page *freelist)
1190 {
1191         struct page *pg;
1192
1193         pg = pfn_to_page(dma_pte_addr(pte) >> PAGE_SHIFT);
1194         pg->freelist = freelist;
1195         freelist = pg;
1196
1197         if (level == 1)
1198                 return freelist;
1199
1200         pte = page_address(pg);
1201         do {
1202                 if (dma_pte_present(pte) && !dma_pte_superpage(pte))
1203                         freelist = dma_pte_list_pagetables(domain, level - 1,
1204                                                            pte, freelist);
1205                 pte++;
1206         } while (!first_pte_in_page(pte));
1207
1208         return freelist;
1209 }
1210
1211 static struct page *dma_pte_clear_level(struct dmar_domain *domain, int level,
1212                                         struct dma_pte *pte, unsigned long pfn,
1213                                         unsigned long start_pfn,
1214                                         unsigned long last_pfn,
1215                                         struct page *freelist)
1216 {
1217         struct dma_pte *first_pte = NULL, *last_pte = NULL;
1218
1219         pfn = max(start_pfn, pfn);
1220         pte = &pte[pfn_level_offset(pfn, level)];
1221
1222         do {
1223                 unsigned long level_pfn;
1224
1225                 if (!dma_pte_present(pte))
1226                         goto next;
1227
1228                 level_pfn = pfn & level_mask(level);
1229
1230                 /* If range covers entire pagetable, free it */
1231                 if (start_pfn <= level_pfn &&
1232                     last_pfn >= level_pfn + level_size(level) - 1) {
1233                         /* These suborbinate page tables are going away entirely. Don't
1234                            bother to clear them; we're just going to *free* them. */
1235                         if (level > 1 && !dma_pte_superpage(pte))
1236                                 freelist = dma_pte_list_pagetables(domain, level - 1, pte, freelist);
1237
1238                         dma_clear_pte(pte);
1239                         if (!first_pte)
1240                                 first_pte = pte;
1241                         last_pte = pte;
1242                 } else if (level > 1) {
1243                         /* Recurse down into a level that isn't *entirely* obsolete */
1244                         freelist = dma_pte_clear_level(domain, level - 1,
1245                                                        phys_to_virt(dma_pte_addr(pte)),
1246                                                        level_pfn, start_pfn, last_pfn,
1247                                                        freelist);
1248                 }
1249 next:
1250                 pfn += level_size(level);
1251         } while (!first_pte_in_page(++pte) && pfn <= last_pfn);
1252
1253         if (first_pte)
1254                 domain_flush_cache(domain, first_pte,
1255                                    (void *)++last_pte - (void *)first_pte);
1256
1257         return freelist;
1258 }
1259
1260 /* We can't just free the pages because the IOMMU may still be walking
1261    the page tables, and may have cached the intermediate levels. The
1262    pages can only be freed after the IOTLB flush has been done. */
1263 static struct page *domain_unmap(struct dmar_domain *domain,
1264                                  unsigned long start_pfn,
1265                                  unsigned long last_pfn)
1266 {
1267         struct page *freelist = NULL;
1268
1269         BUG_ON(!domain_pfn_supported(domain, start_pfn));
1270         BUG_ON(!domain_pfn_supported(domain, last_pfn));
1271         BUG_ON(start_pfn > last_pfn);
1272
1273         /* we don't need lock here; nobody else touches the iova range */
1274         freelist = dma_pte_clear_level(domain, agaw_to_level(domain->agaw),
1275                                        domain->pgd, 0, start_pfn, last_pfn, NULL);
1276
1277         /* free pgd */
1278         if (start_pfn == 0 && last_pfn == DOMAIN_MAX_PFN(domain->gaw)) {
1279                 struct page *pgd_page = virt_to_page(domain->pgd);
1280                 pgd_page->freelist = freelist;
1281                 freelist = pgd_page;
1282
1283                 domain->pgd = NULL;
1284         }
1285
1286         return freelist;
1287 }
1288
1289 static void dma_free_pagelist(struct page *freelist)
1290 {
1291         struct page *pg;
1292
1293         while ((pg = freelist)) {
1294                 freelist = pg->freelist;
1295                 free_pgtable_page(page_address(pg));
1296         }
1297 }
1298
1299 /* iommu handling */
1300 static int iommu_alloc_root_entry(struct intel_iommu *iommu)
1301 {
1302         struct root_entry *root;
1303         unsigned long flags;
1304
1305         root = (struct root_entry *)alloc_pgtable_page(iommu->node);
1306         if (!root) {
1307                 pr_err("Allocating root entry for %s failed\n",
1308                         iommu->name);
1309                 return -ENOMEM;
1310         }
1311
1312         __iommu_flush_cache(iommu, root, ROOT_SIZE);
1313
1314         spin_lock_irqsave(&iommu->lock, flags);
1315         iommu->root_entry = root;
1316         spin_unlock_irqrestore(&iommu->lock, flags);
1317
1318         return 0;
1319 }
1320
1321 static void iommu_set_root_entry(struct intel_iommu *iommu)
1322 {
1323         u64 addr;
1324         u32 sts;
1325         unsigned long flag;
1326
1327         addr = virt_to_phys(iommu->root_entry);
1328         if (ecs_enabled(iommu))
1329                 addr |= DMA_RTADDR_RTT;
1330
1331         raw_spin_lock_irqsave(&iommu->register_lock, flag);
1332         dmar_writeq(iommu->reg + DMAR_RTADDR_REG, addr);
1333
1334         writel(iommu->gcmd | DMA_GCMD_SRTP, iommu->reg + DMAR_GCMD_REG);
1335
1336         /* Make sure hardware complete it */
1337         IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1338                       readl, (sts & DMA_GSTS_RTPS), sts);
1339
1340         raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1341 }
1342
1343 static void iommu_flush_write_buffer(struct intel_iommu *iommu)
1344 {
1345         u32 val;
1346         unsigned long flag;
1347
1348         if (!rwbf_quirk && !cap_rwbf(iommu->cap))
1349                 return;
1350
1351         raw_spin_lock_irqsave(&iommu->register_lock, flag);
1352         writel(iommu->gcmd | DMA_GCMD_WBF, iommu->reg + DMAR_GCMD_REG);
1353
1354         /* Make sure hardware complete it */
1355         IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1356                       readl, (!(val & DMA_GSTS_WBFS)), val);
1357
1358         raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1359 }
1360
1361 /* return value determine if we need a write buffer flush */
1362 static void __iommu_flush_context(struct intel_iommu *iommu,
1363                                   u16 did, u16 source_id, u8 function_mask,
1364                                   u64 type)
1365 {
1366         u64 val = 0;
1367         unsigned long flag;
1368
1369         switch (type) {
1370         case DMA_CCMD_GLOBAL_INVL:
1371                 val = DMA_CCMD_GLOBAL_INVL;
1372                 break;
1373         case DMA_CCMD_DOMAIN_INVL:
1374                 val = DMA_CCMD_DOMAIN_INVL|DMA_CCMD_DID(did);
1375                 break;
1376         case DMA_CCMD_DEVICE_INVL:
1377                 val = DMA_CCMD_DEVICE_INVL|DMA_CCMD_DID(did)
1378                         | DMA_CCMD_SID(source_id) | DMA_CCMD_FM(function_mask);
1379                 break;
1380         default:
1381                 BUG();
1382         }
1383         val |= DMA_CCMD_ICC;
1384
1385         raw_spin_lock_irqsave(&iommu->register_lock, flag);
1386         dmar_writeq(iommu->reg + DMAR_CCMD_REG, val);
1387
1388         /* Make sure hardware complete it */
1389         IOMMU_WAIT_OP(iommu, DMAR_CCMD_REG,
1390                 dmar_readq, (!(val & DMA_CCMD_ICC)), val);
1391
1392         raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1393 }
1394
1395 /* return value determine if we need a write buffer flush */
1396 static void __iommu_flush_iotlb(struct intel_iommu *iommu, u16 did,
1397                                 u64 addr, unsigned int size_order, u64 type)
1398 {
1399         int tlb_offset = ecap_iotlb_offset(iommu->ecap);
1400         u64 val = 0, val_iva = 0;
1401         unsigned long flag;
1402
1403         switch (type) {
1404         case DMA_TLB_GLOBAL_FLUSH:
1405                 /* global flush doesn't need set IVA_REG */
1406                 val = DMA_TLB_GLOBAL_FLUSH|DMA_TLB_IVT;
1407                 break;
1408         case DMA_TLB_DSI_FLUSH:
1409                 val = DMA_TLB_DSI_FLUSH|DMA_TLB_IVT|DMA_TLB_DID(did);
1410                 break;
1411         case DMA_TLB_PSI_FLUSH:
1412                 val = DMA_TLB_PSI_FLUSH|DMA_TLB_IVT|DMA_TLB_DID(did);
1413                 /* IH bit is passed in as part of address */
1414                 val_iva = size_order | addr;
1415                 break;
1416         default:
1417                 BUG();
1418         }
1419         /* Note: set drain read/write */
1420 #if 0
1421         /*
1422          * This is probably to be super secure.. Looks like we can
1423          * ignore it without any impact.
1424          */
1425         if (cap_read_drain(iommu->cap))
1426                 val |= DMA_TLB_READ_DRAIN;
1427 #endif
1428         if (cap_write_drain(iommu->cap))
1429                 val |= DMA_TLB_WRITE_DRAIN;
1430
1431         raw_spin_lock_irqsave(&iommu->register_lock, flag);
1432         /* Note: Only uses first TLB reg currently */
1433         if (val_iva)
1434                 dmar_writeq(iommu->reg + tlb_offset, val_iva);
1435         dmar_writeq(iommu->reg + tlb_offset + 8, val);
1436
1437         /* Make sure hardware complete it */
1438         IOMMU_WAIT_OP(iommu, tlb_offset + 8,
1439                 dmar_readq, (!(val & DMA_TLB_IVT)), val);
1440
1441         raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1442
1443         /* check IOTLB invalidation granularity */
1444         if (DMA_TLB_IAIG(val) == 0)
1445                 pr_err("Flush IOTLB failed\n");
1446         if (DMA_TLB_IAIG(val) != DMA_TLB_IIRG(type))
1447                 pr_debug("TLB flush request %Lx, actual %Lx\n",
1448                         (unsigned long long)DMA_TLB_IIRG(type),
1449                         (unsigned long long)DMA_TLB_IAIG(val));
1450 }
1451
1452 static struct device_domain_info *
1453 iommu_support_dev_iotlb (struct dmar_domain *domain, struct intel_iommu *iommu,
1454                          u8 bus, u8 devfn)
1455 {
1456         struct device_domain_info *info;
1457
1458         assert_spin_locked(&device_domain_lock);
1459
1460         if (!iommu->qi)
1461                 return NULL;
1462
1463         list_for_each_entry(info, &domain->devices, link)
1464                 if (info->iommu == iommu && info->bus == bus &&
1465                     info->devfn == devfn) {
1466                         if (info->ats_supported && info->dev)
1467                                 return info;
1468                         break;
1469                 }
1470
1471         return NULL;
1472 }
1473
1474 static void iommu_enable_dev_iotlb(struct device_domain_info *info)
1475 {
1476         struct pci_dev *pdev;
1477
1478         if (!info || !dev_is_pci(info->dev))
1479                 return;
1480
1481         pdev = to_pci_dev(info->dev);
1482
1483 #ifdef CONFIG_INTEL_IOMMU_SVM
1484         /* The PCIe spec, in its wisdom, declares that the behaviour of
1485            the device if you enable PASID support after ATS support is
1486            undefined. So always enable PASID support on devices which
1487            have it, even if we can't yet know if we're ever going to
1488            use it. */
1489         if (info->pasid_supported && !pci_enable_pasid(pdev, info->pasid_supported & ~1))
1490                 info->pasid_enabled = 1;
1491
1492         if (info->pri_supported && !pci_reset_pri(pdev) && !pci_enable_pri(pdev, 32))
1493                 info->pri_enabled = 1;
1494 #endif
1495         if (info->ats_supported && !pci_enable_ats(pdev, VTD_PAGE_SHIFT)) {
1496                 info->ats_enabled = 1;
1497                 info->ats_qdep = pci_ats_queue_depth(pdev);
1498         }
1499 }
1500
1501 static void iommu_disable_dev_iotlb(struct device_domain_info *info)
1502 {
1503         struct pci_dev *pdev;
1504
1505         if (!dev_is_pci(info->dev))
1506                 return;
1507
1508         pdev = to_pci_dev(info->dev);
1509
1510         if (info->ats_enabled) {
1511                 pci_disable_ats(pdev);
1512                 info->ats_enabled = 0;
1513         }
1514 #ifdef CONFIG_INTEL_IOMMU_SVM
1515         if (info->pri_enabled) {
1516                 pci_disable_pri(pdev);
1517                 info->pri_enabled = 0;
1518         }
1519         if (info->pasid_enabled) {
1520                 pci_disable_pasid(pdev);
1521                 info->pasid_enabled = 0;
1522         }
1523 #endif
1524 }
1525
1526 static void iommu_flush_dev_iotlb(struct dmar_domain *domain,
1527                                   u64 addr, unsigned mask)
1528 {
1529         u16 sid, qdep;
1530         unsigned long flags;
1531         struct device_domain_info *info;
1532
1533         spin_lock_irqsave(&device_domain_lock, flags);
1534         list_for_each_entry(info, &domain->devices, link) {
1535                 if (!info->ats_enabled)
1536                         continue;
1537
1538                 sid = info->bus << 8 | info->devfn;
1539                 qdep = info->ats_qdep;
1540                 qi_flush_dev_iotlb(info->iommu, sid, qdep, addr, mask);
1541         }
1542         spin_unlock_irqrestore(&device_domain_lock, flags);
1543 }
1544
1545 static void iommu_flush_iotlb_psi(struct intel_iommu *iommu,
1546                                   struct dmar_domain *domain,
1547                                   unsigned long pfn, unsigned int pages,
1548                                   int ih, int map)
1549 {
1550         unsigned int mask = ilog2(__roundup_pow_of_two(pages));
1551         uint64_t addr = (uint64_t)pfn << VTD_PAGE_SHIFT;
1552         u16 did = domain->iommu_did[iommu->seq_id];
1553
1554         BUG_ON(pages == 0);
1555
1556         if (ih)
1557                 ih = 1 << 6;
1558         /*
1559          * Fallback to domain selective flush if no PSI support or the size is
1560          * too big.
1561          * PSI requires page size to be 2 ^ x, and the base address is naturally
1562          * aligned to the size
1563          */
1564         if (!cap_pgsel_inv(iommu->cap) || mask > cap_max_amask_val(iommu->cap))
1565                 iommu->flush.flush_iotlb(iommu, did, 0, 0,
1566                                                 DMA_TLB_DSI_FLUSH);
1567         else
1568                 iommu->flush.flush_iotlb(iommu, did, addr | ih, mask,
1569                                                 DMA_TLB_PSI_FLUSH);
1570
1571         /*
1572          * In caching mode, changes of pages from non-present to present require
1573          * flush. However, device IOTLB doesn't need to be flushed in this case.
1574          */
1575         if (!cap_caching_mode(iommu->cap) || !map)
1576                 iommu_flush_dev_iotlb(get_iommu_domain(iommu, did),
1577                                       addr, mask);
1578 }
1579
1580 static void iommu_disable_protect_mem_regions(struct intel_iommu *iommu)
1581 {
1582         u32 pmen;
1583         unsigned long flags;
1584
1585         raw_spin_lock_irqsave(&iommu->register_lock, flags);
1586         pmen = readl(iommu->reg + DMAR_PMEN_REG);
1587         pmen &= ~DMA_PMEN_EPM;
1588         writel(pmen, iommu->reg + DMAR_PMEN_REG);
1589
1590         /* wait for the protected region status bit to clear */
1591         IOMMU_WAIT_OP(iommu, DMAR_PMEN_REG,
1592                 readl, !(pmen & DMA_PMEN_PRS), pmen);
1593
1594         raw_spin_unlock_irqrestore(&iommu->register_lock, flags);
1595 }
1596
1597 static void iommu_enable_translation(struct intel_iommu *iommu)
1598 {
1599         u32 sts;
1600         unsigned long flags;
1601
1602         raw_spin_lock_irqsave(&iommu->register_lock, flags);
1603         iommu->gcmd |= DMA_GCMD_TE;
1604         writel(iommu->gcmd, iommu->reg + DMAR_GCMD_REG);
1605
1606         /* Make sure hardware complete it */
1607         IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1608                       readl, (sts & DMA_GSTS_TES), sts);
1609
1610         raw_spin_unlock_irqrestore(&iommu->register_lock, flags);
1611 }
1612
1613 static void iommu_disable_translation(struct intel_iommu *iommu)
1614 {
1615         u32 sts;
1616         unsigned long flag;
1617
1618         raw_spin_lock_irqsave(&iommu->register_lock, flag);
1619         iommu->gcmd &= ~DMA_GCMD_TE;
1620         writel(iommu->gcmd, iommu->reg + DMAR_GCMD_REG);
1621
1622         /* Make sure hardware complete it */
1623         IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1624                       readl, (!(sts & DMA_GSTS_TES)), sts);
1625
1626         raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1627 }
1628
1629
1630 static int iommu_init_domains(struct intel_iommu *iommu)
1631 {
1632         u32 ndomains, nlongs;
1633         size_t size;
1634
1635         ndomains = cap_ndoms(iommu->cap);
1636         pr_debug("%s: Number of Domains supported <%d>\n",
1637                  iommu->name, ndomains);
1638         nlongs = BITS_TO_LONGS(ndomains);
1639
1640         spin_lock_init(&iommu->lock);
1641
1642         iommu->domain_ids = kcalloc(nlongs, sizeof(unsigned long), GFP_KERNEL);
1643         if (!iommu->domain_ids) {
1644                 pr_err("%s: Allocating domain id array failed\n",
1645                        iommu->name);
1646                 return -ENOMEM;
1647         }
1648
1649         size = ((ndomains >> 8) + 1) * sizeof(struct dmar_domain **);
1650         iommu->domains = kzalloc(size, GFP_KERNEL);
1651
1652         if (iommu->domains) {
1653                 size = 256 * sizeof(struct dmar_domain *);
1654                 iommu->domains[0] = kzalloc(size, GFP_KERNEL);
1655         }
1656
1657         if (!iommu->domains || !iommu->domains[0]) {
1658                 pr_err("%s: Allocating domain array failed\n",
1659                        iommu->name);
1660                 kfree(iommu->domain_ids);
1661                 kfree(iommu->domains);
1662                 iommu->domain_ids = NULL;
1663                 iommu->domains    = NULL;
1664                 return -ENOMEM;
1665         }
1666
1667
1668
1669         /*
1670          * If Caching mode is set, then invalid translations are tagged
1671          * with domain-id 0, hence we need to pre-allocate it. We also
1672          * use domain-id 0 as a marker for non-allocated domain-id, so
1673          * make sure it is not used for a real domain.
1674          */
1675         set_bit(0, iommu->domain_ids);
1676
1677         return 0;
1678 }
1679
1680 static void disable_dmar_iommu(struct intel_iommu *iommu)
1681 {
1682         struct device_domain_info *info, *tmp;
1683         unsigned long flags;
1684
1685         if (!iommu->domains || !iommu->domain_ids)
1686                 return;
1687
1688 again:
1689         spin_lock_irqsave(&device_domain_lock, flags);
1690         list_for_each_entry_safe(info, tmp, &device_domain_list, global) {
1691                 struct dmar_domain *domain;
1692
1693                 if (info->iommu != iommu)
1694                         continue;
1695
1696                 if (!info->dev || !info->domain)
1697                         continue;
1698
1699                 domain = info->domain;
1700
1701                 __dmar_remove_one_dev_info(info);
1702
1703                 if (!domain_type_is_vm_or_si(domain)) {
1704                         /*
1705                          * The domain_exit() function  can't be called under
1706                          * device_domain_lock, as it takes this lock itself.
1707                          * So release the lock here and re-run the loop
1708                          * afterwards.
1709                          */
1710                         spin_unlock_irqrestore(&device_domain_lock, flags);
1711                         domain_exit(domain);
1712                         goto again;
1713                 }
1714         }
1715         spin_unlock_irqrestore(&device_domain_lock, flags);
1716
1717         if (iommu->gcmd & DMA_GCMD_TE)
1718                 iommu_disable_translation(iommu);
1719 }
1720
1721 static void free_dmar_iommu(struct intel_iommu *iommu)
1722 {
1723         if ((iommu->domains) && (iommu->domain_ids)) {
1724                 int elems = (cap_ndoms(iommu->cap) >> 8) + 1;
1725                 int i;
1726
1727                 for (i = 0; i < elems; i++)
1728                         kfree(iommu->domains[i]);
1729                 kfree(iommu->domains);
1730                 kfree(iommu->domain_ids);
1731                 iommu->domains = NULL;
1732                 iommu->domain_ids = NULL;
1733         }
1734
1735         g_iommus[iommu->seq_id] = NULL;
1736
1737         /* free context mapping */
1738         free_context_table(iommu);
1739
1740 #ifdef CONFIG_INTEL_IOMMU_SVM
1741         if (pasid_enabled(iommu)) {
1742                 if (ecap_prs(iommu->ecap))
1743                         intel_svm_finish_prq(iommu);
1744                 intel_svm_free_pasid_tables(iommu);
1745         }
1746 #endif
1747 }
1748
1749 static struct dmar_domain *alloc_domain(int flags)
1750 {
1751         struct dmar_domain *domain;
1752
1753         domain = alloc_domain_mem();
1754         if (!domain)
1755                 return NULL;
1756
1757         memset(domain, 0, sizeof(*domain));
1758         domain->nid = -1;
1759         domain->flags = flags;
1760         INIT_LIST_HEAD(&domain->devices);
1761
1762         return domain;
1763 }
1764
1765 /* Must be called with iommu->lock */
1766 static int domain_attach_iommu(struct dmar_domain *domain,
1767                                struct intel_iommu *iommu)
1768 {
1769         unsigned long ndomains;
1770         int num;
1771
1772         assert_spin_locked(&device_domain_lock);
1773         assert_spin_locked(&iommu->lock);
1774
1775         domain->iommu_refcnt[iommu->seq_id] += 1;
1776         domain->iommu_count += 1;
1777         if (domain->iommu_refcnt[iommu->seq_id] == 1) {
1778                 ndomains = cap_ndoms(iommu->cap);
1779                 num      = find_first_zero_bit(iommu->domain_ids, ndomains);
1780
1781                 if (num >= ndomains) {
1782                         pr_err("%s: No free domain ids\n", iommu->name);
1783                         domain->iommu_refcnt[iommu->seq_id] -= 1;
1784                         domain->iommu_count -= 1;
1785                         return -ENOSPC;
1786                 }
1787
1788                 set_bit(num, iommu->domain_ids);
1789                 set_iommu_domain(iommu, num, domain);
1790
1791                 domain->iommu_did[iommu->seq_id] = num;
1792                 domain->nid                      = iommu->node;
1793
1794                 domain_update_iommu_cap(domain);
1795         }
1796
1797         return 0;
1798 }
1799
1800 static int domain_detach_iommu(struct dmar_domain *domain,
1801                                struct intel_iommu *iommu)
1802 {
1803         int num, count = INT_MAX;
1804
1805         assert_spin_locked(&device_domain_lock);
1806         assert_spin_locked(&iommu->lock);
1807
1808         domain->iommu_refcnt[iommu->seq_id] -= 1;
1809         count = --domain->iommu_count;
1810         if (domain->iommu_refcnt[iommu->seq_id] == 0) {
1811                 num = domain->iommu_did[iommu->seq_id];
1812                 clear_bit(num, iommu->domain_ids);
1813                 set_iommu_domain(iommu, num, NULL);
1814
1815                 domain_update_iommu_cap(domain);
1816                 domain->iommu_did[iommu->seq_id] = 0;
1817         }
1818
1819         return count;
1820 }
1821
1822 static struct iova_domain reserved_iova_list;
1823 static struct lock_class_key reserved_rbtree_key;
1824
1825 static int dmar_init_reserved_ranges(void)
1826 {
1827         struct pci_dev *pdev = NULL;
1828         struct iova *iova;
1829         int i;
1830
1831         init_iova_domain(&reserved_iova_list, VTD_PAGE_SIZE, IOVA_START_PFN,
1832                         DMA_32BIT_PFN);
1833
1834         lockdep_set_class(&reserved_iova_list.iova_rbtree_lock,
1835                 &reserved_rbtree_key);
1836
1837         /* IOAPIC ranges shouldn't be accessed by DMA */
1838         iova = reserve_iova(&reserved_iova_list, IOVA_PFN(IOAPIC_RANGE_START),
1839                 IOVA_PFN(IOAPIC_RANGE_END));
1840         if (!iova) {
1841                 pr_err("Reserve IOAPIC range failed\n");
1842                 return -ENODEV;
1843         }
1844
1845         /* Reserve all PCI MMIO to avoid peer-to-peer access */
1846         for_each_pci_dev(pdev) {
1847                 struct resource *r;
1848
1849                 for (i = 0; i < PCI_NUM_RESOURCES; i++) {
1850                         r = &pdev->resource[i];
1851                         if (!r->flags || !(r->flags & IORESOURCE_MEM))
1852                                 continue;
1853                         iova = reserve_iova(&reserved_iova_list,
1854                                             IOVA_PFN(r->start),
1855                                             IOVA_PFN(r->end));
1856                         if (!iova) {
1857                                 pr_err("Reserve iova failed\n");
1858                                 return -ENODEV;
1859                         }
1860                 }
1861         }
1862         return 0;
1863 }
1864
1865 static void domain_reserve_special_ranges(struct dmar_domain *domain)
1866 {
1867         copy_reserved_iova(&reserved_iova_list, &domain->iovad);
1868 }
1869
1870 static inline int guestwidth_to_adjustwidth(int gaw)
1871 {
1872         int agaw;
1873         int r = (gaw - 12) % 9;
1874
1875         if (r == 0)
1876                 agaw = gaw;
1877         else
1878                 agaw = gaw + 9 - r;
1879         if (agaw > 64)
1880                 agaw = 64;
1881         return agaw;
1882 }
1883
1884 static int domain_init(struct dmar_domain *domain, struct intel_iommu *iommu,
1885                        int guest_width)
1886 {
1887         int adjust_width, agaw;
1888         unsigned long sagaw;
1889
1890         init_iova_domain(&domain->iovad, VTD_PAGE_SIZE, IOVA_START_PFN,
1891                         DMA_32BIT_PFN);
1892         domain_reserve_special_ranges(domain);
1893
1894         /* calculate AGAW */
1895         if (guest_width > cap_mgaw(iommu->cap))
1896                 guest_width = cap_mgaw(iommu->cap);
1897         domain->gaw = guest_width;
1898         adjust_width = guestwidth_to_adjustwidth(guest_width);
1899         agaw = width_to_agaw(adjust_width);
1900         sagaw = cap_sagaw(iommu->cap);
1901         if (!test_bit(agaw, &sagaw)) {
1902                 /* hardware doesn't support it, choose a bigger one */
1903                 pr_debug("Hardware doesn't support agaw %d\n", agaw);
1904                 agaw = find_next_bit(&sagaw, 5, agaw);
1905                 if (agaw >= 5)
1906                         return -ENODEV;
1907         }
1908         domain->agaw = agaw;
1909
1910         if (ecap_coherent(iommu->ecap))
1911                 domain->iommu_coherency = 1;
1912         else
1913                 domain->iommu_coherency = 0;
1914
1915         if (ecap_sc_support(iommu->ecap))
1916                 domain->iommu_snooping = 1;
1917         else
1918                 domain->iommu_snooping = 0;
1919
1920         if (intel_iommu_superpage)
1921                 domain->iommu_superpage = fls(cap_super_page_val(iommu->cap));
1922         else
1923                 domain->iommu_superpage = 0;
1924
1925         domain->nid = iommu->node;
1926
1927         /* always allocate the top pgd */
1928         domain->pgd = (struct dma_pte *)alloc_pgtable_page(domain->nid);
1929         if (!domain->pgd)
1930                 return -ENOMEM;
1931         __iommu_flush_cache(iommu, domain->pgd, PAGE_SIZE);
1932         return 0;
1933 }
1934
1935 static void domain_exit(struct dmar_domain *domain)
1936 {
1937         struct page *freelist = NULL;
1938
1939         /* Domain 0 is reserved, so dont process it */
1940         if (!domain)
1941                 return;
1942
1943         /* Flush any lazy unmaps that may reference this domain */
1944         if (!intel_iommu_strict)
1945                 flush_unmaps_timeout(0);
1946
1947         /* Remove associated devices and clear attached or cached domains */
1948         rcu_read_lock();
1949         domain_remove_dev_info(domain);
1950         rcu_read_unlock();
1951
1952         /* destroy iovas */
1953         put_iova_domain(&domain->iovad);
1954
1955         freelist = domain_unmap(domain, 0, DOMAIN_MAX_PFN(domain->gaw));
1956
1957         dma_free_pagelist(freelist);
1958
1959         free_domain_mem(domain);
1960 }
1961
1962 static int domain_context_mapping_one(struct dmar_domain *domain,
1963                                       struct intel_iommu *iommu,
1964                                       u8 bus, u8 devfn)
1965 {
1966         u16 did = domain->iommu_did[iommu->seq_id];
1967         int translation = CONTEXT_TT_MULTI_LEVEL;
1968         struct device_domain_info *info = NULL;
1969         struct context_entry *context;
1970         unsigned long flags;
1971         struct dma_pte *pgd;
1972         int ret, agaw;
1973
1974         WARN_ON(did == 0);
1975
1976         if (hw_pass_through && domain_type_is_si(domain))
1977                 translation = CONTEXT_TT_PASS_THROUGH;
1978
1979         pr_debug("Set context mapping for %02x:%02x.%d\n",
1980                 bus, PCI_SLOT(devfn), PCI_FUNC(devfn));
1981
1982         BUG_ON(!domain->pgd);
1983
1984         spin_lock_irqsave(&device_domain_lock, flags);
1985         spin_lock(&iommu->lock);
1986
1987         ret = -ENOMEM;
1988         context = iommu_context_addr(iommu, bus, devfn, 1);
1989         if (!context)
1990                 goto out_unlock;
1991
1992         ret = 0;
1993         if (context_present(context))
1994                 goto out_unlock;
1995
1996         /*
1997          * For kdump cases, old valid entries may be cached due to the
1998          * in-flight DMA and copied pgtable, but there is no unmapping
1999          * behaviour for them, thus we need an explicit cache flush for
2000          * the newly-mapped device. For kdump, at this point, the device
2001          * is supposed to finish reset at its driver probe stage, so no
2002          * in-flight DMA will exist, and we don't need to worry anymore
2003          * hereafter.
2004          */
2005         if (context_copied(context)) {
2006                 u16 did_old = context_domain_id(context);
2007
2008                 if (did_old >= 0 && did_old < cap_ndoms(iommu->cap))
2009                         iommu->flush.flush_context(iommu, did_old,
2010                                                    (((u16)bus) << 8) | devfn,
2011                                                    DMA_CCMD_MASK_NOBIT,
2012                                                    DMA_CCMD_DEVICE_INVL);
2013         }
2014
2015         pgd = domain->pgd;
2016
2017         context_clear_entry(context);
2018         context_set_domain_id(context, did);
2019
2020         /*
2021          * Skip top levels of page tables for iommu which has less agaw
2022          * than default.  Unnecessary for PT mode.
2023          */
2024         if (translation != CONTEXT_TT_PASS_THROUGH) {
2025                 for (agaw = domain->agaw; agaw != iommu->agaw; agaw--) {
2026                         ret = -ENOMEM;
2027                         pgd = phys_to_virt(dma_pte_addr(pgd));
2028                         if (!dma_pte_present(pgd))
2029                                 goto out_unlock;
2030                 }
2031
2032                 info = iommu_support_dev_iotlb(domain, iommu, bus, devfn);
2033                 if (info && info->ats_supported)
2034                         translation = CONTEXT_TT_DEV_IOTLB;
2035                 else
2036                         translation = CONTEXT_TT_MULTI_LEVEL;
2037
2038                 context_set_address_root(context, virt_to_phys(pgd));
2039                 context_set_address_width(context, iommu->agaw);
2040         } else {
2041                 /*
2042                  * In pass through mode, AW must be programmed to
2043                  * indicate the largest AGAW value supported by
2044                  * hardware. And ASR is ignored by hardware.
2045                  */
2046                 context_set_address_width(context, iommu->msagaw);
2047         }
2048
2049         context_set_translation_type(context, translation);
2050         context_set_fault_enable(context);
2051         context_set_present(context);
2052         domain_flush_cache(domain, context, sizeof(*context));
2053
2054         /*
2055          * It's a non-present to present mapping. If hardware doesn't cache
2056          * non-present entry we only need to flush the write-buffer. If the
2057          * _does_ cache non-present entries, then it does so in the special
2058          * domain #0, which we have to flush:
2059          */
2060         if (cap_caching_mode(iommu->cap)) {
2061                 iommu->flush.flush_context(iommu, 0,
2062                                            (((u16)bus) << 8) | devfn,
2063                                            DMA_CCMD_MASK_NOBIT,
2064                                            DMA_CCMD_DEVICE_INVL);
2065                 iommu->flush.flush_iotlb(iommu, did, 0, 0, DMA_TLB_DSI_FLUSH);
2066         } else {
2067                 iommu_flush_write_buffer(iommu);
2068         }
2069         iommu_enable_dev_iotlb(info);
2070
2071         ret = 0;
2072
2073 out_unlock:
2074         spin_unlock(&iommu->lock);
2075         spin_unlock_irqrestore(&device_domain_lock, flags);
2076
2077         return ret;
2078 }
2079
2080 struct domain_context_mapping_data {
2081         struct dmar_domain *domain;
2082         struct intel_iommu *iommu;
2083 };
2084
2085 static int domain_context_mapping_cb(struct pci_dev *pdev,
2086                                      u16 alias, void *opaque)
2087 {
2088         struct domain_context_mapping_data *data = opaque;
2089
2090         return domain_context_mapping_one(data->domain, data->iommu,
2091                                           PCI_BUS_NUM(alias), alias & 0xff);
2092 }
2093
2094 static int
2095 domain_context_mapping(struct dmar_domain *domain, struct device *dev)
2096 {
2097         struct intel_iommu *iommu;
2098         u8 bus, devfn;
2099         struct domain_context_mapping_data data;
2100
2101         iommu = device_to_iommu(dev, &bus, &devfn);
2102         if (!iommu)
2103                 return -ENODEV;
2104
2105         if (!dev_is_pci(dev))
2106                 return domain_context_mapping_one(domain, iommu, bus, devfn);
2107
2108         data.domain = domain;
2109         data.iommu = iommu;
2110
2111         return pci_for_each_dma_alias(to_pci_dev(dev),
2112                                       &domain_context_mapping_cb, &data);
2113 }
2114
2115 static int domain_context_mapped_cb(struct pci_dev *pdev,
2116                                     u16 alias, void *opaque)
2117 {
2118         struct intel_iommu *iommu = opaque;
2119
2120         return !device_context_mapped(iommu, PCI_BUS_NUM(alias), alias & 0xff);
2121 }
2122
2123 static int domain_context_mapped(struct device *dev)
2124 {
2125         struct intel_iommu *iommu;
2126         u8 bus, devfn;
2127
2128         iommu = device_to_iommu(dev, &bus, &devfn);
2129         if (!iommu)
2130                 return -ENODEV;
2131
2132         if (!dev_is_pci(dev))
2133                 return device_context_mapped(iommu, bus, devfn);
2134
2135         return !pci_for_each_dma_alias(to_pci_dev(dev),
2136                                        domain_context_mapped_cb, iommu);
2137 }
2138
2139 /* Returns a number of VTD pages, but aligned to MM page size */
2140 static inline unsigned long aligned_nrpages(unsigned long host_addr,
2141                                             size_t size)
2142 {
2143         host_addr &= ~PAGE_MASK;
2144         return PAGE_ALIGN(host_addr + size) >> VTD_PAGE_SHIFT;
2145 }
2146
2147 /* Return largest possible superpage level for a given mapping */
2148 static inline int hardware_largepage_caps(struct dmar_domain *domain,
2149                                           unsigned long iov_pfn,
2150                                           unsigned long phy_pfn,
2151                                           unsigned long pages)
2152 {
2153         int support, level = 1;
2154         unsigned long pfnmerge;
2155
2156         support = domain->iommu_superpage;
2157
2158         /* To use a large page, the virtual *and* physical addresses
2159            must be aligned to 2MiB/1GiB/etc. Lower bits set in either
2160            of them will mean we have to use smaller pages. So just
2161            merge them and check both at once. */
2162         pfnmerge = iov_pfn | phy_pfn;
2163
2164         while (support && !(pfnmerge & ~VTD_STRIDE_MASK)) {
2165                 pages >>= VTD_STRIDE_SHIFT;
2166                 if (!pages)
2167                         break;
2168                 pfnmerge >>= VTD_STRIDE_SHIFT;
2169                 level++;
2170                 support--;
2171         }
2172         return level;
2173 }
2174
2175 static int __domain_mapping(struct dmar_domain *domain, unsigned long iov_pfn,
2176                             struct scatterlist *sg, unsigned long phys_pfn,
2177                             unsigned long nr_pages, int prot)
2178 {
2179         struct dma_pte *first_pte = NULL, *pte = NULL;
2180         phys_addr_t uninitialized_var(pteval);
2181         unsigned long sg_res = 0;
2182         unsigned int largepage_lvl = 0;
2183         unsigned long lvl_pages = 0;
2184
2185         BUG_ON(!domain_pfn_supported(domain, iov_pfn + nr_pages - 1));
2186
2187         if ((prot & (DMA_PTE_READ|DMA_PTE_WRITE)) == 0)
2188                 return -EINVAL;
2189
2190         prot &= DMA_PTE_READ | DMA_PTE_WRITE | DMA_PTE_SNP;
2191
2192         if (!sg) {
2193                 sg_res = nr_pages;
2194                 pteval = ((phys_addr_t)phys_pfn << VTD_PAGE_SHIFT) | prot;
2195         }
2196
2197         while (nr_pages > 0) {
2198                 uint64_t tmp;
2199
2200                 if (!sg_res) {
2201                         sg_res = aligned_nrpages(sg->offset, sg->length);
2202                         sg->dma_address = ((dma_addr_t)iov_pfn << VTD_PAGE_SHIFT) + sg->offset;
2203                         sg->dma_length = sg->length;
2204                         pteval = page_to_phys(sg_page(sg)) | prot;
2205                         phys_pfn = pteval >> VTD_PAGE_SHIFT;
2206                 }
2207
2208                 if (!pte) {
2209                         largepage_lvl = hardware_largepage_caps(domain, iov_pfn, phys_pfn, sg_res);
2210
2211                         first_pte = pte = pfn_to_dma_pte(domain, iov_pfn, &largepage_lvl);
2212                         if (!pte)
2213                                 return -ENOMEM;
2214                         /* It is large page*/
2215                         if (largepage_lvl > 1) {
2216                                 unsigned long nr_superpages, end_pfn;
2217
2218                                 pteval |= DMA_PTE_LARGE_PAGE;
2219                                 lvl_pages = lvl_to_nr_pages(largepage_lvl);
2220
2221                                 nr_superpages = sg_res / lvl_pages;
2222                                 end_pfn = iov_pfn + nr_superpages * lvl_pages - 1;
2223
2224                                 /*
2225                                  * Ensure that old small page tables are
2226                                  * removed to make room for superpage(s).
2227                                  */
2228                                 dma_pte_free_pagetable(domain, iov_pfn, end_pfn);
2229                         } else {
2230                                 pteval &= ~(uint64_t)DMA_PTE_LARGE_PAGE;
2231                         }
2232
2233                 }
2234                 /* We don't need lock here, nobody else
2235                  * touches the iova range
2236                  */
2237                 tmp = cmpxchg64_local(&pte->val, 0ULL, pteval);
2238                 if (tmp) {
2239                         static int dumps = 5;
2240                         pr_crit("ERROR: DMA PTE for vPFN 0x%lx already set (to %llx not %llx)\n",
2241                                 iov_pfn, tmp, (unsigned long long)pteval);
2242                         if (dumps) {
2243                                 dumps--;
2244                                 debug_dma_dump_mappings(NULL);
2245                         }
2246                         WARN_ON(1);
2247                 }
2248
2249                 lvl_pages = lvl_to_nr_pages(largepage_lvl);
2250
2251                 BUG_ON(nr_pages < lvl_pages);
2252                 BUG_ON(sg_res < lvl_pages);
2253
2254                 nr_pages -= lvl_pages;
2255                 iov_pfn += lvl_pages;
2256                 phys_pfn += lvl_pages;
2257                 pteval += lvl_pages * VTD_PAGE_SIZE;
2258                 sg_res -= lvl_pages;
2259
2260                 /* If the next PTE would be the first in a new page, then we
2261                    need to flush the cache on the entries we've just written.
2262                    And then we'll need to recalculate 'pte', so clear it and
2263                    let it get set again in the if (!pte) block above.
2264
2265                    If we're done (!nr_pages) we need to flush the cache too.
2266
2267                    Also if we've been setting superpages, we may need to
2268                    recalculate 'pte' and switch back to smaller pages for the
2269                    end of the mapping, if the trailing size is not enough to
2270                    use another superpage (i.e. sg_res < lvl_pages). */
2271                 pte++;
2272                 if (!nr_pages || first_pte_in_page(pte) ||
2273                     (largepage_lvl > 1 && sg_res < lvl_pages)) {
2274                         domain_flush_cache(domain, first_pte,
2275                                            (void *)pte - (void *)first_pte);
2276                         pte = NULL;
2277                 }
2278
2279                 if (!sg_res && nr_pages)
2280                         sg = sg_next(sg);
2281         }
2282         return 0;
2283 }
2284
2285 static inline int domain_sg_mapping(struct dmar_domain *domain, unsigned long iov_pfn,
2286                                     struct scatterlist *sg, unsigned long nr_pages,
2287                                     int prot)
2288 {
2289         return __domain_mapping(domain, iov_pfn, sg, 0, nr_pages, prot);
2290 }
2291
2292 static inline int domain_pfn_mapping(struct dmar_domain *domain, unsigned long iov_pfn,
2293                                      unsigned long phys_pfn, unsigned long nr_pages,
2294                                      int prot)
2295 {
2296         return __domain_mapping(domain, iov_pfn, NULL, phys_pfn, nr_pages, prot);
2297 }
2298
2299 static void domain_context_clear_one(struct intel_iommu *iommu, u8 bus, u8 devfn)
2300 {
2301         if (!iommu)
2302                 return;
2303
2304         clear_context_table(iommu, bus, devfn);
2305         iommu->flush.flush_context(iommu, 0, 0, 0,
2306                                            DMA_CCMD_GLOBAL_INVL);
2307         iommu->flush.flush_iotlb(iommu, 0, 0, 0, DMA_TLB_GLOBAL_FLUSH);
2308 }
2309
2310 static inline void unlink_domain_info(struct device_domain_info *info)
2311 {
2312         assert_spin_locked(&device_domain_lock);
2313         list_del(&info->link);
2314         list_del(&info->global);
2315         if (info->dev)
2316                 info->dev->archdata.iommu = NULL;
2317 }
2318
2319 static void domain_remove_dev_info(struct dmar_domain *domain)
2320 {
2321         struct device_domain_info *info, *tmp;
2322         unsigned long flags;
2323
2324         spin_lock_irqsave(&device_domain_lock, flags);
2325         list_for_each_entry_safe(info, tmp, &domain->devices, link)
2326                 __dmar_remove_one_dev_info(info);
2327         spin_unlock_irqrestore(&device_domain_lock, flags);
2328 }
2329
2330 /*
2331  * find_domain
2332  * Note: we use struct device->archdata.iommu stores the info
2333  */
2334 static struct dmar_domain *find_domain(struct device *dev)
2335 {
2336         struct device_domain_info *info;
2337
2338         /* No lock here, assumes no domain exit in normal case */
2339         info = dev->archdata.iommu;
2340         if (info)
2341                 return info->domain;
2342         return NULL;
2343 }
2344
2345 static inline struct device_domain_info *
2346 dmar_search_domain_by_dev_info(int segment, int bus, int devfn)
2347 {
2348         struct device_domain_info *info;
2349
2350         list_for_each_entry(info, &device_domain_list, global)
2351                 if (info->iommu->segment == segment && info->bus == bus &&
2352                     info->devfn == devfn)
2353                         return info;
2354
2355         return NULL;
2356 }
2357
2358 static struct dmar_domain *dmar_insert_one_dev_info(struct intel_iommu *iommu,
2359                                                     int bus, int devfn,
2360                                                     struct device *dev,
2361                                                     struct dmar_domain *domain)
2362 {
2363         struct dmar_domain *found = NULL;
2364         struct device_domain_info *info;
2365         unsigned long flags;
2366         int ret;
2367
2368         info = alloc_devinfo_mem();
2369         if (!info)
2370                 return NULL;
2371
2372         info->bus = bus;
2373         info->devfn = devfn;
2374         info->ats_supported = info->pasid_supported = info->pri_supported = 0;
2375         info->ats_enabled = info->pasid_enabled = info->pri_enabled = 0;
2376         info->ats_qdep = 0;
2377         info->dev = dev;
2378         info->domain = domain;
2379         info->iommu = iommu;
2380
2381         if (dev && dev_is_pci(dev)) {
2382                 struct pci_dev *pdev = to_pci_dev(info->dev);
2383
2384                 if (ecap_dev_iotlb_support(iommu->ecap) &&
2385                     pci_find_ext_capability(pdev, PCI_EXT_CAP_ID_ATS) &&
2386                     dmar_find_matched_atsr_unit(pdev))
2387                         info->ats_supported = 1;
2388
2389                 if (ecs_enabled(iommu)) {
2390                         if (pasid_enabled(iommu)) {
2391                                 int features = pci_pasid_features(pdev);
2392                                 if (features >= 0)
2393                                         info->pasid_supported = features | 1;
2394                         }
2395
2396                         if (info->ats_supported && ecap_prs(iommu->ecap) &&
2397                             pci_find_ext_capability(pdev, PCI_EXT_CAP_ID_PRI))
2398                                 info->pri_supported = 1;
2399                 }
2400         }
2401
2402         spin_lock_irqsave(&device_domain_lock, flags);
2403         if (dev)
2404                 found = find_domain(dev);
2405
2406         if (!found) {
2407                 struct device_domain_info *info2;
2408                 info2 = dmar_search_domain_by_dev_info(iommu->segment, bus, devfn);
2409                 if (info2) {
2410                         found      = info2->domain;
2411                         info2->dev = dev;
2412                 }
2413         }
2414
2415         if (found) {
2416                 spin_unlock_irqrestore(&device_domain_lock, flags);
2417                 free_devinfo_mem(info);
2418                 /* Caller must free the original domain */
2419                 return found;
2420         }
2421
2422         spin_lock(&iommu->lock);
2423         ret = domain_attach_iommu(domain, iommu);
2424         spin_unlock(&iommu->lock);
2425
2426         if (ret) {
2427                 spin_unlock_irqrestore(&device_domain_lock, flags);
2428                 free_devinfo_mem(info);
2429                 return NULL;
2430         }
2431
2432         list_add(&info->link, &domain->devices);
2433         list_add(&info->global, &device_domain_list);
2434         if (dev)
2435                 dev->archdata.iommu = info;
2436         spin_unlock_irqrestore(&device_domain_lock, flags);
2437
2438         if (dev && domain_context_mapping(domain, dev)) {
2439                 pr_err("Domain context map for %s failed\n", dev_name(dev));
2440                 dmar_remove_one_dev_info(domain, dev);
2441                 return NULL;
2442         }
2443
2444         return domain;
2445 }
2446
2447 static int get_last_alias(struct pci_dev *pdev, u16 alias, void *opaque)
2448 {
2449         *(u16 *)opaque = alias;
2450         return 0;
2451 }
2452
2453 /* domain is initialized */
2454 static struct dmar_domain *get_domain_for_dev(struct device *dev, int gaw)
2455 {
2456         struct device_domain_info *info = NULL;
2457         struct dmar_domain *domain, *tmp;
2458         struct intel_iommu *iommu;
2459         u16 req_id, dma_alias;
2460         unsigned long flags;
2461         u8 bus, devfn;
2462
2463         domain = find_domain(dev);
2464         if (domain)
2465                 return domain;
2466
2467         iommu = device_to_iommu(dev, &bus, &devfn);
2468         if (!iommu)
2469                 return NULL;
2470
2471         req_id = ((u16)bus << 8) | devfn;
2472
2473         if (dev_is_pci(dev)) {
2474                 struct pci_dev *pdev = to_pci_dev(dev);
2475
2476                 pci_for_each_dma_alias(pdev, get_last_alias, &dma_alias);
2477
2478                 spin_lock_irqsave(&device_domain_lock, flags);
2479                 info = dmar_search_domain_by_dev_info(pci_domain_nr(pdev->bus),
2480                                                       PCI_BUS_NUM(dma_alias),
2481                                                       dma_alias & 0xff);
2482                 if (info) {
2483                         iommu = info->iommu;
2484                         domain = info->domain;
2485                 }
2486                 spin_unlock_irqrestore(&device_domain_lock, flags);
2487
2488                 /* DMA alias already has a domain, uses it */
2489                 if (info)
2490                         goto found_domain;
2491         }
2492
2493         /* Allocate and initialize new domain for the device */
2494         domain = alloc_domain(0);
2495         if (!domain)
2496                 return NULL;
2497         if (domain_init(domain, iommu, gaw)) {
2498                 domain_exit(domain);
2499                 return NULL;
2500         }
2501
2502         /* register PCI DMA alias device */
2503         if (req_id != dma_alias && dev_is_pci(dev)) {
2504                 tmp = dmar_insert_one_dev_info(iommu, PCI_BUS_NUM(dma_alias),
2505                                                dma_alias & 0xff, NULL, domain);
2506
2507                 if (!tmp || tmp != domain) {
2508                         domain_exit(domain);
2509                         domain = tmp;
2510                 }
2511
2512                 if (!domain)
2513                         return NULL;
2514         }
2515
2516 found_domain:
2517         tmp = dmar_insert_one_dev_info(iommu, bus, devfn, dev, domain);
2518
2519         if (!tmp || tmp != domain) {
2520                 domain_exit(domain);
2521                 domain = tmp;
2522         }
2523
2524         return domain;
2525 }
2526
2527 static int iommu_domain_identity_map(struct dmar_domain *domain,
2528                                      unsigned long long start,
2529                                      unsigned long long end)
2530 {
2531         unsigned long first_vpfn = start >> VTD_PAGE_SHIFT;
2532         unsigned long last_vpfn = end >> VTD_PAGE_SHIFT;
2533
2534         if (!reserve_iova(&domain->iovad, dma_to_mm_pfn(first_vpfn),
2535                           dma_to_mm_pfn(last_vpfn))) {
2536                 pr_err("Reserving iova failed\n");
2537                 return -ENOMEM;
2538         }
2539
2540         pr_debug("Mapping reserved region %llx-%llx\n", start, end);
2541         /*
2542          * RMRR range might have overlap with physical memory range,
2543          * clear it first
2544          */
2545         dma_pte_clear_range(domain, first_vpfn, last_vpfn);
2546
2547         return domain_pfn_mapping(domain, first_vpfn, first_vpfn,
2548                                   last_vpfn - first_vpfn + 1,
2549                                   DMA_PTE_READ|DMA_PTE_WRITE);
2550 }
2551
2552 static int domain_prepare_identity_map(struct device *dev,
2553                                        struct dmar_domain *domain,
2554                                        unsigned long long start,
2555                                        unsigned long long end)
2556 {
2557         /* For _hardware_ passthrough, don't bother. But for software
2558            passthrough, we do it anyway -- it may indicate a memory
2559            range which is reserved in E820, so which didn't get set
2560            up to start with in si_domain */
2561         if (domain == si_domain && hw_pass_through) {
2562                 pr_warn("Ignoring identity map for HW passthrough device %s [0x%Lx - 0x%Lx]\n",
2563                         dev_name(dev), start, end);
2564                 return 0;
2565         }
2566
2567         pr_info("Setting identity map for device %s [0x%Lx - 0x%Lx]\n",
2568                 dev_name(dev), start, end);
2569
2570         if (end < start) {
2571                 WARN(1, "Your BIOS is broken; RMRR ends before it starts!\n"
2572                         "BIOS vendor: %s; Ver: %s; Product Version: %s\n",
2573                         dmi_get_system_info(DMI_BIOS_VENDOR),
2574                         dmi_get_system_info(DMI_BIOS_VERSION),
2575                      dmi_get_system_info(DMI_PRODUCT_VERSION));
2576                 return -EIO;
2577         }
2578
2579         if (end >> agaw_to_width(domain->agaw)) {
2580                 WARN(1, "Your BIOS is broken; RMRR exceeds permitted address width (%d bits)\n"
2581                      "BIOS vendor: %s; Ver: %s; Product Version: %s\n",
2582                      agaw_to_width(domain->agaw),
2583                      dmi_get_system_info(DMI_BIOS_VENDOR),
2584                      dmi_get_system_info(DMI_BIOS_VERSION),
2585                      dmi_get_system_info(DMI_PRODUCT_VERSION));
2586                 return -EIO;
2587         }
2588
2589         return iommu_domain_identity_map(domain, start, end);
2590 }
2591
2592 static int iommu_prepare_identity_map(struct device *dev,
2593                                       unsigned long long start,
2594                                       unsigned long long end)
2595 {
2596         struct dmar_domain *domain;
2597         int ret;
2598
2599         domain = get_domain_for_dev(dev, DEFAULT_DOMAIN_ADDRESS_WIDTH);
2600         if (!domain)
2601                 return -ENOMEM;
2602
2603         ret = domain_prepare_identity_map(dev, domain, start, end);
2604         if (ret)
2605                 domain_exit(domain);
2606
2607         return ret;
2608 }
2609
2610 static inline int iommu_prepare_rmrr_dev(struct dmar_rmrr_unit *rmrr,
2611                                          struct device *dev)
2612 {
2613         if (dev->archdata.iommu == DUMMY_DEVICE_DOMAIN_INFO)
2614                 return 0;
2615         return iommu_prepare_identity_map(dev, rmrr->base_address,
2616                                           rmrr->end_address);
2617 }
2618
2619 #ifdef CONFIG_INTEL_IOMMU_FLOPPY_WA
2620 static inline void iommu_prepare_isa(void)
2621 {
2622         struct pci_dev *pdev;
2623         int ret;
2624
2625         pdev = pci_get_class(PCI_CLASS_BRIDGE_ISA << 8, NULL);
2626         if (!pdev)
2627                 return;
2628
2629         pr_info("Prepare 0-16MiB unity mapping for LPC\n");
2630         ret = iommu_prepare_identity_map(&pdev->dev, 0, 16*1024*1024 - 1);
2631
2632         if (ret)
2633                 pr_err("Failed to create 0-16MiB identity map - floppy might not work\n");
2634
2635         pci_dev_put(pdev);
2636 }
2637 #else
2638 static inline void iommu_prepare_isa(void)
2639 {
2640         return;
2641 }
2642 #endif /* !CONFIG_INTEL_IOMMU_FLPY_WA */
2643
2644 static int md_domain_init(struct dmar_domain *domain, int guest_width);
2645
2646 static int __init si_domain_init(int hw)
2647 {
2648         int nid, ret = 0;
2649
2650         si_domain = alloc_domain(DOMAIN_FLAG_STATIC_IDENTITY);
2651         if (!si_domain)
2652                 return -EFAULT;
2653
2654         if (md_domain_init(si_domain, DEFAULT_DOMAIN_ADDRESS_WIDTH)) {
2655                 domain_exit(si_domain);
2656                 return -EFAULT;
2657         }
2658
2659         pr_debug("Identity mapping domain allocated\n");
2660
2661         if (hw)
2662                 return 0;
2663
2664         for_each_online_node(nid) {
2665                 unsigned long start_pfn, end_pfn;
2666                 int i;
2667
2668                 for_each_mem_pfn_range(i, nid, &start_pfn, &end_pfn, NULL) {
2669                         ret = iommu_domain_identity_map(si_domain,
2670                                         PFN_PHYS(start_pfn), PFN_PHYS(end_pfn));
2671                         if (ret)
2672                                 return ret;
2673                 }
2674         }
2675
2676         return 0;
2677 }
2678
2679 static int identity_mapping(struct device *dev)
2680 {
2681         struct device_domain_info *info;
2682
2683         if (likely(!iommu_identity_mapping))
2684                 return 0;
2685
2686         info = dev->archdata.iommu;
2687         if (info && info != DUMMY_DEVICE_DOMAIN_INFO)
2688                 return (info->domain == si_domain);
2689
2690         return 0;
2691 }
2692
2693 static int domain_add_dev_info(struct dmar_domain *domain, struct device *dev)
2694 {
2695         struct dmar_domain *ndomain;
2696         struct intel_iommu *iommu;
2697         u8 bus, devfn;
2698
2699         iommu = device_to_iommu(dev, &bus, &devfn);
2700         if (!iommu)
2701                 return -ENODEV;
2702
2703         ndomain = dmar_insert_one_dev_info(iommu, bus, devfn, dev, domain);
2704         if (ndomain != domain)
2705                 return -EBUSY;
2706
2707         return 0;
2708 }
2709
2710 static bool device_has_rmrr(struct device *dev)
2711 {
2712         struct dmar_rmrr_unit *rmrr;
2713         struct device *tmp;
2714         int i;
2715
2716         rcu_read_lock();
2717         for_each_rmrr_units(rmrr) {
2718                 /*
2719                  * Return TRUE if this RMRR contains the device that
2720                  * is passed in.
2721                  */
2722                 for_each_active_dev_scope(rmrr->devices,
2723                                           rmrr->devices_cnt, i, tmp)
2724                         if (tmp == dev) {
2725                                 rcu_read_unlock();
2726                                 return true;
2727                         }
2728         }
2729         rcu_read_unlock();
2730         return false;
2731 }
2732
2733 /*
2734  * There are a couple cases where we need to restrict the functionality of
2735  * devices associated with RMRRs.  The first is when evaluating a device for
2736  * identity mapping because problems exist when devices are moved in and out
2737  * of domains and their respective RMRR information is lost.  This means that
2738  * a device with associated RMRRs will never be in a "passthrough" domain.
2739  * The second is use of the device through the IOMMU API.  This interface
2740  * expects to have full control of the IOVA space for the device.  We cannot
2741  * satisfy both the requirement that RMRR access is maintained and have an
2742  * unencumbered IOVA space.  We also have no ability to quiesce the device's
2743  * use of the RMRR space or even inform the IOMMU API user of the restriction.
2744  * We therefore prevent devices associated with an RMRR from participating in
2745  * the IOMMU API, which eliminates them from device assignment.
2746  *
2747  * In both cases we assume that PCI USB devices with RMRRs have them largely
2748  * for historical reasons and that the RMRR space is not actively used post
2749  * boot.  This exclusion may change if vendors begin to abuse it.
2750  *
2751  * The same exception is made for graphics devices, with the requirement that
2752  * any use of the RMRR regions will be torn down before assigning the device
2753  * to a guest.
2754  */
2755 static bool device_is_rmrr_locked(struct device *dev)
2756 {
2757         if (!device_has_rmrr(dev))
2758                 return false;
2759
2760         if (dev_is_pci(dev)) {
2761                 struct pci_dev *pdev = to_pci_dev(dev);
2762
2763                 if (IS_USB_DEVICE(pdev) || IS_GFX_DEVICE(pdev))
2764                         return false;
2765         }
2766
2767         return true;
2768 }
2769
2770 static int iommu_should_identity_map(struct device *dev, int startup)
2771 {
2772
2773         if (dev_is_pci(dev)) {
2774                 struct pci_dev *pdev = to_pci_dev(dev);
2775
2776                 if (device_is_rmrr_locked(dev))
2777                         return 0;
2778
2779                 if ((iommu_identity_mapping & IDENTMAP_AZALIA) && IS_AZALIA(pdev))
2780                         return 1;
2781
2782                 if ((iommu_identity_mapping & IDENTMAP_GFX) && IS_GFX_DEVICE(pdev))
2783                         return 1;
2784
2785                 if (!(iommu_identity_mapping & IDENTMAP_ALL))
2786                         return 0;
2787
2788                 /*
2789                  * We want to start off with all devices in the 1:1 domain, and
2790                  * take them out later if we find they can't access all of memory.
2791                  *
2792                  * However, we can't do this for PCI devices behind bridges,
2793                  * because all PCI devices behind the same bridge will end up
2794                  * with the same source-id on their transactions.
2795                  *
2796                  * Practically speaking, we can't change things around for these
2797                  * devices at run-time, because we can't be sure there'll be no
2798                  * DMA transactions in flight for any of their siblings.
2799                  *
2800                  * So PCI devices (unless they're on the root bus) as well as
2801                  * their parent PCI-PCI or PCIe-PCI bridges must be left _out_ of
2802                  * the 1:1 domain, just in _case_ one of their siblings turns out
2803                  * not to be able to map all of memory.
2804                  */
2805                 if (!pci_is_pcie(pdev)) {
2806                         if (!pci_is_root_bus(pdev->bus))
2807                                 return 0;
2808                         if (pdev->class >> 8 == PCI_CLASS_BRIDGE_PCI)
2809                                 return 0;
2810                 } else if (pci_pcie_type(pdev) == PCI_EXP_TYPE_PCI_BRIDGE)
2811                         return 0;
2812         } else {
2813                 if (device_has_rmrr(dev))
2814                         return 0;
2815         }
2816
2817         /*
2818          * At boot time, we don't yet know if devices will be 64-bit capable.
2819          * Assume that they will — if they turn out not to be, then we can
2820          * take them out of the 1:1 domain later.
2821          */
2822         if (!startup) {
2823                 /*
2824                  * If the device's dma_mask is less than the system's memory
2825                  * size then this is not a candidate for identity mapping.
2826                  */
2827                 u64 dma_mask = *dev->dma_mask;
2828
2829                 if (dev->coherent_dma_mask &&
2830                     dev->coherent_dma_mask < dma_mask)
2831                         dma_mask = dev->coherent_dma_mask;
2832
2833                 return dma_mask >= dma_get_required_mask(dev);
2834         }
2835
2836         return 1;
2837 }
2838
2839 static int __init dev_prepare_static_identity_mapping(struct device *dev, int hw)
2840 {
2841         int ret;
2842
2843         if (!iommu_should_identity_map(dev, 1))
2844                 return 0;
2845
2846         ret = domain_add_dev_info(si_domain, dev);
2847         if (!ret)
2848                 pr_info("%s identity mapping for device %s\n",
2849                         hw ? "Hardware" : "Software", dev_name(dev));
2850         else if (ret == -ENODEV)
2851                 /* device not associated with an iommu */
2852                 ret = 0;
2853
2854         return ret;
2855 }
2856
2857
2858 static int __init iommu_prepare_static_identity_mapping(int hw)
2859 {
2860         struct pci_dev *pdev = NULL;
2861         struct dmar_drhd_unit *drhd;
2862         struct intel_iommu *iommu;
2863         struct device *dev;
2864         int i;
2865         int ret = 0;
2866
2867         for_each_pci_dev(pdev) {
2868                 ret = dev_prepare_static_identity_mapping(&pdev->dev, hw);
2869                 if (ret)
2870                         return ret;
2871         }
2872
2873         for_each_active_iommu(iommu, drhd)
2874                 for_each_active_dev_scope(drhd->devices, drhd->devices_cnt, i, dev) {
2875                         struct acpi_device_physical_node *pn;
2876                         struct acpi_device *adev;
2877
2878                         if (dev->bus != &acpi_bus_type)
2879                                 continue;
2880
2881                         adev= to_acpi_device(dev);
2882                         mutex_lock(&adev->physical_node_lock);
2883                         list_for_each_entry(pn, &adev->physical_node_list, node) {
2884                                 ret = dev_prepare_static_identity_mapping(pn->dev, hw);
2885                                 if (ret)
2886                                         break;
2887                         }
2888                         mutex_unlock(&adev->physical_node_lock);
2889                         if (ret)
2890                                 return ret;
2891                 }
2892
2893         return 0;
2894 }
2895
2896 static void intel_iommu_init_qi(struct intel_iommu *iommu)
2897 {
2898         /*
2899          * Start from the sane iommu hardware state.
2900          * If the queued invalidation is already initialized by us
2901          * (for example, while enabling interrupt-remapping) then
2902          * we got the things already rolling from a sane state.
2903          */
2904         if (!iommu->qi) {
2905                 /*
2906                  * Clear any previous faults.
2907                  */
2908                 dmar_fault(-1, iommu);
2909                 /*
2910                  * Disable queued invalidation if supported and already enabled
2911                  * before OS handover.
2912                  */
2913                 dmar_disable_qi(iommu);
2914         }
2915
2916         if (dmar_enable_qi(iommu)) {
2917                 /*
2918                  * Queued Invalidate not enabled, use Register Based Invalidate
2919                  */
2920                 iommu->flush.flush_context = __iommu_flush_context;
2921                 iommu->flush.flush_iotlb = __iommu_flush_iotlb;
2922                 pr_info("%s: Using Register based invalidation\n",
2923                         iommu->name);
2924         } else {
2925                 iommu->flush.flush_context = qi_flush_context;
2926                 iommu->flush.flush_iotlb = qi_flush_iotlb;
2927                 pr_info("%s: Using Queued invalidation\n", iommu->name);
2928         }
2929 }
2930
2931 static int copy_context_table(struct intel_iommu *iommu,
2932                               struct root_entry *old_re,
2933                               struct context_entry **tbl,
2934                               int bus, bool ext)
2935 {
2936         int tbl_idx, pos = 0, idx, devfn, ret = 0, did;
2937         struct context_entry *new_ce = NULL, ce;
2938         struct context_entry *old_ce = NULL;
2939         struct root_entry re;
2940         phys_addr_t old_ce_phys;
2941
2942         tbl_idx = ext ? bus * 2 : bus;
2943         memcpy(&re, old_re, sizeof(re));
2944
2945         for (devfn = 0; devfn < 256; devfn++) {
2946                 /* First calculate the correct index */
2947                 idx = (ext ? devfn * 2 : devfn) % 256;
2948
2949                 if (idx == 0) {
2950                         /* First save what we may have and clean up */
2951                         if (new_ce) {
2952                                 tbl[tbl_idx] = new_ce;
2953                                 __iommu_flush_cache(iommu, new_ce,
2954                                                     VTD_PAGE_SIZE);
2955                                 pos = 1;
2956                         }
2957
2958                         if (old_ce)
2959                                 iounmap(old_ce);
2960
2961                         ret = 0;
2962                         if (devfn < 0x80)
2963                                 old_ce_phys = root_entry_lctp(&re);
2964                         else
2965                                 old_ce_phys = root_entry_uctp(&re);
2966
2967                         if (!old_ce_phys) {
2968                                 if (ext && devfn == 0) {
2969                                         /* No LCTP, try UCTP */
2970                                         devfn = 0x7f;
2971                                         continue;
2972                                 } else {
2973                                         goto out;
2974                                 }
2975                         }
2976
2977                         ret = -ENOMEM;
2978                         old_ce = memremap(old_ce_phys, PAGE_SIZE,
2979                                         MEMREMAP_WB);
2980                         if (!old_ce)
2981                                 goto out;
2982
2983                         new_ce = alloc_pgtable_page(iommu->node);
2984                         if (!new_ce)
2985                                 goto out_unmap;
2986
2987                         ret = 0;
2988                 }
2989
2990                 /* Now copy the context entry */
2991                 memcpy(&ce, old_ce + idx, sizeof(ce));
2992
2993                 if (!__context_present(&ce))
2994                         continue;
2995
2996                 did = context_domain_id(&ce);
2997                 if (did >= 0 && did < cap_ndoms(iommu->cap))
2998                         set_bit(did, iommu->domain_ids);
2999
3000                 /*
3001                  * We need a marker for copied context entries. This
3002                  * marker needs to work for the old format as well as
3003                  * for extended context entries.
3004                  *
3005                  * Bit 67 of the context entry is used. In the old
3006                  * format this bit is available to software, in the
3007                  * extended format it is the PGE bit, but PGE is ignored
3008                  * by HW if PASIDs are disabled (and thus still
3009                  * available).
3010                  *
3011                  * So disable PASIDs first and then mark the entry
3012                  * copied. This means that we don't copy PASID
3013                  * translations from the old kernel, but this is fine as
3014                  * faults there are not fatal.
3015                  */
3016                 context_clear_pasid_enable(&ce);
3017                 context_set_copied(&ce);
3018
3019                 new_ce[idx] = ce;
3020         }
3021
3022         tbl[tbl_idx + pos] = new_ce;
3023
3024         __iommu_flush_cache(iommu, new_ce, VTD_PAGE_SIZE);
3025
3026 out_unmap:
3027         memunmap(old_ce);
3028
3029 out:
3030         return ret;
3031 }
3032
3033 static int copy_translation_tables(struct intel_iommu *iommu)
3034 {
3035         struct context_entry **ctxt_tbls;
3036         struct root_entry *old_rt;
3037         phys_addr_t old_rt_phys;
3038         int ctxt_table_entries;
3039         unsigned long flags;
3040         u64 rtaddr_reg;
3041         int bus, ret;
3042         bool new_ext, ext;
3043
3044         rtaddr_reg = dmar_readq(iommu->reg + DMAR_RTADDR_REG);
3045         ext        = !!(rtaddr_reg & DMA_RTADDR_RTT);
3046         new_ext    = !!ecap_ecs(iommu->ecap);
3047
3048         /*
3049          * The RTT bit can only be changed when translation is disabled,
3050          * but disabling translation means to open a window for data
3051          * corruption. So bail out and don't copy anything if we would
3052          * have to change the bit.
3053          */
3054         if (new_ext != ext)
3055                 return -EINVAL;
3056
3057         old_rt_phys = rtaddr_reg & VTD_PAGE_MASK;
3058         if (!old_rt_phys)
3059                 return -EINVAL;
3060
3061         old_rt = memremap(old_rt_phys, PAGE_SIZE, MEMREMAP_WB);
3062         if (!old_rt)
3063                 return -ENOMEM;
3064
3065         /* This is too big for the stack - allocate it from slab */
3066         ctxt_table_entries = ext ? 512 : 256;
3067         ret = -ENOMEM;
3068         ctxt_tbls = kzalloc(ctxt_table_entries * sizeof(void *), GFP_KERNEL);
3069         if (!ctxt_tbls)
3070                 goto out_unmap;
3071
3072         for (bus = 0; bus < 256; bus++) {
3073                 ret = copy_context_table(iommu, &old_rt[bus],
3074                                          ctxt_tbls, bus, ext);
3075                 if (ret) {
3076                         pr_err("%s: Failed to copy context table for bus %d\n",
3077                                 iommu->name, bus);
3078                         continue;
3079                 }
3080         }
3081
3082         spin_lock_irqsave(&iommu->lock, flags);
3083
3084         /* Context tables are copied, now write them to the root_entry table */
3085         for (bus = 0; bus < 256; bus++) {
3086                 int idx = ext ? bus * 2 : bus;
3087                 u64 val;
3088
3089                 if (ctxt_tbls[idx]) {
3090                         val = virt_to_phys(ctxt_tbls[idx]) | 1;
3091                         iommu->root_entry[bus].lo = val;
3092                 }
3093
3094                 if (!ext || !ctxt_tbls[idx + 1])
3095                         continue;
3096
3097                 val = virt_to_phys(ctxt_tbls[idx + 1]) | 1;
3098                 iommu->root_entry[bus].hi = val;
3099         }
3100
3101         spin_unlock_irqrestore(&iommu->lock, flags);
3102
3103         kfree(ctxt_tbls);
3104
3105         __iommu_flush_cache(iommu, iommu->root_entry, PAGE_SIZE);
3106
3107         ret = 0;
3108
3109 out_unmap:
3110         memunmap(old_rt);
3111
3112         return ret;
3113 }
3114
3115 static int __init init_dmars(void)
3116 {
3117         struct dmar_drhd_unit *drhd;
3118         struct dmar_rmrr_unit *rmrr;
3119         bool copied_tables = false;
3120         struct device *dev;
3121         struct intel_iommu *iommu;
3122         int i, ret;
3123
3124         /*
3125          * for each drhd
3126          *    allocate root
3127          *    initialize and program root entry to not present
3128          * endfor
3129          */
3130         for_each_drhd_unit(drhd) {
3131                 /*
3132                  * lock not needed as this is only incremented in the single
3133                  * threaded kernel __init code path all other access are read
3134                  * only
3135                  */
3136                 if (g_num_of_iommus < DMAR_UNITS_SUPPORTED) {
3137                         g_num_of_iommus++;
3138                         continue;
3139                 }
3140                 pr_err_once("Exceeded %d IOMMUs\n", DMAR_UNITS_SUPPORTED);
3141         }
3142
3143         /* Preallocate enough resources for IOMMU hot-addition */
3144         if (g_num_of_iommus < DMAR_UNITS_SUPPORTED)
3145                 g_num_of_iommus = DMAR_UNITS_SUPPORTED;
3146
3147         g_iommus = kcalloc(g_num_of_iommus, sizeof(struct intel_iommu *),
3148                         GFP_KERNEL);
3149         if (!g_iommus) {
3150                 pr_err("Allocating global iommu array failed\n");
3151                 ret = -ENOMEM;
3152                 goto error;
3153         }
3154
3155         deferred_flush = kzalloc(g_num_of_iommus *
3156                 sizeof(struct deferred_flush_tables), GFP_KERNEL);
3157         if (!deferred_flush) {
3158                 ret = -ENOMEM;
3159                 goto free_g_iommus;
3160         }
3161
3162         for_each_active_iommu(iommu, drhd) {
3163                 g_iommus[iommu->seq_id] = iommu;
3164
3165                 intel_iommu_init_qi(iommu);
3166
3167                 ret = iommu_init_domains(iommu);
3168                 if (ret)
3169                         goto free_iommu;
3170
3171                 init_translation_status(iommu);
3172
3173                 if (translation_pre_enabled(iommu) && !is_kdump_kernel()) {
3174                         iommu_disable_translation(iommu);
3175                         clear_translation_pre_enabled(iommu);
3176                         pr_warn("Translation was enabled for %s but we are not in kdump mode\n",
3177                                 iommu->name);
3178                 }
3179
3180                 /*
3181                  * TBD:
3182                  * we could share the same root & context tables
3183                  * among all IOMMU's. Need to Split it later.
3184                  */
3185                 ret = iommu_alloc_root_entry(iommu);
3186                 if (ret)
3187                         goto free_iommu;
3188
3189                 if (translation_pre_enabled(iommu)) {
3190                         pr_info("Translation already enabled - trying to copy translation structures\n");
3191
3192                         ret = copy_translation_tables(iommu);
3193                         if (ret) {
3194                                 /*
3195                                  * We found the IOMMU with translation
3196                                  * enabled - but failed to copy over the
3197                                  * old root-entry table. Try to proceed
3198                                  * by disabling translation now and
3199                                  * allocating a clean root-entry table.
3200                                  * This might cause DMAR faults, but
3201                                  * probably the dump will still succeed.
3202                                  */
3203                                 pr_err("Failed to copy translation tables from previous kernel for %s\n",
3204                                        iommu->name);
3205                                 iommu_disable_translation(iommu);
3206                                 clear_translation_pre_enabled(iommu);
3207                         } else {
3208                                 pr_info("Copied translation tables from previous kernel for %s\n",
3209                                         iommu->name);
3210                                 copied_tables = true;
3211                         }
3212                 }
3213
3214                 if (!ecap_pass_through(iommu->ecap))
3215                         hw_pass_through = 0;
3216 #ifdef CONFIG_INTEL_IOMMU_SVM
3217                 if (pasid_enabled(iommu))
3218                         intel_svm_alloc_pasid_tables(iommu);
3219 #endif
3220         }
3221
3222         /*
3223          * Now that qi is enabled on all iommus, set the root entry and flush
3224          * caches. This is required on some Intel X58 chipsets, otherwise the
3225          * flush_context function will loop forever and the boot hangs.
3226          */
3227         for_each_active_iommu(iommu, drhd) {
3228                 iommu_flush_write_buffer(iommu);
3229                 iommu_set_root_entry(iommu);
3230                 iommu->flush.flush_context(iommu, 0, 0, 0, DMA_CCMD_GLOBAL_INVL);
3231                 iommu->flush.flush_iotlb(iommu, 0, 0, 0, DMA_TLB_GLOBAL_FLUSH);
3232         }
3233
3234         if (iommu_pass_through)
3235                 iommu_identity_mapping |= IDENTMAP_ALL;
3236
3237 #ifdef CONFIG_INTEL_IOMMU_BROKEN_GFX_WA
3238         iommu_identity_mapping |= IDENTMAP_GFX;
3239 #endif
3240
3241         if (iommu_identity_mapping) {
3242                 ret = si_domain_init(hw_pass_through);
3243                 if (ret)
3244                         goto free_iommu;
3245         }
3246
3247         check_tylersburg_isoch();
3248
3249         /*
3250          * If we copied translations from a previous kernel in the kdump
3251          * case, we can not assign the devices to domains now, as that
3252          * would eliminate the old mappings. So skip this part and defer
3253          * the assignment to device driver initialization time.
3254          */
3255         if (copied_tables)
3256                 goto domains_done;
3257
3258         /*
3259          * If pass through is not set or not enabled, setup context entries for
3260          * identity mappings for rmrr, gfx, and isa and may fall back to static
3261          * identity mapping if iommu_identity_mapping is set.
3262          */
3263         if (iommu_identity_mapping) {
3264                 ret = iommu_prepare_static_identity_mapping(hw_pass_through);
3265                 if (ret) {
3266                         pr_crit("Failed to setup IOMMU pass-through\n");
3267                         goto free_iommu;
3268                 }
3269         }
3270         /*
3271          * For each rmrr
3272          *   for each dev attached to rmrr
3273          *   do
3274          *     locate drhd for dev, alloc domain for dev
3275          *     allocate free domain
3276          *     allocate page table entries for rmrr
3277          *     if context not allocated for bus
3278          *           allocate and init context
3279          *           set present in root table for this bus
3280          *     init context with domain, translation etc
3281          *    endfor
3282          * endfor
3283          */
3284         pr_info("Setting RMRR:\n");
3285         for_each_rmrr_units(rmrr) {
3286                 /* some BIOS lists non-exist devices in DMAR table. */
3287                 for_each_active_dev_scope(rmrr->devices, rmrr->devices_cnt,
3288                                           i, dev) {
3289                         ret = iommu_prepare_rmrr_dev(rmrr, dev);
3290                         if (ret)
3291                                 pr_err("Mapping reserved region failed\n");
3292                 }
3293         }
3294
3295         iommu_prepare_isa();
3296
3297 domains_done:
3298
3299         /*
3300          * for each drhd
3301          *   enable fault log
3302          *   global invalidate context cache
3303          *   global invalidate iotlb
3304          *   enable translation
3305          */
3306         for_each_iommu(iommu, drhd) {
3307                 if (drhd->ignored) {
3308                         /*
3309                          * we always have to disable PMRs or DMA may fail on
3310                          * this device
3311                          */
3312                         if (force_on)
3313                                 iommu_disable_protect_mem_regions(iommu);
3314                         continue;
3315                 }
3316
3317                 iommu_flush_write_buffer(iommu);
3318
3319 #ifdef CONFIG_INTEL_IOMMU_SVM
3320                 if (pasid_enabled(iommu) && ecap_prs(iommu->ecap)) {
3321                         ret = intel_svm_enable_prq(iommu);
3322                         if (ret)
3323                                 goto free_iommu;
3324                 }
3325 #endif
3326                 ret = dmar_set_interrupt(iommu);
3327                 if (ret)
3328                         goto free_iommu;
3329
3330                 if (!translation_pre_enabled(iommu))
3331                         iommu_enable_translation(iommu);
3332
3333                 iommu_disable_protect_mem_regions(iommu);
3334         }
3335
3336         return 0;
3337
3338 free_iommu:
3339         for_each_active_iommu(iommu, drhd) {
3340                 disable_dmar_iommu(iommu);
3341                 free_dmar_iommu(iommu);
3342         }
3343         kfree(deferred_flush);
3344 free_g_iommus:
3345         kfree(g_iommus);
3346 error:
3347         return ret;
3348 }
3349
3350 /* This takes a number of _MM_ pages, not VTD pages */
3351 static struct iova *intel_alloc_iova(struct device *dev,
3352                                      struct dmar_domain *domain,
3353                                      unsigned long nrpages, uint64_t dma_mask)
3354 {
3355         struct iova *iova = NULL;
3356
3357         /* Restrict dma_mask to the width that the iommu can handle */
3358         dma_mask = min_t(uint64_t, DOMAIN_MAX_ADDR(domain->gaw), dma_mask);
3359         /* Ensure we reserve the whole size-aligned region */
3360         nrpages = __roundup_pow_of_two(nrpages);
3361
3362         if (!dmar_forcedac && dma_mask > DMA_BIT_MASK(32)) {
3363                 /*
3364                  * First try to allocate an io virtual address in
3365                  * DMA_BIT_MASK(32) and if that fails then try allocating
3366                  * from higher range
3367                  */
3368                 iova = alloc_iova(&domain->iovad, nrpages,
3369                                   IOVA_PFN(DMA_BIT_MASK(32)), 1);
3370                 if (iova)
3371                         return iova;
3372         }
3373         iova = alloc_iova(&domain->iovad, nrpages, IOVA_PFN(dma_mask), 1);
3374         if (unlikely(!iova)) {
3375                 pr_err("Allocating %ld-page iova for %s failed",
3376                        nrpages, dev_name(dev));
3377                 return NULL;
3378         }
3379
3380         return iova;
3381 }
3382
3383 static struct dmar_domain *__get_valid_domain_for_dev(struct device *dev)
3384 {
3385         struct dmar_rmrr_unit *rmrr;
3386         struct dmar_domain *domain;
3387         struct device *i_dev;
3388         int i, ret;
3389
3390         domain = get_domain_for_dev(dev, DEFAULT_DOMAIN_ADDRESS_WIDTH);
3391         if (!domain) {
3392                 pr_err("Allocating domain for %s failed\n",
3393                        dev_name(dev));
3394                 return NULL;
3395         }
3396
3397         /* We have a new domain - setup possible RMRRs for the device */
3398         rcu_read_lock();
3399         for_each_rmrr_units(rmrr) {
3400                 for_each_active_dev_scope(rmrr->devices, rmrr->devices_cnt,
3401                                           i, i_dev) {
3402                         if (i_dev != dev)
3403                                 continue;
3404
3405                         ret = domain_prepare_identity_map(dev, domain,
3406                                                           rmrr->base_address,
3407                                                           rmrr->end_address);
3408                         if (ret)
3409                                 dev_err(dev, "Mapping reserved region failed\n");
3410                 }
3411         }
3412         rcu_read_unlock();
3413
3414         return domain;
3415 }
3416
3417 static inline struct dmar_domain *get_valid_domain_for_dev(struct device *dev)
3418 {
3419         struct device_domain_info *info;
3420
3421         /* No lock here, assumes no domain exit in normal case */
3422         info = dev->archdata.iommu;
3423         if (likely(info))
3424                 return info->domain;
3425
3426         return __get_valid_domain_for_dev(dev);
3427 }
3428
3429 /* Check if the dev needs to go through non-identity map and unmap process.*/
3430 static int iommu_no_mapping(struct device *dev)
3431 {
3432         int found;
3433
3434         if (iommu_dummy(dev))
3435                 return 1;
3436
3437         if (!iommu_identity_mapping)
3438                 return 0;
3439
3440         found = identity_mapping(dev);
3441         if (found) {
3442                 if (iommu_should_identity_map(dev, 0))
3443                         return 1;
3444                 else {
3445                         /*
3446                          * 32 bit DMA is removed from si_domain and fall back
3447                          * to non-identity mapping.
3448                          */
3449                         dmar_remove_one_dev_info(si_domain, dev);
3450                         pr_info("32bit %s uses non-identity mapping\n",
3451                                 dev_name(dev));
3452                         return 0;
3453                 }
3454         } else {
3455                 /*
3456                  * In case of a detached 64 bit DMA device from vm, the device
3457                  * is put into si_domain for identity mapping.
3458                  */
3459                 if (iommu_should_identity_map(dev, 0)) {
3460                         int ret;
3461                         ret = domain_add_dev_info(si_domain, dev);
3462                         if (!ret) {
3463                                 pr_info("64bit %s uses identity mapping\n",
3464                                         dev_name(dev));
3465                                 return 1;
3466                         }
3467                 }
3468         }
3469
3470         return 0;
3471 }
3472
3473 static dma_addr_t __intel_map_single(struct device *dev, phys_addr_t paddr,
3474                                      size_t size, int dir, u64 dma_mask)
3475 {
3476         struct dmar_domain *domain;
3477         phys_addr_t start_paddr;
3478         struct iova *iova;
3479         int prot = 0;
3480         int ret;
3481         struct intel_iommu *iommu;
3482         unsigned long paddr_pfn = paddr >> PAGE_SHIFT;
3483
3484         BUG_ON(dir == DMA_NONE);
3485
3486         if (iommu_no_mapping(dev))
3487                 return paddr;
3488
3489         domain = get_valid_domain_for_dev(dev);
3490         if (!domain)
3491                 return 0;
3492
3493         iommu = domain_get_iommu(domain);
3494         size = aligned_nrpages(paddr, size);
3495
3496         iova = intel_alloc_iova(dev, domain, dma_to_mm_pfn(size), dma_mask);
3497         if (!iova)
3498                 goto error;
3499
3500         /*
3501          * Check if DMAR supports zero-length reads on write only
3502          * mappings..
3503          */
3504         if (dir == DMA_TO_DEVICE || dir == DMA_BIDIRECTIONAL || \
3505                         !cap_zlr(iommu->cap))
3506                 prot |= DMA_PTE_READ;
3507         if (dir == DMA_FROM_DEVICE || dir == DMA_BIDIRECTIONAL)
3508                 prot |= DMA_PTE_WRITE;
3509         /*
3510          * paddr - (paddr + size) might be partial page, we should map the whole
3511          * page.  Note: if two part of one page are separately mapped, we
3512          * might have two guest_addr mapping to the same host paddr, but this
3513          * is not a big problem
3514          */
3515         ret = domain_pfn_mapping(domain, mm_to_dma_pfn(iova->pfn_lo),
3516                                  mm_to_dma_pfn(paddr_pfn), size, prot);
3517         if (ret)
3518                 goto error;
3519
3520         /* it's a non-present to present mapping. Only flush if caching mode */
3521         if (cap_caching_mode(iommu->cap))
3522                 iommu_flush_iotlb_psi(iommu, domain,
3523                                       mm_to_dma_pfn(iova->pfn_lo),
3524                                       size, 0, 1);
3525         else
3526                 iommu_flush_write_buffer(iommu);
3527
3528         start_paddr = (phys_addr_t)iova->pfn_lo << PAGE_SHIFT;
3529         start_paddr += paddr & ~PAGE_MASK;
3530         return start_paddr;
3531
3532 error:
3533         if (iova)
3534                 __free_iova(&domain->iovad, iova);
3535         pr_err("Device %s request: %zx@%llx dir %d --- failed\n",
3536                 dev_name(dev), size, (unsigned long long)paddr, dir);
3537         return 0;
3538 }
3539
3540 static dma_addr_t intel_map_page(struct device *dev, struct page *page,
3541                                  unsigned long offset, size_t size,
3542                                  enum dma_data_direction dir,
3543                                  struct dma_attrs *attrs)
3544 {
3545         return __intel_map_single(dev, page_to_phys(page) + offset, size,
3546                                   dir, *dev->dma_mask);
3547 }
3548
3549 static void flush_unmaps(void)
3550 {
3551         int i, j;
3552
3553         timer_on = 0;
3554
3555         /* just flush them all */
3556         for (i = 0; i < g_num_of_iommus; i++) {
3557                 struct intel_iommu *iommu = g_iommus[i];
3558                 if (!iommu)
3559                         continue;
3560
3561                 if (!deferred_flush[i].next)
3562                         continue;
3563
3564                 /* In caching mode, global flushes turn emulation expensive */
3565                 if (!cap_caching_mode(iommu->cap))
3566                         iommu->flush.flush_iotlb(iommu, 0, 0, 0,
3567                                          DMA_TLB_GLOBAL_FLUSH);
3568                 for (j = 0; j < deferred_flush[i].next; j++) {
3569                         unsigned long mask;
3570                         struct iova *iova = deferred_flush[i].iova[j];
3571                         struct dmar_domain *domain = deferred_flush[i].domain[j];
3572
3573                         /* On real hardware multiple invalidations are expensive */
3574                         if (cap_caching_mode(iommu->cap))
3575                                 iommu_flush_iotlb_psi(iommu, domain,
3576                                         iova->pfn_lo, iova_size(iova),
3577                                         !deferred_flush[i].freelist[j], 0);
3578                         else {
3579                                 mask = ilog2(mm_to_dma_pfn(iova_size(iova)));
3580                                 iommu_flush_dev_iotlb(deferred_flush[i].domain[j],
3581                                                 (uint64_t)iova->pfn_lo << PAGE_SHIFT, mask);
3582                         }
3583                         __free_iova(&deferred_flush[i].domain[j]->iovad, iova);
3584                         if (deferred_flush[i].freelist[j])
3585                                 dma_free_pagelist(deferred_flush[i].freelist[j]);
3586                 }
3587                 deferred_flush[i].next = 0;
3588         }
3589
3590         list_size = 0;
3591 }
3592
3593 static void flush_unmaps_timeout(unsigned long data)
3594 {
3595         unsigned long flags;
3596
3597         spin_lock_irqsave(&async_umap_flush_lock, flags);
3598         flush_unmaps();
3599         spin_unlock_irqrestore(&async_umap_flush_lock, flags);
3600 }
3601
3602 static void add_unmap(struct dmar_domain *dom, struct iova *iova, struct page *freelist)
3603 {
3604         unsigned long flags;
3605         int next, iommu_id;
3606         struct intel_iommu *iommu;
3607
3608         spin_lock_irqsave(&async_umap_flush_lock, flags);
3609         if (list_size == HIGH_WATER_MARK)
3610                 flush_unmaps();
3611
3612         iommu = domain_get_iommu(dom);
3613         iommu_id = iommu->seq_id;
3614
3615         next = deferred_flush[iommu_id].next;
3616         deferred_flush[iommu_id].domain[next] = dom;
3617         deferred_flush[iommu_id].iova[next] = iova;
3618         deferred_flush[iommu_id].freelist[next] = freelist;
3619         deferred_flush[iommu_id].next++;
3620
3621         if (!timer_on) {
3622                 mod_timer(&unmap_timer, jiffies + msecs_to_jiffies(10));
3623                 timer_on = 1;
3624         }
3625         list_size++;
3626         spin_unlock_irqrestore(&async_umap_flush_lock, flags);
3627 }
3628
3629 static void intel_unmap(struct device *dev, dma_addr_t dev_addr)
3630 {
3631         struct dmar_domain *domain;
3632         unsigned long start_pfn, last_pfn;
3633         struct iova *iova;
3634         struct intel_iommu *iommu;
3635         struct page *freelist;
3636
3637         if (iommu_no_mapping(dev))
3638                 return;
3639
3640         domain = find_domain(dev);
3641         BUG_ON(!domain);
3642
3643         iommu = domain_get_iommu(domain);
3644
3645         iova = find_iova(&domain->iovad, IOVA_PFN(dev_addr));
3646         if (WARN_ONCE(!iova, "Driver unmaps unmatched page at PFN %llx\n",
3647                       (unsigned long long)dev_addr))
3648                 return;
3649
3650         start_pfn = mm_to_dma_pfn(iova->pfn_lo);
3651         last_pfn = mm_to_dma_pfn(iova->pfn_hi + 1) - 1;
3652
3653         pr_debug("Device %s unmapping: pfn %lx-%lx\n",
3654                  dev_name(dev), start_pfn, last_pfn);
3655
3656         freelist = domain_unmap(domain, start_pfn, last_pfn);
3657
3658         if (intel_iommu_strict) {
3659                 iommu_flush_iotlb_psi(iommu, domain, start_pfn,
3660                                       last_pfn - start_pfn + 1, !freelist, 0);
3661                 /* free iova */
3662                 __free_iova(&domain->iovad, iova);
3663                 dma_free_pagelist(freelist);
3664         } else {
3665                 add_unmap(domain, iova, freelist);
3666                 /*
3667                  * queue up the release of the unmap to save the 1/6th of the
3668                  * cpu used up by the iotlb flush operation...
3669                  */
3670         }
3671 }
3672
3673 static void intel_unmap_page(struct device *dev, dma_addr_t dev_addr,
3674                              size_t size, enum dma_data_direction dir,
3675                              struct dma_attrs *attrs)
3676 {
3677         intel_unmap(dev, dev_addr);
3678 }
3679
3680 static void *intel_alloc_coherent(struct device *dev, size_t size,
3681                                   dma_addr_t *dma_handle, gfp_t flags,
3682                                   struct dma_attrs *attrs)
3683 {
3684         struct page *page = NULL;
3685         int order;
3686
3687         size = PAGE_ALIGN(size);
3688         order = get_order(size);
3689
3690         if (!iommu_no_mapping(dev))
3691                 flags &= ~(GFP_DMA | GFP_DMA32);
3692         else if (dev->coherent_dma_mask < dma_get_required_mask(dev)) {
3693                 if (dev->coherent_dma_mask < DMA_BIT_MASK(32))
3694                         flags |= GFP_DMA;
3695                 else
3696                         flags |= GFP_DMA32;
3697         }
3698
3699         if (gfpflags_allow_blocking(flags)) {
3700                 unsigned int count = size >> PAGE_SHIFT;
3701
3702                 page = dma_alloc_from_contiguous(dev, count, order);
3703                 if (page && iommu_no_mapping(dev) &&
3704                     page_to_phys(page) + size > dev->coherent_dma_mask) {
3705                         dma_release_from_contiguous(dev, page, count);
3706                         page = NULL;
3707                 }
3708         }
3709
3710         if (!page)
3711                 page = alloc_pages(flags, order);
3712         if (!page)
3713                 return NULL;
3714         memset(page_address(page), 0, size);
3715
3716         *dma_handle = __intel_map_single(dev, page_to_phys(page), size,
3717                                          DMA_BIDIRECTIONAL,
3718                                          dev->coherent_dma_mask);
3719         if (*dma_handle)
3720                 return page_address(page);
3721         if (!dma_release_from_contiguous(dev, page, size >> PAGE_SHIFT))
3722                 __free_pages(page, order);
3723
3724         return NULL;
3725 }
3726
3727 static void intel_free_coherent(struct device *dev, size_t size, void *vaddr,
3728                                 dma_addr_t dma_handle, struct dma_attrs *attrs)
3729 {
3730         int order;
3731         struct page *page = virt_to_page(vaddr);
3732
3733         size = PAGE_ALIGN(size);
3734         order = get_order(size);
3735
3736         intel_unmap(dev, dma_handle);
3737         if (!dma_release_from_contiguous(dev, page, size >> PAGE_SHIFT))
3738                 __free_pages(page, order);
3739 }
3740
3741 static void intel_unmap_sg(struct device *dev, struct scatterlist *sglist,
3742                            int nelems, enum dma_data_direction dir,
3743                            struct dma_attrs *attrs)
3744 {
3745         intel_unmap(dev, sglist[0].dma_address);
3746 }
3747
3748 static int intel_nontranslate_map_sg(struct device *hddev,
3749         struct scatterlist *sglist, int nelems, int dir)
3750 {
3751         int i;
3752         struct scatterlist *sg;
3753
3754         for_each_sg(sglist, sg, nelems, i) {
3755                 BUG_ON(!sg_page(sg));
3756                 sg->dma_address = page_to_phys(sg_page(sg)) + sg->offset;
3757                 sg->dma_length = sg->length;
3758         }
3759         return nelems;
3760 }
3761
3762 static int intel_map_sg(struct device *dev, struct scatterlist *sglist, int nelems,
3763                         enum dma_data_direction dir, struct dma_attrs *attrs)
3764 {
3765         int i;
3766         struct dmar_domain *domain;
3767         size_t size = 0;
3768         int prot = 0;
3769         struct iova *iova = NULL;
3770         int ret;
3771         struct scatterlist *sg;
3772         unsigned long start_vpfn;
3773         struct intel_iommu *iommu;
3774
3775         BUG_ON(dir == DMA_NONE);
3776         if (iommu_no_mapping(dev))
3777                 return intel_nontranslate_map_sg(dev, sglist, nelems, dir);
3778
3779         domain = get_valid_domain_for_dev(dev);
3780         if (!domain)
3781                 return 0;
3782
3783         iommu = domain_get_iommu(domain);
3784
3785         for_each_sg(sglist, sg, nelems, i)
3786                 size += aligned_nrpages(sg->offset, sg->length);
3787
3788         iova = intel_alloc_iova(dev, domain, dma_to_mm_pfn(size),
3789                                 *dev->dma_mask);
3790         if (!iova) {
3791                 sglist->dma_length = 0;
3792                 return 0;
3793         }
3794
3795         /*
3796          * Check if DMAR supports zero-length reads on write only
3797          * mappings..
3798          */
3799         if (dir == DMA_TO_DEVICE || dir == DMA_BIDIRECTIONAL || \
3800                         !cap_zlr(iommu->cap))
3801                 prot |= DMA_PTE_READ;
3802         if (dir == DMA_FROM_DEVICE || dir == DMA_BIDIRECTIONAL)
3803                 prot |= DMA_PTE_WRITE;
3804
3805         start_vpfn = mm_to_dma_pfn(iova->pfn_lo);
3806
3807         ret = domain_sg_mapping(domain, start_vpfn, sglist, size, prot);
3808         if (unlikely(ret)) {
3809                 dma_pte_free_pagetable(domain, start_vpfn,
3810                                        start_vpfn + size - 1);
3811                 __free_iova(&domain->iovad, iova);
3812                 return 0;
3813         }
3814
3815         /* it's a non-present to present mapping. Only flush if caching mode */
3816         if (cap_caching_mode(iommu->cap))
3817                 iommu_flush_iotlb_psi(iommu, domain, start_vpfn, size, 0, 1);
3818         else
3819                 iommu_flush_write_buffer(iommu);
3820
3821         return nelems;
3822 }
3823
3824 static int intel_mapping_error(struct device *dev, dma_addr_t dma_addr)
3825 {
3826         return !dma_addr;
3827 }
3828
3829 struct dma_map_ops intel_dma_ops = {
3830         .alloc = intel_alloc_coherent,
3831         .free = intel_free_coherent,
3832         .map_sg = intel_map_sg,
3833         .unmap_sg = intel_unmap_sg,
3834         .map_page = intel_map_page,
3835         .unmap_page = intel_unmap_page,
3836         .mapping_error = intel_mapping_error,
3837 };
3838
3839 static inline int iommu_domain_cache_init(void)
3840 {
3841         int ret = 0;
3842
3843         iommu_domain_cache = kmem_cache_create("iommu_domain",
3844                                          sizeof(struct dmar_domain),
3845                                          0,
3846                                          SLAB_HWCACHE_ALIGN,
3847
3848                                          NULL);
3849         if (!iommu_domain_cache) {
3850                 pr_err("Couldn't create iommu_domain cache\n");
3851                 ret = -ENOMEM;
3852         }
3853
3854         return ret;
3855 }
3856
3857 static inline int iommu_devinfo_cache_init(void)
3858 {
3859         int ret = 0;
3860
3861         iommu_devinfo_cache = kmem_cache_create("iommu_devinfo",
3862                                          sizeof(struct device_domain_info),
3863                                          0,
3864                                          SLAB_HWCACHE_ALIGN,
3865                                          NULL);
3866         if (!iommu_devinfo_cache) {
3867                 pr_err("Couldn't create devinfo cache\n");
3868                 ret = -ENOMEM;
3869         }
3870
3871         return ret;
3872 }
3873
3874 static int __init iommu_init_mempool(void)
3875 {
3876         int ret;
3877         ret = iova_cache_get();
3878         if (ret)
3879                 return ret;
3880
3881         ret = iommu_domain_cache_init();
3882         if (ret)
3883                 goto domain_error;
3884
3885         ret = iommu_devinfo_cache_init();
3886         if (!ret)
3887                 return ret;
3888
3889         kmem_cache_destroy(iommu_domain_cache);
3890 domain_error:
3891         iova_cache_put();
3892
3893         return -ENOMEM;
3894 }
3895
3896 static void __init iommu_exit_mempool(void)
3897 {
3898         kmem_cache_destroy(iommu_devinfo_cache);
3899         kmem_cache_destroy(iommu_domain_cache);
3900         iova_cache_put();
3901 }
3902
3903 static void quirk_ioat_snb_local_iommu(struct pci_dev *pdev)
3904 {
3905         struct dmar_drhd_unit *drhd;
3906         u32 vtbar;
3907         int rc;
3908
3909         /* We know that this device on this chipset has its own IOMMU.
3910          * If we find it under a different IOMMU, then the BIOS is lying
3911          * to us. Hope that the IOMMU for this device is actually
3912          * disabled, and it needs no translation...
3913          */
3914         rc = pci_bus_read_config_dword(pdev->bus, PCI_DEVFN(0, 0), 0xb0, &vtbar);
3915         if (rc) {
3916                 /* "can't" happen */
3917                 dev_info(&pdev->dev, "failed to run vt-d quirk\n");
3918                 return;
3919         }
3920         vtbar &= 0xffff0000;
3921
3922         /* we know that the this iommu should be at offset 0xa000 from vtbar */
3923         drhd = dmar_find_matched_drhd_unit(pdev);
3924         if (WARN_TAINT_ONCE(!drhd || drhd->reg_base_addr - vtbar != 0xa000,
3925                             TAINT_FIRMWARE_WORKAROUND,
3926                             "BIOS assigned incorrect VT-d unit for Intel(R) QuickData Technology device\n"))
3927                 pdev->dev.archdata.iommu = DUMMY_DEVICE_DOMAIN_INFO;
3928 }
3929 DECLARE_PCI_FIXUP_ENABLE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_IOAT_SNB, quirk_ioat_snb_local_iommu);
3930
3931 static void __init init_no_remapping_devices(void)
3932 {
3933         struct dmar_drhd_unit *drhd;
3934         struct device *dev;
3935         int i;
3936
3937         for_each_drhd_unit(drhd) {
3938                 if (!drhd->include_all) {
3939                         for_each_active_dev_scope(drhd->devices,
3940                                                   drhd->devices_cnt, i, dev)
3941                                 break;
3942                         /* ignore DMAR unit if no devices exist */
3943                         if (i == drhd->devices_cnt)
3944                                 drhd->ignored = 1;
3945                 }
3946         }
3947
3948         for_each_active_drhd_unit(drhd) {
3949                 if (drhd->include_all)
3950                         continue;
3951
3952                 for_each_active_dev_scope(drhd->devices,
3953                                           drhd->devices_cnt, i, dev)
3954                         if (!dev_is_pci(dev) || !IS_GFX_DEVICE(to_pci_dev(dev)))
3955                                 break;
3956                 if (i < drhd->devices_cnt)
3957                         continue;
3958
3959                 /* This IOMMU has *only* gfx devices. Either bypass it or
3960                    set the gfx_mapped flag, as appropriate */
3961                 if (dmar_map_gfx) {
3962                         intel_iommu_gfx_mapped = 1;
3963                 } else {
3964                         drhd->ignored = 1;
3965                         for_each_active_dev_scope(drhd->devices,
3966                                                   drhd->devices_cnt, i, dev)
3967                                 dev->archdata.iommu = DUMMY_DEVICE_DOMAIN_INFO;
3968                 }
3969         }
3970 }
3971
3972 #ifdef CONFIG_SUSPEND
3973 static int init_iommu_hw(void)
3974 {
3975         struct dmar_drhd_unit *drhd;
3976         struct intel_iommu *iommu = NULL;
3977
3978         for_each_active_iommu(iommu, drhd)
3979                 if (iommu->qi)
3980                         dmar_reenable_qi(iommu);
3981
3982         for_each_iommu(iommu, drhd) {
3983                 if (drhd->ignored) {
3984                         /*
3985                          * we always have to disable PMRs or DMA may fail on
3986                          * this device
3987                          */
3988                         if (force_on)
3989                                 iommu_disable_protect_mem_regions(iommu);
3990                         continue;
3991                 }
3992         
3993                 iommu_flush_write_buffer(iommu);
3994
3995                 iommu_set_root_entry(iommu);
3996
3997                 iommu->flush.flush_context(iommu, 0, 0, 0,
3998                                            DMA_CCMD_GLOBAL_INVL);
3999                 iommu->flush.flush_iotlb(iommu, 0, 0, 0, DMA_TLB_GLOBAL_FLUSH);
4000                 iommu_enable_translation(iommu);
4001                 iommu_disable_protect_mem_regions(iommu);
4002         }
4003
4004         return 0;
4005 }
4006
4007 static void iommu_flush_all(void)
4008 {
4009         struct dmar_drhd_unit *drhd;
4010         struct intel_iommu *iommu;
4011
4012         for_each_active_iommu(iommu, drhd) {
4013                 iommu->flush.flush_context(iommu, 0, 0, 0,
4014                                            DMA_CCMD_GLOBAL_INVL);
4015                 iommu->flush.flush_iotlb(iommu, 0, 0, 0,
4016                                          DMA_TLB_GLOBAL_FLUSH);
4017         }
4018 }
4019
4020 static int iommu_suspend(void)
4021 {
4022         struct dmar_drhd_unit *drhd;
4023         struct intel_iommu *iommu = NULL;
4024         unsigned long flag;
4025
4026         for_each_active_iommu(iommu, drhd) {
4027                 iommu->iommu_state = kzalloc(sizeof(u32) * MAX_SR_DMAR_REGS,
4028                                                  GFP_ATOMIC);
4029                 if (!iommu->iommu_state)
4030                         goto nomem;
4031         }
4032
4033         iommu_flush_all();
4034
4035         for_each_active_iommu(iommu, drhd) {
4036                 iommu_disable_translation(iommu);
4037
4038                 raw_spin_lock_irqsave(&iommu->register_lock, flag);
4039
4040                 iommu->iommu_state[SR_DMAR_FECTL_REG] =
4041                         readl(iommu->reg + DMAR_FECTL_REG);
4042                 iommu->iommu_state[SR_DMAR_FEDATA_REG] =
4043                         readl(iommu->reg + DMAR_FEDATA_REG);
4044                 iommu->iommu_state[SR_DMAR_FEADDR_REG] =
4045                         readl(iommu->reg + DMAR_FEADDR_REG);
4046                 iommu->iommu_state[SR_DMAR_FEUADDR_REG] =
4047                         readl(iommu->reg + DMAR_FEUADDR_REG);
4048
4049                 raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
4050         }
4051         return 0;
4052
4053 nomem:
4054         for_each_active_iommu(iommu, drhd)
4055                 kfree(iommu->iommu_state);
4056
4057         return -ENOMEM;
4058 }
4059
4060 static void iommu_resume(void)
4061 {
4062         struct dmar_drhd_unit *drhd;
4063         struct intel_iommu *iommu = NULL;
4064         unsigned long flag;
4065
4066         if (init_iommu_hw()) {
4067                 if (force_on)
4068                         panic("tboot: IOMMU setup failed, DMAR can not resume!\n");
4069                 else
4070                         WARN(1, "IOMMU setup failed, DMAR can not resume!\n");
4071                 return;
4072         }
4073
4074         for_each_active_iommu(iommu, drhd) {
4075
4076                 raw_spin_lock_irqsave(&iommu->register_lock, flag);
4077
4078                 writel(iommu->iommu_state[SR_DMAR_FECTL_REG],
4079                         iommu->reg + DMAR_FECTL_REG);
4080                 writel(iommu->iommu_state[SR_DMAR_FEDATA_REG],
4081                         iommu->reg + DMAR_FEDATA_REG);
4082                 writel(iommu->iommu_state[SR_DMAR_FEADDR_REG],
4083                         iommu->reg + DMAR_FEADDR_REG);
4084                 writel(iommu->iommu_state[SR_DMAR_FEUADDR_REG],
4085                         iommu->reg + DMAR_FEUADDR_REG);
4086
4087                 raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
4088         }
4089
4090         for_each_active_iommu(iommu, drhd)
4091                 kfree(iommu->iommu_state);
4092 }
4093
4094 static struct syscore_ops iommu_syscore_ops = {
4095         .resume         = iommu_resume,
4096         .suspend        = iommu_suspend,
4097 };
4098
4099 static void __init init_iommu_pm_ops(void)
4100 {
4101         register_syscore_ops(&iommu_syscore_ops);
4102 }
4103
4104 #else
4105 static inline void init_iommu_pm_ops(void) {}
4106 #endif  /* CONFIG_PM */
4107
4108
4109 int __init dmar_parse_one_rmrr(struct acpi_dmar_header *header, void *arg)
4110 {
4111         struct acpi_dmar_reserved_memory *rmrr;
4112         struct dmar_rmrr_unit *rmrru;
4113
4114         rmrru = kzalloc(sizeof(*rmrru), GFP_KERNEL);
4115         if (!rmrru)
4116                 return -ENOMEM;
4117
4118         rmrru->hdr = header;
4119         rmrr = (struct acpi_dmar_reserved_memory *)header;
4120         rmrru->base_address = rmrr->base_address;
4121         rmrru->end_address = rmrr->end_address;
4122         rmrru->devices = dmar_alloc_dev_scope((void *)(rmrr + 1),
4123                                 ((void *)rmrr) + rmrr->header.length,
4124                                 &rmrru->devices_cnt);
4125         if (rmrru->devices_cnt && rmrru->devices == NULL) {
4126                 kfree(rmrru);
4127                 return -ENOMEM;
4128         }
4129
4130         list_add(&rmrru->list, &dmar_rmrr_units);
4131
4132         return 0;
4133 }
4134
4135 static struct dmar_atsr_unit *dmar_find_atsr(struct acpi_dmar_atsr *atsr)
4136 {
4137         struct dmar_atsr_unit *atsru;
4138         struct acpi_dmar_atsr *tmp;
4139
4140         list_for_each_entry_rcu(atsru, &dmar_atsr_units, list) {
4141                 tmp = (struct acpi_dmar_atsr *)atsru->hdr;
4142                 if (atsr->segment != tmp->segment)
4143                         continue;
4144                 if (atsr->header.length != tmp->header.length)
4145                         continue;
4146                 if (memcmp(atsr, tmp, atsr->header.length) == 0)
4147                         return atsru;
4148         }
4149
4150         return NULL;
4151 }
4152
4153 int dmar_parse_one_atsr(struct acpi_dmar_header *hdr, void *arg)
4154 {
4155         struct acpi_dmar_atsr *atsr;
4156         struct dmar_atsr_unit *atsru;
4157
4158         if (system_state != SYSTEM_BOOTING && !intel_iommu_enabled)
4159                 return 0;
4160
4161         atsr = container_of(hdr, struct acpi_dmar_atsr, header);
4162         atsru = dmar_find_atsr(atsr);
4163         if (atsru)
4164                 return 0;
4165
4166         atsru = kzalloc(sizeof(*atsru) + hdr->length, GFP_KERNEL);
4167         if (!atsru)
4168                 return -ENOMEM;
4169
4170         /*
4171          * If memory is allocated from slab by ACPI _DSM method, we need to
4172          * copy the memory content because the memory buffer will be freed
4173          * on return.
4174          */
4175         atsru->hdr = (void *)(atsru + 1);
4176         memcpy(atsru->hdr, hdr, hdr->length);
4177         atsru->include_all = atsr->flags & 0x1;
4178         if (!atsru->include_all) {
4179                 atsru->devices = dmar_alloc_dev_scope((void *)(atsr + 1),
4180                                 (void *)atsr + atsr->header.length,
4181                                 &atsru->devices_cnt);
4182                 if (atsru->devices_cnt && atsru->devices == NULL) {
4183                         kfree(atsru);
4184                         return -ENOMEM;
4185                 }
4186         }
4187
4188         list_add_rcu(&atsru->list, &dmar_atsr_units);
4189
4190         return 0;
4191 }
4192
4193 static void intel_iommu_free_atsr(struct dmar_atsr_unit *atsru)
4194 {
4195         dmar_free_dev_scope(&atsru->devices, &atsru->devices_cnt);
4196         kfree(atsru);
4197 }
4198
4199 int dmar_release_one_atsr(struct acpi_dmar_header *hdr, void *arg)
4200 {
4201         struct acpi_dmar_atsr *atsr;
4202         struct dmar_atsr_unit *atsru;
4203
4204         atsr = container_of(hdr, struct acpi_dmar_atsr, header);
4205         atsru = dmar_find_atsr(atsr);
4206         if (atsru) {
4207                 list_del_rcu(&atsru->list);
4208                 synchronize_rcu();
4209                 intel_iommu_free_atsr(atsru);
4210         }
4211
4212         return 0;
4213 }
4214
4215 int dmar_check_one_atsr(struct acpi_dmar_header *hdr, void *arg)
4216 {
4217         int i;
4218         struct device *dev;
4219         struct acpi_dmar_atsr *atsr;
4220         struct dmar_atsr_unit *atsru;
4221
4222         atsr = container_of(hdr, struct acpi_dmar_atsr, header);
4223         atsru = dmar_find_atsr(atsr);
4224         if (!atsru)
4225                 return 0;
4226
4227         if (!atsru->include_all && atsru->devices && atsru->devices_cnt) {
4228                 for_each_active_dev_scope(atsru->devices, atsru->devices_cnt,
4229                                           i, dev)
4230                         return -EBUSY;
4231         }
4232
4233         return 0;
4234 }
4235
4236 static int intel_iommu_add(struct dmar_drhd_unit *dmaru)
4237 {
4238         int sp, ret = 0;
4239         struct intel_iommu *iommu = dmaru->iommu;
4240
4241         if (g_iommus[iommu->seq_id])
4242                 return 0;
4243
4244         if (hw_pass_through && !ecap_pass_through(iommu->ecap)) {
4245                 pr_warn("%s: Doesn't support hardware pass through.\n",
4246                         iommu->name);
4247                 return -ENXIO;
4248         }
4249         if (!ecap_sc_support(iommu->ecap) &&
4250             domain_update_iommu_snooping(iommu)) {
4251                 pr_warn("%s: Doesn't support snooping.\n",
4252                         iommu->name);
4253                 return -ENXIO;
4254         }
4255         sp = domain_update_iommu_superpage(iommu) - 1;
4256         if (sp >= 0 && !(cap_super_page_val(iommu->cap) & (1 << sp))) {
4257                 pr_warn("%s: Doesn't support large page.\n",
4258                         iommu->name);
4259                 return -ENXIO;
4260         }
4261
4262         /*
4263          * Disable translation if already enabled prior to OS handover.
4264          */
4265         if (iommu->gcmd & DMA_GCMD_TE)
4266                 iommu_disable_translation(iommu);
4267
4268         g_iommus[iommu->seq_id] = iommu;
4269         ret = iommu_init_domains(iommu);
4270         if (ret == 0)
4271                 ret = iommu_alloc_root_entry(iommu);
4272         if (ret)
4273                 goto out;
4274
4275 #ifdef CONFIG_INTEL_IOMMU_SVM
4276         if (pasid_enabled(iommu))
4277                 intel_svm_alloc_pasid_tables(iommu);
4278 #endif
4279
4280         if (dmaru->ignored) {
4281                 /*
4282                  * we always have to disable PMRs or DMA may fail on this device
4283                  */
4284                 if (force_on)
4285                         iommu_disable_protect_mem_regions(iommu);
4286                 return 0;
4287         }
4288
4289         intel_iommu_init_qi(iommu);
4290         iommu_flush_write_buffer(iommu);
4291
4292 #ifdef CONFIG_INTEL_IOMMU_SVM
4293         if (pasid_enabled(iommu) && ecap_prs(iommu->ecap)) {
4294                 ret = intel_svm_enable_prq(iommu);
4295                 if (ret)
4296                         goto disable_iommu;
4297         }
4298 #endif
4299         ret = dmar_set_interrupt(iommu);
4300         if (ret)
4301                 goto disable_iommu;
4302
4303         iommu_set_root_entry(iommu);
4304         iommu->flush.flush_context(iommu, 0, 0, 0, DMA_CCMD_GLOBAL_INVL);
4305         iommu->flush.flush_iotlb(iommu, 0, 0, 0, DMA_TLB_GLOBAL_FLUSH);
4306         iommu_enable_translation(iommu);
4307
4308         iommu_disable_protect_mem_regions(iommu);
4309         return 0;
4310
4311 disable_iommu:
4312         disable_dmar_iommu(iommu);
4313 out:
4314         free_dmar_iommu(iommu);
4315         return ret;
4316 }
4317
4318 int dmar_iommu_hotplug(struct dmar_drhd_unit *dmaru, bool insert)
4319 {
4320         int ret = 0;
4321         struct intel_iommu *iommu = dmaru->iommu;
4322
4323         if (!intel_iommu_enabled)
4324                 return 0;
4325         if (iommu == NULL)
4326                 return -EINVAL;
4327
4328         if (insert) {
4329                 ret = intel_iommu_add(dmaru);
4330         } else {
4331                 disable_dmar_iommu(iommu);
4332                 free_dmar_iommu(iommu);
4333         }
4334
4335         return ret;
4336 }
4337
4338 static void intel_iommu_free_dmars(void)
4339 {
4340         struct dmar_rmrr_unit *rmrru, *rmrr_n;
4341         struct dmar_atsr_unit *atsru, *atsr_n;
4342
4343         list_for_each_entry_safe(rmrru, rmrr_n, &dmar_rmrr_units, list) {
4344                 list_del(&rmrru->list);
4345                 dmar_free_dev_scope(&rmrru->devices, &rmrru->devices_cnt);
4346                 kfree(rmrru);
4347         }
4348
4349         list_for_each_entry_safe(atsru, atsr_n, &dmar_atsr_units, list) {
4350                 list_del(&atsru->list);
4351                 intel_iommu_free_atsr(atsru);
4352         }
4353 }
4354
4355 int dmar_find_matched_atsr_unit(struct pci_dev *dev)
4356 {
4357         int i, ret = 1;
4358         struct pci_bus *bus;
4359         struct pci_dev *bridge = NULL;
4360         struct device *tmp;
4361         struct acpi_dmar_atsr *atsr;
4362         struct dmar_atsr_unit *atsru;
4363
4364         dev = pci_physfn(dev);
4365         for (bus = dev->bus; bus; bus = bus->parent) {
4366                 bridge = bus->self;
4367                 /* If it's an integrated device, allow ATS */
4368                 if (!bridge)
4369                         return 1;
4370                 /* Connected via non-PCIe: no ATS */
4371                 if (!pci_is_pcie(bridge) ||
4372                     pci_pcie_type(bridge) == PCI_EXP_TYPE_PCI_BRIDGE)
4373                         return 0;
4374                 /* If we found the root port, look it up in the ATSR */
4375                 if (pci_pcie_type(bridge) == PCI_EXP_TYPE_ROOT_PORT)
4376                         break;
4377         }
4378
4379         rcu_read_lock();
4380         list_for_each_entry_rcu(atsru, &dmar_atsr_units, list) {
4381                 atsr = container_of(atsru->hdr, struct acpi_dmar_atsr, header);
4382                 if (atsr->segment != pci_domain_nr(dev->bus))
4383                         continue;
4384
4385                 for_each_dev_scope(atsru->devices, atsru->devices_cnt, i, tmp)
4386                         if (tmp == &bridge->dev)
4387                                 goto out;
4388
4389                 if (atsru->include_all)
4390                         goto out;
4391         }
4392         ret = 0;
4393 out:
4394         rcu_read_unlock();
4395
4396         return ret;
4397 }
4398
4399 int dmar_iommu_notify_scope_dev(struct dmar_pci_notify_info *info)
4400 {
4401         int ret = 0;
4402         struct dmar_rmrr_unit *rmrru;
4403         struct dmar_atsr_unit *atsru;
4404         struct acpi_dmar_atsr *atsr;
4405         struct acpi_dmar_reserved_memory *rmrr;
4406
4407         if (!intel_iommu_enabled && system_state != SYSTEM_BOOTING)
4408                 return 0;
4409
4410         list_for_each_entry(rmrru, &dmar_rmrr_units, list) {
4411                 rmrr = container_of(rmrru->hdr,
4412                                     struct acpi_dmar_reserved_memory, header);
4413                 if (info->event == BUS_NOTIFY_ADD_DEVICE) {
4414                         ret = dmar_insert_dev_scope(info, (void *)(rmrr + 1),
4415                                 ((void *)rmrr) + rmrr->header.length,
4416                                 rmrr->segment, rmrru->devices,
4417                                 rmrru->devices_cnt);
4418                         if(ret < 0)
4419                                 return ret;
4420                 } else if (info->event == BUS_NOTIFY_REMOVED_DEVICE) {
4421                         dmar_remove_dev_scope(info, rmrr->segment,
4422                                 rmrru->devices, rmrru->devices_cnt);
4423                 }
4424         }
4425
4426         list_for_each_entry(atsru, &dmar_atsr_units, list) {
4427                 if (atsru->include_all)
4428                         continue;
4429
4430                 atsr = container_of(atsru->hdr, struct acpi_dmar_atsr, header);
4431                 if (info->event == BUS_NOTIFY_ADD_DEVICE) {
4432                         ret = dmar_insert_dev_scope(info, (void *)(atsr + 1),
4433                                         (void *)atsr + atsr->header.length,
4434                                         atsr->segment, atsru->devices,
4435                                         atsru->devices_cnt);
4436                         if (ret > 0)
4437                                 break;
4438                         else if(ret < 0)
4439                                 return ret;
4440                 } else if (info->event == BUS_NOTIFY_REMOVED_DEVICE) {
4441                         if (dmar_remove_dev_scope(info, atsr->segment,
4442                                         atsru->devices, atsru->devices_cnt))
4443                                 break;
4444                 }
4445         }
4446
4447         return 0;
4448 }
4449
4450 /*
4451  * Here we only respond to action of unbound device from driver.
4452  *
4453  * Added device is not attached to its DMAR domain here yet. That will happen
4454  * when mapping the device to iova.
4455  */
4456 static int device_notifier(struct notifier_block *nb,
4457                                   unsigned long action, void *data)
4458 {
4459         struct device *dev = data;
4460         struct dmar_domain *domain;
4461
4462         if (iommu_dummy(dev))
4463                 return 0;
4464
4465         if (action != BUS_NOTIFY_REMOVED_DEVICE)
4466                 return 0;
4467
4468         domain = find_domain(dev);
4469         if (!domain)
4470                 return 0;
4471
4472         dmar_remove_one_dev_info(domain, dev);
4473         if (!domain_type_is_vm_or_si(domain) && list_empty(&domain->devices))
4474                 domain_exit(domain);
4475
4476         return 0;
4477 }
4478
4479 static struct notifier_block device_nb = {
4480         .notifier_call = device_notifier,
4481 };
4482
4483 static int intel_iommu_memory_notifier(struct notifier_block *nb,
4484                                        unsigned long val, void *v)
4485 {
4486         struct memory_notify *mhp = v;
4487         unsigned long long start, end;
4488         unsigned long start_vpfn, last_vpfn;
4489
4490         switch (val) {
4491         case MEM_GOING_ONLINE:
4492                 start = mhp->start_pfn << PAGE_SHIFT;
4493                 end = ((mhp->start_pfn + mhp->nr_pages) << PAGE_SHIFT) - 1;
4494                 if (iommu_domain_identity_map(si_domain, start, end)) {
4495                         pr_warn("Failed to build identity map for [%llx-%llx]\n",
4496                                 start, end);
4497                         return NOTIFY_BAD;
4498                 }
4499                 break;
4500
4501         case MEM_OFFLINE:
4502         case MEM_CANCEL_ONLINE:
4503                 start_vpfn = mm_to_dma_pfn(mhp->start_pfn);
4504                 last_vpfn = mm_to_dma_pfn(mhp->start_pfn + mhp->nr_pages - 1);
4505                 while (start_vpfn <= last_vpfn) {
4506                         struct iova *iova;
4507                         struct dmar_drhd_unit *drhd;
4508                         struct intel_iommu *iommu;
4509                         struct page *freelist;
4510
4511                         iova = find_iova(&si_domain->iovad, start_vpfn);
4512                         if (iova == NULL) {
4513                                 pr_debug("Failed get IOVA for PFN %lx\n",
4514                                          start_vpfn);
4515                                 break;
4516                         }
4517
4518                         iova = split_and_remove_iova(&si_domain->iovad, iova,
4519                                                      start_vpfn, last_vpfn);
4520                         if (iova == NULL) {
4521                                 pr_warn("Failed to split IOVA PFN [%lx-%lx]\n",
4522                                         start_vpfn, last_vpfn);
4523                                 return NOTIFY_BAD;
4524                         }
4525
4526                         freelist = domain_unmap(si_domain, iova->pfn_lo,
4527                                                iova->pfn_hi);
4528
4529                         rcu_read_lock();
4530                         for_each_active_iommu(iommu, drhd)
4531                                 iommu_flush_iotlb_psi(iommu, si_domain,
4532                                         iova->pfn_lo, iova_size(iova),
4533                                         !freelist, 0);
4534                         rcu_read_unlock();
4535                         dma_free_pagelist(freelist);
4536
4537                         start_vpfn = iova->pfn_hi + 1;
4538                         free_iova_mem(iova);
4539                 }
4540                 break;
4541         }
4542
4543         return NOTIFY_OK;
4544 }
4545
4546 static struct notifier_block intel_iommu_memory_nb = {
4547         .notifier_call = intel_iommu_memory_notifier,
4548         .priority = 0
4549 };
4550
4551
4552 static ssize_t intel_iommu_show_version(struct device *dev,
4553                                         struct device_attribute *attr,
4554                                         char *buf)
4555 {
4556         struct intel_iommu *iommu = dev_get_drvdata(dev);
4557         u32 ver = readl(iommu->reg + DMAR_VER_REG);
4558         return sprintf(buf, "%d:%d\n",
4559                        DMAR_VER_MAJOR(ver), DMAR_VER_MINOR(ver));
4560 }
4561 static DEVICE_ATTR(version, S_IRUGO, intel_iommu_show_version, NULL);
4562
4563 static ssize_t intel_iommu_show_address(struct device *dev,
4564                                         struct device_attribute *attr,
4565                                         char *buf)
4566 {
4567         struct intel_iommu *iommu = dev_get_drvdata(dev);
4568         return sprintf(buf, "%llx\n", iommu->reg_phys);
4569 }
4570 static DEVICE_ATTR(address, S_IRUGO, intel_iommu_show_address, NULL);
4571
4572 static ssize_t intel_iommu_show_cap(struct device *dev,
4573                                     struct device_attribute *attr,
4574                                     char *buf)
4575 {
4576         struct intel_iommu *iommu = dev_get_drvdata(dev);
4577         return sprintf(buf, "%llx\n", iommu->cap);
4578 }
4579 static DEVICE_ATTR(cap, S_IRUGO, intel_iommu_show_cap, NULL);
4580
4581 static ssize_t intel_iommu_show_ecap(struct device *dev,
4582                                     struct device_attribute *attr,
4583                                     char *buf)
4584 {
4585         struct intel_iommu *iommu = dev_get_drvdata(dev);
4586         return sprintf(buf, "%llx\n", iommu->ecap);
4587 }
4588 static DEVICE_ATTR(ecap, S_IRUGO, intel_iommu_show_ecap, NULL);
4589
4590 static ssize_t intel_iommu_show_ndoms(struct device *dev,
4591                                       struct device_attribute *attr,
4592                                       char *buf)
4593 {
4594         struct intel_iommu *iommu = dev_get_drvdata(dev);
4595         return sprintf(buf, "%ld\n", cap_ndoms(iommu->cap));
4596 }
4597 static DEVICE_ATTR(domains_supported, S_IRUGO, intel_iommu_show_ndoms, NULL);
4598
4599 static ssize_t intel_iommu_show_ndoms_used(struct device *dev,
4600                                            struct device_attribute *attr,
4601                                            char *buf)
4602 {
4603         struct intel_iommu *iommu = dev_get_drvdata(dev);
4604         return sprintf(buf, "%d\n", bitmap_weight(iommu->domain_ids,
4605                                                   cap_ndoms(iommu->cap)));
4606 }
4607 static DEVICE_ATTR(domains_used, S_IRUGO, intel_iommu_show_ndoms_used, NULL);
4608
4609 static struct attribute *intel_iommu_attrs[] = {
4610         &dev_attr_version.attr,
4611         &dev_attr_address.attr,
4612         &dev_attr_cap.attr,
4613         &dev_attr_ecap.attr,
4614         &dev_attr_domains_supported.attr,
4615         &dev_attr_domains_used.attr,
4616         NULL,
4617 };
4618
4619 static struct attribute_group intel_iommu_group = {
4620         .name = "intel-iommu",
4621         .attrs = intel_iommu_attrs,
4622 };
4623
4624 const struct attribute_group *intel_iommu_groups[] = {
4625         &intel_iommu_group,
4626         NULL,
4627 };
4628
4629 int __init intel_iommu_init(void)
4630 {
4631         int ret = -ENODEV;
4632         struct dmar_drhd_unit *drhd;
4633         struct intel_iommu *iommu;
4634
4635         /* VT-d is required for a TXT/tboot launch, so enforce that */
4636         force_on = tboot_force_iommu();
4637
4638         if (iommu_init_mempool()) {
4639                 if (force_on)
4640                         panic("tboot: Failed to initialize iommu memory\n");
4641                 return -ENOMEM;
4642         }
4643
4644         down_write(&dmar_global_lock);
4645         if (dmar_table_init()) {
4646                 if (force_on)
4647                         panic("tboot: Failed to initialize DMAR table\n");
4648                 goto out_free_dmar;
4649         }
4650
4651         if (dmar_dev_scope_init() < 0) {
4652                 if (force_on)
4653                         panic("tboot: Failed to initialize DMAR device scope\n");
4654                 goto out_free_dmar;
4655         }
4656
4657         if (no_iommu || dmar_disabled)
4658                 goto out_free_dmar;
4659
4660         if (list_empty(&dmar_rmrr_units))
4661                 pr_info("No RMRR found\n");
4662
4663         if (list_empty(&dmar_atsr_units))
4664                 pr_info("No ATSR found\n");
4665
4666         if (dmar_init_reserved_ranges()) {
4667                 if (force_on)
4668                         panic("tboot: Failed to reserve iommu ranges\n");
4669                 goto out_free_reserved_range;
4670         }
4671
4672         init_no_remapping_devices();
4673
4674         ret = init_dmars();
4675         if (ret) {
4676                 if (force_on)
4677                         panic("tboot: Failed to initialize DMARs\n");
4678                 pr_err("Initialization failed\n");
4679                 goto out_free_reserved_range;
4680         }
4681         up_write(&dmar_global_lock);
4682         pr_info("Intel(R) Virtualization Technology for Directed I/O\n");
4683
4684         init_timer(&unmap_timer);
4685 #ifdef CONFIG_SWIOTLB
4686         swiotlb = 0;
4687 #endif
4688         dma_ops = &intel_dma_ops;
4689
4690         init_iommu_pm_ops();
4691
4692         for_each_active_iommu(iommu, drhd)
4693                 iommu->iommu_dev = iommu_device_create(NULL, iommu,
4694                                                        intel_iommu_groups,
4695                                                        "%s", iommu->name);
4696
4697         bus_set_iommu(&pci_bus_type, &intel_iommu_ops);
4698         bus_register_notifier(&pci_bus_type, &device_nb);
4699         if (si_domain && !hw_pass_through)
4700                 register_memory_notifier(&intel_iommu_memory_nb);
4701
4702         intel_iommu_enabled = 1;
4703
4704         return 0;
4705
4706 out_free_reserved_range:
4707         put_iova_domain(&reserved_iova_list);
4708 out_free_dmar:
4709         intel_iommu_free_dmars();
4710         up_write(&dmar_global_lock);
4711         iommu_exit_mempool();
4712         return ret;
4713 }
4714
4715 static int domain_context_clear_one_cb(struct pci_dev *pdev, u16 alias, void *opaque)
4716 {
4717         struct intel_iommu *iommu = opaque;
4718
4719         domain_context_clear_one(iommu, PCI_BUS_NUM(alias), alias & 0xff);
4720         return 0;
4721 }
4722
4723 /*
4724  * NB - intel-iommu lacks any sort of reference counting for the users of
4725  * dependent devices.  If multiple endpoints have intersecting dependent
4726  * devices, unbinding the driver from any one of them will possibly leave
4727  * the others unable to operate.
4728  */
4729 static void domain_context_clear(struct intel_iommu *iommu, struct device *dev)
4730 {
4731         if (!iommu || !dev || !dev_is_pci(dev))
4732                 return;
4733
4734         pci_for_each_dma_alias(to_pci_dev(dev), &domain_context_clear_one_cb, iommu);
4735 }
4736
4737 static void __dmar_remove_one_dev_info(struct device_domain_info *info)
4738 {
4739         struct intel_iommu *iommu;
4740         unsigned long flags;
4741
4742         assert_spin_locked(&device_domain_lock);
4743
4744         if (WARN_ON(!info))
4745                 return;
4746
4747         iommu = info->iommu;
4748
4749         if (info->dev) {
4750                 iommu_disable_dev_iotlb(info);
4751                 domain_context_clear(iommu, info->dev);
4752         }
4753
4754         unlink_domain_info(info);
4755
4756         spin_lock_irqsave(&iommu->lock, flags);
4757         domain_detach_iommu(info->domain, iommu);
4758         spin_unlock_irqrestore(&iommu->lock, flags);
4759
4760         free_devinfo_mem(info);
4761 }
4762
4763 static void dmar_remove_one_dev_info(struct dmar_domain *domain,
4764                                      struct device *dev)
4765 {
4766         struct device_domain_info *info;
4767         unsigned long flags;
4768
4769         spin_lock_irqsave(&device_domain_lock, flags);
4770         info = dev->archdata.iommu;
4771         __dmar_remove_one_dev_info(info);
4772         spin_unlock_irqrestore(&device_domain_lock, flags);
4773 }
4774
4775 static int md_domain_init(struct dmar_domain *domain, int guest_width)
4776 {
4777         int adjust_width;
4778
4779         init_iova_domain(&domain->iovad, VTD_PAGE_SIZE, IOVA_START_PFN,
4780                         DMA_32BIT_PFN);
4781         domain_reserve_special_ranges(domain);
4782
4783         /* calculate AGAW */
4784         domain->gaw = guest_width;
4785         adjust_width = guestwidth_to_adjustwidth(guest_width);
4786         domain->agaw = width_to_agaw(adjust_width);
4787
4788         domain->iommu_coherency = 0;
4789         domain->iommu_snooping = 0;
4790         domain->iommu_superpage = 0;
4791         domain->max_addr = 0;
4792
4793         /* always allocate the top pgd */
4794         domain->pgd = (struct dma_pte *)alloc_pgtable_page(domain->nid);
4795         if (!domain->pgd)
4796                 return -ENOMEM;
4797         domain_flush_cache(domain, domain->pgd, PAGE_SIZE);
4798         return 0;
4799 }
4800
4801 static struct iommu_domain *intel_iommu_domain_alloc(unsigned type)
4802 {
4803         struct dmar_domain *dmar_domain;
4804         struct iommu_domain *domain;
4805
4806         if (type != IOMMU_DOMAIN_UNMANAGED)
4807                 return NULL;
4808
4809         dmar_domain = alloc_domain(DOMAIN_FLAG_VIRTUAL_MACHINE);
4810         if (!dmar_domain) {
4811                 pr_err("Can't allocate dmar_domain\n");
4812                 return NULL;
4813         }
4814         if (md_domain_init(dmar_domain, DEFAULT_DOMAIN_ADDRESS_WIDTH)) {
4815                 pr_err("Domain initialization failed\n");
4816                 domain_exit(dmar_domain);
4817                 return NULL;
4818         }
4819         domain_update_iommu_cap(dmar_domain);
4820
4821         domain = &dmar_domain->domain;
4822         domain->geometry.aperture_start = 0;
4823         domain->geometry.aperture_end   = __DOMAIN_MAX_ADDR(dmar_domain->gaw);
4824         domain->geometry.force_aperture = true;
4825
4826         return domain;
4827 }
4828
4829 static void intel_iommu_domain_free(struct iommu_domain *domain)
4830 {
4831         domain_exit(to_dmar_domain(domain));
4832 }
4833
4834 static int intel_iommu_attach_device(struct iommu_domain *domain,
4835                                      struct device *dev)
4836 {
4837         struct dmar_domain *dmar_domain = to_dmar_domain(domain);
4838         struct intel_iommu *iommu;
4839         int addr_width;
4840         u8 bus, devfn;
4841
4842         if (device_is_rmrr_locked(dev)) {
4843                 dev_warn(dev, "Device is ineligible for IOMMU domain attach due to platform RMRR requirement.  Contact your platform vendor.\n");
4844                 return -EPERM;
4845         }
4846
4847         /* normally dev is not mapped */
4848         if (unlikely(domain_context_mapped(dev))) {
4849                 struct dmar_domain *old_domain;
4850
4851                 old_domain = find_domain(dev);
4852                 if (old_domain) {
4853                         rcu_read_lock();
4854                         dmar_remove_one_dev_info(old_domain, dev);
4855                         rcu_read_unlock();
4856
4857                         if (!domain_type_is_vm_or_si(old_domain) &&
4858                              list_empty(&old_domain->devices))
4859                                 domain_exit(old_domain);
4860                 }
4861         }
4862
4863         iommu = device_to_iommu(dev, &bus, &devfn);
4864         if (!iommu)
4865                 return -ENODEV;
4866
4867         /* check if this iommu agaw is sufficient for max mapped address */
4868         addr_width = agaw_to_width(iommu->agaw);
4869         if (addr_width > cap_mgaw(iommu->cap))
4870                 addr_width = cap_mgaw(iommu->cap);
4871
4872         if (dmar_domain->max_addr > (1LL << addr_width)) {
4873                 pr_err("%s: iommu width (%d) is not "
4874                        "sufficient for the mapped address (%llx)\n",
4875                        __func__, addr_width, dmar_domain->max_addr);
4876                 return -EFAULT;
4877         }
4878         dmar_domain->gaw = addr_width;
4879
4880         /*
4881          * Knock out extra levels of page tables if necessary
4882          */
4883         while (iommu->agaw < dmar_domain->agaw) {
4884                 struct dma_pte *pte;
4885
4886                 pte = dmar_domain->pgd;
4887                 if (dma_pte_present(pte)) {
4888                         dmar_domain->pgd = (struct dma_pte *)
4889                                 phys_to_virt(dma_pte_addr(pte));
4890                         free_pgtable_page(pte);
4891                 }
4892                 dmar_domain->agaw--;
4893         }
4894
4895         return domain_add_dev_info(dmar_domain, dev);
4896 }
4897
4898 static void intel_iommu_detach_device(struct iommu_domain *domain,
4899                                       struct device *dev)
4900 {
4901         dmar_remove_one_dev_info(to_dmar_domain(domain), dev);
4902 }
4903
4904 static int intel_iommu_map(struct iommu_domain *domain,
4905                            unsigned long iova, phys_addr_t hpa,
4906                            size_t size, int iommu_prot)
4907 {
4908         struct dmar_domain *dmar_domain = to_dmar_domain(domain);
4909         u64 max_addr;
4910         int prot = 0;
4911         int ret;
4912
4913         if (iommu_prot & IOMMU_READ)
4914                 prot |= DMA_PTE_READ;
4915         if (iommu_prot & IOMMU_WRITE)
4916                 prot |= DMA_PTE_WRITE;
4917         if ((iommu_prot & IOMMU_CACHE) && dmar_domain->iommu_snooping)
4918                 prot |= DMA_PTE_SNP;
4919
4920         max_addr = iova + size;
4921         if (dmar_domain->max_addr < max_addr) {
4922                 u64 end;
4923
4924                 /* check if minimum agaw is sufficient for mapped address */
4925                 end = __DOMAIN_MAX_ADDR(dmar_domain->gaw) + 1;
4926                 if (end < max_addr) {
4927                         pr_err("%s: iommu width (%d) is not "
4928                                "sufficient for the mapped address (%llx)\n",
4929                                __func__, dmar_domain->gaw, max_addr);
4930                         return -EFAULT;
4931                 }
4932                 dmar_domain->max_addr = max_addr;
4933         }
4934         /* Round up size to next multiple of PAGE_SIZE, if it and
4935            the low bits of hpa would take us onto the next page */
4936         size = aligned_nrpages(hpa, size);
4937         ret = domain_pfn_mapping(dmar_domain, iova >> VTD_PAGE_SHIFT,
4938                                  hpa >> VTD_PAGE_SHIFT, size, prot);
4939         return ret;
4940 }
4941
4942 static size_t intel_iommu_unmap(struct iommu_domain *domain,
4943                                 unsigned long iova, size_t size)
4944 {
4945         struct dmar_domain *dmar_domain = to_dmar_domain(domain);
4946         struct page *freelist = NULL;
4947         struct intel_iommu *iommu;
4948         unsigned long start_pfn, last_pfn;
4949         unsigned int npages;
4950         int iommu_id, level = 0;
4951
4952         /* Cope with horrid API which requires us to unmap more than the
4953            size argument if it happens to be a large-page mapping. */
4954         BUG_ON(!pfn_to_dma_pte(dmar_domain, iova >> VTD_PAGE_SHIFT, &level));
4955
4956         if (size < VTD_PAGE_SIZE << level_to_offset_bits(level))
4957                 size = VTD_PAGE_SIZE << level_to_offset_bits(level);
4958
4959         start_pfn = iova >> VTD_PAGE_SHIFT;
4960         last_pfn = (iova + size - 1) >> VTD_PAGE_SHIFT;
4961
4962         freelist = domain_unmap(dmar_domain, start_pfn, last_pfn);
4963
4964         npages = last_pfn - start_pfn + 1;
4965
4966         for_each_domain_iommu(iommu_id, dmar_domain) {
4967                 iommu = g_iommus[iommu_id];
4968
4969                 iommu_flush_iotlb_psi(g_iommus[iommu_id], dmar_domain,
4970                                       start_pfn, npages, !freelist, 0);
4971         }
4972
4973         dma_free_pagelist(freelist);
4974
4975         if (dmar_domain->max_addr == iova + size)
4976                 dmar_domain->max_addr = iova;
4977
4978         return size;
4979 }
4980
4981 static phys_addr_t intel_iommu_iova_to_phys(struct iommu_domain *domain,
4982                                             dma_addr_t iova)
4983 {
4984         struct dmar_domain *dmar_domain = to_dmar_domain(domain);
4985         struct dma_pte *pte;
4986         int level = 0;
4987         u64 phys = 0;
4988
4989         pte = pfn_to_dma_pte(dmar_domain, iova >> VTD_PAGE_SHIFT, &level);
4990         if (pte)
4991                 phys = dma_pte_addr(pte);
4992
4993         return phys;
4994 }
4995
4996 static bool intel_iommu_capable(enum iommu_cap cap)
4997 {
4998         if (cap == IOMMU_CAP_CACHE_COHERENCY)
4999                 return domain_update_iommu_snooping(NULL) == 1;
5000         if (cap == IOMMU_CAP_INTR_REMAP)
5001                 return irq_remapping_enabled == 1;
5002
5003         return false;
5004 }
5005
5006 static int intel_iommu_add_device(struct device *dev)
5007 {
5008         struct intel_iommu *iommu;
5009         struct iommu_group *group;
5010         u8 bus, devfn;
5011
5012         iommu = device_to_iommu(dev, &bus, &devfn);
5013         if (!iommu)
5014                 return -ENODEV;
5015
5016         iommu_device_link(iommu->iommu_dev, dev);
5017
5018         group = iommu_group_get_for_dev(dev);
5019
5020         if (IS_ERR(group))
5021                 return PTR_ERR(group);
5022
5023         iommu_group_put(group);
5024         return 0;
5025 }
5026
5027 static void intel_iommu_remove_device(struct device *dev)
5028 {
5029         struct intel_iommu *iommu;
5030         u8 bus, devfn;
5031
5032         iommu = device_to_iommu(dev, &bus, &devfn);
5033         if (!iommu)
5034                 return;
5035
5036         iommu_group_remove_device(dev);
5037
5038         iommu_device_unlink(iommu->iommu_dev, dev);
5039 }
5040
5041 #ifdef CONFIG_INTEL_IOMMU_SVM
5042 #define MAX_NR_PASID_BITS (20)
5043 static inline unsigned long intel_iommu_get_pts(struct intel_iommu *iommu)
5044 {
5045         /*
5046          * Convert ecap_pss to extend context entry pts encoding, also
5047          * respect the soft pasid_max value set by the iommu.
5048          * - number of PASID bits = ecap_pss + 1
5049          * - number of PASID table entries = 2^(pts + 5)
5050          * Therefore, pts = ecap_pss - 4
5051          * e.g. KBL ecap_pss = 0x13, PASID has 20 bits, pts = 15
5052          */
5053         if (ecap_pss(iommu->ecap) < 5)
5054                 return 0;
5055
5056         /* pasid_max is encoded as actual number of entries not the bits */
5057         return find_first_bit((unsigned long *)&iommu->pasid_max,
5058                         MAX_NR_PASID_BITS) - 5;
5059 }
5060
5061 int intel_iommu_enable_pasid(struct intel_iommu *iommu, struct intel_svm_dev *sdev)
5062 {
5063         struct device_domain_info *info;
5064         struct context_entry *context;
5065         struct dmar_domain *domain;
5066         unsigned long flags;
5067         u64 ctx_lo;
5068         int ret;
5069
5070         domain = get_valid_domain_for_dev(sdev->dev);
5071         if (!domain)
5072                 return -EINVAL;
5073
5074         spin_lock_irqsave(&device_domain_lock, flags);
5075         spin_lock(&iommu->lock);
5076
5077         ret = -EINVAL;
5078         info = sdev->dev->archdata.iommu;
5079         if (!info || !info->pasid_supported)
5080                 goto out;
5081
5082         context = iommu_context_addr(iommu, info->bus, info->devfn, 0);
5083         if (WARN_ON(!context))
5084                 goto out;
5085
5086         ctx_lo = context[0].lo;
5087
5088         sdev->did = domain->iommu_did[iommu->seq_id];
5089         sdev->sid = PCI_DEVID(info->bus, info->devfn);
5090
5091         if (!(ctx_lo & CONTEXT_PASIDE)) {
5092                 context[1].hi = (u64)virt_to_phys(iommu->pasid_state_table);
5093                 context[1].lo = (u64)virt_to_phys(iommu->pasid_table) |
5094                         intel_iommu_get_pts(iommu);
5095
5096                 wmb();
5097                 /* CONTEXT_TT_MULTI_LEVEL and CONTEXT_TT_DEV_IOTLB are both
5098                  * extended to permit requests-with-PASID if the PASIDE bit
5099                  * is set. which makes sense. For CONTEXT_TT_PASS_THROUGH,
5100                  * however, the PASIDE bit is ignored and requests-with-PASID
5101                  * are unconditionally blocked. Which makes less sense.
5102                  * So convert from CONTEXT_TT_PASS_THROUGH to one of the new
5103                  * "guest mode" translation types depending on whether ATS
5104                  * is available or not. Annoyingly, we can't use the new
5105                  * modes *unless* PASIDE is set. */
5106                 if ((ctx_lo & CONTEXT_TT_MASK) == (CONTEXT_TT_PASS_THROUGH << 2)) {
5107                         ctx_lo &= ~CONTEXT_TT_MASK;
5108                         if (info->ats_supported)
5109                                 ctx_lo |= CONTEXT_TT_PT_PASID_DEV_IOTLB << 2;
5110                         else
5111                                 ctx_lo |= CONTEXT_TT_PT_PASID << 2;
5112                 }
5113                 ctx_lo |= CONTEXT_PASIDE;
5114                 if (iommu->pasid_state_table)
5115                         ctx_lo |= CONTEXT_DINVE;
5116                 if (info->pri_supported)
5117                         ctx_lo |= CONTEXT_PRS;
5118                 context[0].lo = ctx_lo;
5119                 wmb();
5120                 iommu->flush.flush_context(iommu, sdev->did, sdev->sid,
5121                                            DMA_CCMD_MASK_NOBIT,
5122                                            DMA_CCMD_DEVICE_INVL);
5123         }
5124
5125         /* Enable PASID support in the device, if it wasn't already */
5126         if (!info->pasid_enabled)
5127                 iommu_enable_dev_iotlb(info);
5128
5129         if (info->ats_enabled) {
5130                 sdev->dev_iotlb = 1;
5131                 sdev->qdep = info->ats_qdep;
5132                 if (sdev->qdep >= QI_DEV_EIOTLB_MAX_INVS)
5133                         sdev->qdep = 0;
5134         }
5135         ret = 0;
5136
5137  out:
5138         spin_unlock(&iommu->lock);
5139         spin_unlock_irqrestore(&device_domain_lock, flags);
5140
5141         return ret;
5142 }
5143
5144 struct intel_iommu *intel_svm_device_to_iommu(struct device *dev)
5145 {
5146         struct intel_iommu *iommu;
5147         u8 bus, devfn;
5148
5149         if (iommu_dummy(dev)) {
5150                 dev_warn(dev,
5151                          "No IOMMU translation for device; cannot enable SVM\n");
5152                 return NULL;
5153         }
5154
5155         iommu = device_to_iommu(dev, &bus, &devfn);
5156         if ((!iommu)) {
5157                 dev_err(dev, "No IOMMU for device; cannot enable SVM\n");
5158                 return NULL;
5159         }
5160
5161         if (!iommu->pasid_table) {
5162                 dev_err(dev, "PASID not enabled on IOMMU; cannot enable SVM\n");
5163                 return NULL;
5164         }
5165
5166         return iommu;
5167 }
5168 #endif /* CONFIG_INTEL_IOMMU_SVM */
5169
5170 static const struct iommu_ops intel_iommu_ops = {
5171         .capable        = intel_iommu_capable,
5172         .domain_alloc   = intel_iommu_domain_alloc,
5173         .domain_free    = intel_iommu_domain_free,
5174         .attach_dev     = intel_iommu_attach_device,
5175         .detach_dev     = intel_iommu_detach_device,
5176         .map            = intel_iommu_map,
5177         .unmap          = intel_iommu_unmap,
5178         .map_sg         = default_iommu_map_sg,
5179         .iova_to_phys   = intel_iommu_iova_to_phys,
5180         .add_device     = intel_iommu_add_device,
5181         .remove_device  = intel_iommu_remove_device,
5182         .device_group   = pci_device_group,
5183         .pgsize_bitmap  = INTEL_IOMMU_PGSIZES,
5184 };
5185
5186 static void quirk_iommu_g4x_gfx(struct pci_dev *dev)
5187 {
5188         /* G4x/GM45 integrated gfx dmar support is totally busted. */
5189         pr_info("Disabling IOMMU for graphics on this chipset\n");
5190         dmar_map_gfx = 0;
5191 }
5192
5193 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2a40, quirk_iommu_g4x_gfx);
5194 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e00, quirk_iommu_g4x_gfx);
5195 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e10, quirk_iommu_g4x_gfx);
5196 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e20, quirk_iommu_g4x_gfx);
5197 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e30, quirk_iommu_g4x_gfx);
5198 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e40, quirk_iommu_g4x_gfx);
5199 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e90, quirk_iommu_g4x_gfx);
5200
5201 static void quirk_iommu_rwbf(struct pci_dev *dev)
5202 {
5203         /*
5204          * Mobile 4 Series Chipset neglects to set RWBF capability,
5205          * but needs it. Same seems to hold for the desktop versions.
5206          */
5207         pr_info("Forcing write-buffer flush capability\n");
5208         rwbf_quirk = 1;
5209 }
5210
5211 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2a40, quirk_iommu_rwbf);
5212 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e00, quirk_iommu_rwbf);
5213 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e10, quirk_iommu_rwbf);
5214 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e20, quirk_iommu_rwbf);
5215 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e30, quirk_iommu_rwbf);
5216 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e40, quirk_iommu_rwbf);
5217 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e90, quirk_iommu_rwbf);
5218
5219 #define GGC 0x52
5220 #define GGC_MEMORY_SIZE_MASK    (0xf << 8)
5221 #define GGC_MEMORY_SIZE_NONE    (0x0 << 8)
5222 #define GGC_MEMORY_SIZE_1M      (0x1 << 8)
5223 #define GGC_MEMORY_SIZE_2M      (0x3 << 8)
5224 #define GGC_MEMORY_VT_ENABLED   (0x8 << 8)
5225 #define GGC_MEMORY_SIZE_2M_VT   (0x9 << 8)
5226 #define GGC_MEMORY_SIZE_3M_VT   (0xa << 8)
5227 #define GGC_MEMORY_SIZE_4M_VT   (0xb << 8)
5228
5229 static void quirk_calpella_no_shadow_gtt(struct pci_dev *dev)
5230 {
5231         unsigned short ggc;
5232
5233         if (pci_read_config_word(dev, GGC, &ggc))
5234                 return;
5235
5236         if (!(ggc & GGC_MEMORY_VT_ENABLED)) {
5237                 pr_info("BIOS has allocated no shadow GTT; disabling IOMMU for graphics\n");
5238                 dmar_map_gfx = 0;
5239         } else if (dmar_map_gfx) {
5240                 /* we have to ensure the gfx device is idle before we flush */
5241                 pr_info("Disabling batched IOTLB flush on Ironlake\n");
5242                 intel_iommu_strict = 1;
5243        }
5244 }
5245 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x0040, quirk_calpella_no_shadow_gtt);
5246 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x0044, quirk_calpella_no_shadow_gtt);
5247 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x0062, quirk_calpella_no_shadow_gtt);
5248 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x006a, quirk_calpella_no_shadow_gtt);
5249
5250 /* On Tylersburg chipsets, some BIOSes have been known to enable the
5251    ISOCH DMAR unit for the Azalia sound device, but not give it any
5252    TLB entries, which causes it to deadlock. Check for that.  We do
5253    this in a function called from init_dmars(), instead of in a PCI
5254    quirk, because we don't want to print the obnoxious "BIOS broken"
5255    message if VT-d is actually disabled.
5256 */
5257 static void __init check_tylersburg_isoch(void)
5258 {
5259         struct pci_dev *pdev;
5260         uint32_t vtisochctrl;
5261
5262         /* If there's no Azalia in the system anyway, forget it. */
5263         pdev = pci_get_device(PCI_VENDOR_ID_INTEL, 0x3a3e, NULL);
5264         if (!pdev)
5265                 return;
5266         pci_dev_put(pdev);
5267
5268         /* System Management Registers. Might be hidden, in which case
5269            we can't do the sanity check. But that's OK, because the
5270            known-broken BIOSes _don't_ actually hide it, so far. */
5271         pdev = pci_get_device(PCI_VENDOR_ID_INTEL, 0x342e, NULL);
5272         if (!pdev)
5273                 return;
5274
5275         if (pci_read_config_dword(pdev, 0x188, &vtisochctrl)) {
5276                 pci_dev_put(pdev);
5277                 return;
5278         }
5279
5280         pci_dev_put(pdev);
5281
5282         /* If Azalia DMA is routed to the non-isoch DMAR unit, fine. */
5283         if (vtisochctrl & 1)
5284                 return;
5285
5286         /* Drop all bits other than the number of TLB entries */
5287         vtisochctrl &= 0x1c;
5288
5289         /* If we have the recommended number of TLB entries (16), fine. */
5290         if (vtisochctrl == 0x10)
5291                 return;
5292
5293         /* Zero TLB entries? You get to ride the short bus to school. */
5294         if (!vtisochctrl) {
5295                 WARN(1, "Your BIOS is broken; DMA routed to ISOCH DMAR unit but no TLB space.\n"
5296                      "BIOS vendor: %s; Ver: %s; Product Version: %s\n",
5297                      dmi_get_system_info(DMI_BIOS_VENDOR),
5298                      dmi_get_system_info(DMI_BIOS_VERSION),
5299                      dmi_get_system_info(DMI_PRODUCT_VERSION));
5300                 iommu_identity_mapping |= IDENTMAP_AZALIA;
5301                 return;
5302         }
5303
5304         pr_warn("Recommended TLB entries for ISOCH unit is 16; your BIOS set %d\n",
5305                vtisochctrl);
5306 }