Merge branches 'iommu/page-sizes' and 'iommu/group-id' into next
[~shefty/rdma-dev.git] / drivers / iommu / intel-iommu.c
1 /*
2  * Copyright (c) 2006, Intel Corporation.
3  *
4  * This program is free software; you can redistribute it and/or modify it
5  * under the terms and conditions of the GNU General Public License,
6  * version 2, as published by the Free Software Foundation.
7  *
8  * This program is distributed in the hope it will be useful, but WITHOUT
9  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
10  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
11  * more details.
12  *
13  * You should have received a copy of the GNU General Public License along with
14  * this program; if not, write to the Free Software Foundation, Inc., 59 Temple
15  * Place - Suite 330, Boston, MA 02111-1307 USA.
16  *
17  * Copyright (C) 2006-2008 Intel Corporation
18  * Author: Ashok Raj <ashok.raj@intel.com>
19  * Author: Shaohua Li <shaohua.li@intel.com>
20  * Author: Anil S Keshavamurthy <anil.s.keshavamurthy@intel.com>
21  * Author: Fenghua Yu <fenghua.yu@intel.com>
22  */
23
24 #include <linux/init.h>
25 #include <linux/bitmap.h>
26 #include <linux/debugfs.h>
27 #include <linux/export.h>
28 #include <linux/slab.h>
29 #include <linux/irq.h>
30 #include <linux/interrupt.h>
31 #include <linux/spinlock.h>
32 #include <linux/pci.h>
33 #include <linux/dmar.h>
34 #include <linux/dma-mapping.h>
35 #include <linux/mempool.h>
36 #include <linux/timer.h>
37 #include <linux/iova.h>
38 #include <linux/iommu.h>
39 #include <linux/intel-iommu.h>
40 #include <linux/syscore_ops.h>
41 #include <linux/tboot.h>
42 #include <linux/dmi.h>
43 #include <linux/pci-ats.h>
44 #include <asm/cacheflush.h>
45 #include <asm/iommu.h>
46
47 #define ROOT_SIZE               VTD_PAGE_SIZE
48 #define CONTEXT_SIZE            VTD_PAGE_SIZE
49
50 #define IS_BRIDGE_HOST_DEVICE(pdev) \
51                             ((pdev->class >> 8) == PCI_CLASS_BRIDGE_HOST)
52 #define IS_GFX_DEVICE(pdev) ((pdev->class >> 16) == PCI_BASE_CLASS_DISPLAY)
53 #define IS_ISA_DEVICE(pdev) ((pdev->class >> 8) == PCI_CLASS_BRIDGE_ISA)
54 #define IS_AZALIA(pdev) ((pdev)->vendor == 0x8086 && (pdev)->device == 0x3a3e)
55
56 #define IOAPIC_RANGE_START      (0xfee00000)
57 #define IOAPIC_RANGE_END        (0xfeefffff)
58 #define IOVA_START_ADDR         (0x1000)
59
60 #define DEFAULT_DOMAIN_ADDRESS_WIDTH 48
61
62 #define MAX_AGAW_WIDTH 64
63
64 #define __DOMAIN_MAX_PFN(gaw)  ((((uint64_t)1) << (gaw-VTD_PAGE_SHIFT)) - 1)
65 #define __DOMAIN_MAX_ADDR(gaw) ((((uint64_t)1) << gaw) - 1)
66
67 /* We limit DOMAIN_MAX_PFN to fit in an unsigned long, and DOMAIN_MAX_ADDR
68    to match. That way, we can use 'unsigned long' for PFNs with impunity. */
69 #define DOMAIN_MAX_PFN(gaw)     ((unsigned long) min_t(uint64_t, \
70                                 __DOMAIN_MAX_PFN(gaw), (unsigned long)-1))
71 #define DOMAIN_MAX_ADDR(gaw)    (((uint64_t)__DOMAIN_MAX_PFN(gaw)) << VTD_PAGE_SHIFT)
72
73 #define IOVA_PFN(addr)          ((addr) >> PAGE_SHIFT)
74 #define DMA_32BIT_PFN           IOVA_PFN(DMA_BIT_MASK(32))
75 #define DMA_64BIT_PFN           IOVA_PFN(DMA_BIT_MASK(64))
76
77 /* page table handling */
78 #define LEVEL_STRIDE            (9)
79 #define LEVEL_MASK              (((u64)1 << LEVEL_STRIDE) - 1)
80
81 /*
82  * This bitmap is used to advertise the page sizes our hardware support
83  * to the IOMMU core, which will then use this information to split
84  * physically contiguous memory regions it is mapping into page sizes
85  * that we support.
86  *
87  * Traditionally the IOMMU core just handed us the mappings directly,
88  * after making sure the size is an order of a 4KiB page and that the
89  * mapping has natural alignment.
90  *
91  * To retain this behavior, we currently advertise that we support
92  * all page sizes that are an order of 4KiB.
93  *
94  * If at some point we'd like to utilize the IOMMU core's new behavior,
95  * we could change this to advertise the real page sizes we support.
96  */
97 #define INTEL_IOMMU_PGSIZES     (~0xFFFUL)
98
99 static inline int agaw_to_level(int agaw)
100 {
101         return agaw + 2;
102 }
103
104 static inline int agaw_to_width(int agaw)
105 {
106         return 30 + agaw * LEVEL_STRIDE;
107 }
108
109 static inline int width_to_agaw(int width)
110 {
111         return (width - 30) / LEVEL_STRIDE;
112 }
113
114 static inline unsigned int level_to_offset_bits(int level)
115 {
116         return (level - 1) * LEVEL_STRIDE;
117 }
118
119 static inline int pfn_level_offset(unsigned long pfn, int level)
120 {
121         return (pfn >> level_to_offset_bits(level)) & LEVEL_MASK;
122 }
123
124 static inline unsigned long level_mask(int level)
125 {
126         return -1UL << level_to_offset_bits(level);
127 }
128
129 static inline unsigned long level_size(int level)
130 {
131         return 1UL << level_to_offset_bits(level);
132 }
133
134 static inline unsigned long align_to_level(unsigned long pfn, int level)
135 {
136         return (pfn + level_size(level) - 1) & level_mask(level);
137 }
138
139 static inline unsigned long lvl_to_nr_pages(unsigned int lvl)
140 {
141         return  1 << ((lvl - 1) * LEVEL_STRIDE);
142 }
143
144 /* VT-d pages must always be _smaller_ than MM pages. Otherwise things
145    are never going to work. */
146 static inline unsigned long dma_to_mm_pfn(unsigned long dma_pfn)
147 {
148         return dma_pfn >> (PAGE_SHIFT - VTD_PAGE_SHIFT);
149 }
150
151 static inline unsigned long mm_to_dma_pfn(unsigned long mm_pfn)
152 {
153         return mm_pfn << (PAGE_SHIFT - VTD_PAGE_SHIFT);
154 }
155 static inline unsigned long page_to_dma_pfn(struct page *pg)
156 {
157         return mm_to_dma_pfn(page_to_pfn(pg));
158 }
159 static inline unsigned long virt_to_dma_pfn(void *p)
160 {
161         return page_to_dma_pfn(virt_to_page(p));
162 }
163
164 /* global iommu list, set NULL for ignored DMAR units */
165 static struct intel_iommu **g_iommus;
166
167 static void __init check_tylersburg_isoch(void);
168 static int rwbf_quirk;
169
170 /*
171  * set to 1 to panic kernel if can't successfully enable VT-d
172  * (used when kernel is launched w/ TXT)
173  */
174 static int force_on = 0;
175
176 /*
177  * 0: Present
178  * 1-11: Reserved
179  * 12-63: Context Ptr (12 - (haw-1))
180  * 64-127: Reserved
181  */
182 struct root_entry {
183         u64     val;
184         u64     rsvd1;
185 };
186 #define ROOT_ENTRY_NR (VTD_PAGE_SIZE/sizeof(struct root_entry))
187 static inline bool root_present(struct root_entry *root)
188 {
189         return (root->val & 1);
190 }
191 static inline void set_root_present(struct root_entry *root)
192 {
193         root->val |= 1;
194 }
195 static inline void set_root_value(struct root_entry *root, unsigned long value)
196 {
197         root->val |= value & VTD_PAGE_MASK;
198 }
199
200 static inline struct context_entry *
201 get_context_addr_from_root(struct root_entry *root)
202 {
203         return (struct context_entry *)
204                 (root_present(root)?phys_to_virt(
205                 root->val & VTD_PAGE_MASK) :
206                 NULL);
207 }
208
209 /*
210  * low 64 bits:
211  * 0: present
212  * 1: fault processing disable
213  * 2-3: translation type
214  * 12-63: address space root
215  * high 64 bits:
216  * 0-2: address width
217  * 3-6: aval
218  * 8-23: domain id
219  */
220 struct context_entry {
221         u64 lo;
222         u64 hi;
223 };
224
225 static inline bool context_present(struct context_entry *context)
226 {
227         return (context->lo & 1);
228 }
229 static inline void context_set_present(struct context_entry *context)
230 {
231         context->lo |= 1;
232 }
233
234 static inline void context_set_fault_enable(struct context_entry *context)
235 {
236         context->lo &= (((u64)-1) << 2) | 1;
237 }
238
239 static inline void context_set_translation_type(struct context_entry *context,
240                                                 unsigned long value)
241 {
242         context->lo &= (((u64)-1) << 4) | 3;
243         context->lo |= (value & 3) << 2;
244 }
245
246 static inline void context_set_address_root(struct context_entry *context,
247                                             unsigned long value)
248 {
249         context->lo |= value & VTD_PAGE_MASK;
250 }
251
252 static inline void context_set_address_width(struct context_entry *context,
253                                              unsigned long value)
254 {
255         context->hi |= value & 7;
256 }
257
258 static inline void context_set_domain_id(struct context_entry *context,
259                                          unsigned long value)
260 {
261         context->hi |= (value & ((1 << 16) - 1)) << 8;
262 }
263
264 static inline void context_clear_entry(struct context_entry *context)
265 {
266         context->lo = 0;
267         context->hi = 0;
268 }
269
270 /*
271  * 0: readable
272  * 1: writable
273  * 2-6: reserved
274  * 7: super page
275  * 8-10: available
276  * 11: snoop behavior
277  * 12-63: Host physcial address
278  */
279 struct dma_pte {
280         u64 val;
281 };
282
283 static inline void dma_clear_pte(struct dma_pte *pte)
284 {
285         pte->val = 0;
286 }
287
288 static inline void dma_set_pte_readable(struct dma_pte *pte)
289 {
290         pte->val |= DMA_PTE_READ;
291 }
292
293 static inline void dma_set_pte_writable(struct dma_pte *pte)
294 {
295         pte->val |= DMA_PTE_WRITE;
296 }
297
298 static inline void dma_set_pte_snp(struct dma_pte *pte)
299 {
300         pte->val |= DMA_PTE_SNP;
301 }
302
303 static inline void dma_set_pte_prot(struct dma_pte *pte, unsigned long prot)
304 {
305         pte->val = (pte->val & ~3) | (prot & 3);
306 }
307
308 static inline u64 dma_pte_addr(struct dma_pte *pte)
309 {
310 #ifdef CONFIG_64BIT
311         return pte->val & VTD_PAGE_MASK;
312 #else
313         /* Must have a full atomic 64-bit read */
314         return  __cmpxchg64(&pte->val, 0ULL, 0ULL) & VTD_PAGE_MASK;
315 #endif
316 }
317
318 static inline void dma_set_pte_pfn(struct dma_pte *pte, unsigned long pfn)
319 {
320         pte->val |= (uint64_t)pfn << VTD_PAGE_SHIFT;
321 }
322
323 static inline bool dma_pte_present(struct dma_pte *pte)
324 {
325         return (pte->val & 3) != 0;
326 }
327
328 static inline bool dma_pte_superpage(struct dma_pte *pte)
329 {
330         return (pte->val & (1 << 7));
331 }
332
333 static inline int first_pte_in_page(struct dma_pte *pte)
334 {
335         return !((unsigned long)pte & ~VTD_PAGE_MASK);
336 }
337
338 /*
339  * This domain is a statically identity mapping domain.
340  *      1. This domain creats a static 1:1 mapping to all usable memory.
341  *      2. It maps to each iommu if successful.
342  *      3. Each iommu mapps to this domain if successful.
343  */
344 static struct dmar_domain *si_domain;
345 static int hw_pass_through = 1;
346
347 /* devices under the same p2p bridge are owned in one domain */
348 #define DOMAIN_FLAG_P2P_MULTIPLE_DEVICES (1 << 0)
349
350 /* domain represents a virtual machine, more than one devices
351  * across iommus may be owned in one domain, e.g. kvm guest.
352  */
353 #define DOMAIN_FLAG_VIRTUAL_MACHINE     (1 << 1)
354
355 /* si_domain contains mulitple devices */
356 #define DOMAIN_FLAG_STATIC_IDENTITY     (1 << 2)
357
358 struct dmar_domain {
359         int     id;                     /* domain id */
360         int     nid;                    /* node id */
361         unsigned long iommu_bmp;        /* bitmap of iommus this domain uses*/
362
363         struct list_head devices;       /* all devices' list */
364         struct iova_domain iovad;       /* iova's that belong to this domain */
365
366         struct dma_pte  *pgd;           /* virtual address */
367         int             gaw;            /* max guest address width */
368
369         /* adjusted guest address width, 0 is level 2 30-bit */
370         int             agaw;
371
372         int             flags;          /* flags to find out type of domain */
373
374         int             iommu_coherency;/* indicate coherency of iommu access */
375         int             iommu_snooping; /* indicate snooping control feature*/
376         int             iommu_count;    /* reference count of iommu */
377         int             iommu_superpage;/* Level of superpages supported:
378                                            0 == 4KiB (no superpages), 1 == 2MiB,
379                                            2 == 1GiB, 3 == 512GiB, 4 == 1TiB */
380         spinlock_t      iommu_lock;     /* protect iommu set in domain */
381         u64             max_addr;       /* maximum mapped address */
382 };
383
384 /* PCI domain-device relationship */
385 struct device_domain_info {
386         struct list_head link;  /* link to domain siblings */
387         struct list_head global; /* link to global list */
388         int segment;            /* PCI domain */
389         u8 bus;                 /* PCI bus number */
390         u8 devfn;               /* PCI devfn number */
391         struct pci_dev *dev; /* it's NULL for PCIe-to-PCI bridge */
392         struct intel_iommu *iommu; /* IOMMU used by this device */
393         struct dmar_domain *domain; /* pointer to domain */
394 };
395
396 static void flush_unmaps_timeout(unsigned long data);
397
398 DEFINE_TIMER(unmap_timer,  flush_unmaps_timeout, 0, 0);
399
400 #define HIGH_WATER_MARK 250
401 struct deferred_flush_tables {
402         int next;
403         struct iova *iova[HIGH_WATER_MARK];
404         struct dmar_domain *domain[HIGH_WATER_MARK];
405 };
406
407 static struct deferred_flush_tables *deferred_flush;
408
409 /* bitmap for indexing intel_iommus */
410 static int g_num_of_iommus;
411
412 static DEFINE_SPINLOCK(async_umap_flush_lock);
413 static LIST_HEAD(unmaps_to_do);
414
415 static int timer_on;
416 static long list_size;
417
418 static void domain_remove_dev_info(struct dmar_domain *domain);
419
420 #ifdef CONFIG_INTEL_IOMMU_DEFAULT_ON
421 int dmar_disabled = 0;
422 #else
423 int dmar_disabled = 1;
424 #endif /*CONFIG_INTEL_IOMMU_DEFAULT_ON*/
425
426 int intel_iommu_enabled = 0;
427 EXPORT_SYMBOL_GPL(intel_iommu_enabled);
428
429 static int dmar_map_gfx = 1;
430 static int dmar_forcedac;
431 static int intel_iommu_strict;
432 static int intel_iommu_superpage = 1;
433
434 int intel_iommu_gfx_mapped;
435 EXPORT_SYMBOL_GPL(intel_iommu_gfx_mapped);
436
437 #define DUMMY_DEVICE_DOMAIN_INFO ((struct device_domain_info *)(-1))
438 static DEFINE_SPINLOCK(device_domain_lock);
439 static LIST_HEAD(device_domain_list);
440
441 static struct iommu_ops intel_iommu_ops;
442
443 static int __init intel_iommu_setup(char *str)
444 {
445         if (!str)
446                 return -EINVAL;
447         while (*str) {
448                 if (!strncmp(str, "on", 2)) {
449                         dmar_disabled = 0;
450                         printk(KERN_INFO "Intel-IOMMU: enabled\n");
451                 } else if (!strncmp(str, "off", 3)) {
452                         dmar_disabled = 1;
453                         printk(KERN_INFO "Intel-IOMMU: disabled\n");
454                 } else if (!strncmp(str, "igfx_off", 8)) {
455                         dmar_map_gfx = 0;
456                         printk(KERN_INFO
457                                 "Intel-IOMMU: disable GFX device mapping\n");
458                 } else if (!strncmp(str, "forcedac", 8)) {
459                         printk(KERN_INFO
460                                 "Intel-IOMMU: Forcing DAC for PCI devices\n");
461                         dmar_forcedac = 1;
462                 } else if (!strncmp(str, "strict", 6)) {
463                         printk(KERN_INFO
464                                 "Intel-IOMMU: disable batched IOTLB flush\n");
465                         intel_iommu_strict = 1;
466                 } else if (!strncmp(str, "sp_off", 6)) {
467                         printk(KERN_INFO
468                                 "Intel-IOMMU: disable supported super page\n");
469                         intel_iommu_superpage = 0;
470                 }
471
472                 str += strcspn(str, ",");
473                 while (*str == ',')
474                         str++;
475         }
476         return 0;
477 }
478 __setup("intel_iommu=", intel_iommu_setup);
479
480 static struct kmem_cache *iommu_domain_cache;
481 static struct kmem_cache *iommu_devinfo_cache;
482 static struct kmem_cache *iommu_iova_cache;
483
484 static inline void *alloc_pgtable_page(int node)
485 {
486         struct page *page;
487         void *vaddr = NULL;
488
489         page = alloc_pages_node(node, GFP_ATOMIC | __GFP_ZERO, 0);
490         if (page)
491                 vaddr = page_address(page);
492         return vaddr;
493 }
494
495 static inline void free_pgtable_page(void *vaddr)
496 {
497         free_page((unsigned long)vaddr);
498 }
499
500 static inline void *alloc_domain_mem(void)
501 {
502         return kmem_cache_alloc(iommu_domain_cache, GFP_ATOMIC);
503 }
504
505 static void free_domain_mem(void *vaddr)
506 {
507         kmem_cache_free(iommu_domain_cache, vaddr);
508 }
509
510 static inline void * alloc_devinfo_mem(void)
511 {
512         return kmem_cache_alloc(iommu_devinfo_cache, GFP_ATOMIC);
513 }
514
515 static inline void free_devinfo_mem(void *vaddr)
516 {
517         kmem_cache_free(iommu_devinfo_cache, vaddr);
518 }
519
520 struct iova *alloc_iova_mem(void)
521 {
522         return kmem_cache_alloc(iommu_iova_cache, GFP_ATOMIC);
523 }
524
525 void free_iova_mem(struct iova *iova)
526 {
527         kmem_cache_free(iommu_iova_cache, iova);
528 }
529
530
531 static int __iommu_calculate_agaw(struct intel_iommu *iommu, int max_gaw)
532 {
533         unsigned long sagaw;
534         int agaw = -1;
535
536         sagaw = cap_sagaw(iommu->cap);
537         for (agaw = width_to_agaw(max_gaw);
538              agaw >= 0; agaw--) {
539                 if (test_bit(agaw, &sagaw))
540                         break;
541         }
542
543         return agaw;
544 }
545
546 /*
547  * Calculate max SAGAW for each iommu.
548  */
549 int iommu_calculate_max_sagaw(struct intel_iommu *iommu)
550 {
551         return __iommu_calculate_agaw(iommu, MAX_AGAW_WIDTH);
552 }
553
554 /*
555  * calculate agaw for each iommu.
556  * "SAGAW" may be different across iommus, use a default agaw, and
557  * get a supported less agaw for iommus that don't support the default agaw.
558  */
559 int iommu_calculate_agaw(struct intel_iommu *iommu)
560 {
561         return __iommu_calculate_agaw(iommu, DEFAULT_DOMAIN_ADDRESS_WIDTH);
562 }
563
564 /* This functionin only returns single iommu in a domain */
565 static struct intel_iommu *domain_get_iommu(struct dmar_domain *domain)
566 {
567         int iommu_id;
568
569         /* si_domain and vm domain should not get here. */
570         BUG_ON(domain->flags & DOMAIN_FLAG_VIRTUAL_MACHINE);
571         BUG_ON(domain->flags & DOMAIN_FLAG_STATIC_IDENTITY);
572
573         iommu_id = find_first_bit(&domain->iommu_bmp, g_num_of_iommus);
574         if (iommu_id < 0 || iommu_id >= g_num_of_iommus)
575                 return NULL;
576
577         return g_iommus[iommu_id];
578 }
579
580 static void domain_update_iommu_coherency(struct dmar_domain *domain)
581 {
582         int i;
583
584         domain->iommu_coherency = 1;
585
586         for_each_set_bit(i, &domain->iommu_bmp, g_num_of_iommus) {
587                 if (!ecap_coherent(g_iommus[i]->ecap)) {
588                         domain->iommu_coherency = 0;
589                         break;
590                 }
591         }
592 }
593
594 static void domain_update_iommu_snooping(struct dmar_domain *domain)
595 {
596         int i;
597
598         domain->iommu_snooping = 1;
599
600         for_each_set_bit(i, &domain->iommu_bmp, g_num_of_iommus) {
601                 if (!ecap_sc_support(g_iommus[i]->ecap)) {
602                         domain->iommu_snooping = 0;
603                         break;
604                 }
605         }
606 }
607
608 static void domain_update_iommu_superpage(struct dmar_domain *domain)
609 {
610         struct dmar_drhd_unit *drhd;
611         struct intel_iommu *iommu = NULL;
612         int mask = 0xf;
613
614         if (!intel_iommu_superpage) {
615                 domain->iommu_superpage = 0;
616                 return;
617         }
618
619         /* set iommu_superpage to the smallest common denominator */
620         for_each_active_iommu(iommu, drhd) {
621                 mask &= cap_super_page_val(iommu->cap);
622                 if (!mask) {
623                         break;
624                 }
625         }
626         domain->iommu_superpage = fls(mask);
627 }
628
629 /* Some capabilities may be different across iommus */
630 static void domain_update_iommu_cap(struct dmar_domain *domain)
631 {
632         domain_update_iommu_coherency(domain);
633         domain_update_iommu_snooping(domain);
634         domain_update_iommu_superpage(domain);
635 }
636
637 static struct intel_iommu *device_to_iommu(int segment, u8 bus, u8 devfn)
638 {
639         struct dmar_drhd_unit *drhd = NULL;
640         int i;
641
642         for_each_drhd_unit(drhd) {
643                 if (drhd->ignored)
644                         continue;
645                 if (segment != drhd->segment)
646                         continue;
647
648                 for (i = 0; i < drhd->devices_cnt; i++) {
649                         if (drhd->devices[i] &&
650                             drhd->devices[i]->bus->number == bus &&
651                             drhd->devices[i]->devfn == devfn)
652                                 return drhd->iommu;
653                         if (drhd->devices[i] &&
654                             drhd->devices[i]->subordinate &&
655                             drhd->devices[i]->subordinate->number <= bus &&
656                             drhd->devices[i]->subordinate->subordinate >= bus)
657                                 return drhd->iommu;
658                 }
659
660                 if (drhd->include_all)
661                         return drhd->iommu;
662         }
663
664         return NULL;
665 }
666
667 static void domain_flush_cache(struct dmar_domain *domain,
668                                void *addr, int size)
669 {
670         if (!domain->iommu_coherency)
671                 clflush_cache_range(addr, size);
672 }
673
674 /* Gets context entry for a given bus and devfn */
675 static struct context_entry * device_to_context_entry(struct intel_iommu *iommu,
676                 u8 bus, u8 devfn)
677 {
678         struct root_entry *root;
679         struct context_entry *context;
680         unsigned long phy_addr;
681         unsigned long flags;
682
683         spin_lock_irqsave(&iommu->lock, flags);
684         root = &iommu->root_entry[bus];
685         context = get_context_addr_from_root(root);
686         if (!context) {
687                 context = (struct context_entry *)
688                                 alloc_pgtable_page(iommu->node);
689                 if (!context) {
690                         spin_unlock_irqrestore(&iommu->lock, flags);
691                         return NULL;
692                 }
693                 __iommu_flush_cache(iommu, (void *)context, CONTEXT_SIZE);
694                 phy_addr = virt_to_phys((void *)context);
695                 set_root_value(root, phy_addr);
696                 set_root_present(root);
697                 __iommu_flush_cache(iommu, root, sizeof(*root));
698         }
699         spin_unlock_irqrestore(&iommu->lock, flags);
700         return &context[devfn];
701 }
702
703 static int device_context_mapped(struct intel_iommu *iommu, u8 bus, u8 devfn)
704 {
705         struct root_entry *root;
706         struct context_entry *context;
707         int ret;
708         unsigned long flags;
709
710         spin_lock_irqsave(&iommu->lock, flags);
711         root = &iommu->root_entry[bus];
712         context = get_context_addr_from_root(root);
713         if (!context) {
714                 ret = 0;
715                 goto out;
716         }
717         ret = context_present(&context[devfn]);
718 out:
719         spin_unlock_irqrestore(&iommu->lock, flags);
720         return ret;
721 }
722
723 static void clear_context_table(struct intel_iommu *iommu, u8 bus, u8 devfn)
724 {
725         struct root_entry *root;
726         struct context_entry *context;
727         unsigned long flags;
728
729         spin_lock_irqsave(&iommu->lock, flags);
730         root = &iommu->root_entry[bus];
731         context = get_context_addr_from_root(root);
732         if (context) {
733                 context_clear_entry(&context[devfn]);
734                 __iommu_flush_cache(iommu, &context[devfn], \
735                         sizeof(*context));
736         }
737         spin_unlock_irqrestore(&iommu->lock, flags);
738 }
739
740 static void free_context_table(struct intel_iommu *iommu)
741 {
742         struct root_entry *root;
743         int i;
744         unsigned long flags;
745         struct context_entry *context;
746
747         spin_lock_irqsave(&iommu->lock, flags);
748         if (!iommu->root_entry) {
749                 goto out;
750         }
751         for (i = 0; i < ROOT_ENTRY_NR; i++) {
752                 root = &iommu->root_entry[i];
753                 context = get_context_addr_from_root(root);
754                 if (context)
755                         free_pgtable_page(context);
756         }
757         free_pgtable_page(iommu->root_entry);
758         iommu->root_entry = NULL;
759 out:
760         spin_unlock_irqrestore(&iommu->lock, flags);
761 }
762
763 static struct dma_pte *pfn_to_dma_pte(struct dmar_domain *domain,
764                                       unsigned long pfn, int target_level)
765 {
766         int addr_width = agaw_to_width(domain->agaw) - VTD_PAGE_SHIFT;
767         struct dma_pte *parent, *pte = NULL;
768         int level = agaw_to_level(domain->agaw);
769         int offset;
770
771         BUG_ON(!domain->pgd);
772         BUG_ON(addr_width < BITS_PER_LONG && pfn >> addr_width);
773         parent = domain->pgd;
774
775         while (level > 0) {
776                 void *tmp_page;
777
778                 offset = pfn_level_offset(pfn, level);
779                 pte = &parent[offset];
780                 if (!target_level && (dma_pte_superpage(pte) || !dma_pte_present(pte)))
781                         break;
782                 if (level == target_level)
783                         break;
784
785                 if (!dma_pte_present(pte)) {
786                         uint64_t pteval;
787
788                         tmp_page = alloc_pgtable_page(domain->nid);
789
790                         if (!tmp_page)
791                                 return NULL;
792
793                         domain_flush_cache(domain, tmp_page, VTD_PAGE_SIZE);
794                         pteval = ((uint64_t)virt_to_dma_pfn(tmp_page) << VTD_PAGE_SHIFT) | DMA_PTE_READ | DMA_PTE_WRITE;
795                         if (cmpxchg64(&pte->val, 0ULL, pteval)) {
796                                 /* Someone else set it while we were thinking; use theirs. */
797                                 free_pgtable_page(tmp_page);
798                         } else {
799                                 dma_pte_addr(pte);
800                                 domain_flush_cache(domain, pte, sizeof(*pte));
801                         }
802                 }
803                 parent = phys_to_virt(dma_pte_addr(pte));
804                 level--;
805         }
806
807         return pte;
808 }
809
810
811 /* return address's pte at specific level */
812 static struct dma_pte *dma_pfn_level_pte(struct dmar_domain *domain,
813                                          unsigned long pfn,
814                                          int level, int *large_page)
815 {
816         struct dma_pte *parent, *pte = NULL;
817         int total = agaw_to_level(domain->agaw);
818         int offset;
819
820         parent = domain->pgd;
821         while (level <= total) {
822                 offset = pfn_level_offset(pfn, total);
823                 pte = &parent[offset];
824                 if (level == total)
825                         return pte;
826
827                 if (!dma_pte_present(pte)) {
828                         *large_page = total;
829                         break;
830                 }
831
832                 if (pte->val & DMA_PTE_LARGE_PAGE) {
833                         *large_page = total;
834                         return pte;
835                 }
836
837                 parent = phys_to_virt(dma_pte_addr(pte));
838                 total--;
839         }
840         return NULL;
841 }
842
843 /* clear last level pte, a tlb flush should be followed */
844 static int dma_pte_clear_range(struct dmar_domain *domain,
845                                 unsigned long start_pfn,
846                                 unsigned long last_pfn)
847 {
848         int addr_width = agaw_to_width(domain->agaw) - VTD_PAGE_SHIFT;
849         unsigned int large_page = 1;
850         struct dma_pte *first_pte, *pte;
851         int order;
852
853         BUG_ON(addr_width < BITS_PER_LONG && start_pfn >> addr_width);
854         BUG_ON(addr_width < BITS_PER_LONG && last_pfn >> addr_width);
855         BUG_ON(start_pfn > last_pfn);
856
857         /* we don't need lock here; nobody else touches the iova range */
858         do {
859                 large_page = 1;
860                 first_pte = pte = dma_pfn_level_pte(domain, start_pfn, 1, &large_page);
861                 if (!pte) {
862                         start_pfn = align_to_level(start_pfn + 1, large_page + 1);
863                         continue;
864                 }
865                 do {
866                         dma_clear_pte(pte);
867                         start_pfn += lvl_to_nr_pages(large_page);
868                         pte++;
869                 } while (start_pfn <= last_pfn && !first_pte_in_page(pte));
870
871                 domain_flush_cache(domain, first_pte,
872                                    (void *)pte - (void *)first_pte);
873
874         } while (start_pfn && start_pfn <= last_pfn);
875
876         order = (large_page - 1) * 9;
877         return order;
878 }
879
880 /* free page table pages. last level pte should already be cleared */
881 static void dma_pte_free_pagetable(struct dmar_domain *domain,
882                                    unsigned long start_pfn,
883                                    unsigned long last_pfn)
884 {
885         int addr_width = agaw_to_width(domain->agaw) - VTD_PAGE_SHIFT;
886         struct dma_pte *first_pte, *pte;
887         int total = agaw_to_level(domain->agaw);
888         int level;
889         unsigned long tmp;
890         int large_page = 2;
891
892         BUG_ON(addr_width < BITS_PER_LONG && start_pfn >> addr_width);
893         BUG_ON(addr_width < BITS_PER_LONG && last_pfn >> addr_width);
894         BUG_ON(start_pfn > last_pfn);
895
896         /* We don't need lock here; nobody else touches the iova range */
897         level = 2;
898         while (level <= total) {
899                 tmp = align_to_level(start_pfn, level);
900
901                 /* If we can't even clear one PTE at this level, we're done */
902                 if (tmp + level_size(level) - 1 > last_pfn)
903                         return;
904
905                 do {
906                         large_page = level;
907                         first_pte = pte = dma_pfn_level_pte(domain, tmp, level, &large_page);
908                         if (large_page > level)
909                                 level = large_page + 1;
910                         if (!pte) {
911                                 tmp = align_to_level(tmp + 1, level + 1);
912                                 continue;
913                         }
914                         do {
915                                 if (dma_pte_present(pte)) {
916                                         free_pgtable_page(phys_to_virt(dma_pte_addr(pte)));
917                                         dma_clear_pte(pte);
918                                 }
919                                 pte++;
920                                 tmp += level_size(level);
921                         } while (!first_pte_in_page(pte) &&
922                                  tmp + level_size(level) - 1 <= last_pfn);
923
924                         domain_flush_cache(domain, first_pte,
925                                            (void *)pte - (void *)first_pte);
926                         
927                 } while (tmp && tmp + level_size(level) - 1 <= last_pfn);
928                 level++;
929         }
930         /* free pgd */
931         if (start_pfn == 0 && last_pfn == DOMAIN_MAX_PFN(domain->gaw)) {
932                 free_pgtable_page(domain->pgd);
933                 domain->pgd = NULL;
934         }
935 }
936
937 /* iommu handling */
938 static int iommu_alloc_root_entry(struct intel_iommu *iommu)
939 {
940         struct root_entry *root;
941         unsigned long flags;
942
943         root = (struct root_entry *)alloc_pgtable_page(iommu->node);
944         if (!root)
945                 return -ENOMEM;
946
947         __iommu_flush_cache(iommu, root, ROOT_SIZE);
948
949         spin_lock_irqsave(&iommu->lock, flags);
950         iommu->root_entry = root;
951         spin_unlock_irqrestore(&iommu->lock, flags);
952
953         return 0;
954 }
955
956 static void iommu_set_root_entry(struct intel_iommu *iommu)
957 {
958         void *addr;
959         u32 sts;
960         unsigned long flag;
961
962         addr = iommu->root_entry;
963
964         raw_spin_lock_irqsave(&iommu->register_lock, flag);
965         dmar_writeq(iommu->reg + DMAR_RTADDR_REG, virt_to_phys(addr));
966
967         writel(iommu->gcmd | DMA_GCMD_SRTP, iommu->reg + DMAR_GCMD_REG);
968
969         /* Make sure hardware complete it */
970         IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
971                       readl, (sts & DMA_GSTS_RTPS), sts);
972
973         raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
974 }
975
976 static void iommu_flush_write_buffer(struct intel_iommu *iommu)
977 {
978         u32 val;
979         unsigned long flag;
980
981         if (!rwbf_quirk && !cap_rwbf(iommu->cap))
982                 return;
983
984         raw_spin_lock_irqsave(&iommu->register_lock, flag);
985         writel(iommu->gcmd | DMA_GCMD_WBF, iommu->reg + DMAR_GCMD_REG);
986
987         /* Make sure hardware complete it */
988         IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
989                       readl, (!(val & DMA_GSTS_WBFS)), val);
990
991         raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
992 }
993
994 /* return value determine if we need a write buffer flush */
995 static void __iommu_flush_context(struct intel_iommu *iommu,
996                                   u16 did, u16 source_id, u8 function_mask,
997                                   u64 type)
998 {
999         u64 val = 0;
1000         unsigned long flag;
1001
1002         switch (type) {
1003         case DMA_CCMD_GLOBAL_INVL:
1004                 val = DMA_CCMD_GLOBAL_INVL;
1005                 break;
1006         case DMA_CCMD_DOMAIN_INVL:
1007                 val = DMA_CCMD_DOMAIN_INVL|DMA_CCMD_DID(did);
1008                 break;
1009         case DMA_CCMD_DEVICE_INVL:
1010                 val = DMA_CCMD_DEVICE_INVL|DMA_CCMD_DID(did)
1011                         | DMA_CCMD_SID(source_id) | DMA_CCMD_FM(function_mask);
1012                 break;
1013         default:
1014                 BUG();
1015         }
1016         val |= DMA_CCMD_ICC;
1017
1018         raw_spin_lock_irqsave(&iommu->register_lock, flag);
1019         dmar_writeq(iommu->reg + DMAR_CCMD_REG, val);
1020
1021         /* Make sure hardware complete it */
1022         IOMMU_WAIT_OP(iommu, DMAR_CCMD_REG,
1023                 dmar_readq, (!(val & DMA_CCMD_ICC)), val);
1024
1025         raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1026 }
1027
1028 /* return value determine if we need a write buffer flush */
1029 static void __iommu_flush_iotlb(struct intel_iommu *iommu, u16 did,
1030                                 u64 addr, unsigned int size_order, u64 type)
1031 {
1032         int tlb_offset = ecap_iotlb_offset(iommu->ecap);
1033         u64 val = 0, val_iva = 0;
1034         unsigned long flag;
1035
1036         switch (type) {
1037         case DMA_TLB_GLOBAL_FLUSH:
1038                 /* global flush doesn't need set IVA_REG */
1039                 val = DMA_TLB_GLOBAL_FLUSH|DMA_TLB_IVT;
1040                 break;
1041         case DMA_TLB_DSI_FLUSH:
1042                 val = DMA_TLB_DSI_FLUSH|DMA_TLB_IVT|DMA_TLB_DID(did);
1043                 break;
1044         case DMA_TLB_PSI_FLUSH:
1045                 val = DMA_TLB_PSI_FLUSH|DMA_TLB_IVT|DMA_TLB_DID(did);
1046                 /* Note: always flush non-leaf currently */
1047                 val_iva = size_order | addr;
1048                 break;
1049         default:
1050                 BUG();
1051         }
1052         /* Note: set drain read/write */
1053 #if 0
1054         /*
1055          * This is probably to be super secure.. Looks like we can
1056          * ignore it without any impact.
1057          */
1058         if (cap_read_drain(iommu->cap))
1059                 val |= DMA_TLB_READ_DRAIN;
1060 #endif
1061         if (cap_write_drain(iommu->cap))
1062                 val |= DMA_TLB_WRITE_DRAIN;
1063
1064         raw_spin_lock_irqsave(&iommu->register_lock, flag);
1065         /* Note: Only uses first TLB reg currently */
1066         if (val_iva)
1067                 dmar_writeq(iommu->reg + tlb_offset, val_iva);
1068         dmar_writeq(iommu->reg + tlb_offset + 8, val);
1069
1070         /* Make sure hardware complete it */
1071         IOMMU_WAIT_OP(iommu, tlb_offset + 8,
1072                 dmar_readq, (!(val & DMA_TLB_IVT)), val);
1073
1074         raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1075
1076         /* check IOTLB invalidation granularity */
1077         if (DMA_TLB_IAIG(val) == 0)
1078                 printk(KERN_ERR"IOMMU: flush IOTLB failed\n");
1079         if (DMA_TLB_IAIG(val) != DMA_TLB_IIRG(type))
1080                 pr_debug("IOMMU: tlb flush request %Lx, actual %Lx\n",
1081                         (unsigned long long)DMA_TLB_IIRG(type),
1082                         (unsigned long long)DMA_TLB_IAIG(val));
1083 }
1084
1085 static struct device_domain_info *iommu_support_dev_iotlb(
1086         struct dmar_domain *domain, int segment, u8 bus, u8 devfn)
1087 {
1088         int found = 0;
1089         unsigned long flags;
1090         struct device_domain_info *info;
1091         struct intel_iommu *iommu = device_to_iommu(segment, bus, devfn);
1092
1093         if (!ecap_dev_iotlb_support(iommu->ecap))
1094                 return NULL;
1095
1096         if (!iommu->qi)
1097                 return NULL;
1098
1099         spin_lock_irqsave(&device_domain_lock, flags);
1100         list_for_each_entry(info, &domain->devices, link)
1101                 if (info->bus == bus && info->devfn == devfn) {
1102                         found = 1;
1103                         break;
1104                 }
1105         spin_unlock_irqrestore(&device_domain_lock, flags);
1106
1107         if (!found || !info->dev)
1108                 return NULL;
1109
1110         if (!pci_find_ext_capability(info->dev, PCI_EXT_CAP_ID_ATS))
1111                 return NULL;
1112
1113         if (!dmar_find_matched_atsr_unit(info->dev))
1114                 return NULL;
1115
1116         info->iommu = iommu;
1117
1118         return info;
1119 }
1120
1121 static void iommu_enable_dev_iotlb(struct device_domain_info *info)
1122 {
1123         if (!info)
1124                 return;
1125
1126         pci_enable_ats(info->dev, VTD_PAGE_SHIFT);
1127 }
1128
1129 static void iommu_disable_dev_iotlb(struct device_domain_info *info)
1130 {
1131         if (!info->dev || !pci_ats_enabled(info->dev))
1132                 return;
1133
1134         pci_disable_ats(info->dev);
1135 }
1136
1137 static void iommu_flush_dev_iotlb(struct dmar_domain *domain,
1138                                   u64 addr, unsigned mask)
1139 {
1140         u16 sid, qdep;
1141         unsigned long flags;
1142         struct device_domain_info *info;
1143
1144         spin_lock_irqsave(&device_domain_lock, flags);
1145         list_for_each_entry(info, &domain->devices, link) {
1146                 if (!info->dev || !pci_ats_enabled(info->dev))
1147                         continue;
1148
1149                 sid = info->bus << 8 | info->devfn;
1150                 qdep = pci_ats_queue_depth(info->dev);
1151                 qi_flush_dev_iotlb(info->iommu, sid, qdep, addr, mask);
1152         }
1153         spin_unlock_irqrestore(&device_domain_lock, flags);
1154 }
1155
1156 static void iommu_flush_iotlb_psi(struct intel_iommu *iommu, u16 did,
1157                                   unsigned long pfn, unsigned int pages, int map)
1158 {
1159         unsigned int mask = ilog2(__roundup_pow_of_two(pages));
1160         uint64_t addr = (uint64_t)pfn << VTD_PAGE_SHIFT;
1161
1162         BUG_ON(pages == 0);
1163
1164         /*
1165          * Fallback to domain selective flush if no PSI support or the size is
1166          * too big.
1167          * PSI requires page size to be 2 ^ x, and the base address is naturally
1168          * aligned to the size
1169          */
1170         if (!cap_pgsel_inv(iommu->cap) || mask > cap_max_amask_val(iommu->cap))
1171                 iommu->flush.flush_iotlb(iommu, did, 0, 0,
1172                                                 DMA_TLB_DSI_FLUSH);
1173         else
1174                 iommu->flush.flush_iotlb(iommu, did, addr, mask,
1175                                                 DMA_TLB_PSI_FLUSH);
1176
1177         /*
1178          * In caching mode, changes of pages from non-present to present require
1179          * flush. However, device IOTLB doesn't need to be flushed in this case.
1180          */
1181         if (!cap_caching_mode(iommu->cap) || !map)
1182                 iommu_flush_dev_iotlb(iommu->domains[did], addr, mask);
1183 }
1184
1185 static void iommu_disable_protect_mem_regions(struct intel_iommu *iommu)
1186 {
1187         u32 pmen;
1188         unsigned long flags;
1189
1190         raw_spin_lock_irqsave(&iommu->register_lock, flags);
1191         pmen = readl(iommu->reg + DMAR_PMEN_REG);
1192         pmen &= ~DMA_PMEN_EPM;
1193         writel(pmen, iommu->reg + DMAR_PMEN_REG);
1194
1195         /* wait for the protected region status bit to clear */
1196         IOMMU_WAIT_OP(iommu, DMAR_PMEN_REG,
1197                 readl, !(pmen & DMA_PMEN_PRS), pmen);
1198
1199         raw_spin_unlock_irqrestore(&iommu->register_lock, flags);
1200 }
1201
1202 static int iommu_enable_translation(struct intel_iommu *iommu)
1203 {
1204         u32 sts;
1205         unsigned long flags;
1206
1207         raw_spin_lock_irqsave(&iommu->register_lock, flags);
1208         iommu->gcmd |= DMA_GCMD_TE;
1209         writel(iommu->gcmd, iommu->reg + DMAR_GCMD_REG);
1210
1211         /* Make sure hardware complete it */
1212         IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1213                       readl, (sts & DMA_GSTS_TES), sts);
1214
1215         raw_spin_unlock_irqrestore(&iommu->register_lock, flags);
1216         return 0;
1217 }
1218
1219 static int iommu_disable_translation(struct intel_iommu *iommu)
1220 {
1221         u32 sts;
1222         unsigned long flag;
1223
1224         raw_spin_lock_irqsave(&iommu->register_lock, flag);
1225         iommu->gcmd &= ~DMA_GCMD_TE;
1226         writel(iommu->gcmd, iommu->reg + DMAR_GCMD_REG);
1227
1228         /* Make sure hardware complete it */
1229         IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1230                       readl, (!(sts & DMA_GSTS_TES)), sts);
1231
1232         raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1233         return 0;
1234 }
1235
1236
1237 static int iommu_init_domains(struct intel_iommu *iommu)
1238 {
1239         unsigned long ndomains;
1240         unsigned long nlongs;
1241
1242         ndomains = cap_ndoms(iommu->cap);
1243         pr_debug("IOMMU %d: Number of Domains supportd <%ld>\n", iommu->seq_id,
1244                         ndomains);
1245         nlongs = BITS_TO_LONGS(ndomains);
1246
1247         spin_lock_init(&iommu->lock);
1248
1249         /* TBD: there might be 64K domains,
1250          * consider other allocation for future chip
1251          */
1252         iommu->domain_ids = kcalloc(nlongs, sizeof(unsigned long), GFP_KERNEL);
1253         if (!iommu->domain_ids) {
1254                 printk(KERN_ERR "Allocating domain id array failed\n");
1255                 return -ENOMEM;
1256         }
1257         iommu->domains = kcalloc(ndomains, sizeof(struct dmar_domain *),
1258                         GFP_KERNEL);
1259         if (!iommu->domains) {
1260                 printk(KERN_ERR "Allocating domain array failed\n");
1261                 return -ENOMEM;
1262         }
1263
1264         /*
1265          * if Caching mode is set, then invalid translations are tagged
1266          * with domainid 0. Hence we need to pre-allocate it.
1267          */
1268         if (cap_caching_mode(iommu->cap))
1269                 set_bit(0, iommu->domain_ids);
1270         return 0;
1271 }
1272
1273
1274 static void domain_exit(struct dmar_domain *domain);
1275 static void vm_domain_exit(struct dmar_domain *domain);
1276
1277 void free_dmar_iommu(struct intel_iommu *iommu)
1278 {
1279         struct dmar_domain *domain;
1280         int i;
1281         unsigned long flags;
1282
1283         if ((iommu->domains) && (iommu->domain_ids)) {
1284                 for_each_set_bit(i, iommu->domain_ids, cap_ndoms(iommu->cap)) {
1285                         domain = iommu->domains[i];
1286                         clear_bit(i, iommu->domain_ids);
1287
1288                         spin_lock_irqsave(&domain->iommu_lock, flags);
1289                         if (--domain->iommu_count == 0) {
1290                                 if (domain->flags & DOMAIN_FLAG_VIRTUAL_MACHINE)
1291                                         vm_domain_exit(domain);
1292                                 else
1293                                         domain_exit(domain);
1294                         }
1295                         spin_unlock_irqrestore(&domain->iommu_lock, flags);
1296                 }
1297         }
1298
1299         if (iommu->gcmd & DMA_GCMD_TE)
1300                 iommu_disable_translation(iommu);
1301
1302         if (iommu->irq) {
1303                 irq_set_handler_data(iommu->irq, NULL);
1304                 /* This will mask the irq */
1305                 free_irq(iommu->irq, iommu);
1306                 destroy_irq(iommu->irq);
1307         }
1308
1309         kfree(iommu->domains);
1310         kfree(iommu->domain_ids);
1311
1312         g_iommus[iommu->seq_id] = NULL;
1313
1314         /* if all iommus are freed, free g_iommus */
1315         for (i = 0; i < g_num_of_iommus; i++) {
1316                 if (g_iommus[i])
1317                         break;
1318         }
1319
1320         if (i == g_num_of_iommus)
1321                 kfree(g_iommus);
1322
1323         /* free context mapping */
1324         free_context_table(iommu);
1325 }
1326
1327 static struct dmar_domain *alloc_domain(void)
1328 {
1329         struct dmar_domain *domain;
1330
1331         domain = alloc_domain_mem();
1332         if (!domain)
1333                 return NULL;
1334
1335         domain->nid = -1;
1336         memset(&domain->iommu_bmp, 0, sizeof(unsigned long));
1337         domain->flags = 0;
1338
1339         return domain;
1340 }
1341
1342 static int iommu_attach_domain(struct dmar_domain *domain,
1343                                struct intel_iommu *iommu)
1344 {
1345         int num;
1346         unsigned long ndomains;
1347         unsigned long flags;
1348
1349         ndomains = cap_ndoms(iommu->cap);
1350
1351         spin_lock_irqsave(&iommu->lock, flags);
1352
1353         num = find_first_zero_bit(iommu->domain_ids, ndomains);
1354         if (num >= ndomains) {
1355                 spin_unlock_irqrestore(&iommu->lock, flags);
1356                 printk(KERN_ERR "IOMMU: no free domain ids\n");
1357                 return -ENOMEM;
1358         }
1359
1360         domain->id = num;
1361         set_bit(num, iommu->domain_ids);
1362         set_bit(iommu->seq_id, &domain->iommu_bmp);
1363         iommu->domains[num] = domain;
1364         spin_unlock_irqrestore(&iommu->lock, flags);
1365
1366         return 0;
1367 }
1368
1369 static void iommu_detach_domain(struct dmar_domain *domain,
1370                                 struct intel_iommu *iommu)
1371 {
1372         unsigned long flags;
1373         int num, ndomains;
1374         int found = 0;
1375
1376         spin_lock_irqsave(&iommu->lock, flags);
1377         ndomains = cap_ndoms(iommu->cap);
1378         for_each_set_bit(num, iommu->domain_ids, ndomains) {
1379                 if (iommu->domains[num] == domain) {
1380                         found = 1;
1381                         break;
1382                 }
1383         }
1384
1385         if (found) {
1386                 clear_bit(num, iommu->domain_ids);
1387                 clear_bit(iommu->seq_id, &domain->iommu_bmp);
1388                 iommu->domains[num] = NULL;
1389         }
1390         spin_unlock_irqrestore(&iommu->lock, flags);
1391 }
1392
1393 static struct iova_domain reserved_iova_list;
1394 static struct lock_class_key reserved_rbtree_key;
1395
1396 static int dmar_init_reserved_ranges(void)
1397 {
1398         struct pci_dev *pdev = NULL;
1399         struct iova *iova;
1400         int i;
1401
1402         init_iova_domain(&reserved_iova_list, DMA_32BIT_PFN);
1403
1404         lockdep_set_class(&reserved_iova_list.iova_rbtree_lock,
1405                 &reserved_rbtree_key);
1406
1407         /* IOAPIC ranges shouldn't be accessed by DMA */
1408         iova = reserve_iova(&reserved_iova_list, IOVA_PFN(IOAPIC_RANGE_START),
1409                 IOVA_PFN(IOAPIC_RANGE_END));
1410         if (!iova) {
1411                 printk(KERN_ERR "Reserve IOAPIC range failed\n");
1412                 return -ENODEV;
1413         }
1414
1415         /* Reserve all PCI MMIO to avoid peer-to-peer access */
1416         for_each_pci_dev(pdev) {
1417                 struct resource *r;
1418
1419                 for (i = 0; i < PCI_NUM_RESOURCES; i++) {
1420                         r = &pdev->resource[i];
1421                         if (!r->flags || !(r->flags & IORESOURCE_MEM))
1422                                 continue;
1423                         iova = reserve_iova(&reserved_iova_list,
1424                                             IOVA_PFN(r->start),
1425                                             IOVA_PFN(r->end));
1426                         if (!iova) {
1427                                 printk(KERN_ERR "Reserve iova failed\n");
1428                                 return -ENODEV;
1429                         }
1430                 }
1431         }
1432         return 0;
1433 }
1434
1435 static void domain_reserve_special_ranges(struct dmar_domain *domain)
1436 {
1437         copy_reserved_iova(&reserved_iova_list, &domain->iovad);
1438 }
1439
1440 static inline int guestwidth_to_adjustwidth(int gaw)
1441 {
1442         int agaw;
1443         int r = (gaw - 12) % 9;
1444
1445         if (r == 0)
1446                 agaw = gaw;
1447         else
1448                 agaw = gaw + 9 - r;
1449         if (agaw > 64)
1450                 agaw = 64;
1451         return agaw;
1452 }
1453
1454 static int domain_init(struct dmar_domain *domain, int guest_width)
1455 {
1456         struct intel_iommu *iommu;
1457         int adjust_width, agaw;
1458         unsigned long sagaw;
1459
1460         init_iova_domain(&domain->iovad, DMA_32BIT_PFN);
1461         spin_lock_init(&domain->iommu_lock);
1462
1463         domain_reserve_special_ranges(domain);
1464
1465         /* calculate AGAW */
1466         iommu = domain_get_iommu(domain);
1467         if (guest_width > cap_mgaw(iommu->cap))
1468                 guest_width = cap_mgaw(iommu->cap);
1469         domain->gaw = guest_width;
1470         adjust_width = guestwidth_to_adjustwidth(guest_width);
1471         agaw = width_to_agaw(adjust_width);
1472         sagaw = cap_sagaw(iommu->cap);
1473         if (!test_bit(agaw, &sagaw)) {
1474                 /* hardware doesn't support it, choose a bigger one */
1475                 pr_debug("IOMMU: hardware doesn't support agaw %d\n", agaw);
1476                 agaw = find_next_bit(&sagaw, 5, agaw);
1477                 if (agaw >= 5)
1478                         return -ENODEV;
1479         }
1480         domain->agaw = agaw;
1481         INIT_LIST_HEAD(&domain->devices);
1482
1483         if (ecap_coherent(iommu->ecap))
1484                 domain->iommu_coherency = 1;
1485         else
1486                 domain->iommu_coherency = 0;
1487
1488         if (ecap_sc_support(iommu->ecap))
1489                 domain->iommu_snooping = 1;
1490         else
1491                 domain->iommu_snooping = 0;
1492
1493         domain->iommu_superpage = fls(cap_super_page_val(iommu->cap));
1494         domain->iommu_count = 1;
1495         domain->nid = iommu->node;
1496
1497         /* always allocate the top pgd */
1498         domain->pgd = (struct dma_pte *)alloc_pgtable_page(domain->nid);
1499         if (!domain->pgd)
1500                 return -ENOMEM;
1501         __iommu_flush_cache(iommu, domain->pgd, PAGE_SIZE);
1502         return 0;
1503 }
1504
1505 static void domain_exit(struct dmar_domain *domain)
1506 {
1507         struct dmar_drhd_unit *drhd;
1508         struct intel_iommu *iommu;
1509
1510         /* Domain 0 is reserved, so dont process it */
1511         if (!domain)
1512                 return;
1513
1514         /* Flush any lazy unmaps that may reference this domain */
1515         if (!intel_iommu_strict)
1516                 flush_unmaps_timeout(0);
1517
1518         domain_remove_dev_info(domain);
1519         /* destroy iovas */
1520         put_iova_domain(&domain->iovad);
1521
1522         /* clear ptes */
1523         dma_pte_clear_range(domain, 0, DOMAIN_MAX_PFN(domain->gaw));
1524
1525         /* free page tables */
1526         dma_pte_free_pagetable(domain, 0, DOMAIN_MAX_PFN(domain->gaw));
1527
1528         for_each_active_iommu(iommu, drhd)
1529                 if (test_bit(iommu->seq_id, &domain->iommu_bmp))
1530                         iommu_detach_domain(domain, iommu);
1531
1532         free_domain_mem(domain);
1533 }
1534
1535 static int domain_context_mapping_one(struct dmar_domain *domain, int segment,
1536                                  u8 bus, u8 devfn, int translation)
1537 {
1538         struct context_entry *context;
1539         unsigned long flags;
1540         struct intel_iommu *iommu;
1541         struct dma_pte *pgd;
1542         unsigned long num;
1543         unsigned long ndomains;
1544         int id;
1545         int agaw;
1546         struct device_domain_info *info = NULL;
1547
1548         pr_debug("Set context mapping for %02x:%02x.%d\n",
1549                 bus, PCI_SLOT(devfn), PCI_FUNC(devfn));
1550
1551         BUG_ON(!domain->pgd);
1552         BUG_ON(translation != CONTEXT_TT_PASS_THROUGH &&
1553                translation != CONTEXT_TT_MULTI_LEVEL);
1554
1555         iommu = device_to_iommu(segment, bus, devfn);
1556         if (!iommu)
1557                 return -ENODEV;
1558
1559         context = device_to_context_entry(iommu, bus, devfn);
1560         if (!context)
1561                 return -ENOMEM;
1562         spin_lock_irqsave(&iommu->lock, flags);
1563         if (context_present(context)) {
1564                 spin_unlock_irqrestore(&iommu->lock, flags);
1565                 return 0;
1566         }
1567
1568         id = domain->id;
1569         pgd = domain->pgd;
1570
1571         if (domain->flags & DOMAIN_FLAG_VIRTUAL_MACHINE ||
1572             domain->flags & DOMAIN_FLAG_STATIC_IDENTITY) {
1573                 int found = 0;
1574
1575                 /* find an available domain id for this device in iommu */
1576                 ndomains = cap_ndoms(iommu->cap);
1577                 for_each_set_bit(num, iommu->domain_ids, ndomains) {
1578                         if (iommu->domains[num] == domain) {
1579                                 id = num;
1580                                 found = 1;
1581                                 break;
1582                         }
1583                 }
1584
1585                 if (found == 0) {
1586                         num = find_first_zero_bit(iommu->domain_ids, ndomains);
1587                         if (num >= ndomains) {
1588                                 spin_unlock_irqrestore(&iommu->lock, flags);
1589                                 printk(KERN_ERR "IOMMU: no free domain ids\n");
1590                                 return -EFAULT;
1591                         }
1592
1593                         set_bit(num, iommu->domain_ids);
1594                         iommu->domains[num] = domain;
1595                         id = num;
1596                 }
1597
1598                 /* Skip top levels of page tables for
1599                  * iommu which has less agaw than default.
1600                  * Unnecessary for PT mode.
1601                  */
1602                 if (translation != CONTEXT_TT_PASS_THROUGH) {
1603                         for (agaw = domain->agaw; agaw != iommu->agaw; agaw--) {
1604                                 pgd = phys_to_virt(dma_pte_addr(pgd));
1605                                 if (!dma_pte_present(pgd)) {
1606                                         spin_unlock_irqrestore(&iommu->lock, flags);
1607                                         return -ENOMEM;
1608                                 }
1609                         }
1610                 }
1611         }
1612
1613         context_set_domain_id(context, id);
1614
1615         if (translation != CONTEXT_TT_PASS_THROUGH) {
1616                 info = iommu_support_dev_iotlb(domain, segment, bus, devfn);
1617                 translation = info ? CONTEXT_TT_DEV_IOTLB :
1618                                      CONTEXT_TT_MULTI_LEVEL;
1619         }
1620         /*
1621          * In pass through mode, AW must be programmed to indicate the largest
1622          * AGAW value supported by hardware. And ASR is ignored by hardware.
1623          */
1624         if (unlikely(translation == CONTEXT_TT_PASS_THROUGH))
1625                 context_set_address_width(context, iommu->msagaw);
1626         else {
1627                 context_set_address_root(context, virt_to_phys(pgd));
1628                 context_set_address_width(context, iommu->agaw);
1629         }
1630
1631         context_set_translation_type(context, translation);
1632         context_set_fault_enable(context);
1633         context_set_present(context);
1634         domain_flush_cache(domain, context, sizeof(*context));
1635
1636         /*
1637          * It's a non-present to present mapping. If hardware doesn't cache
1638          * non-present entry we only need to flush the write-buffer. If the
1639          * _does_ cache non-present entries, then it does so in the special
1640          * domain #0, which we have to flush:
1641          */
1642         if (cap_caching_mode(iommu->cap)) {
1643                 iommu->flush.flush_context(iommu, 0,
1644                                            (((u16)bus) << 8) | devfn,
1645                                            DMA_CCMD_MASK_NOBIT,
1646                                            DMA_CCMD_DEVICE_INVL);
1647                 iommu->flush.flush_iotlb(iommu, domain->id, 0, 0, DMA_TLB_DSI_FLUSH);
1648         } else {
1649                 iommu_flush_write_buffer(iommu);
1650         }
1651         iommu_enable_dev_iotlb(info);
1652         spin_unlock_irqrestore(&iommu->lock, flags);
1653
1654         spin_lock_irqsave(&domain->iommu_lock, flags);
1655         if (!test_and_set_bit(iommu->seq_id, &domain->iommu_bmp)) {
1656                 domain->iommu_count++;
1657                 if (domain->iommu_count == 1)
1658                         domain->nid = iommu->node;
1659                 domain_update_iommu_cap(domain);
1660         }
1661         spin_unlock_irqrestore(&domain->iommu_lock, flags);
1662         return 0;
1663 }
1664
1665 static int
1666 domain_context_mapping(struct dmar_domain *domain, struct pci_dev *pdev,
1667                         int translation)
1668 {
1669         int ret;
1670         struct pci_dev *tmp, *parent;
1671
1672         ret = domain_context_mapping_one(domain, pci_domain_nr(pdev->bus),
1673                                          pdev->bus->number, pdev->devfn,
1674                                          translation);
1675         if (ret)
1676                 return ret;
1677
1678         /* dependent device mapping */
1679         tmp = pci_find_upstream_pcie_bridge(pdev);
1680         if (!tmp)
1681                 return 0;
1682         /* Secondary interface's bus number and devfn 0 */
1683         parent = pdev->bus->self;
1684         while (parent != tmp) {
1685                 ret = domain_context_mapping_one(domain,
1686                                                  pci_domain_nr(parent->bus),
1687                                                  parent->bus->number,
1688                                                  parent->devfn, translation);
1689                 if (ret)
1690                         return ret;
1691                 parent = parent->bus->self;
1692         }
1693         if (pci_is_pcie(tmp)) /* this is a PCIe-to-PCI bridge */
1694                 return domain_context_mapping_one(domain,
1695                                         pci_domain_nr(tmp->subordinate),
1696                                         tmp->subordinate->number, 0,
1697                                         translation);
1698         else /* this is a legacy PCI bridge */
1699                 return domain_context_mapping_one(domain,
1700                                                   pci_domain_nr(tmp->bus),
1701                                                   tmp->bus->number,
1702                                                   tmp->devfn,
1703                                                   translation);
1704 }
1705
1706 static int domain_context_mapped(struct pci_dev *pdev)
1707 {
1708         int ret;
1709         struct pci_dev *tmp, *parent;
1710         struct intel_iommu *iommu;
1711
1712         iommu = device_to_iommu(pci_domain_nr(pdev->bus), pdev->bus->number,
1713                                 pdev->devfn);
1714         if (!iommu)
1715                 return -ENODEV;
1716
1717         ret = device_context_mapped(iommu, pdev->bus->number, pdev->devfn);
1718         if (!ret)
1719                 return ret;
1720         /* dependent device mapping */
1721         tmp = pci_find_upstream_pcie_bridge(pdev);
1722         if (!tmp)
1723                 return ret;
1724         /* Secondary interface's bus number and devfn 0 */
1725         parent = pdev->bus->self;
1726         while (parent != tmp) {
1727                 ret = device_context_mapped(iommu, parent->bus->number,
1728                                             parent->devfn);
1729                 if (!ret)
1730                         return ret;
1731                 parent = parent->bus->self;
1732         }
1733         if (pci_is_pcie(tmp))
1734                 return device_context_mapped(iommu, tmp->subordinate->number,
1735                                              0);
1736         else
1737                 return device_context_mapped(iommu, tmp->bus->number,
1738                                              tmp->devfn);
1739 }
1740
1741 /* Returns a number of VTD pages, but aligned to MM page size */
1742 static inline unsigned long aligned_nrpages(unsigned long host_addr,
1743                                             size_t size)
1744 {
1745         host_addr &= ~PAGE_MASK;
1746         return PAGE_ALIGN(host_addr + size) >> VTD_PAGE_SHIFT;
1747 }
1748
1749 /* Return largest possible superpage level for a given mapping */
1750 static inline int hardware_largepage_caps(struct dmar_domain *domain,
1751                                           unsigned long iov_pfn,
1752                                           unsigned long phy_pfn,
1753                                           unsigned long pages)
1754 {
1755         int support, level = 1;
1756         unsigned long pfnmerge;
1757
1758         support = domain->iommu_superpage;
1759
1760         /* To use a large page, the virtual *and* physical addresses
1761            must be aligned to 2MiB/1GiB/etc. Lower bits set in either
1762            of them will mean we have to use smaller pages. So just
1763            merge them and check both at once. */
1764         pfnmerge = iov_pfn | phy_pfn;
1765
1766         while (support && !(pfnmerge & ~VTD_STRIDE_MASK)) {
1767                 pages >>= VTD_STRIDE_SHIFT;
1768                 if (!pages)
1769                         break;
1770                 pfnmerge >>= VTD_STRIDE_SHIFT;
1771                 level++;
1772                 support--;
1773         }
1774         return level;
1775 }
1776
1777 static int __domain_mapping(struct dmar_domain *domain, unsigned long iov_pfn,
1778                             struct scatterlist *sg, unsigned long phys_pfn,
1779                             unsigned long nr_pages, int prot)
1780 {
1781         struct dma_pte *first_pte = NULL, *pte = NULL;
1782         phys_addr_t uninitialized_var(pteval);
1783         int addr_width = agaw_to_width(domain->agaw) - VTD_PAGE_SHIFT;
1784         unsigned long sg_res;
1785         unsigned int largepage_lvl = 0;
1786         unsigned long lvl_pages = 0;
1787
1788         BUG_ON(addr_width < BITS_PER_LONG && (iov_pfn + nr_pages - 1) >> addr_width);
1789
1790         if ((prot & (DMA_PTE_READ|DMA_PTE_WRITE)) == 0)
1791                 return -EINVAL;
1792
1793         prot &= DMA_PTE_READ | DMA_PTE_WRITE | DMA_PTE_SNP;
1794
1795         if (sg)
1796                 sg_res = 0;
1797         else {
1798                 sg_res = nr_pages + 1;
1799                 pteval = ((phys_addr_t)phys_pfn << VTD_PAGE_SHIFT) | prot;
1800         }
1801
1802         while (nr_pages > 0) {
1803                 uint64_t tmp;
1804
1805                 if (!sg_res) {
1806                         sg_res = aligned_nrpages(sg->offset, sg->length);
1807                         sg->dma_address = ((dma_addr_t)iov_pfn << VTD_PAGE_SHIFT) + sg->offset;
1808                         sg->dma_length = sg->length;
1809                         pteval = page_to_phys(sg_page(sg)) | prot;
1810                         phys_pfn = pteval >> VTD_PAGE_SHIFT;
1811                 }
1812
1813                 if (!pte) {
1814                         largepage_lvl = hardware_largepage_caps(domain, iov_pfn, phys_pfn, sg_res);
1815
1816                         first_pte = pte = pfn_to_dma_pte(domain, iov_pfn, largepage_lvl);
1817                         if (!pte)
1818                                 return -ENOMEM;
1819                         /* It is large page*/
1820                         if (largepage_lvl > 1)
1821                                 pteval |= DMA_PTE_LARGE_PAGE;
1822                         else
1823                                 pteval &= ~(uint64_t)DMA_PTE_LARGE_PAGE;
1824
1825                 }
1826                 /* We don't need lock here, nobody else
1827                  * touches the iova range
1828                  */
1829                 tmp = cmpxchg64_local(&pte->val, 0ULL, pteval);
1830                 if (tmp) {
1831                         static int dumps = 5;
1832                         printk(KERN_CRIT "ERROR: DMA PTE for vPFN 0x%lx already set (to %llx not %llx)\n",
1833                                iov_pfn, tmp, (unsigned long long)pteval);
1834                         if (dumps) {
1835                                 dumps--;
1836                                 debug_dma_dump_mappings(NULL);
1837                         }
1838                         WARN_ON(1);
1839                 }
1840
1841                 lvl_pages = lvl_to_nr_pages(largepage_lvl);
1842
1843                 BUG_ON(nr_pages < lvl_pages);
1844                 BUG_ON(sg_res < lvl_pages);
1845
1846                 nr_pages -= lvl_pages;
1847                 iov_pfn += lvl_pages;
1848                 phys_pfn += lvl_pages;
1849                 pteval += lvl_pages * VTD_PAGE_SIZE;
1850                 sg_res -= lvl_pages;
1851
1852                 /* If the next PTE would be the first in a new page, then we
1853                    need to flush the cache on the entries we've just written.
1854                    And then we'll need to recalculate 'pte', so clear it and
1855                    let it get set again in the if (!pte) block above.
1856
1857                    If we're done (!nr_pages) we need to flush the cache too.
1858
1859                    Also if we've been setting superpages, we may need to
1860                    recalculate 'pte' and switch back to smaller pages for the
1861                    end of the mapping, if the trailing size is not enough to
1862                    use another superpage (i.e. sg_res < lvl_pages). */
1863                 pte++;
1864                 if (!nr_pages || first_pte_in_page(pte) ||
1865                     (largepage_lvl > 1 && sg_res < lvl_pages)) {
1866                         domain_flush_cache(domain, first_pte,
1867                                            (void *)pte - (void *)first_pte);
1868                         pte = NULL;
1869                 }
1870
1871                 if (!sg_res && nr_pages)
1872                         sg = sg_next(sg);
1873         }
1874         return 0;
1875 }
1876
1877 static inline int domain_sg_mapping(struct dmar_domain *domain, unsigned long iov_pfn,
1878                                     struct scatterlist *sg, unsigned long nr_pages,
1879                                     int prot)
1880 {
1881         return __domain_mapping(domain, iov_pfn, sg, 0, nr_pages, prot);
1882 }
1883
1884 static inline int domain_pfn_mapping(struct dmar_domain *domain, unsigned long iov_pfn,
1885                                      unsigned long phys_pfn, unsigned long nr_pages,
1886                                      int prot)
1887 {
1888         return __domain_mapping(domain, iov_pfn, NULL, phys_pfn, nr_pages, prot);
1889 }
1890
1891 static void iommu_detach_dev(struct intel_iommu *iommu, u8 bus, u8 devfn)
1892 {
1893         if (!iommu)
1894                 return;
1895
1896         clear_context_table(iommu, bus, devfn);
1897         iommu->flush.flush_context(iommu, 0, 0, 0,
1898                                            DMA_CCMD_GLOBAL_INVL);
1899         iommu->flush.flush_iotlb(iommu, 0, 0, 0, DMA_TLB_GLOBAL_FLUSH);
1900 }
1901
1902 static void domain_remove_dev_info(struct dmar_domain *domain)
1903 {
1904         struct device_domain_info *info;
1905         unsigned long flags;
1906         struct intel_iommu *iommu;
1907
1908         spin_lock_irqsave(&device_domain_lock, flags);
1909         while (!list_empty(&domain->devices)) {
1910                 info = list_entry(domain->devices.next,
1911                         struct device_domain_info, link);
1912                 list_del(&info->link);
1913                 list_del(&info->global);
1914                 if (info->dev)
1915                         info->dev->dev.archdata.iommu = NULL;
1916                 spin_unlock_irqrestore(&device_domain_lock, flags);
1917
1918                 iommu_disable_dev_iotlb(info);
1919                 iommu = device_to_iommu(info->segment, info->bus, info->devfn);
1920                 iommu_detach_dev(iommu, info->bus, info->devfn);
1921                 free_devinfo_mem(info);
1922
1923                 spin_lock_irqsave(&device_domain_lock, flags);
1924         }
1925         spin_unlock_irqrestore(&device_domain_lock, flags);
1926 }
1927
1928 /*
1929  * find_domain
1930  * Note: we use struct pci_dev->dev.archdata.iommu stores the info
1931  */
1932 static struct dmar_domain *
1933 find_domain(struct pci_dev *pdev)
1934 {
1935         struct device_domain_info *info;
1936
1937         /* No lock here, assumes no domain exit in normal case */
1938         info = pdev->dev.archdata.iommu;
1939         if (info)
1940                 return info->domain;
1941         return NULL;
1942 }
1943
1944 /* domain is initialized */
1945 static struct dmar_domain *get_domain_for_dev(struct pci_dev *pdev, int gaw)
1946 {
1947         struct dmar_domain *domain, *found = NULL;
1948         struct intel_iommu *iommu;
1949         struct dmar_drhd_unit *drhd;
1950         struct device_domain_info *info, *tmp;
1951         struct pci_dev *dev_tmp;
1952         unsigned long flags;
1953         int bus = 0, devfn = 0;
1954         int segment;
1955         int ret;
1956
1957         domain = find_domain(pdev);
1958         if (domain)
1959                 return domain;
1960
1961         segment = pci_domain_nr(pdev->bus);
1962
1963         dev_tmp = pci_find_upstream_pcie_bridge(pdev);
1964         if (dev_tmp) {
1965                 if (pci_is_pcie(dev_tmp)) {
1966                         bus = dev_tmp->subordinate->number;
1967                         devfn = 0;
1968                 } else {
1969                         bus = dev_tmp->bus->number;
1970                         devfn = dev_tmp->devfn;
1971                 }
1972                 spin_lock_irqsave(&device_domain_lock, flags);
1973                 list_for_each_entry(info, &device_domain_list, global) {
1974                         if (info->segment == segment &&
1975                             info->bus == bus && info->devfn == devfn) {
1976                                 found = info->domain;
1977                                 break;
1978                         }
1979                 }
1980                 spin_unlock_irqrestore(&device_domain_lock, flags);
1981                 /* pcie-pci bridge already has a domain, uses it */
1982                 if (found) {
1983                         domain = found;
1984                         goto found_domain;
1985                 }
1986         }
1987
1988         domain = alloc_domain();
1989         if (!domain)
1990                 goto error;
1991
1992         /* Allocate new domain for the device */
1993         drhd = dmar_find_matched_drhd_unit(pdev);
1994         if (!drhd) {
1995                 printk(KERN_ERR "IOMMU: can't find DMAR for device %s\n",
1996                         pci_name(pdev));
1997                 return NULL;
1998         }
1999         iommu = drhd->iommu;
2000
2001         ret = iommu_attach_domain(domain, iommu);
2002         if (ret) {
2003                 free_domain_mem(domain);
2004                 goto error;
2005         }
2006
2007         if (domain_init(domain, gaw)) {
2008                 domain_exit(domain);
2009                 goto error;
2010         }
2011
2012         /* register pcie-to-pci device */
2013         if (dev_tmp) {
2014                 info = alloc_devinfo_mem();
2015                 if (!info) {
2016                         domain_exit(domain);
2017                         goto error;
2018                 }
2019                 info->segment = segment;
2020                 info->bus = bus;
2021                 info->devfn = devfn;
2022                 info->dev = NULL;
2023                 info->domain = domain;
2024                 /* This domain is shared by devices under p2p bridge */
2025                 domain->flags |= DOMAIN_FLAG_P2P_MULTIPLE_DEVICES;
2026
2027                 /* pcie-to-pci bridge already has a domain, uses it */
2028                 found = NULL;
2029                 spin_lock_irqsave(&device_domain_lock, flags);
2030                 list_for_each_entry(tmp, &device_domain_list, global) {
2031                         if (tmp->segment == segment &&
2032                             tmp->bus == bus && tmp->devfn == devfn) {
2033                                 found = tmp->domain;
2034                                 break;
2035                         }
2036                 }
2037                 if (found) {
2038                         spin_unlock_irqrestore(&device_domain_lock, flags);
2039                         free_devinfo_mem(info);
2040                         domain_exit(domain);
2041                         domain = found;
2042                 } else {
2043                         list_add(&info->link, &domain->devices);
2044                         list_add(&info->global, &device_domain_list);
2045                         spin_unlock_irqrestore(&device_domain_lock, flags);
2046                 }
2047         }
2048
2049 found_domain:
2050         info = alloc_devinfo_mem();
2051         if (!info)
2052                 goto error;
2053         info->segment = segment;
2054         info->bus = pdev->bus->number;
2055         info->devfn = pdev->devfn;
2056         info->dev = pdev;
2057         info->domain = domain;
2058         spin_lock_irqsave(&device_domain_lock, flags);
2059         /* somebody is fast */
2060         found = find_domain(pdev);
2061         if (found != NULL) {
2062                 spin_unlock_irqrestore(&device_domain_lock, flags);
2063                 if (found != domain) {
2064                         domain_exit(domain);
2065                         domain = found;
2066                 }
2067                 free_devinfo_mem(info);
2068                 return domain;
2069         }
2070         list_add(&info->link, &domain->devices);
2071         list_add(&info->global, &device_domain_list);
2072         pdev->dev.archdata.iommu = info;
2073         spin_unlock_irqrestore(&device_domain_lock, flags);
2074         return domain;
2075 error:
2076         /* recheck it here, maybe others set it */
2077         return find_domain(pdev);
2078 }
2079
2080 static int iommu_identity_mapping;
2081 #define IDENTMAP_ALL            1
2082 #define IDENTMAP_GFX            2
2083 #define IDENTMAP_AZALIA         4
2084
2085 static int iommu_domain_identity_map(struct dmar_domain *domain,
2086                                      unsigned long long start,
2087                                      unsigned long long end)
2088 {
2089         unsigned long first_vpfn = start >> VTD_PAGE_SHIFT;
2090         unsigned long last_vpfn = end >> VTD_PAGE_SHIFT;
2091
2092         if (!reserve_iova(&domain->iovad, dma_to_mm_pfn(first_vpfn),
2093                           dma_to_mm_pfn(last_vpfn))) {
2094                 printk(KERN_ERR "IOMMU: reserve iova failed\n");
2095                 return -ENOMEM;
2096         }
2097
2098         pr_debug("Mapping reserved region %llx-%llx for domain %d\n",
2099                  start, end, domain->id);
2100         /*
2101          * RMRR range might have overlap with physical memory range,
2102          * clear it first
2103          */
2104         dma_pte_clear_range(domain, first_vpfn, last_vpfn);
2105
2106         return domain_pfn_mapping(domain, first_vpfn, first_vpfn,
2107                                   last_vpfn - first_vpfn + 1,
2108                                   DMA_PTE_READ|DMA_PTE_WRITE);
2109 }
2110
2111 static int iommu_prepare_identity_map(struct pci_dev *pdev,
2112                                       unsigned long long start,
2113                                       unsigned long long end)
2114 {
2115         struct dmar_domain *domain;
2116         int ret;
2117
2118         domain = get_domain_for_dev(pdev, DEFAULT_DOMAIN_ADDRESS_WIDTH);
2119         if (!domain)
2120                 return -ENOMEM;
2121
2122         /* For _hardware_ passthrough, don't bother. But for software
2123            passthrough, we do it anyway -- it may indicate a memory
2124            range which is reserved in E820, so which didn't get set
2125            up to start with in si_domain */
2126         if (domain == si_domain && hw_pass_through) {
2127                 printk("Ignoring identity map for HW passthrough device %s [0x%Lx - 0x%Lx]\n",
2128                        pci_name(pdev), start, end);
2129                 return 0;
2130         }
2131
2132         printk(KERN_INFO
2133                "IOMMU: Setting identity map for device %s [0x%Lx - 0x%Lx]\n",
2134                pci_name(pdev), start, end);
2135         
2136         if (end < start) {
2137                 WARN(1, "Your BIOS is broken; RMRR ends before it starts!\n"
2138                         "BIOS vendor: %s; Ver: %s; Product Version: %s\n",
2139                         dmi_get_system_info(DMI_BIOS_VENDOR),
2140                         dmi_get_system_info(DMI_BIOS_VERSION),
2141                      dmi_get_system_info(DMI_PRODUCT_VERSION));
2142                 ret = -EIO;
2143                 goto error;
2144         }
2145
2146         if (end >> agaw_to_width(domain->agaw)) {
2147                 WARN(1, "Your BIOS is broken; RMRR exceeds permitted address width (%d bits)\n"
2148                      "BIOS vendor: %s; Ver: %s; Product Version: %s\n",
2149                      agaw_to_width(domain->agaw),
2150                      dmi_get_system_info(DMI_BIOS_VENDOR),
2151                      dmi_get_system_info(DMI_BIOS_VERSION),
2152                      dmi_get_system_info(DMI_PRODUCT_VERSION));
2153                 ret = -EIO;
2154                 goto error;
2155         }
2156
2157         ret = iommu_domain_identity_map(domain, start, end);
2158         if (ret)
2159                 goto error;
2160
2161         /* context entry init */
2162         ret = domain_context_mapping(domain, pdev, CONTEXT_TT_MULTI_LEVEL);
2163         if (ret)
2164                 goto error;
2165
2166         return 0;
2167
2168  error:
2169         domain_exit(domain);
2170         return ret;
2171 }
2172
2173 static inline int iommu_prepare_rmrr_dev(struct dmar_rmrr_unit *rmrr,
2174         struct pci_dev *pdev)
2175 {
2176         if (pdev->dev.archdata.iommu == DUMMY_DEVICE_DOMAIN_INFO)
2177                 return 0;
2178         return iommu_prepare_identity_map(pdev, rmrr->base_address,
2179                 rmrr->end_address);
2180 }
2181
2182 #ifdef CONFIG_INTEL_IOMMU_FLOPPY_WA
2183 static inline void iommu_prepare_isa(void)
2184 {
2185         struct pci_dev *pdev;
2186         int ret;
2187
2188         pdev = pci_get_class(PCI_CLASS_BRIDGE_ISA << 8, NULL);
2189         if (!pdev)
2190                 return;
2191
2192         printk(KERN_INFO "IOMMU: Prepare 0-16MiB unity mapping for LPC\n");
2193         ret = iommu_prepare_identity_map(pdev, 0, 16*1024*1024 - 1);
2194
2195         if (ret)
2196                 printk(KERN_ERR "IOMMU: Failed to create 0-16MiB identity map; "
2197                        "floppy might not work\n");
2198
2199 }
2200 #else
2201 static inline void iommu_prepare_isa(void)
2202 {
2203         return;
2204 }
2205 #endif /* !CONFIG_INTEL_IOMMU_FLPY_WA */
2206
2207 static int md_domain_init(struct dmar_domain *domain, int guest_width);
2208
2209 static int __init si_domain_work_fn(unsigned long start_pfn,
2210                                     unsigned long end_pfn, void *datax)
2211 {
2212         int *ret = datax;
2213
2214         *ret = iommu_domain_identity_map(si_domain,
2215                                          (uint64_t)start_pfn << PAGE_SHIFT,
2216                                          (uint64_t)end_pfn << PAGE_SHIFT);
2217         return *ret;
2218
2219 }
2220
2221 static int __init si_domain_init(int hw)
2222 {
2223         struct dmar_drhd_unit *drhd;
2224         struct intel_iommu *iommu;
2225         int nid, ret = 0;
2226
2227         si_domain = alloc_domain();
2228         if (!si_domain)
2229                 return -EFAULT;
2230
2231         pr_debug("Identity mapping domain is domain %d\n", si_domain->id);
2232
2233         for_each_active_iommu(iommu, drhd) {
2234                 ret = iommu_attach_domain(si_domain, iommu);
2235                 if (ret) {
2236                         domain_exit(si_domain);
2237                         return -EFAULT;
2238                 }
2239         }
2240
2241         if (md_domain_init(si_domain, DEFAULT_DOMAIN_ADDRESS_WIDTH)) {
2242                 domain_exit(si_domain);
2243                 return -EFAULT;
2244         }
2245
2246         si_domain->flags = DOMAIN_FLAG_STATIC_IDENTITY;
2247
2248         if (hw)
2249                 return 0;
2250
2251         for_each_online_node(nid) {
2252                 work_with_active_regions(nid, si_domain_work_fn, &ret);
2253                 if (ret)
2254                         return ret;
2255         }
2256
2257         return 0;
2258 }
2259
2260 static void domain_remove_one_dev_info(struct dmar_domain *domain,
2261                                           struct pci_dev *pdev);
2262 static int identity_mapping(struct pci_dev *pdev)
2263 {
2264         struct device_domain_info *info;
2265
2266         if (likely(!iommu_identity_mapping))
2267                 return 0;
2268
2269         info = pdev->dev.archdata.iommu;
2270         if (info && info != DUMMY_DEVICE_DOMAIN_INFO)
2271                 return (info->domain == si_domain);
2272
2273         return 0;
2274 }
2275
2276 static int domain_add_dev_info(struct dmar_domain *domain,
2277                                struct pci_dev *pdev,
2278                                int translation)
2279 {
2280         struct device_domain_info *info;
2281         unsigned long flags;
2282         int ret;
2283
2284         info = alloc_devinfo_mem();
2285         if (!info)
2286                 return -ENOMEM;
2287
2288         ret = domain_context_mapping(domain, pdev, translation);
2289         if (ret) {
2290                 free_devinfo_mem(info);
2291                 return ret;
2292         }
2293
2294         info->segment = pci_domain_nr(pdev->bus);
2295         info->bus = pdev->bus->number;
2296         info->devfn = pdev->devfn;
2297         info->dev = pdev;
2298         info->domain = domain;
2299
2300         spin_lock_irqsave(&device_domain_lock, flags);
2301         list_add(&info->link, &domain->devices);
2302         list_add(&info->global, &device_domain_list);
2303         pdev->dev.archdata.iommu = info;
2304         spin_unlock_irqrestore(&device_domain_lock, flags);
2305
2306         return 0;
2307 }
2308
2309 static int iommu_should_identity_map(struct pci_dev *pdev, int startup)
2310 {
2311         if ((iommu_identity_mapping & IDENTMAP_AZALIA) && IS_AZALIA(pdev))
2312                 return 1;
2313
2314         if ((iommu_identity_mapping & IDENTMAP_GFX) && IS_GFX_DEVICE(pdev))
2315                 return 1;
2316
2317         if (!(iommu_identity_mapping & IDENTMAP_ALL))
2318                 return 0;
2319
2320         /*
2321          * We want to start off with all devices in the 1:1 domain, and
2322          * take them out later if we find they can't access all of memory.
2323          *
2324          * However, we can't do this for PCI devices behind bridges,
2325          * because all PCI devices behind the same bridge will end up
2326          * with the same source-id on their transactions.
2327          *
2328          * Practically speaking, we can't change things around for these
2329          * devices at run-time, because we can't be sure there'll be no
2330          * DMA transactions in flight for any of their siblings.
2331          * 
2332          * So PCI devices (unless they're on the root bus) as well as
2333          * their parent PCI-PCI or PCIe-PCI bridges must be left _out_ of
2334          * the 1:1 domain, just in _case_ one of their siblings turns out
2335          * not to be able to map all of memory.
2336          */
2337         if (!pci_is_pcie(pdev)) {
2338                 if (!pci_is_root_bus(pdev->bus))
2339                         return 0;
2340                 if (pdev->class >> 8 == PCI_CLASS_BRIDGE_PCI)
2341                         return 0;
2342         } else if (pdev->pcie_type == PCI_EXP_TYPE_PCI_BRIDGE)
2343                 return 0;
2344
2345         /* 
2346          * At boot time, we don't yet know if devices will be 64-bit capable.
2347          * Assume that they will -- if they turn out not to be, then we can 
2348          * take them out of the 1:1 domain later.
2349          */
2350         if (!startup) {
2351                 /*
2352                  * If the device's dma_mask is less than the system's memory
2353                  * size then this is not a candidate for identity mapping.
2354                  */
2355                 u64 dma_mask = pdev->dma_mask;
2356
2357                 if (pdev->dev.coherent_dma_mask &&
2358                     pdev->dev.coherent_dma_mask < dma_mask)
2359                         dma_mask = pdev->dev.coherent_dma_mask;
2360
2361                 return dma_mask >= dma_get_required_mask(&pdev->dev);
2362         }
2363
2364         return 1;
2365 }
2366
2367 static int __init iommu_prepare_static_identity_mapping(int hw)
2368 {
2369         struct pci_dev *pdev = NULL;
2370         int ret;
2371
2372         ret = si_domain_init(hw);
2373         if (ret)
2374                 return -EFAULT;
2375
2376         for_each_pci_dev(pdev) {
2377                 /* Skip Host/PCI Bridge devices */
2378                 if (IS_BRIDGE_HOST_DEVICE(pdev))
2379                         continue;
2380                 if (iommu_should_identity_map(pdev, 1)) {
2381                         printk(KERN_INFO "IOMMU: %s identity mapping for device %s\n",
2382                                hw ? "hardware" : "software", pci_name(pdev));
2383
2384                         ret = domain_add_dev_info(si_domain, pdev,
2385                                                      hw ? CONTEXT_TT_PASS_THROUGH :
2386                                                      CONTEXT_TT_MULTI_LEVEL);
2387                         if (ret)
2388                                 return ret;
2389                 }
2390         }
2391
2392         return 0;
2393 }
2394
2395 static int __init init_dmars(void)
2396 {
2397         struct dmar_drhd_unit *drhd;
2398         struct dmar_rmrr_unit *rmrr;
2399         struct pci_dev *pdev;
2400         struct intel_iommu *iommu;
2401         int i, ret;
2402
2403         /*
2404          * for each drhd
2405          *    allocate root
2406          *    initialize and program root entry to not present
2407          * endfor
2408          */
2409         for_each_drhd_unit(drhd) {
2410                 g_num_of_iommus++;
2411                 /*
2412                  * lock not needed as this is only incremented in the single
2413                  * threaded kernel __init code path all other access are read
2414                  * only
2415                  */
2416         }
2417
2418         g_iommus = kcalloc(g_num_of_iommus, sizeof(struct intel_iommu *),
2419                         GFP_KERNEL);
2420         if (!g_iommus) {
2421                 printk(KERN_ERR "Allocating global iommu array failed\n");
2422                 ret = -ENOMEM;
2423                 goto error;
2424         }
2425
2426         deferred_flush = kzalloc(g_num_of_iommus *
2427                 sizeof(struct deferred_flush_tables), GFP_KERNEL);
2428         if (!deferred_flush) {
2429                 ret = -ENOMEM;
2430                 goto error;
2431         }
2432
2433         for_each_drhd_unit(drhd) {
2434                 if (drhd->ignored)
2435                         continue;
2436
2437                 iommu = drhd->iommu;
2438                 g_iommus[iommu->seq_id] = iommu;
2439
2440                 ret = iommu_init_domains(iommu);
2441                 if (ret)
2442                         goto error;
2443
2444                 /*
2445                  * TBD:
2446                  * we could share the same root & context tables
2447                  * among all IOMMU's. Need to Split it later.
2448                  */
2449                 ret = iommu_alloc_root_entry(iommu);
2450                 if (ret) {
2451                         printk(KERN_ERR "IOMMU: allocate root entry failed\n");
2452                         goto error;
2453                 }
2454                 if (!ecap_pass_through(iommu->ecap))
2455                         hw_pass_through = 0;
2456         }
2457
2458         /*
2459          * Start from the sane iommu hardware state.
2460          */
2461         for_each_drhd_unit(drhd) {
2462                 if (drhd->ignored)
2463                         continue;
2464
2465                 iommu = drhd->iommu;
2466
2467                 /*
2468                  * If the queued invalidation is already initialized by us
2469                  * (for example, while enabling interrupt-remapping) then
2470                  * we got the things already rolling from a sane state.
2471                  */
2472                 if (iommu->qi)
2473                         continue;
2474
2475                 /*
2476                  * Clear any previous faults.
2477                  */
2478                 dmar_fault(-1, iommu);
2479                 /*
2480                  * Disable queued invalidation if supported and already enabled
2481                  * before OS handover.
2482                  */
2483                 dmar_disable_qi(iommu);
2484         }
2485
2486         for_each_drhd_unit(drhd) {
2487                 if (drhd->ignored)
2488                         continue;
2489
2490                 iommu = drhd->iommu;
2491
2492                 if (dmar_enable_qi(iommu)) {
2493                         /*
2494                          * Queued Invalidate not enabled, use Register Based
2495                          * Invalidate
2496                          */
2497                         iommu->flush.flush_context = __iommu_flush_context;
2498                         iommu->flush.flush_iotlb = __iommu_flush_iotlb;
2499                         printk(KERN_INFO "IOMMU %d 0x%Lx: using Register based "
2500                                "invalidation\n",
2501                                 iommu->seq_id,
2502                                (unsigned long long)drhd->reg_base_addr);
2503                 } else {
2504                         iommu->flush.flush_context = qi_flush_context;
2505                         iommu->flush.flush_iotlb = qi_flush_iotlb;
2506                         printk(KERN_INFO "IOMMU %d 0x%Lx: using Queued "
2507                                "invalidation\n",
2508                                 iommu->seq_id,
2509                                (unsigned long long)drhd->reg_base_addr);
2510                 }
2511         }
2512
2513         if (iommu_pass_through)
2514                 iommu_identity_mapping |= IDENTMAP_ALL;
2515
2516 #ifdef CONFIG_INTEL_IOMMU_BROKEN_GFX_WA
2517         iommu_identity_mapping |= IDENTMAP_GFX;
2518 #endif
2519
2520         check_tylersburg_isoch();
2521
2522         /*
2523          * If pass through is not set or not enabled, setup context entries for
2524          * identity mappings for rmrr, gfx, and isa and may fall back to static
2525          * identity mapping if iommu_identity_mapping is set.
2526          */
2527         if (iommu_identity_mapping) {
2528                 ret = iommu_prepare_static_identity_mapping(hw_pass_through);
2529                 if (ret) {
2530                         printk(KERN_CRIT "Failed to setup IOMMU pass-through\n");
2531                         goto error;
2532                 }
2533         }
2534         /*
2535          * For each rmrr
2536          *   for each dev attached to rmrr
2537          *   do
2538          *     locate drhd for dev, alloc domain for dev
2539          *     allocate free domain
2540          *     allocate page table entries for rmrr
2541          *     if context not allocated for bus
2542          *           allocate and init context
2543          *           set present in root table for this bus
2544          *     init context with domain, translation etc
2545          *    endfor
2546          * endfor
2547          */
2548         printk(KERN_INFO "IOMMU: Setting RMRR:\n");
2549         for_each_rmrr_units(rmrr) {
2550                 for (i = 0; i < rmrr->devices_cnt; i++) {
2551                         pdev = rmrr->devices[i];
2552                         /*
2553                          * some BIOS lists non-exist devices in DMAR
2554                          * table.
2555                          */
2556                         if (!pdev)
2557                                 continue;
2558                         ret = iommu_prepare_rmrr_dev(rmrr, pdev);
2559                         if (ret)
2560                                 printk(KERN_ERR
2561                                        "IOMMU: mapping reserved region failed\n");
2562                 }
2563         }
2564
2565         iommu_prepare_isa();
2566
2567         /*
2568          * for each drhd
2569          *   enable fault log
2570          *   global invalidate context cache
2571          *   global invalidate iotlb
2572          *   enable translation
2573          */
2574         for_each_drhd_unit(drhd) {
2575                 if (drhd->ignored) {
2576                         /*
2577                          * we always have to disable PMRs or DMA may fail on
2578                          * this device
2579                          */
2580                         if (force_on)
2581                                 iommu_disable_protect_mem_regions(drhd->iommu);
2582                         continue;
2583                 }
2584                 iommu = drhd->iommu;
2585
2586                 iommu_flush_write_buffer(iommu);
2587
2588                 ret = dmar_set_interrupt(iommu);
2589                 if (ret)
2590                         goto error;
2591
2592                 iommu_set_root_entry(iommu);
2593
2594                 iommu->flush.flush_context(iommu, 0, 0, 0, DMA_CCMD_GLOBAL_INVL);
2595                 iommu->flush.flush_iotlb(iommu, 0, 0, 0, DMA_TLB_GLOBAL_FLUSH);
2596
2597                 ret = iommu_enable_translation(iommu);
2598                 if (ret)
2599                         goto error;
2600
2601                 iommu_disable_protect_mem_regions(iommu);
2602         }
2603
2604         return 0;
2605 error:
2606         for_each_drhd_unit(drhd) {
2607                 if (drhd->ignored)
2608                         continue;
2609                 iommu = drhd->iommu;
2610                 free_iommu(iommu);
2611         }
2612         kfree(g_iommus);
2613         return ret;
2614 }
2615
2616 /* This takes a number of _MM_ pages, not VTD pages */
2617 static struct iova *intel_alloc_iova(struct device *dev,
2618                                      struct dmar_domain *domain,
2619                                      unsigned long nrpages, uint64_t dma_mask)
2620 {
2621         struct pci_dev *pdev = to_pci_dev(dev);
2622         struct iova *iova = NULL;
2623
2624         /* Restrict dma_mask to the width that the iommu can handle */
2625         dma_mask = min_t(uint64_t, DOMAIN_MAX_ADDR(domain->gaw), dma_mask);
2626
2627         if (!dmar_forcedac && dma_mask > DMA_BIT_MASK(32)) {
2628                 /*
2629                  * First try to allocate an io virtual address in
2630                  * DMA_BIT_MASK(32) and if that fails then try allocating
2631                  * from higher range
2632                  */
2633                 iova = alloc_iova(&domain->iovad, nrpages,
2634                                   IOVA_PFN(DMA_BIT_MASK(32)), 1);
2635                 if (iova)
2636                         return iova;
2637         }
2638         iova = alloc_iova(&domain->iovad, nrpages, IOVA_PFN(dma_mask), 1);
2639         if (unlikely(!iova)) {
2640                 printk(KERN_ERR "Allocating %ld-page iova for %s failed",
2641                        nrpages, pci_name(pdev));
2642                 return NULL;
2643         }
2644
2645         return iova;
2646 }
2647
2648 static struct dmar_domain *__get_valid_domain_for_dev(struct pci_dev *pdev)
2649 {
2650         struct dmar_domain *domain;
2651         int ret;
2652
2653         domain = get_domain_for_dev(pdev,
2654                         DEFAULT_DOMAIN_ADDRESS_WIDTH);
2655         if (!domain) {
2656                 printk(KERN_ERR
2657                         "Allocating domain for %s failed", pci_name(pdev));
2658                 return NULL;
2659         }
2660
2661         /* make sure context mapping is ok */
2662         if (unlikely(!domain_context_mapped(pdev))) {
2663                 ret = domain_context_mapping(domain, pdev,
2664                                              CONTEXT_TT_MULTI_LEVEL);
2665                 if (ret) {
2666                         printk(KERN_ERR
2667                                 "Domain context map for %s failed",
2668                                 pci_name(pdev));
2669                         return NULL;
2670                 }
2671         }
2672
2673         return domain;
2674 }
2675
2676 static inline struct dmar_domain *get_valid_domain_for_dev(struct pci_dev *dev)
2677 {
2678         struct device_domain_info *info;
2679
2680         /* No lock here, assumes no domain exit in normal case */
2681         info = dev->dev.archdata.iommu;
2682         if (likely(info))
2683                 return info->domain;
2684
2685         return __get_valid_domain_for_dev(dev);
2686 }
2687
2688 static int iommu_dummy(struct pci_dev *pdev)
2689 {
2690         return pdev->dev.archdata.iommu == DUMMY_DEVICE_DOMAIN_INFO;
2691 }
2692
2693 /* Check if the pdev needs to go through non-identity map and unmap process.*/
2694 static int iommu_no_mapping(struct device *dev)
2695 {
2696         struct pci_dev *pdev;
2697         int found;
2698
2699         if (unlikely(dev->bus != &pci_bus_type))
2700                 return 1;
2701
2702         pdev = to_pci_dev(dev);
2703         if (iommu_dummy(pdev))
2704                 return 1;
2705
2706         if (!iommu_identity_mapping)
2707                 return 0;
2708
2709         found = identity_mapping(pdev);
2710         if (found) {
2711                 if (iommu_should_identity_map(pdev, 0))
2712                         return 1;
2713                 else {
2714                         /*
2715                          * 32 bit DMA is removed from si_domain and fall back
2716                          * to non-identity mapping.
2717                          */
2718                         domain_remove_one_dev_info(si_domain, pdev);
2719                         printk(KERN_INFO "32bit %s uses non-identity mapping\n",
2720                                pci_name(pdev));
2721                         return 0;
2722                 }
2723         } else {
2724                 /*
2725                  * In case of a detached 64 bit DMA device from vm, the device
2726                  * is put into si_domain for identity mapping.
2727                  */
2728                 if (iommu_should_identity_map(pdev, 0)) {
2729                         int ret;
2730                         ret = domain_add_dev_info(si_domain, pdev,
2731                                                   hw_pass_through ?
2732                                                   CONTEXT_TT_PASS_THROUGH :
2733                                                   CONTEXT_TT_MULTI_LEVEL);
2734                         if (!ret) {
2735                                 printk(KERN_INFO "64bit %s uses identity mapping\n",
2736                                        pci_name(pdev));
2737                                 return 1;
2738                         }
2739                 }
2740         }
2741
2742         return 0;
2743 }
2744
2745 static dma_addr_t __intel_map_single(struct device *hwdev, phys_addr_t paddr,
2746                                      size_t size, int dir, u64 dma_mask)
2747 {
2748         struct pci_dev *pdev = to_pci_dev(hwdev);
2749         struct dmar_domain *domain;
2750         phys_addr_t start_paddr;
2751         struct iova *iova;
2752         int prot = 0;
2753         int ret;
2754         struct intel_iommu *iommu;
2755         unsigned long paddr_pfn = paddr >> PAGE_SHIFT;
2756
2757         BUG_ON(dir == DMA_NONE);
2758
2759         if (iommu_no_mapping(hwdev))
2760                 return paddr;
2761
2762         domain = get_valid_domain_for_dev(pdev);
2763         if (!domain)
2764                 return 0;
2765
2766         iommu = domain_get_iommu(domain);
2767         size = aligned_nrpages(paddr, size);
2768
2769         iova = intel_alloc_iova(hwdev, domain, dma_to_mm_pfn(size), dma_mask);
2770         if (!iova)
2771                 goto error;
2772
2773         /*
2774          * Check if DMAR supports zero-length reads on write only
2775          * mappings..
2776          */
2777         if (dir == DMA_TO_DEVICE || dir == DMA_BIDIRECTIONAL || \
2778                         !cap_zlr(iommu->cap))
2779                 prot |= DMA_PTE_READ;
2780         if (dir == DMA_FROM_DEVICE || dir == DMA_BIDIRECTIONAL)
2781                 prot |= DMA_PTE_WRITE;
2782         /*
2783          * paddr - (paddr + size) might be partial page, we should map the whole
2784          * page.  Note: if two part of one page are separately mapped, we
2785          * might have two guest_addr mapping to the same host paddr, but this
2786          * is not a big problem
2787          */
2788         ret = domain_pfn_mapping(domain, mm_to_dma_pfn(iova->pfn_lo),
2789                                  mm_to_dma_pfn(paddr_pfn), size, prot);
2790         if (ret)
2791                 goto error;
2792
2793         /* it's a non-present to present mapping. Only flush if caching mode */
2794         if (cap_caching_mode(iommu->cap))
2795                 iommu_flush_iotlb_psi(iommu, domain->id, mm_to_dma_pfn(iova->pfn_lo), size, 1);
2796         else
2797                 iommu_flush_write_buffer(iommu);
2798
2799         start_paddr = (phys_addr_t)iova->pfn_lo << PAGE_SHIFT;
2800         start_paddr += paddr & ~PAGE_MASK;
2801         return start_paddr;
2802
2803 error:
2804         if (iova)
2805                 __free_iova(&domain->iovad, iova);
2806         printk(KERN_ERR"Device %s request: %zx@%llx dir %d --- failed\n",
2807                 pci_name(pdev), size, (unsigned long long)paddr, dir);
2808         return 0;
2809 }
2810
2811 static dma_addr_t intel_map_page(struct device *dev, struct page *page,
2812                                  unsigned long offset, size_t size,
2813                                  enum dma_data_direction dir,
2814                                  struct dma_attrs *attrs)
2815 {
2816         return __intel_map_single(dev, page_to_phys(page) + offset, size,
2817                                   dir, to_pci_dev(dev)->dma_mask);
2818 }
2819
2820 static void flush_unmaps(void)
2821 {
2822         int i, j;
2823
2824         timer_on = 0;
2825
2826         /* just flush them all */
2827         for (i = 0; i < g_num_of_iommus; i++) {
2828                 struct intel_iommu *iommu = g_iommus[i];
2829                 if (!iommu)
2830                         continue;
2831
2832                 if (!deferred_flush[i].next)
2833                         continue;
2834
2835                 /* In caching mode, global flushes turn emulation expensive */
2836                 if (!cap_caching_mode(iommu->cap))
2837                         iommu->flush.flush_iotlb(iommu, 0, 0, 0,
2838                                          DMA_TLB_GLOBAL_FLUSH);
2839                 for (j = 0; j < deferred_flush[i].next; j++) {
2840                         unsigned long mask;
2841                         struct iova *iova = deferred_flush[i].iova[j];
2842                         struct dmar_domain *domain = deferred_flush[i].domain[j];
2843
2844                         /* On real hardware multiple invalidations are expensive */
2845                         if (cap_caching_mode(iommu->cap))
2846                                 iommu_flush_iotlb_psi(iommu, domain->id,
2847                                 iova->pfn_lo, iova->pfn_hi - iova->pfn_lo + 1, 0);
2848                         else {
2849                                 mask = ilog2(mm_to_dma_pfn(iova->pfn_hi - iova->pfn_lo + 1));
2850                                 iommu_flush_dev_iotlb(deferred_flush[i].domain[j],
2851                                                 (uint64_t)iova->pfn_lo << PAGE_SHIFT, mask);
2852                         }
2853                         __free_iova(&deferred_flush[i].domain[j]->iovad, iova);
2854                 }
2855                 deferred_flush[i].next = 0;
2856         }
2857
2858         list_size = 0;
2859 }
2860
2861 static void flush_unmaps_timeout(unsigned long data)
2862 {
2863         unsigned long flags;
2864
2865         spin_lock_irqsave(&async_umap_flush_lock, flags);
2866         flush_unmaps();
2867         spin_unlock_irqrestore(&async_umap_flush_lock, flags);
2868 }
2869
2870 static void add_unmap(struct dmar_domain *dom, struct iova *iova)
2871 {
2872         unsigned long flags;
2873         int next, iommu_id;
2874         struct intel_iommu *iommu;
2875
2876         spin_lock_irqsave(&async_umap_flush_lock, flags);
2877         if (list_size == HIGH_WATER_MARK)
2878                 flush_unmaps();
2879
2880         iommu = domain_get_iommu(dom);
2881         iommu_id = iommu->seq_id;
2882
2883         next = deferred_flush[iommu_id].next;
2884         deferred_flush[iommu_id].domain[next] = dom;
2885         deferred_flush[iommu_id].iova[next] = iova;
2886         deferred_flush[iommu_id].next++;
2887
2888         if (!timer_on) {
2889                 mod_timer(&unmap_timer, jiffies + msecs_to_jiffies(10));
2890                 timer_on = 1;
2891         }
2892         list_size++;
2893         spin_unlock_irqrestore(&async_umap_flush_lock, flags);
2894 }
2895
2896 static void intel_unmap_page(struct device *dev, dma_addr_t dev_addr,
2897                              size_t size, enum dma_data_direction dir,
2898                              struct dma_attrs *attrs)
2899 {
2900         struct pci_dev *pdev = to_pci_dev(dev);
2901         struct dmar_domain *domain;
2902         unsigned long start_pfn, last_pfn;
2903         struct iova *iova;
2904         struct intel_iommu *iommu;
2905
2906         if (iommu_no_mapping(dev))
2907                 return;
2908
2909         domain = find_domain(pdev);
2910         BUG_ON(!domain);
2911
2912         iommu = domain_get_iommu(domain);
2913
2914         iova = find_iova(&domain->iovad, IOVA_PFN(dev_addr));
2915         if (WARN_ONCE(!iova, "Driver unmaps unmatched page at PFN %llx\n",
2916                       (unsigned long long)dev_addr))
2917                 return;
2918
2919         start_pfn = mm_to_dma_pfn(iova->pfn_lo);
2920         last_pfn = mm_to_dma_pfn(iova->pfn_hi + 1) - 1;
2921
2922         pr_debug("Device %s unmapping: pfn %lx-%lx\n",
2923                  pci_name(pdev), start_pfn, last_pfn);
2924
2925         /*  clear the whole page */
2926         dma_pte_clear_range(domain, start_pfn, last_pfn);
2927
2928         /* free page tables */
2929         dma_pte_free_pagetable(domain, start_pfn, last_pfn);
2930
2931         if (intel_iommu_strict) {
2932                 iommu_flush_iotlb_psi(iommu, domain->id, start_pfn,
2933                                       last_pfn - start_pfn + 1, 0);
2934                 /* free iova */
2935                 __free_iova(&domain->iovad, iova);
2936         } else {
2937                 add_unmap(domain, iova);
2938                 /*
2939                  * queue up the release of the unmap to save the 1/6th of the
2940                  * cpu used up by the iotlb flush operation...
2941                  */
2942         }
2943 }
2944
2945 static void *intel_alloc_coherent(struct device *hwdev, size_t size,
2946                                   dma_addr_t *dma_handle, gfp_t flags)
2947 {
2948         void *vaddr;
2949         int order;
2950
2951         size = PAGE_ALIGN(size);
2952         order = get_order(size);
2953
2954         if (!iommu_no_mapping(hwdev))
2955                 flags &= ~(GFP_DMA | GFP_DMA32);
2956         else if (hwdev->coherent_dma_mask < dma_get_required_mask(hwdev)) {
2957                 if (hwdev->coherent_dma_mask < DMA_BIT_MASK(32))
2958                         flags |= GFP_DMA;
2959                 else
2960                         flags |= GFP_DMA32;
2961         }
2962
2963         vaddr = (void *)__get_free_pages(flags, order);
2964         if (!vaddr)
2965                 return NULL;
2966         memset(vaddr, 0, size);
2967
2968         *dma_handle = __intel_map_single(hwdev, virt_to_bus(vaddr), size,
2969                                          DMA_BIDIRECTIONAL,
2970                                          hwdev->coherent_dma_mask);
2971         if (*dma_handle)
2972                 return vaddr;
2973         free_pages((unsigned long)vaddr, order);
2974         return NULL;
2975 }
2976
2977 static void intel_free_coherent(struct device *hwdev, size_t size, void *vaddr,
2978                                 dma_addr_t dma_handle)
2979 {
2980         int order;
2981
2982         size = PAGE_ALIGN(size);
2983         order = get_order(size);
2984
2985         intel_unmap_page(hwdev, dma_handle, size, DMA_BIDIRECTIONAL, NULL);
2986         free_pages((unsigned long)vaddr, order);
2987 }
2988
2989 static void intel_unmap_sg(struct device *hwdev, struct scatterlist *sglist,
2990                            int nelems, enum dma_data_direction dir,
2991                            struct dma_attrs *attrs)
2992 {
2993         struct pci_dev *pdev = to_pci_dev(hwdev);
2994         struct dmar_domain *domain;
2995         unsigned long start_pfn, last_pfn;
2996         struct iova *iova;
2997         struct intel_iommu *iommu;
2998
2999         if (iommu_no_mapping(hwdev))
3000                 return;
3001
3002         domain = find_domain(pdev);
3003         BUG_ON(!domain);
3004
3005         iommu = domain_get_iommu(domain);
3006
3007         iova = find_iova(&domain->iovad, IOVA_PFN(sglist[0].dma_address));
3008         if (WARN_ONCE(!iova, "Driver unmaps unmatched sglist at PFN %llx\n",
3009                       (unsigned long long)sglist[0].dma_address))
3010                 return;
3011
3012         start_pfn = mm_to_dma_pfn(iova->pfn_lo);
3013         last_pfn = mm_to_dma_pfn(iova->pfn_hi + 1) - 1;
3014
3015         /*  clear the whole page */
3016         dma_pte_clear_range(domain, start_pfn, last_pfn);
3017
3018         /* free page tables */
3019         dma_pte_free_pagetable(domain, start_pfn, last_pfn);
3020
3021         if (intel_iommu_strict) {
3022                 iommu_flush_iotlb_psi(iommu, domain->id, start_pfn,
3023                                       last_pfn - start_pfn + 1, 0);
3024                 /* free iova */
3025                 __free_iova(&domain->iovad, iova);
3026         } else {
3027                 add_unmap(domain, iova);
3028                 /*
3029                  * queue up the release of the unmap to save the 1/6th of the
3030                  * cpu used up by the iotlb flush operation...
3031                  */
3032         }
3033 }
3034
3035 static int intel_nontranslate_map_sg(struct device *hddev,
3036         struct scatterlist *sglist, int nelems, int dir)
3037 {
3038         int i;
3039         struct scatterlist *sg;
3040
3041         for_each_sg(sglist, sg, nelems, i) {
3042                 BUG_ON(!sg_page(sg));
3043                 sg->dma_address = page_to_phys(sg_page(sg)) + sg->offset;
3044                 sg->dma_length = sg->length;
3045         }
3046         return nelems;
3047 }
3048
3049 static int intel_map_sg(struct device *hwdev, struct scatterlist *sglist, int nelems,
3050                         enum dma_data_direction dir, struct dma_attrs *attrs)
3051 {
3052         int i;
3053         struct pci_dev *pdev = to_pci_dev(hwdev);
3054         struct dmar_domain *domain;
3055         size_t size = 0;
3056         int prot = 0;
3057         struct iova *iova = NULL;
3058         int ret;
3059         struct scatterlist *sg;
3060         unsigned long start_vpfn;
3061         struct intel_iommu *iommu;
3062
3063         BUG_ON(dir == DMA_NONE);
3064         if (iommu_no_mapping(hwdev))
3065                 return intel_nontranslate_map_sg(hwdev, sglist, nelems, dir);
3066
3067         domain = get_valid_domain_for_dev(pdev);
3068         if (!domain)
3069                 return 0;
3070
3071         iommu = domain_get_iommu(domain);
3072
3073         for_each_sg(sglist, sg, nelems, i)
3074                 size += aligned_nrpages(sg->offset, sg->length);
3075
3076         iova = intel_alloc_iova(hwdev, domain, dma_to_mm_pfn(size),
3077                                 pdev->dma_mask);
3078         if (!iova) {
3079                 sglist->dma_length = 0;
3080                 return 0;
3081         }
3082
3083         /*
3084          * Check if DMAR supports zero-length reads on write only
3085          * mappings..
3086          */
3087         if (dir == DMA_TO_DEVICE || dir == DMA_BIDIRECTIONAL || \
3088                         !cap_zlr(iommu->cap))
3089                 prot |= DMA_PTE_READ;
3090         if (dir == DMA_FROM_DEVICE || dir == DMA_BIDIRECTIONAL)
3091                 prot |= DMA_PTE_WRITE;
3092
3093         start_vpfn = mm_to_dma_pfn(iova->pfn_lo);
3094
3095         ret = domain_sg_mapping(domain, start_vpfn, sglist, size, prot);
3096         if (unlikely(ret)) {
3097                 /*  clear the page */
3098                 dma_pte_clear_range(domain, start_vpfn,
3099                                     start_vpfn + size - 1);
3100                 /* free page tables */
3101                 dma_pte_free_pagetable(domain, start_vpfn,
3102                                        start_vpfn + size - 1);
3103                 /* free iova */
3104                 __free_iova(&domain->iovad, iova);
3105                 return 0;
3106         }
3107
3108         /* it's a non-present to present mapping. Only flush if caching mode */
3109         if (cap_caching_mode(iommu->cap))
3110                 iommu_flush_iotlb_psi(iommu, domain->id, start_vpfn, size, 1);
3111         else
3112                 iommu_flush_write_buffer(iommu);
3113
3114         return nelems;
3115 }
3116
3117 static int intel_mapping_error(struct device *dev, dma_addr_t dma_addr)
3118 {
3119         return !dma_addr;
3120 }
3121
3122 struct dma_map_ops intel_dma_ops = {
3123         .alloc_coherent = intel_alloc_coherent,
3124         .free_coherent = intel_free_coherent,
3125         .map_sg = intel_map_sg,
3126         .unmap_sg = intel_unmap_sg,
3127         .map_page = intel_map_page,
3128         .unmap_page = intel_unmap_page,
3129         .mapping_error = intel_mapping_error,
3130 };
3131
3132 static inline int iommu_domain_cache_init(void)
3133 {
3134         int ret = 0;
3135
3136         iommu_domain_cache = kmem_cache_create("iommu_domain",
3137                                          sizeof(struct dmar_domain),
3138                                          0,
3139                                          SLAB_HWCACHE_ALIGN,
3140
3141                                          NULL);
3142         if (!iommu_domain_cache) {
3143                 printk(KERN_ERR "Couldn't create iommu_domain cache\n");
3144                 ret = -ENOMEM;
3145         }
3146
3147         return ret;
3148 }
3149
3150 static inline int iommu_devinfo_cache_init(void)
3151 {
3152         int ret = 0;
3153
3154         iommu_devinfo_cache = kmem_cache_create("iommu_devinfo",
3155                                          sizeof(struct device_domain_info),
3156                                          0,
3157                                          SLAB_HWCACHE_ALIGN,
3158                                          NULL);
3159         if (!iommu_devinfo_cache) {
3160                 printk(KERN_ERR "Couldn't create devinfo cache\n");
3161                 ret = -ENOMEM;
3162         }
3163
3164         return ret;
3165 }
3166
3167 static inline int iommu_iova_cache_init(void)
3168 {
3169         int ret = 0;
3170
3171         iommu_iova_cache = kmem_cache_create("iommu_iova",
3172                                          sizeof(struct iova),
3173                                          0,
3174                                          SLAB_HWCACHE_ALIGN,
3175                                          NULL);
3176         if (!iommu_iova_cache) {
3177                 printk(KERN_ERR "Couldn't create iova cache\n");
3178                 ret = -ENOMEM;
3179         }
3180
3181         return ret;
3182 }
3183
3184 static int __init iommu_init_mempool(void)
3185 {
3186         int ret;
3187         ret = iommu_iova_cache_init();
3188         if (ret)
3189                 return ret;
3190
3191         ret = iommu_domain_cache_init();
3192         if (ret)
3193                 goto domain_error;
3194
3195         ret = iommu_devinfo_cache_init();
3196         if (!ret)
3197                 return ret;
3198
3199         kmem_cache_destroy(iommu_domain_cache);
3200 domain_error:
3201         kmem_cache_destroy(iommu_iova_cache);
3202
3203         return -ENOMEM;
3204 }
3205
3206 static void __init iommu_exit_mempool(void)
3207 {
3208         kmem_cache_destroy(iommu_devinfo_cache);
3209         kmem_cache_destroy(iommu_domain_cache);
3210         kmem_cache_destroy(iommu_iova_cache);
3211
3212 }
3213
3214 static void quirk_ioat_snb_local_iommu(struct pci_dev *pdev)
3215 {
3216         struct dmar_drhd_unit *drhd;
3217         u32 vtbar;
3218         int rc;
3219
3220         /* We know that this device on this chipset has its own IOMMU.
3221          * If we find it under a different IOMMU, then the BIOS is lying
3222          * to us. Hope that the IOMMU for this device is actually
3223          * disabled, and it needs no translation...
3224          */
3225         rc = pci_bus_read_config_dword(pdev->bus, PCI_DEVFN(0, 0), 0xb0, &vtbar);
3226         if (rc) {
3227                 /* "can't" happen */
3228                 dev_info(&pdev->dev, "failed to run vt-d quirk\n");
3229                 return;
3230         }
3231         vtbar &= 0xffff0000;
3232
3233         /* we know that the this iommu should be at offset 0xa000 from vtbar */
3234         drhd = dmar_find_matched_drhd_unit(pdev);
3235         if (WARN_TAINT_ONCE(!drhd || drhd->reg_base_addr - vtbar != 0xa000,
3236                             TAINT_FIRMWARE_WORKAROUND,
3237                             "BIOS assigned incorrect VT-d unit for Intel(R) QuickData Technology device\n"))
3238                 pdev->dev.archdata.iommu = DUMMY_DEVICE_DOMAIN_INFO;
3239 }
3240 DECLARE_PCI_FIXUP_ENABLE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_IOAT_SNB, quirk_ioat_snb_local_iommu);
3241
3242 static void __init init_no_remapping_devices(void)
3243 {
3244         struct dmar_drhd_unit *drhd;
3245
3246         for_each_drhd_unit(drhd) {
3247                 if (!drhd->include_all) {
3248                         int i;
3249                         for (i = 0; i < drhd->devices_cnt; i++)
3250                                 if (drhd->devices[i] != NULL)
3251                                         break;
3252                         /* ignore DMAR unit if no pci devices exist */
3253                         if (i == drhd->devices_cnt)
3254                                 drhd->ignored = 1;
3255                 }
3256         }
3257
3258         for_each_drhd_unit(drhd) {
3259                 int i;
3260                 if (drhd->ignored || drhd->include_all)
3261                         continue;
3262
3263                 for (i = 0; i < drhd->devices_cnt; i++)
3264                         if (drhd->devices[i] &&
3265                             !IS_GFX_DEVICE(drhd->devices[i]))
3266                                 break;
3267
3268                 if (i < drhd->devices_cnt)
3269                         continue;
3270
3271                 /* This IOMMU has *only* gfx devices. Either bypass it or
3272                    set the gfx_mapped flag, as appropriate */
3273                 if (dmar_map_gfx) {
3274                         intel_iommu_gfx_mapped = 1;
3275                 } else {
3276                         drhd->ignored = 1;
3277                         for (i = 0; i < drhd->devices_cnt; i++) {
3278                                 if (!drhd->devices[i])
3279                                         continue;
3280                                 drhd->devices[i]->dev.archdata.iommu = DUMMY_DEVICE_DOMAIN_INFO;
3281                         }
3282                 }
3283         }
3284 }
3285
3286 #ifdef CONFIG_SUSPEND
3287 static int init_iommu_hw(void)
3288 {
3289         struct dmar_drhd_unit *drhd;
3290         struct intel_iommu *iommu = NULL;
3291
3292         for_each_active_iommu(iommu, drhd)
3293                 if (iommu->qi)
3294                         dmar_reenable_qi(iommu);
3295
3296         for_each_iommu(iommu, drhd) {
3297                 if (drhd->ignored) {
3298                         /*
3299                          * we always have to disable PMRs or DMA may fail on
3300                          * this device
3301                          */
3302                         if (force_on)
3303