Merge branch 'iommu/page-sizes' into x86/amd
[~shefty/rdma-dev.git] / drivers / iommu / intel-iommu.c
1 /*
2  * Copyright (c) 2006, Intel Corporation.
3  *
4  * This program is free software; you can redistribute it and/or modify it
5  * under the terms and conditions of the GNU General Public License,
6  * version 2, as published by the Free Software Foundation.
7  *
8  * This program is distributed in the hope it will be useful, but WITHOUT
9  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
10  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
11  * more details.
12  *
13  * You should have received a copy of the GNU General Public License along with
14  * this program; if not, write to the Free Software Foundation, Inc., 59 Temple
15  * Place - Suite 330, Boston, MA 02111-1307 USA.
16  *
17  * Copyright (C) 2006-2008 Intel Corporation
18  * Author: Ashok Raj <ashok.raj@intel.com>
19  * Author: Shaohua Li <shaohua.li@intel.com>
20  * Author: Anil S Keshavamurthy <anil.s.keshavamurthy@intel.com>
21  * Author: Fenghua Yu <fenghua.yu@intel.com>
22  */
23
24 #include <linux/init.h>
25 #include <linux/bitmap.h>
26 #include <linux/debugfs.h>
27 #include <linux/export.h>
28 #include <linux/slab.h>
29 #include <linux/irq.h>
30 #include <linux/interrupt.h>
31 #include <linux/spinlock.h>
32 #include <linux/pci.h>
33 #include <linux/dmar.h>
34 #include <linux/dma-mapping.h>
35 #include <linux/mempool.h>
36 #include <linux/timer.h>
37 #include <linux/iova.h>
38 #include <linux/iommu.h>
39 #include <linux/intel-iommu.h>
40 #include <linux/syscore_ops.h>
41 #include <linux/tboot.h>
42 #include <linux/dmi.h>
43 #include <linux/pci-ats.h>
44 #include <asm/cacheflush.h>
45 #include <asm/iommu.h>
46
47 #define ROOT_SIZE               VTD_PAGE_SIZE
48 #define CONTEXT_SIZE            VTD_PAGE_SIZE
49
50 #define IS_BRIDGE_HOST_DEVICE(pdev) \
51                             ((pdev->class >> 8) == PCI_CLASS_BRIDGE_HOST)
52 #define IS_GFX_DEVICE(pdev) ((pdev->class >> 16) == PCI_BASE_CLASS_DISPLAY)
53 #define IS_ISA_DEVICE(pdev) ((pdev->class >> 8) == PCI_CLASS_BRIDGE_ISA)
54 #define IS_AZALIA(pdev) ((pdev)->vendor == 0x8086 && (pdev)->device == 0x3a3e)
55
56 #define IOAPIC_RANGE_START      (0xfee00000)
57 #define IOAPIC_RANGE_END        (0xfeefffff)
58 #define IOVA_START_ADDR         (0x1000)
59
60 #define DEFAULT_DOMAIN_ADDRESS_WIDTH 48
61
62 #define MAX_AGAW_WIDTH 64
63
64 #define __DOMAIN_MAX_PFN(gaw)  ((((uint64_t)1) << (gaw-VTD_PAGE_SHIFT)) - 1)
65 #define __DOMAIN_MAX_ADDR(gaw) ((((uint64_t)1) << gaw) - 1)
66
67 /* We limit DOMAIN_MAX_PFN to fit in an unsigned long, and DOMAIN_MAX_ADDR
68    to match. That way, we can use 'unsigned long' for PFNs with impunity. */
69 #define DOMAIN_MAX_PFN(gaw)     ((unsigned long) min_t(uint64_t, \
70                                 __DOMAIN_MAX_PFN(gaw), (unsigned long)-1))
71 #define DOMAIN_MAX_ADDR(gaw)    (((uint64_t)__DOMAIN_MAX_PFN(gaw)) << VTD_PAGE_SHIFT)
72
73 #define IOVA_PFN(addr)          ((addr) >> PAGE_SHIFT)
74 #define DMA_32BIT_PFN           IOVA_PFN(DMA_BIT_MASK(32))
75 #define DMA_64BIT_PFN           IOVA_PFN(DMA_BIT_MASK(64))
76
77 /* page table handling */
78 #define LEVEL_STRIDE            (9)
79 #define LEVEL_MASK              (((u64)1 << LEVEL_STRIDE) - 1)
80
81 /*
82  * This bitmap is used to advertise the page sizes our hardware support
83  * to the IOMMU core, which will then use this information to split
84  * physically contiguous memory regions it is mapping into page sizes
85  * that we support.
86  *
87  * Traditionally the IOMMU core just handed us the mappings directly,
88  * after making sure the size is an order of a 4KiB page and that the
89  * mapping has natural alignment.
90  *
91  * To retain this behavior, we currently advertise that we support
92  * all page sizes that are an order of 4KiB.
93  *
94  * If at some point we'd like to utilize the IOMMU core's new behavior,
95  * we could change this to advertise the real page sizes we support.
96  */
97 #define INTEL_IOMMU_PGSIZES     (~0xFFFUL)
98
99 static inline int agaw_to_level(int agaw)
100 {
101         return agaw + 2;
102 }
103
104 static inline int agaw_to_width(int agaw)
105 {
106         return 30 + agaw * LEVEL_STRIDE;
107 }
108
109 static inline int width_to_agaw(int width)
110 {
111         return (width - 30) / LEVEL_STRIDE;
112 }
113
114 static inline unsigned int level_to_offset_bits(int level)
115 {
116         return (level - 1) * LEVEL_STRIDE;
117 }
118
119 static inline int pfn_level_offset(unsigned long pfn, int level)
120 {
121         return (pfn >> level_to_offset_bits(level)) & LEVEL_MASK;
122 }
123
124 static inline unsigned long level_mask(int level)
125 {
126         return -1UL << level_to_offset_bits(level);
127 }
128
129 static inline unsigned long level_size(int level)
130 {
131         return 1UL << level_to_offset_bits(level);
132 }
133
134 static inline unsigned long align_to_level(unsigned long pfn, int level)
135 {
136         return (pfn + level_size(level) - 1) & level_mask(level);
137 }
138
139 static inline unsigned long lvl_to_nr_pages(unsigned int lvl)
140 {
141         return  1 << ((lvl - 1) * LEVEL_STRIDE);
142 }
143
144 /* VT-d pages must always be _smaller_ than MM pages. Otherwise things
145    are never going to work. */
146 static inline unsigned long dma_to_mm_pfn(unsigned long dma_pfn)
147 {
148         return dma_pfn >> (PAGE_SHIFT - VTD_PAGE_SHIFT);
149 }
150
151 static inline unsigned long mm_to_dma_pfn(unsigned long mm_pfn)
152 {
153         return mm_pfn << (PAGE_SHIFT - VTD_PAGE_SHIFT);
154 }
155 static inline unsigned long page_to_dma_pfn(struct page *pg)
156 {
157         return mm_to_dma_pfn(page_to_pfn(pg));
158 }
159 static inline unsigned long virt_to_dma_pfn(void *p)
160 {
161         return page_to_dma_pfn(virt_to_page(p));
162 }
163
164 /* global iommu list, set NULL for ignored DMAR units */
165 static struct intel_iommu **g_iommus;
166
167 static void __init check_tylersburg_isoch(void);
168 static int rwbf_quirk;
169
170 /*
171  * set to 1 to panic kernel if can't successfully enable VT-d
172  * (used when kernel is launched w/ TXT)
173  */
174 static int force_on = 0;
175
176 /*
177  * 0: Present
178  * 1-11: Reserved
179  * 12-63: Context Ptr (12 - (haw-1))
180  * 64-127: Reserved
181  */
182 struct root_entry {
183         u64     val;
184         u64     rsvd1;
185 };
186 #define ROOT_ENTRY_NR (VTD_PAGE_SIZE/sizeof(struct root_entry))
187 static inline bool root_present(struct root_entry *root)
188 {
189         return (root->val & 1);
190 }
191 static inline void set_root_present(struct root_entry *root)
192 {
193         root->val |= 1;
194 }
195 static inline void set_root_value(struct root_entry *root, unsigned long value)
196 {
197         root->val |= value & VTD_PAGE_MASK;
198 }
199
200 static inline struct context_entry *
201 get_context_addr_from_root(struct root_entry *root)
202 {
203         return (struct context_entry *)
204                 (root_present(root)?phys_to_virt(
205                 root->val & VTD_PAGE_MASK) :
206                 NULL);
207 }
208
209 /*
210  * low 64 bits:
211  * 0: present
212  * 1: fault processing disable
213  * 2-3: translation type
214  * 12-63: address space root
215  * high 64 bits:
216  * 0-2: address width
217  * 3-6: aval
218  * 8-23: domain id
219  */
220 struct context_entry {
221         u64 lo;
222         u64 hi;
223 };
224
225 static inline bool context_present(struct context_entry *context)
226 {
227         return (context->lo & 1);
228 }
229 static inline void context_set_present(struct context_entry *context)
230 {
231         context->lo |= 1;
232 }
233
234 static inline void context_set_fault_enable(struct context_entry *context)
235 {
236         context->lo &= (((u64)-1) << 2) | 1;
237 }
238
239 static inline void context_set_translation_type(struct context_entry *context,
240                                                 unsigned long value)
241 {
242         context->lo &= (((u64)-1) << 4) | 3;
243         context->lo |= (value & 3) << 2;
244 }
245
246 static inline void context_set_address_root(struct context_entry *context,
247                                             unsigned long value)
248 {
249         context->lo |= value & VTD_PAGE_MASK;
250 }
251
252 static inline void context_set_address_width(struct context_entry *context,
253                                              unsigned long value)
254 {
255         context->hi |= value & 7;
256 }
257
258 static inline void context_set_domain_id(struct context_entry *context,
259                                          unsigned long value)
260 {
261         context->hi |= (value & ((1 << 16) - 1)) << 8;
262 }
263
264 static inline void context_clear_entry(struct context_entry *context)
265 {
266         context->lo = 0;
267         context->hi = 0;
268 }
269
270 /*
271  * 0: readable
272  * 1: writable
273  * 2-6: reserved
274  * 7: super page
275  * 8-10: available
276  * 11: snoop behavior
277  * 12-63: Host physcial address
278  */
279 struct dma_pte {
280         u64 val;
281 };
282
283 static inline void dma_clear_pte(struct dma_pte *pte)
284 {
285         pte->val = 0;
286 }
287
288 static inline void dma_set_pte_readable(struct dma_pte *pte)
289 {
290         pte->val |= DMA_PTE_READ;
291 }
292
293 static inline void dma_set_pte_writable(struct dma_pte *pte)
294 {
295         pte->val |= DMA_PTE_WRITE;
296 }
297
298 static inline void dma_set_pte_snp(struct dma_pte *pte)
299 {
300         pte->val |= DMA_PTE_SNP;
301 }
302
303 static inline void dma_set_pte_prot(struct dma_pte *pte, unsigned long prot)
304 {
305         pte->val = (pte->val & ~3) | (prot & 3);
306 }
307
308 static inline u64 dma_pte_addr(struct dma_pte *pte)
309 {
310 #ifdef CONFIG_64BIT
311         return pte->val & VTD_PAGE_MASK;
312 #else
313         /* Must have a full atomic 64-bit read */
314         return  __cmpxchg64(&pte->val, 0ULL, 0ULL) & VTD_PAGE_MASK;
315 #endif
316 }
317
318 static inline void dma_set_pte_pfn(struct dma_pte *pte, unsigned long pfn)
319 {
320         pte->val |= (uint64_t)pfn << VTD_PAGE_SHIFT;
321 }
322
323 static inline bool dma_pte_present(struct dma_pte *pte)
324 {
325         return (pte->val & 3) != 0;
326 }
327
328 static inline bool dma_pte_superpage(struct dma_pte *pte)
329 {
330         return (pte->val & (1 << 7));
331 }
332
333 static inline int first_pte_in_page(struct dma_pte *pte)
334 {
335         return !((unsigned long)pte & ~VTD_PAGE_MASK);
336 }
337
338 /*
339  * This domain is a statically identity mapping domain.
340  *      1. This domain creats a static 1:1 mapping to all usable memory.
341  *      2. It maps to each iommu if successful.
342  *      3. Each iommu mapps to this domain if successful.
343  */
344 static struct dmar_domain *si_domain;
345 static int hw_pass_through = 1;
346
347 /* devices under the same p2p bridge are owned in one domain */
348 #define DOMAIN_FLAG_P2P_MULTIPLE_DEVICES (1 << 0)
349
350 /* domain represents a virtual machine, more than one devices
351  * across iommus may be owned in one domain, e.g. kvm guest.
352  */
353 #define DOMAIN_FLAG_VIRTUAL_MACHINE     (1 << 1)
354
355 /* si_domain contains mulitple devices */
356 #define DOMAIN_FLAG_STATIC_IDENTITY     (1 << 2)
357
358 struct dmar_domain {
359         int     id;                     /* domain id */
360         int     nid;                    /* node id */
361         unsigned long iommu_bmp;        /* bitmap of iommus this domain uses*/
362
363         struct list_head devices;       /* all devices' list */
364         struct iova_domain iovad;       /* iova's that belong to this domain */
365
366         struct dma_pte  *pgd;           /* virtual address */
367         int             gaw;            /* max guest address width */
368
369         /* adjusted guest address width, 0 is level 2 30-bit */
370         int             agaw;
371
372         int             flags;          /* flags to find out type of domain */
373
374         int             iommu_coherency;/* indicate coherency of iommu access */
375         int             iommu_snooping; /* indicate snooping control feature*/
376         int             iommu_count;    /* reference count of iommu */
377         int             iommu_superpage;/* Level of superpages supported:
378                                            0 == 4KiB (no superpages), 1 == 2MiB,
379                                            2 == 1GiB, 3 == 512GiB, 4 == 1TiB */
380         spinlock_t      iommu_lock;     /* protect iommu set in domain */
381         u64             max_addr;       /* maximum mapped address */
382 };
383
384 /* PCI domain-device relationship */
385 struct device_domain_info {
386         struct list_head link;  /* link to domain siblings */
387         struct list_head global; /* link to global list */
388         int segment;            /* PCI domain */
389         u8 bus;                 /* PCI bus number */
390         u8 devfn;               /* PCI devfn number */
391         struct pci_dev *dev; /* it's NULL for PCIe-to-PCI bridge */
392         struct intel_iommu *iommu; /* IOMMU used by this device */
393         struct dmar_domain *domain; /* pointer to domain */
394 };
395
396 static void flush_unmaps_timeout(unsigned long data);
397
398 DEFINE_TIMER(unmap_timer,  flush_unmaps_timeout, 0, 0);
399
400 #define HIGH_WATER_MARK 250
401 struct deferred_flush_tables {
402         int next;
403         struct iova *iova[HIGH_WATER_MARK];
404         struct dmar_domain *domain[HIGH_WATER_MARK];
405 };
406
407 static struct deferred_flush_tables *deferred_flush;
408
409 /* bitmap for indexing intel_iommus */
410 static int g_num_of_iommus;
411
412 static DEFINE_SPINLOCK(async_umap_flush_lock);
413 static LIST_HEAD(unmaps_to_do);
414
415 static int timer_on;
416 static long list_size;
417
418 static void domain_remove_dev_info(struct dmar_domain *domain);
419
420 #ifdef CONFIG_INTEL_IOMMU_DEFAULT_ON
421 int dmar_disabled = 0;
422 #else
423 int dmar_disabled = 1;
424 #endif /*CONFIG_INTEL_IOMMU_DEFAULT_ON*/
425
426 static int dmar_map_gfx = 1;
427 static int dmar_forcedac;
428 static int intel_iommu_strict;
429 static int intel_iommu_superpage = 1;
430
431 int intel_iommu_gfx_mapped;
432 EXPORT_SYMBOL_GPL(intel_iommu_gfx_mapped);
433
434 #define DUMMY_DEVICE_DOMAIN_INFO ((struct device_domain_info *)(-1))
435 static DEFINE_SPINLOCK(device_domain_lock);
436 static LIST_HEAD(device_domain_list);
437
438 static struct iommu_ops intel_iommu_ops;
439
440 static int __init intel_iommu_setup(char *str)
441 {
442         if (!str)
443                 return -EINVAL;
444         while (*str) {
445                 if (!strncmp(str, "on", 2)) {
446                         dmar_disabled = 0;
447                         printk(KERN_INFO "Intel-IOMMU: enabled\n");
448                 } else if (!strncmp(str, "off", 3)) {
449                         dmar_disabled = 1;
450                         printk(KERN_INFO "Intel-IOMMU: disabled\n");
451                 } else if (!strncmp(str, "igfx_off", 8)) {
452                         dmar_map_gfx = 0;
453                         printk(KERN_INFO
454                                 "Intel-IOMMU: disable GFX device mapping\n");
455                 } else if (!strncmp(str, "forcedac", 8)) {
456                         printk(KERN_INFO
457                                 "Intel-IOMMU: Forcing DAC for PCI devices\n");
458                         dmar_forcedac = 1;
459                 } else if (!strncmp(str, "strict", 6)) {
460                         printk(KERN_INFO
461                                 "Intel-IOMMU: disable batched IOTLB flush\n");
462                         intel_iommu_strict = 1;
463                 } else if (!strncmp(str, "sp_off", 6)) {
464                         printk(KERN_INFO
465                                 "Intel-IOMMU: disable supported super page\n");
466                         intel_iommu_superpage = 0;
467                 }
468
469                 str += strcspn(str, ",");
470                 while (*str == ',')
471                         str++;
472         }
473         return 0;
474 }
475 __setup("intel_iommu=", intel_iommu_setup);
476
477 static struct kmem_cache *iommu_domain_cache;
478 static struct kmem_cache *iommu_devinfo_cache;
479 static struct kmem_cache *iommu_iova_cache;
480
481 static inline void *alloc_pgtable_page(int node)
482 {
483         struct page *page;
484         void *vaddr = NULL;
485
486         page = alloc_pages_node(node, GFP_ATOMIC | __GFP_ZERO, 0);
487         if (page)
488                 vaddr = page_address(page);
489         return vaddr;
490 }
491
492 static inline void free_pgtable_page(void *vaddr)
493 {
494         free_page((unsigned long)vaddr);
495 }
496
497 static inline void *alloc_domain_mem(void)
498 {
499         return kmem_cache_alloc(iommu_domain_cache, GFP_ATOMIC);
500 }
501
502 static void free_domain_mem(void *vaddr)
503 {
504         kmem_cache_free(iommu_domain_cache, vaddr);
505 }
506
507 static inline void * alloc_devinfo_mem(void)
508 {
509         return kmem_cache_alloc(iommu_devinfo_cache, GFP_ATOMIC);
510 }
511
512 static inline void free_devinfo_mem(void *vaddr)
513 {
514         kmem_cache_free(iommu_devinfo_cache, vaddr);
515 }
516
517 struct iova *alloc_iova_mem(void)
518 {
519         return kmem_cache_alloc(iommu_iova_cache, GFP_ATOMIC);
520 }
521
522 void free_iova_mem(struct iova *iova)
523 {
524         kmem_cache_free(iommu_iova_cache, iova);
525 }
526
527
528 static int __iommu_calculate_agaw(struct intel_iommu *iommu, int max_gaw)
529 {
530         unsigned long sagaw;
531         int agaw = -1;
532
533         sagaw = cap_sagaw(iommu->cap);
534         for (agaw = width_to_agaw(max_gaw);
535              agaw >= 0; agaw--) {
536                 if (test_bit(agaw, &sagaw))
537                         break;
538         }
539
540         return agaw;
541 }
542
543 /*
544  * Calculate max SAGAW for each iommu.
545  */
546 int iommu_calculate_max_sagaw(struct intel_iommu *iommu)
547 {
548         return __iommu_calculate_agaw(iommu, MAX_AGAW_WIDTH);
549 }
550
551 /*
552  * calculate agaw for each iommu.
553  * "SAGAW" may be different across iommus, use a default agaw, and
554  * get a supported less agaw for iommus that don't support the default agaw.
555  */
556 int iommu_calculate_agaw(struct intel_iommu *iommu)
557 {
558         return __iommu_calculate_agaw(iommu, DEFAULT_DOMAIN_ADDRESS_WIDTH);
559 }
560
561 /* This functionin only returns single iommu in a domain */
562 static struct intel_iommu *domain_get_iommu(struct dmar_domain *domain)
563 {
564         int iommu_id;
565
566         /* si_domain and vm domain should not get here. */
567         BUG_ON(domain->flags & DOMAIN_FLAG_VIRTUAL_MACHINE);
568         BUG_ON(domain->flags & DOMAIN_FLAG_STATIC_IDENTITY);
569
570         iommu_id = find_first_bit(&domain->iommu_bmp, g_num_of_iommus);
571         if (iommu_id < 0 || iommu_id >= g_num_of_iommus)
572                 return NULL;
573
574         return g_iommus[iommu_id];
575 }
576
577 static void domain_update_iommu_coherency(struct dmar_domain *domain)
578 {
579         int i;
580
581         domain->iommu_coherency = 1;
582
583         for_each_set_bit(i, &domain->iommu_bmp, g_num_of_iommus) {
584                 if (!ecap_coherent(g_iommus[i]->ecap)) {
585                         domain->iommu_coherency = 0;
586                         break;
587                 }
588         }
589 }
590
591 static void domain_update_iommu_snooping(struct dmar_domain *domain)
592 {
593         int i;
594
595         domain->iommu_snooping = 1;
596
597         for_each_set_bit(i, &domain->iommu_bmp, g_num_of_iommus) {
598                 if (!ecap_sc_support(g_iommus[i]->ecap)) {
599                         domain->iommu_snooping = 0;
600                         break;
601                 }
602         }
603 }
604
605 static void domain_update_iommu_superpage(struct dmar_domain *domain)
606 {
607         struct dmar_drhd_unit *drhd;
608         struct intel_iommu *iommu = NULL;
609         int mask = 0xf;
610
611         if (!intel_iommu_superpage) {
612                 domain->iommu_superpage = 0;
613                 return;
614         }
615
616         /* set iommu_superpage to the smallest common denominator */
617         for_each_active_iommu(iommu, drhd) {
618                 mask &= cap_super_page_val(iommu->cap);
619                 if (!mask) {
620                         break;
621                 }
622         }
623         domain->iommu_superpage = fls(mask);
624 }
625
626 /* Some capabilities may be different across iommus */
627 static void domain_update_iommu_cap(struct dmar_domain *domain)
628 {
629         domain_update_iommu_coherency(domain);
630         domain_update_iommu_snooping(domain);
631         domain_update_iommu_superpage(domain);
632 }
633
634 static struct intel_iommu *device_to_iommu(int segment, u8 bus, u8 devfn)
635 {
636         struct dmar_drhd_unit *drhd = NULL;
637         int i;
638
639         for_each_drhd_unit(drhd) {
640                 if (drhd->ignored)
641                         continue;
642                 if (segment != drhd->segment)
643                         continue;
644
645                 for (i = 0; i < drhd->devices_cnt; i++) {
646                         if (drhd->devices[i] &&
647                             drhd->devices[i]->bus->number == bus &&
648                             drhd->devices[i]->devfn == devfn)
649                                 return drhd->iommu;
650                         if (drhd->devices[i] &&
651                             drhd->devices[i]->subordinate &&
652                             drhd->devices[i]->subordinate->number <= bus &&
653                             drhd->devices[i]->subordinate->subordinate >= bus)
654                                 return drhd->iommu;
655                 }
656
657                 if (drhd->include_all)
658                         return drhd->iommu;
659         }
660
661         return NULL;
662 }
663
664 static void domain_flush_cache(struct dmar_domain *domain,
665                                void *addr, int size)
666 {
667         if (!domain->iommu_coherency)
668                 clflush_cache_range(addr, size);
669 }
670
671 /* Gets context entry for a given bus and devfn */
672 static struct context_entry * device_to_context_entry(struct intel_iommu *iommu,
673                 u8 bus, u8 devfn)
674 {
675         struct root_entry *root;
676         struct context_entry *context;
677         unsigned long phy_addr;
678         unsigned long flags;
679
680         spin_lock_irqsave(&iommu->lock, flags);
681         root = &iommu->root_entry[bus];
682         context = get_context_addr_from_root(root);
683         if (!context) {
684                 context = (struct context_entry *)
685                                 alloc_pgtable_page(iommu->node);
686                 if (!context) {
687                         spin_unlock_irqrestore(&iommu->lock, flags);
688                         return NULL;
689                 }
690                 __iommu_flush_cache(iommu, (void *)context, CONTEXT_SIZE);
691                 phy_addr = virt_to_phys((void *)context);
692                 set_root_value(root, phy_addr);
693                 set_root_present(root);
694                 __iommu_flush_cache(iommu, root, sizeof(*root));
695         }
696         spin_unlock_irqrestore(&iommu->lock, flags);
697         return &context[devfn];
698 }
699
700 static int device_context_mapped(struct intel_iommu *iommu, u8 bus, u8 devfn)
701 {
702         struct root_entry *root;
703         struct context_entry *context;
704         int ret;
705         unsigned long flags;
706
707         spin_lock_irqsave(&iommu->lock, flags);
708         root = &iommu->root_entry[bus];
709         context = get_context_addr_from_root(root);
710         if (!context) {
711                 ret = 0;
712                 goto out;
713         }
714         ret = context_present(&context[devfn]);
715 out:
716         spin_unlock_irqrestore(&iommu->lock, flags);
717         return ret;
718 }
719
720 static void clear_context_table(struct intel_iommu *iommu, u8 bus, u8 devfn)
721 {
722         struct root_entry *root;
723         struct context_entry *context;
724         unsigned long flags;
725
726         spin_lock_irqsave(&iommu->lock, flags);
727         root = &iommu->root_entry[bus];
728         context = get_context_addr_from_root(root);
729         if (context) {
730                 context_clear_entry(&context[devfn]);
731                 __iommu_flush_cache(iommu, &context[devfn], \
732                         sizeof(*context));
733         }
734         spin_unlock_irqrestore(&iommu->lock, flags);
735 }
736
737 static void free_context_table(struct intel_iommu *iommu)
738 {
739         struct root_entry *root;
740         int i;
741         unsigned long flags;
742         struct context_entry *context;
743
744         spin_lock_irqsave(&iommu->lock, flags);
745         if (!iommu->root_entry) {
746                 goto out;
747         }
748         for (i = 0; i < ROOT_ENTRY_NR; i++) {
749                 root = &iommu->root_entry[i];
750                 context = get_context_addr_from_root(root);
751                 if (context)
752                         free_pgtable_page(context);
753         }
754         free_pgtable_page(iommu->root_entry);
755         iommu->root_entry = NULL;
756 out:
757         spin_unlock_irqrestore(&iommu->lock, flags);
758 }
759
760 static struct dma_pte *pfn_to_dma_pte(struct dmar_domain *domain,
761                                       unsigned long pfn, int target_level)
762 {
763         int addr_width = agaw_to_width(domain->agaw) - VTD_PAGE_SHIFT;
764         struct dma_pte *parent, *pte = NULL;
765         int level = agaw_to_level(domain->agaw);
766         int offset;
767
768         BUG_ON(!domain->pgd);
769         BUG_ON(addr_width < BITS_PER_LONG && pfn >> addr_width);
770         parent = domain->pgd;
771
772         while (level > 0) {
773                 void *tmp_page;
774
775                 offset = pfn_level_offset(pfn, level);
776                 pte = &parent[offset];
777                 if (!target_level && (dma_pte_superpage(pte) || !dma_pte_present(pte)))
778                         break;
779                 if (level == target_level)
780                         break;
781
782                 if (!dma_pte_present(pte)) {
783                         uint64_t pteval;
784
785                         tmp_page = alloc_pgtable_page(domain->nid);
786
787                         if (!tmp_page)
788                                 return NULL;
789
790                         domain_flush_cache(domain, tmp_page, VTD_PAGE_SIZE);
791                         pteval = ((uint64_t)virt_to_dma_pfn(tmp_page) << VTD_PAGE_SHIFT) | DMA_PTE_READ | DMA_PTE_WRITE;
792                         if (cmpxchg64(&pte->val, 0ULL, pteval)) {
793                                 /* Someone else set it while we were thinking; use theirs. */
794                                 free_pgtable_page(tmp_page);
795                         } else {
796                                 dma_pte_addr(pte);
797                                 domain_flush_cache(domain, pte, sizeof(*pte));
798                         }
799                 }
800                 parent = phys_to_virt(dma_pte_addr(pte));
801                 level--;
802         }
803
804         return pte;
805 }
806
807
808 /* return address's pte at specific level */
809 static struct dma_pte *dma_pfn_level_pte(struct dmar_domain *domain,
810                                          unsigned long pfn,
811                                          int level, int *large_page)
812 {
813         struct dma_pte *parent, *pte = NULL;
814         int total = agaw_to_level(domain->agaw);
815         int offset;
816
817         parent = domain->pgd;
818         while (level <= total) {
819                 offset = pfn_level_offset(pfn, total);
820                 pte = &parent[offset];
821                 if (level == total)
822                         return pte;
823
824                 if (!dma_pte_present(pte)) {
825                         *large_page = total;
826                         break;
827                 }
828
829                 if (pte->val & DMA_PTE_LARGE_PAGE) {
830                         *large_page = total;
831                         return pte;
832                 }
833
834                 parent = phys_to_virt(dma_pte_addr(pte));
835                 total--;
836         }
837         return NULL;
838 }
839
840 /* clear last level pte, a tlb flush should be followed */
841 static int dma_pte_clear_range(struct dmar_domain *domain,
842                                 unsigned long start_pfn,
843                                 unsigned long last_pfn)
844 {
845         int addr_width = agaw_to_width(domain->agaw) - VTD_PAGE_SHIFT;
846         unsigned int large_page = 1;
847         struct dma_pte *first_pte, *pte;
848         int order;
849
850         BUG_ON(addr_width < BITS_PER_LONG && start_pfn >> addr_width);
851         BUG_ON(addr_width < BITS_PER_LONG && last_pfn >> addr_width);
852         BUG_ON(start_pfn > last_pfn);
853
854         /* we don't need lock here; nobody else touches the iova range */
855         do {
856                 large_page = 1;
857                 first_pte = pte = dma_pfn_level_pte(domain, start_pfn, 1, &large_page);
858                 if (!pte) {
859                         start_pfn = align_to_level(start_pfn + 1, large_page + 1);
860                         continue;
861                 }
862                 do {
863                         dma_clear_pte(pte);
864                         start_pfn += lvl_to_nr_pages(large_page);
865                         pte++;
866                 } while (start_pfn <= last_pfn && !first_pte_in_page(pte));
867
868                 domain_flush_cache(domain, first_pte,
869                                    (void *)pte - (void *)first_pte);
870
871         } while (start_pfn && start_pfn <= last_pfn);
872
873         order = (large_page - 1) * 9;
874         return order;
875 }
876
877 /* free page table pages. last level pte should already be cleared */
878 static void dma_pte_free_pagetable(struct dmar_domain *domain,
879                                    unsigned long start_pfn,
880                                    unsigned long last_pfn)
881 {
882         int addr_width = agaw_to_width(domain->agaw) - VTD_PAGE_SHIFT;
883         struct dma_pte *first_pte, *pte;
884         int total = agaw_to_level(domain->agaw);
885         int level;
886         unsigned long tmp;
887         int large_page = 2;
888
889         BUG_ON(addr_width < BITS_PER_LONG && start_pfn >> addr_width);
890         BUG_ON(addr_width < BITS_PER_LONG && last_pfn >> addr_width);
891         BUG_ON(start_pfn > last_pfn);
892
893         /* We don't need lock here; nobody else touches the iova range */
894         level = 2;
895         while (level <= total) {
896                 tmp = align_to_level(start_pfn, level);
897
898                 /* If we can't even clear one PTE at this level, we're done */
899                 if (tmp + level_size(level) - 1 > last_pfn)
900                         return;
901
902                 do {
903                         large_page = level;
904                         first_pte = pte = dma_pfn_level_pte(domain, tmp, level, &large_page);
905                         if (large_page > level)
906                                 level = large_page + 1;
907                         if (!pte) {
908                                 tmp = align_to_level(tmp + 1, level + 1);
909                                 continue;
910                         }
911                         do {
912                                 if (dma_pte_present(pte)) {
913                                         free_pgtable_page(phys_to_virt(dma_pte_addr(pte)));
914                                         dma_clear_pte(pte);
915                                 }
916                                 pte++;
917                                 tmp += level_size(level);
918                         } while (!first_pte_in_page(pte) &&
919                                  tmp + level_size(level) - 1 <= last_pfn);
920
921                         domain_flush_cache(domain, first_pte,
922                                            (void *)pte - (void *)first_pte);
923                         
924                 } while (tmp && tmp + level_size(level) - 1 <= last_pfn);
925                 level++;
926         }
927         /* free pgd */
928         if (start_pfn == 0 && last_pfn == DOMAIN_MAX_PFN(domain->gaw)) {
929                 free_pgtable_page(domain->pgd);
930                 domain->pgd = NULL;
931         }
932 }
933
934 /* iommu handling */
935 static int iommu_alloc_root_entry(struct intel_iommu *iommu)
936 {
937         struct root_entry *root;
938         unsigned long flags;
939
940         root = (struct root_entry *)alloc_pgtable_page(iommu->node);
941         if (!root)
942                 return -ENOMEM;
943
944         __iommu_flush_cache(iommu, root, ROOT_SIZE);
945
946         spin_lock_irqsave(&iommu->lock, flags);
947         iommu->root_entry = root;
948         spin_unlock_irqrestore(&iommu->lock, flags);
949
950         return 0;
951 }
952
953 static void iommu_set_root_entry(struct intel_iommu *iommu)
954 {
955         void *addr;
956         u32 sts;
957         unsigned long flag;
958
959         addr = iommu->root_entry;
960
961         raw_spin_lock_irqsave(&iommu->register_lock, flag);
962         dmar_writeq(iommu->reg + DMAR_RTADDR_REG, virt_to_phys(addr));
963
964         writel(iommu->gcmd | DMA_GCMD_SRTP, iommu->reg + DMAR_GCMD_REG);
965
966         /* Make sure hardware complete it */
967         IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
968                       readl, (sts & DMA_GSTS_RTPS), sts);
969
970         raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
971 }
972
973 static void iommu_flush_write_buffer(struct intel_iommu *iommu)
974 {
975         u32 val;
976         unsigned long flag;
977
978         if (!rwbf_quirk && !cap_rwbf(iommu->cap))
979                 return;
980
981         raw_spin_lock_irqsave(&iommu->register_lock, flag);
982         writel(iommu->gcmd | DMA_GCMD_WBF, iommu->reg + DMAR_GCMD_REG);
983
984         /* Make sure hardware complete it */
985         IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
986                       readl, (!(val & DMA_GSTS_WBFS)), val);
987
988         raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
989 }
990
991 /* return value determine if we need a write buffer flush */
992 static void __iommu_flush_context(struct intel_iommu *iommu,
993                                   u16 did, u16 source_id, u8 function_mask,
994                                   u64 type)
995 {
996         u64 val = 0;
997         unsigned long flag;
998
999         switch (type) {
1000         case DMA_CCMD_GLOBAL_INVL:
1001                 val = DMA_CCMD_GLOBAL_INVL;
1002                 break;
1003         case DMA_CCMD_DOMAIN_INVL:
1004                 val = DMA_CCMD_DOMAIN_INVL|DMA_CCMD_DID(did);
1005                 break;
1006         case DMA_CCMD_DEVICE_INVL:
1007                 val = DMA_CCMD_DEVICE_INVL|DMA_CCMD_DID(did)
1008                         | DMA_CCMD_SID(source_id) | DMA_CCMD_FM(function_mask);
1009                 break;
1010         default:
1011                 BUG();
1012         }
1013         val |= DMA_CCMD_ICC;
1014
1015         raw_spin_lock_irqsave(&iommu->register_lock, flag);
1016         dmar_writeq(iommu->reg + DMAR_CCMD_REG, val);
1017
1018         /* Make sure hardware complete it */
1019         IOMMU_WAIT_OP(iommu, DMAR_CCMD_REG,
1020                 dmar_readq, (!(val & DMA_CCMD_ICC)), val);
1021
1022         raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1023 }
1024
1025 /* return value determine if we need a write buffer flush */
1026 static void __iommu_flush_iotlb(struct intel_iommu *iommu, u16 did,
1027                                 u64 addr, unsigned int size_order, u64 type)
1028 {
1029         int tlb_offset = ecap_iotlb_offset(iommu->ecap);
1030         u64 val = 0, val_iva = 0;
1031         unsigned long flag;
1032
1033         switch (type) {
1034         case DMA_TLB_GLOBAL_FLUSH:
1035                 /* global flush doesn't need set IVA_REG */
1036                 val = DMA_TLB_GLOBAL_FLUSH|DMA_TLB_IVT;
1037                 break;
1038         case DMA_TLB_DSI_FLUSH:
1039                 val = DMA_TLB_DSI_FLUSH|DMA_TLB_IVT|DMA_TLB_DID(did);
1040                 break;
1041         case DMA_TLB_PSI_FLUSH:
1042                 val = DMA_TLB_PSI_FLUSH|DMA_TLB_IVT|DMA_TLB_DID(did);
1043                 /* Note: always flush non-leaf currently */
1044                 val_iva = size_order | addr;
1045                 break;
1046         default:
1047                 BUG();
1048         }
1049         /* Note: set drain read/write */
1050 #if 0
1051         /*
1052          * This is probably to be super secure.. Looks like we can
1053          * ignore it without any impact.
1054          */
1055         if (cap_read_drain(iommu->cap))
1056                 val |= DMA_TLB_READ_DRAIN;
1057 #endif
1058         if (cap_write_drain(iommu->cap))
1059                 val |= DMA_TLB_WRITE_DRAIN;
1060
1061         raw_spin_lock_irqsave(&iommu->register_lock, flag);
1062         /* Note: Only uses first TLB reg currently */
1063         if (val_iva)
1064                 dmar_writeq(iommu->reg + tlb_offset, val_iva);
1065         dmar_writeq(iommu->reg + tlb_offset + 8, val);
1066
1067         /* Make sure hardware complete it */
1068         IOMMU_WAIT_OP(iommu, tlb_offset + 8,
1069                 dmar_readq, (!(val & DMA_TLB_IVT)), val);
1070
1071         raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1072
1073         /* check IOTLB invalidation granularity */
1074         if (DMA_TLB_IAIG(val) == 0)
1075                 printk(KERN_ERR"IOMMU: flush IOTLB failed\n");
1076         if (DMA_TLB_IAIG(val) != DMA_TLB_IIRG(type))
1077                 pr_debug("IOMMU: tlb flush request %Lx, actual %Lx\n",
1078                         (unsigned long long)DMA_TLB_IIRG(type),
1079                         (unsigned long long)DMA_TLB_IAIG(val));
1080 }
1081
1082 static struct device_domain_info *iommu_support_dev_iotlb(
1083         struct dmar_domain *domain, int segment, u8 bus, u8 devfn)
1084 {
1085         int found = 0;
1086         unsigned long flags;
1087         struct device_domain_info *info;
1088         struct intel_iommu *iommu = device_to_iommu(segment, bus, devfn);
1089
1090         if (!ecap_dev_iotlb_support(iommu->ecap))
1091                 return NULL;
1092
1093         if (!iommu->qi)
1094                 return NULL;
1095
1096         spin_lock_irqsave(&device_domain_lock, flags);
1097         list_for_each_entry(info, &domain->devices, link)
1098                 if (info->bus == bus && info->devfn == devfn) {
1099                         found = 1;
1100                         break;
1101                 }
1102         spin_unlock_irqrestore(&device_domain_lock, flags);
1103
1104         if (!found || !info->dev)
1105                 return NULL;
1106
1107         if (!pci_find_ext_capability(info->dev, PCI_EXT_CAP_ID_ATS))
1108                 return NULL;
1109
1110         if (!dmar_find_matched_atsr_unit(info->dev))
1111                 return NULL;
1112
1113         info->iommu = iommu;
1114
1115         return info;
1116 }
1117
1118 static void iommu_enable_dev_iotlb(struct device_domain_info *info)
1119 {
1120         if (!info)
1121                 return;
1122
1123         pci_enable_ats(info->dev, VTD_PAGE_SHIFT);
1124 }
1125
1126 static void iommu_disable_dev_iotlb(struct device_domain_info *info)
1127 {
1128         if (!info->dev || !pci_ats_enabled(info->dev))
1129                 return;
1130
1131         pci_disable_ats(info->dev);
1132 }
1133
1134 static void iommu_flush_dev_iotlb(struct dmar_domain *domain,
1135                                   u64 addr, unsigned mask)
1136 {
1137         u16 sid, qdep;
1138         unsigned long flags;
1139         struct device_domain_info *info;
1140
1141         spin_lock_irqsave(&device_domain_lock, flags);
1142         list_for_each_entry(info, &domain->devices, link) {
1143                 if (!info->dev || !pci_ats_enabled(info->dev))
1144                         continue;
1145
1146                 sid = info->bus << 8 | info->devfn;
1147                 qdep = pci_ats_queue_depth(info->dev);
1148                 qi_flush_dev_iotlb(info->iommu, sid, qdep, addr, mask);
1149         }
1150         spin_unlock_irqrestore(&device_domain_lock, flags);
1151 }
1152
1153 static void iommu_flush_iotlb_psi(struct intel_iommu *iommu, u16 did,
1154                                   unsigned long pfn, unsigned int pages, int map)
1155 {
1156         unsigned int mask = ilog2(__roundup_pow_of_two(pages));
1157         uint64_t addr = (uint64_t)pfn << VTD_PAGE_SHIFT;
1158
1159         BUG_ON(pages == 0);
1160
1161         /*
1162          * Fallback to domain selective flush if no PSI support or the size is
1163          * too big.
1164          * PSI requires page size to be 2 ^ x, and the base address is naturally
1165          * aligned to the size
1166          */
1167         if (!cap_pgsel_inv(iommu->cap) || mask > cap_max_amask_val(iommu->cap))
1168                 iommu->flush.flush_iotlb(iommu, did, 0, 0,
1169                                                 DMA_TLB_DSI_FLUSH);
1170         else
1171                 iommu->flush.flush_iotlb(iommu, did, addr, mask,
1172                                                 DMA_TLB_PSI_FLUSH);
1173
1174         /*
1175          * In caching mode, changes of pages from non-present to present require
1176          * flush. However, device IOTLB doesn't need to be flushed in this case.
1177          */
1178         if (!cap_caching_mode(iommu->cap) || !map)
1179                 iommu_flush_dev_iotlb(iommu->domains[did], addr, mask);
1180 }
1181
1182 static void iommu_disable_protect_mem_regions(struct intel_iommu *iommu)
1183 {
1184         u32 pmen;
1185         unsigned long flags;
1186
1187         raw_spin_lock_irqsave(&iommu->register_lock, flags);
1188         pmen = readl(iommu->reg + DMAR_PMEN_REG);
1189         pmen &= ~DMA_PMEN_EPM;
1190         writel(pmen, iommu->reg + DMAR_PMEN_REG);
1191
1192         /* wait for the protected region status bit to clear */
1193         IOMMU_WAIT_OP(iommu, DMAR_PMEN_REG,
1194                 readl, !(pmen & DMA_PMEN_PRS), pmen);
1195
1196         raw_spin_unlock_irqrestore(&iommu->register_lock, flags);
1197 }
1198
1199 static int iommu_enable_translation(struct intel_iommu *iommu)
1200 {
1201         u32 sts;
1202         unsigned long flags;
1203
1204         raw_spin_lock_irqsave(&iommu->register_lock, flags);
1205         iommu->gcmd |= DMA_GCMD_TE;
1206         writel(iommu->gcmd, iommu->reg + DMAR_GCMD_REG);
1207
1208         /* Make sure hardware complete it */
1209         IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1210                       readl, (sts & DMA_GSTS_TES), sts);
1211
1212         raw_spin_unlock_irqrestore(&iommu->register_lock, flags);
1213         return 0;
1214 }
1215
1216 static int iommu_disable_translation(struct intel_iommu *iommu)
1217 {
1218         u32 sts;
1219         unsigned long flag;
1220
1221         raw_spin_lock_irqsave(&iommu->register_lock, flag);
1222         iommu->gcmd &= ~DMA_GCMD_TE;
1223         writel(iommu->gcmd, iommu->reg + DMAR_GCMD_REG);
1224
1225         /* Make sure hardware complete it */
1226         IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1227                       readl, (!(sts & DMA_GSTS_TES)), sts);
1228
1229         raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1230         return 0;
1231 }
1232
1233
1234 static int iommu_init_domains(struct intel_iommu *iommu)
1235 {
1236         unsigned long ndomains;
1237         unsigned long nlongs;
1238
1239         ndomains = cap_ndoms(iommu->cap);
1240         pr_debug("IOMMU %d: Number of Domains supportd <%ld>\n", iommu->seq_id,
1241                         ndomains);
1242         nlongs = BITS_TO_LONGS(ndomains);
1243
1244         spin_lock_init(&iommu->lock);
1245
1246         /* TBD: there might be 64K domains,
1247          * consider other allocation for future chip
1248          */
1249         iommu->domain_ids = kcalloc(nlongs, sizeof(unsigned long), GFP_KERNEL);
1250         if (!iommu->domain_ids) {
1251                 printk(KERN_ERR "Allocating domain id array failed\n");
1252                 return -ENOMEM;
1253         }
1254         iommu->domains = kcalloc(ndomains, sizeof(struct dmar_domain *),
1255                         GFP_KERNEL);
1256         if (!iommu->domains) {
1257                 printk(KERN_ERR "Allocating domain array failed\n");
1258                 return -ENOMEM;
1259         }
1260
1261         /*
1262          * if Caching mode is set, then invalid translations are tagged
1263          * with domainid 0. Hence we need to pre-allocate it.
1264          */
1265         if (cap_caching_mode(iommu->cap))
1266                 set_bit(0, iommu->domain_ids);
1267         return 0;
1268 }
1269
1270
1271 static void domain_exit(struct dmar_domain *domain);
1272 static void vm_domain_exit(struct dmar_domain *domain);
1273
1274 void free_dmar_iommu(struct intel_iommu *iommu)
1275 {
1276         struct dmar_domain *domain;
1277         int i;
1278         unsigned long flags;
1279
1280         if ((iommu->domains) && (iommu->domain_ids)) {
1281                 for_each_set_bit(i, iommu->domain_ids, cap_ndoms(iommu->cap)) {
1282                         domain = iommu->domains[i];
1283                         clear_bit(i, iommu->domain_ids);
1284
1285                         spin_lock_irqsave(&domain->iommu_lock, flags);
1286                         if (--domain->iommu_count == 0) {
1287                                 if (domain->flags & DOMAIN_FLAG_VIRTUAL_MACHINE)
1288                                         vm_domain_exit(domain);
1289                                 else
1290                                         domain_exit(domain);
1291                         }
1292                         spin_unlock_irqrestore(&domain->iommu_lock, flags);
1293                 }
1294         }
1295
1296         if (iommu->gcmd & DMA_GCMD_TE)
1297                 iommu_disable_translation(iommu);
1298
1299         if (iommu->irq) {
1300                 irq_set_handler_data(iommu->irq, NULL);
1301                 /* This will mask the irq */
1302                 free_irq(iommu->irq, iommu);
1303                 destroy_irq(iommu->irq);
1304         }
1305
1306         kfree(iommu->domains);
1307         kfree(iommu->domain_ids);
1308
1309         g_iommus[iommu->seq_id] = NULL;
1310
1311         /* if all iommus are freed, free g_iommus */
1312         for (i = 0; i < g_num_of_iommus; i++) {
1313                 if (g_iommus[i])
1314                         break;
1315         }
1316
1317         if (i == g_num_of_iommus)
1318                 kfree(g_iommus);
1319
1320         /* free context mapping */
1321         free_context_table(iommu);
1322 }
1323
1324 static struct dmar_domain *alloc_domain(void)
1325 {
1326         struct dmar_domain *domain;
1327
1328         domain = alloc_domain_mem();
1329         if (!domain)
1330                 return NULL;
1331
1332         domain->nid = -1;
1333         memset(&domain->iommu_bmp, 0, sizeof(unsigned long));
1334         domain->flags = 0;
1335
1336         return domain;
1337 }
1338
1339 static int iommu_attach_domain(struct dmar_domain *domain,
1340                                struct intel_iommu *iommu)
1341 {
1342         int num;
1343         unsigned long ndomains;
1344         unsigned long flags;
1345
1346         ndomains = cap_ndoms(iommu->cap);
1347
1348         spin_lock_irqsave(&iommu->lock, flags);
1349
1350         num = find_first_zero_bit(iommu->domain_ids, ndomains);
1351         if (num >= ndomains) {
1352                 spin_unlock_irqrestore(&iommu->lock, flags);
1353                 printk(KERN_ERR "IOMMU: no free domain ids\n");
1354                 return -ENOMEM;
1355         }
1356
1357         domain->id = num;
1358         set_bit(num, iommu->domain_ids);
1359         set_bit(iommu->seq_id, &domain->iommu_bmp);
1360         iommu->domains[num] = domain;
1361         spin_unlock_irqrestore(&iommu->lock, flags);
1362
1363         return 0;
1364 }
1365
1366 static void iommu_detach_domain(struct dmar_domain *domain,
1367                                 struct intel_iommu *iommu)
1368 {
1369         unsigned long flags;
1370         int num, ndomains;
1371         int found = 0;
1372
1373         spin_lock_irqsave(&iommu->lock, flags);
1374         ndomains = cap_ndoms(iommu->cap);
1375         for_each_set_bit(num, iommu->domain_ids, ndomains) {
1376                 if (iommu->domains[num] == domain) {
1377                         found = 1;
1378                         break;
1379                 }
1380         }
1381
1382         if (found) {
1383                 clear_bit(num, iommu->domain_ids);
1384                 clear_bit(iommu->seq_id, &domain->iommu_bmp);
1385                 iommu->domains[num] = NULL;
1386         }
1387         spin_unlock_irqrestore(&iommu->lock, flags);
1388 }
1389
1390 static struct iova_domain reserved_iova_list;
1391 static struct lock_class_key reserved_rbtree_key;
1392
1393 static int dmar_init_reserved_ranges(void)
1394 {
1395         struct pci_dev *pdev = NULL;
1396         struct iova *iova;
1397         int i;
1398
1399         init_iova_domain(&reserved_iova_list, DMA_32BIT_PFN);
1400
1401         lockdep_set_class(&reserved_iova_list.iova_rbtree_lock,
1402                 &reserved_rbtree_key);
1403
1404         /* IOAPIC ranges shouldn't be accessed by DMA */
1405         iova = reserve_iova(&reserved_iova_list, IOVA_PFN(IOAPIC_RANGE_START),
1406                 IOVA_PFN(IOAPIC_RANGE_END));
1407         if (!iova) {
1408                 printk(KERN_ERR "Reserve IOAPIC range failed\n");
1409                 return -ENODEV;
1410         }
1411
1412         /* Reserve all PCI MMIO to avoid peer-to-peer access */
1413         for_each_pci_dev(pdev) {
1414                 struct resource *r;
1415
1416                 for (i = 0; i < PCI_NUM_RESOURCES; i++) {
1417                         r = &pdev->resource[i];
1418                         if (!r->flags || !(r->flags & IORESOURCE_MEM))
1419                                 continue;
1420                         iova = reserve_iova(&reserved_iova_list,
1421                                             IOVA_PFN(r->start),
1422                                             IOVA_PFN(r->end));
1423                         if (!iova) {
1424                                 printk(KERN_ERR "Reserve iova failed\n");
1425                                 return -ENODEV;
1426                         }
1427                 }
1428         }
1429         return 0;
1430 }
1431
1432 static void domain_reserve_special_ranges(struct dmar_domain *domain)
1433 {
1434         copy_reserved_iova(&reserved_iova_list, &domain->iovad);
1435 }
1436
1437 static inline int guestwidth_to_adjustwidth(int gaw)
1438 {
1439         int agaw;
1440         int r = (gaw - 12) % 9;
1441
1442         if (r == 0)
1443                 agaw = gaw;
1444         else
1445                 agaw = gaw + 9 - r;
1446         if (agaw > 64)
1447                 agaw = 64;
1448         return agaw;
1449 }
1450
1451 static int domain_init(struct dmar_domain *domain, int guest_width)
1452 {
1453         struct intel_iommu *iommu;
1454         int adjust_width, agaw;
1455         unsigned long sagaw;
1456
1457         init_iova_domain(&domain->iovad, DMA_32BIT_PFN);
1458         spin_lock_init(&domain->iommu_lock);
1459
1460         domain_reserve_special_ranges(domain);
1461
1462         /* calculate AGAW */
1463         iommu = domain_get_iommu(domain);
1464         if (guest_width > cap_mgaw(iommu->cap))
1465                 guest_width = cap_mgaw(iommu->cap);
1466         domain->gaw = guest_width;
1467         adjust_width = guestwidth_to_adjustwidth(guest_width);
1468         agaw = width_to_agaw(adjust_width);
1469         sagaw = cap_sagaw(iommu->cap);
1470         if (!test_bit(agaw, &sagaw)) {
1471                 /* hardware doesn't support it, choose a bigger one */
1472                 pr_debug("IOMMU: hardware doesn't support agaw %d\n", agaw);
1473                 agaw = find_next_bit(&sagaw, 5, agaw);
1474                 if (agaw >= 5)
1475                         return -ENODEV;
1476         }
1477         domain->agaw = agaw;
1478         INIT_LIST_HEAD(&domain->devices);
1479
1480         if (ecap_coherent(iommu->ecap))
1481                 domain->iommu_coherency = 1;
1482         else
1483                 domain->iommu_coherency = 0;
1484
1485         if (ecap_sc_support(iommu->ecap))
1486                 domain->iommu_snooping = 1;
1487         else
1488                 domain->iommu_snooping = 0;
1489
1490         domain->iommu_superpage = fls(cap_super_page_val(iommu->cap));
1491         domain->iommu_count = 1;
1492         domain->nid = iommu->node;
1493
1494         /* always allocate the top pgd */
1495         domain->pgd = (struct dma_pte *)alloc_pgtable_page(domain->nid);
1496         if (!domain->pgd)
1497                 return -ENOMEM;
1498         __iommu_flush_cache(iommu, domain->pgd, PAGE_SIZE);
1499         return 0;
1500 }
1501
1502 static void domain_exit(struct dmar_domain *domain)
1503 {
1504         struct dmar_drhd_unit *drhd;
1505         struct intel_iommu *iommu;
1506
1507         /* Domain 0 is reserved, so dont process it */
1508         if (!domain)
1509                 return;
1510
1511         /* Flush any lazy unmaps that may reference this domain */
1512         if (!intel_iommu_strict)
1513                 flush_unmaps_timeout(0);
1514
1515         domain_remove_dev_info(domain);
1516         /* destroy iovas */
1517         put_iova_domain(&domain->iovad);
1518
1519         /* clear ptes */
1520         dma_pte_clear_range(domain, 0, DOMAIN_MAX_PFN(domain->gaw));
1521
1522         /* free page tables */
1523         dma_pte_free_pagetable(domain, 0, DOMAIN_MAX_PFN(domain->gaw));
1524
1525         for_each_active_iommu(iommu, drhd)
1526                 if (test_bit(iommu->seq_id, &domain->iommu_bmp))
1527                         iommu_detach_domain(domain, iommu);
1528
1529         free_domain_mem(domain);
1530 }
1531
1532 static int domain_context_mapping_one(struct dmar_domain *domain, int segment,
1533                                  u8 bus, u8 devfn, int translation)
1534 {
1535         struct context_entry *context;
1536         unsigned long flags;
1537         struct intel_iommu *iommu;
1538         struct dma_pte *pgd;
1539         unsigned long num;
1540         unsigned long ndomains;
1541         int id;
1542         int agaw;
1543         struct device_domain_info *info = NULL;
1544
1545         pr_debug("Set context mapping for %02x:%02x.%d\n",
1546                 bus, PCI_SLOT(devfn), PCI_FUNC(devfn));
1547
1548         BUG_ON(!domain->pgd);
1549         BUG_ON(translation != CONTEXT_TT_PASS_THROUGH &&
1550                translation != CONTEXT_TT_MULTI_LEVEL);
1551
1552         iommu = device_to_iommu(segment, bus, devfn);
1553         if (!iommu)
1554                 return -ENODEV;
1555
1556         context = device_to_context_entry(iommu, bus, devfn);
1557         if (!context)
1558                 return -ENOMEM;
1559         spin_lock_irqsave(&iommu->lock, flags);
1560         if (context_present(context)) {
1561                 spin_unlock_irqrestore(&iommu->lock, flags);
1562                 return 0;
1563         }
1564
1565         id = domain->id;
1566         pgd = domain->pgd;
1567
1568         if (domain->flags & DOMAIN_FLAG_VIRTUAL_MACHINE ||
1569             domain->flags & DOMAIN_FLAG_STATIC_IDENTITY) {
1570                 int found = 0;
1571
1572                 /* find an available domain id for this device in iommu */
1573                 ndomains = cap_ndoms(iommu->cap);
1574                 for_each_set_bit(num, iommu->domain_ids, ndomains) {
1575                         if (iommu->domains[num] == domain) {
1576                                 id = num;
1577                                 found = 1;
1578                                 break;
1579                         }
1580                 }
1581
1582                 if (found == 0) {
1583                         num = find_first_zero_bit(iommu->domain_ids, ndomains);
1584                         if (num >= ndomains) {
1585                                 spin_unlock_irqrestore(&iommu->lock, flags);
1586                                 printk(KERN_ERR "IOMMU: no free domain ids\n");
1587                                 return -EFAULT;
1588                         }
1589
1590                         set_bit(num, iommu->domain_ids);
1591                         iommu->domains[num] = domain;
1592                         id = num;
1593                 }
1594
1595                 /* Skip top levels of page tables for
1596                  * iommu which has less agaw than default.
1597                  * Unnecessary for PT mode.
1598                  */
1599                 if (translation != CONTEXT_TT_PASS_THROUGH) {
1600                         for (agaw = domain->agaw; agaw != iommu->agaw; agaw--) {
1601                                 pgd = phys_to_virt(dma_pte_addr(pgd));
1602                                 if (!dma_pte_present(pgd)) {
1603                                         spin_unlock_irqrestore(&iommu->lock, flags);
1604                                         return -ENOMEM;
1605                                 }
1606                         }
1607                 }
1608         }
1609
1610         context_set_domain_id(context, id);
1611
1612         if (translation != CONTEXT_TT_PASS_THROUGH) {
1613                 info = iommu_support_dev_iotlb(domain, segment, bus, devfn);
1614                 translation = info ? CONTEXT_TT_DEV_IOTLB :
1615                                      CONTEXT_TT_MULTI_LEVEL;
1616         }
1617         /*
1618          * In pass through mode, AW must be programmed to indicate the largest
1619          * AGAW value supported by hardware. And ASR is ignored by hardware.
1620          */
1621         if (unlikely(translation == CONTEXT_TT_PASS_THROUGH))
1622                 context_set_address_width(context, iommu->msagaw);
1623         else {
1624                 context_set_address_root(context, virt_to_phys(pgd));
1625                 context_set_address_width(context, iommu->agaw);
1626         }
1627
1628         context_set_translation_type(context, translation);
1629         context_set_fault_enable(context);
1630         context_set_present(context);
1631         domain_flush_cache(domain, context, sizeof(*context));
1632
1633         /*
1634          * It's a non-present to present mapping. If hardware doesn't cache
1635          * non-present entry we only need to flush the write-buffer. If the
1636          * _does_ cache non-present entries, then it does so in the special
1637          * domain #0, which we have to flush:
1638          */
1639         if (cap_caching_mode(iommu->cap)) {
1640                 iommu->flush.flush_context(iommu, 0,
1641                                            (((u16)bus) << 8) | devfn,
1642                                            DMA_CCMD_MASK_NOBIT,
1643                                            DMA_CCMD_DEVICE_INVL);
1644                 iommu->flush.flush_iotlb(iommu, domain->id, 0, 0, DMA_TLB_DSI_FLUSH);
1645         } else {
1646                 iommu_flush_write_buffer(iommu);
1647         }
1648         iommu_enable_dev_iotlb(info);
1649         spin_unlock_irqrestore(&iommu->lock, flags);
1650
1651         spin_lock_irqsave(&domain->iommu_lock, flags);
1652         if (!test_and_set_bit(iommu->seq_id, &domain->iommu_bmp)) {
1653                 domain->iommu_count++;
1654                 if (domain->iommu_count == 1)
1655                         domain->nid = iommu->node;
1656                 domain_update_iommu_cap(domain);
1657         }
1658         spin_unlock_irqrestore(&domain->iommu_lock, flags);
1659         return 0;
1660 }
1661
1662 static int
1663 domain_context_mapping(struct dmar_domain *domain, struct pci_dev *pdev,
1664                         int translation)
1665 {
1666         int ret;
1667         struct pci_dev *tmp, *parent;
1668
1669         ret = domain_context_mapping_one(domain, pci_domain_nr(pdev->bus),
1670                                          pdev->bus->number, pdev->devfn,
1671                                          translation);
1672         if (ret)
1673                 return ret;
1674
1675         /* dependent device mapping */
1676         tmp = pci_find_upstream_pcie_bridge(pdev);
1677         if (!tmp)
1678                 return 0;
1679         /* Secondary interface's bus number and devfn 0 */
1680         parent = pdev->bus->self;
1681         while (parent != tmp) {
1682                 ret = domain_context_mapping_one(domain,
1683                                                  pci_domain_nr(parent->bus),
1684                                                  parent->bus->number,
1685                                                  parent->devfn, translation);
1686                 if (ret)
1687                         return ret;
1688                 parent = parent->bus->self;
1689         }
1690         if (pci_is_pcie(tmp)) /* this is a PCIe-to-PCI bridge */
1691                 return domain_context_mapping_one(domain,
1692                                         pci_domain_nr(tmp->subordinate),
1693                                         tmp->subordinate->number, 0,
1694                                         translation);
1695         else /* this is a legacy PCI bridge */
1696                 return domain_context_mapping_one(domain,
1697                                                   pci_domain_nr(tmp->bus),
1698                                                   tmp->bus->number,
1699                                                   tmp->devfn,
1700                                                   translation);
1701 }
1702
1703 static int domain_context_mapped(struct pci_dev *pdev)
1704 {
1705         int ret;
1706         struct pci_dev *tmp, *parent;
1707         struct intel_iommu *iommu;
1708
1709         iommu = device_to_iommu(pci_domain_nr(pdev->bus), pdev->bus->number,
1710                                 pdev->devfn);
1711         if (!iommu)
1712                 return -ENODEV;
1713
1714         ret = device_context_mapped(iommu, pdev->bus->number, pdev->devfn);
1715         if (!ret)
1716                 return ret;
1717         /* dependent device mapping */
1718         tmp = pci_find_upstream_pcie_bridge(pdev);
1719         if (!tmp)
1720                 return ret;
1721         /* Secondary interface's bus number and devfn 0 */
1722         parent = pdev->bus->self;
1723         while (parent != tmp) {
1724                 ret = device_context_mapped(iommu, parent->bus->number,
1725                                             parent->devfn);
1726                 if (!ret)
1727                         return ret;
1728                 parent = parent->bus->self;
1729         }
1730         if (pci_is_pcie(tmp))
1731                 return device_context_mapped(iommu, tmp->subordinate->number,
1732                                              0);
1733         else
1734                 return device_context_mapped(iommu, tmp->bus->number,
1735                                              tmp->devfn);
1736 }
1737
1738 /* Returns a number of VTD pages, but aligned to MM page size */
1739 static inline unsigned long aligned_nrpages(unsigned long host_addr,
1740                                             size_t size)
1741 {
1742         host_addr &= ~PAGE_MASK;
1743         return PAGE_ALIGN(host_addr + size) >> VTD_PAGE_SHIFT;
1744 }
1745
1746 /* Return largest possible superpage level for a given mapping */
1747 static inline int hardware_largepage_caps(struct dmar_domain *domain,
1748                                           unsigned long iov_pfn,
1749                                           unsigned long phy_pfn,
1750                                           unsigned long pages)
1751 {
1752         int support, level = 1;
1753         unsigned long pfnmerge;
1754
1755         support = domain->iommu_superpage;
1756
1757         /* To use a large page, the virtual *and* physical addresses
1758            must be aligned to 2MiB/1GiB/etc. Lower bits set in either
1759            of them will mean we have to use smaller pages. So just
1760            merge them and check both at once. */
1761         pfnmerge = iov_pfn | phy_pfn;
1762
1763         while (support && !(pfnmerge & ~VTD_STRIDE_MASK)) {
1764                 pages >>= VTD_STRIDE_SHIFT;
1765                 if (!pages)
1766                         break;
1767                 pfnmerge >>= VTD_STRIDE_SHIFT;
1768                 level++;
1769                 support--;
1770         }
1771         return level;
1772 }
1773
1774 static int __domain_mapping(struct dmar_domain *domain, unsigned long iov_pfn,
1775                             struct scatterlist *sg, unsigned long phys_pfn,
1776                             unsigned long nr_pages, int prot)
1777 {
1778         struct dma_pte *first_pte = NULL, *pte = NULL;
1779         phys_addr_t uninitialized_var(pteval);
1780         int addr_width = agaw_to_width(domain->agaw) - VTD_PAGE_SHIFT;
1781         unsigned long sg_res;
1782         unsigned int largepage_lvl = 0;
1783         unsigned long lvl_pages = 0;
1784
1785         BUG_ON(addr_width < BITS_PER_LONG && (iov_pfn + nr_pages - 1) >> addr_width);
1786
1787         if ((prot & (DMA_PTE_READ|DMA_PTE_WRITE)) == 0)
1788                 return -EINVAL;
1789
1790         prot &= DMA_PTE_READ | DMA_PTE_WRITE | DMA_PTE_SNP;
1791
1792         if (sg)
1793                 sg_res = 0;
1794         else {
1795                 sg_res = nr_pages + 1;
1796                 pteval = ((phys_addr_t)phys_pfn << VTD_PAGE_SHIFT) | prot;
1797         }
1798
1799         while (nr_pages > 0) {
1800                 uint64_t tmp;
1801
1802                 if (!sg_res) {
1803                         sg_res = aligned_nrpages(sg->offset, sg->length);
1804                         sg->dma_address = ((dma_addr_t)iov_pfn << VTD_PAGE_SHIFT) + sg->offset;
1805                         sg->dma_length = sg->length;
1806                         pteval = page_to_phys(sg_page(sg)) | prot;
1807                         phys_pfn = pteval >> VTD_PAGE_SHIFT;
1808                 }
1809
1810                 if (!pte) {
1811                         largepage_lvl = hardware_largepage_caps(domain, iov_pfn, phys_pfn, sg_res);
1812
1813                         first_pte = pte = pfn_to_dma_pte(domain, iov_pfn, largepage_lvl);
1814                         if (!pte)
1815                                 return -ENOMEM;
1816                         /* It is large page*/
1817                         if (largepage_lvl > 1)
1818                                 pteval |= DMA_PTE_LARGE_PAGE;
1819                         else
1820                                 pteval &= ~(uint64_t)DMA_PTE_LARGE_PAGE;
1821
1822                 }
1823                 /* We don't need lock here, nobody else
1824                  * touches the iova range
1825                  */
1826                 tmp = cmpxchg64_local(&pte->val, 0ULL, pteval);
1827                 if (tmp) {
1828                         static int dumps = 5;
1829                         printk(KERN_CRIT "ERROR: DMA PTE for vPFN 0x%lx already set (to %llx not %llx)\n",
1830                                iov_pfn, tmp, (unsigned long long)pteval);
1831                         if (dumps) {
1832                                 dumps--;
1833                                 debug_dma_dump_mappings(NULL);
1834                         }
1835                         WARN_ON(1);
1836                 }
1837
1838                 lvl_pages = lvl_to_nr_pages(largepage_lvl);
1839
1840                 BUG_ON(nr_pages < lvl_pages);
1841                 BUG_ON(sg_res < lvl_pages);
1842
1843                 nr_pages -= lvl_pages;
1844                 iov_pfn += lvl_pages;
1845                 phys_pfn += lvl_pages;
1846                 pteval += lvl_pages * VTD_PAGE_SIZE;
1847                 sg_res -= lvl_pages;
1848
1849                 /* If the next PTE would be the first in a new page, then we
1850                    need to flush the cache on the entries we've just written.
1851                    And then we'll need to recalculate 'pte', so clear it and
1852                    let it get set again in the if (!pte) block above.
1853
1854                    If we're done (!nr_pages) we need to flush the cache too.
1855
1856                    Also if we've been setting superpages, we may need to
1857                    recalculate 'pte' and switch back to smaller pages for the
1858                    end of the mapping, if the trailing size is not enough to
1859                    use another superpage (i.e. sg_res < lvl_pages). */
1860                 pte++;
1861                 if (!nr_pages || first_pte_in_page(pte) ||
1862                     (largepage_lvl > 1 && sg_res < lvl_pages)) {
1863                         domain_flush_cache(domain, first_pte,
1864                                            (void *)pte - (void *)first_pte);
1865                         pte = NULL;
1866                 }
1867
1868                 if (!sg_res && nr_pages)
1869                         sg = sg_next(sg);
1870         }
1871         return 0;
1872 }
1873
1874 static inline int domain_sg_mapping(struct dmar_domain *domain, unsigned long iov_pfn,
1875                                     struct scatterlist *sg, unsigned long nr_pages,
1876                                     int prot)
1877 {
1878         return __domain_mapping(domain, iov_pfn, sg, 0, nr_pages, prot);
1879 }
1880
1881 static inline int domain_pfn_mapping(struct dmar_domain *domain, unsigned long iov_pfn,
1882                                      unsigned long phys_pfn, unsigned long nr_pages,
1883                                      int prot)
1884 {
1885         return __domain_mapping(domain, iov_pfn, NULL, phys_pfn, nr_pages, prot);
1886 }
1887
1888 static void iommu_detach_dev(struct intel_iommu *iommu, u8 bus, u8 devfn)
1889 {
1890         if (!iommu)
1891                 return;
1892
1893         clear_context_table(iommu, bus, devfn);
1894         iommu->flush.flush_context(iommu, 0, 0, 0,
1895                                            DMA_CCMD_GLOBAL_INVL);
1896         iommu->flush.flush_iotlb(iommu, 0, 0, 0, DMA_TLB_GLOBAL_FLUSH);
1897 }
1898
1899 static void domain_remove_dev_info(struct dmar_domain *domain)
1900 {
1901         struct device_domain_info *info;
1902         unsigned long flags;
1903         struct intel_iommu *iommu;
1904
1905         spin_lock_irqsave(&device_domain_lock, flags);
1906         while (!list_empty(&domain->devices)) {
1907                 info = list_entry(domain->devices.next,
1908                         struct device_domain_info, link);
1909                 list_del(&info->link);
1910                 list_del(&info->global);
1911                 if (info->dev)
1912                         info->dev->dev.archdata.iommu = NULL;
1913                 spin_unlock_irqrestore(&device_domain_lock, flags);
1914
1915                 iommu_disable_dev_iotlb(info);
1916                 iommu = device_to_iommu(info->segment, info->bus, info->devfn);
1917                 iommu_detach_dev(iommu, info->bus, info->devfn);
1918                 free_devinfo_mem(info);
1919
1920                 spin_lock_irqsave(&device_domain_lock, flags);
1921         }
1922         spin_unlock_irqrestore(&device_domain_lock, flags);
1923 }
1924
1925 /*
1926  * find_domain
1927  * Note: we use struct pci_dev->dev.archdata.iommu stores the info
1928  */
1929 static struct dmar_domain *
1930 find_domain(struct pci_dev *pdev)
1931 {
1932         struct device_domain_info *info;
1933
1934         /* No lock here, assumes no domain exit in normal case */
1935         info = pdev->dev.archdata.iommu;
1936         if (info)
1937                 return info->domain;
1938         return NULL;
1939 }
1940
1941 /* domain is initialized */
1942 static struct dmar_domain *get_domain_for_dev(struct pci_dev *pdev, int gaw)
1943 {
1944         struct dmar_domain *domain, *found = NULL;
1945         struct intel_iommu *iommu;
1946         struct dmar_drhd_unit *drhd;
1947         struct device_domain_info *info, *tmp;
1948         struct pci_dev *dev_tmp;
1949         unsigned long flags;
1950         int bus = 0, devfn = 0;
1951         int segment;
1952         int ret;
1953
1954         domain = find_domain(pdev);
1955         if (domain)
1956                 return domain;
1957
1958         segment = pci_domain_nr(pdev->bus);
1959
1960         dev_tmp = pci_find_upstream_pcie_bridge(pdev);
1961         if (dev_tmp) {
1962                 if (pci_is_pcie(dev_tmp)) {
1963                         bus = dev_tmp->subordinate->number;
1964                         devfn = 0;
1965                 } else {
1966                         bus = dev_tmp->bus->number;
1967                         devfn = dev_tmp->devfn;
1968                 }
1969                 spin_lock_irqsave(&device_domain_lock, flags);
1970                 list_for_each_entry(info, &device_domain_list, global) {
1971                         if (info->segment == segment &&
1972                             info->bus == bus && info->devfn == devfn) {
1973                                 found = info->domain;
1974                                 break;
1975                         }
1976                 }
1977                 spin_unlock_irqrestore(&device_domain_lock, flags);
1978                 /* pcie-pci bridge already has a domain, uses it */
1979                 if (found) {
1980                         domain = found;
1981                         goto found_domain;
1982                 }
1983         }
1984
1985         domain = alloc_domain();
1986         if (!domain)
1987                 goto error;
1988
1989         /* Allocate new domain for the device */
1990         drhd = dmar_find_matched_drhd_unit(pdev);
1991         if (!drhd) {
1992                 printk(KERN_ERR "IOMMU: can't find DMAR for device %s\n",
1993                         pci_name(pdev));
1994                 return NULL;
1995         }
1996         iommu = drhd->iommu;
1997
1998         ret = iommu_attach_domain(domain, iommu);
1999         if (ret) {
2000                 free_domain_mem(domain);
2001                 goto error;
2002         }
2003
2004         if (domain_init(domain, gaw)) {
2005                 domain_exit(domain);
2006                 goto error;
2007         }
2008
2009         /* register pcie-to-pci device */
2010         if (dev_tmp) {
2011                 info = alloc_devinfo_mem();
2012                 if (!info) {
2013                         domain_exit(domain);
2014                         goto error;
2015                 }
2016                 info->segment = segment;
2017                 info->bus = bus;
2018                 info->devfn = devfn;
2019                 info->dev = NULL;
2020                 info->domain = domain;
2021                 /* This domain is shared by devices under p2p bridge */
2022                 domain->flags |= DOMAIN_FLAG_P2P_MULTIPLE_DEVICES;
2023
2024                 /* pcie-to-pci bridge already has a domain, uses it */
2025                 found = NULL;
2026                 spin_lock_irqsave(&device_domain_lock, flags);
2027                 list_for_each_entry(tmp, &device_domain_list, global) {
2028                         if (tmp->segment == segment &&
2029                             tmp->bus == bus && tmp->devfn == devfn) {
2030                                 found = tmp->domain;
2031                                 break;
2032                         }
2033                 }
2034                 if (found) {
2035                         spin_unlock_irqrestore(&device_domain_lock, flags);
2036                         free_devinfo_mem(info);
2037                         domain_exit(domain);
2038                         domain = found;
2039                 } else {
2040                         list_add(&info->link, &domain->devices);
2041                         list_add(&info->global, &device_domain_list);
2042                         spin_unlock_irqrestore(&device_domain_lock, flags);
2043                 }
2044         }
2045
2046 found_domain:
2047         info = alloc_devinfo_mem();
2048         if (!info)
2049                 goto error;
2050         info->segment = segment;
2051         info->bus = pdev->bus->number;
2052         info->devfn = pdev->devfn;
2053         info->dev = pdev;
2054         info->domain = domain;
2055         spin_lock_irqsave(&device_domain_lock, flags);
2056         /* somebody is fast */
2057         found = find_domain(pdev);
2058         if (found != NULL) {
2059                 spin_unlock_irqrestore(&device_domain_lock, flags);
2060                 if (found != domain) {
2061                         domain_exit(domain);
2062                         domain = found;
2063                 }
2064                 free_devinfo_mem(info);
2065                 return domain;
2066         }
2067         list_add(&info->link, &domain->devices);
2068         list_add(&info->global, &device_domain_list);
2069         pdev->dev.archdata.iommu = info;
2070         spin_unlock_irqrestore(&device_domain_lock, flags);
2071         return domain;
2072 error:
2073         /* recheck it here, maybe others set it */
2074         return find_domain(pdev);
2075 }
2076
2077 static int iommu_identity_mapping;
2078 #define IDENTMAP_ALL            1
2079 #define IDENTMAP_GFX            2
2080 #define IDENTMAP_AZALIA         4
2081
2082 static int iommu_domain_identity_map(struct dmar_domain *domain,
2083                                      unsigned long long start,
2084                                      unsigned long long end)
2085 {
2086         unsigned long first_vpfn = start >> VTD_PAGE_SHIFT;
2087         unsigned long last_vpfn = end >> VTD_PAGE_SHIFT;
2088
2089         if (!reserve_iova(&domain->iovad, dma_to_mm_pfn(first_vpfn),
2090                           dma_to_mm_pfn(last_vpfn))) {
2091                 printk(KERN_ERR "IOMMU: reserve iova failed\n");
2092                 return -ENOMEM;
2093         }
2094
2095         pr_debug("Mapping reserved region %llx-%llx for domain %d\n",
2096                  start, end, domain->id);
2097         /*
2098          * RMRR range might have overlap with physical memory range,
2099          * clear it first
2100          */
2101         dma_pte_clear_range(domain, first_vpfn, last_vpfn);
2102
2103         return domain_pfn_mapping(domain, first_vpfn, first_vpfn,
2104                                   last_vpfn - first_vpfn + 1,
2105                                   DMA_PTE_READ|DMA_PTE_WRITE);
2106 }
2107
2108 static int iommu_prepare_identity_map(struct pci_dev *pdev,
2109                                       unsigned long long start,
2110                                       unsigned long long end)
2111 {
2112         struct dmar_domain *domain;
2113         int ret;
2114
2115         domain = get_domain_for_dev(pdev, DEFAULT_DOMAIN_ADDRESS_WIDTH);
2116         if (!domain)
2117                 return -ENOMEM;
2118
2119         /* For _hardware_ passthrough, don't bother. But for software
2120            passthrough, we do it anyway -- it may indicate a memory
2121            range which is reserved in E820, so which didn't get set
2122            up to start with in si_domain */
2123         if (domain == si_domain && hw_pass_through) {
2124                 printk("Ignoring identity map for HW passthrough device %s [0x%Lx - 0x%Lx]\n",
2125                        pci_name(pdev), start, end);
2126                 return 0;
2127         }
2128
2129         printk(KERN_INFO
2130                "IOMMU: Setting identity map for device %s [0x%Lx - 0x%Lx]\n",
2131                pci_name(pdev), start, end);
2132         
2133         if (end < start) {
2134                 WARN(1, "Your BIOS is broken; RMRR ends before it starts!\n"
2135                         "BIOS vendor: %s; Ver: %s; Product Version: %s\n",
2136                         dmi_get_system_info(DMI_BIOS_VENDOR),
2137                         dmi_get_system_info(DMI_BIOS_VERSION),
2138                      dmi_get_system_info(DMI_PRODUCT_VERSION));
2139                 ret = -EIO;
2140                 goto error;
2141         }
2142
2143         if (end >> agaw_to_width(domain->agaw)) {
2144                 WARN(1, "Your BIOS is broken; RMRR exceeds permitted address width (%d bits)\n"
2145                      "BIOS vendor: %s; Ver: %s; Product Version: %s\n",
2146                      agaw_to_width(domain->agaw),
2147                      dmi_get_system_info(DMI_BIOS_VENDOR),
2148                      dmi_get_system_info(DMI_BIOS_VERSION),
2149                      dmi_get_system_info(DMI_PRODUCT_VERSION));
2150                 ret = -EIO;
2151                 goto error;
2152         }
2153
2154         ret = iommu_domain_identity_map(domain, start, end);
2155         if (ret)
2156                 goto error;
2157
2158         /* context entry init */
2159         ret = domain_context_mapping(domain, pdev, CONTEXT_TT_MULTI_LEVEL);
2160         if (ret)
2161                 goto error;
2162
2163         return 0;
2164
2165  error:
2166         domain_exit(domain);
2167         return ret;
2168 }
2169
2170 static inline int iommu_prepare_rmrr_dev(struct dmar_rmrr_unit *rmrr,
2171         struct pci_dev *pdev)
2172 {
2173         if (pdev->dev.archdata.iommu == DUMMY_DEVICE_DOMAIN_INFO)
2174                 return 0;
2175         return iommu_prepare_identity_map(pdev, rmrr->base_address,
2176                 rmrr->end_address);
2177 }
2178
2179 #ifdef CONFIG_INTEL_IOMMU_FLOPPY_WA
2180 static inline void iommu_prepare_isa(void)
2181 {
2182         struct pci_dev *pdev;
2183         int ret;
2184
2185         pdev = pci_get_class(PCI_CLASS_BRIDGE_ISA << 8, NULL);
2186         if (!pdev)
2187                 return;
2188
2189         printk(KERN_INFO "IOMMU: Prepare 0-16MiB unity mapping for LPC\n");
2190         ret = iommu_prepare_identity_map(pdev, 0, 16*1024*1024 - 1);
2191
2192         if (ret)
2193                 printk(KERN_ERR "IOMMU: Failed to create 0-16MiB identity map; "
2194                        "floppy might not work\n");
2195
2196 }
2197 #else
2198 static inline void iommu_prepare_isa(void)
2199 {
2200         return;
2201 }
2202 #endif /* !CONFIG_INTEL_IOMMU_FLPY_WA */
2203
2204 static int md_domain_init(struct dmar_domain *domain, int guest_width);
2205
2206 static int __init si_domain_work_fn(unsigned long start_pfn,
2207                                     unsigned long end_pfn, void *datax)
2208 {
2209         int *ret = datax;
2210
2211         *ret = iommu_domain_identity_map(si_domain,
2212                                          (uint64_t)start_pfn << PAGE_SHIFT,
2213                                          (uint64_t)end_pfn << PAGE_SHIFT);
2214         return *ret;
2215
2216 }
2217
2218 static int __init si_domain_init(int hw)
2219 {
2220         struct dmar_drhd_unit *drhd;
2221         struct intel_iommu *iommu;
2222         int nid, ret = 0;
2223
2224         si_domain = alloc_domain();
2225         if (!si_domain)
2226                 return -EFAULT;
2227
2228         pr_debug("Identity mapping domain is domain %d\n", si_domain->id);
2229
2230         for_each_active_iommu(iommu, drhd) {
2231                 ret = iommu_attach_domain(si_domain, iommu);
2232                 if (ret) {
2233                         domain_exit(si_domain);
2234                         return -EFAULT;
2235                 }
2236         }
2237
2238         if (md_domain_init(si_domain, DEFAULT_DOMAIN_ADDRESS_WIDTH)) {
2239                 domain_exit(si_domain);
2240                 return -EFAULT;
2241         }
2242
2243         si_domain->flags = DOMAIN_FLAG_STATIC_IDENTITY;
2244
2245         if (hw)
2246                 return 0;
2247
2248         for_each_online_node(nid) {
2249                 work_with_active_regions(nid, si_domain_work_fn, &ret);
2250                 if (ret)
2251                         return ret;
2252         }
2253
2254         return 0;
2255 }
2256
2257 static void domain_remove_one_dev_info(struct dmar_domain *domain,
2258                                           struct pci_dev *pdev);
2259 static int identity_mapping(struct pci_dev *pdev)
2260 {
2261         struct device_domain_info *info;
2262
2263         if (likely(!iommu_identity_mapping))
2264                 return 0;
2265
2266         info = pdev->dev.archdata.iommu;
2267         if (info && info != DUMMY_DEVICE_DOMAIN_INFO)
2268                 return (info->domain == si_domain);
2269
2270         return 0;
2271 }
2272
2273 static int domain_add_dev_info(struct dmar_domain *domain,
2274                                struct pci_dev *pdev,
2275                                int translation)
2276 {
2277         struct device_domain_info *info;
2278         unsigned long flags;
2279         int ret;
2280
2281         info = alloc_devinfo_mem();
2282         if (!info)
2283                 return -ENOMEM;
2284
2285         ret = domain_context_mapping(domain, pdev, translation);
2286         if (ret) {
2287                 free_devinfo_mem(info);
2288                 return ret;
2289         }
2290
2291         info->segment = pci_domain_nr(pdev->bus);
2292         info->bus = pdev->bus->number;
2293         info->devfn = pdev->devfn;
2294         info->dev = pdev;
2295         info->domain = domain;
2296
2297         spin_lock_irqsave(&device_domain_lock, flags);
2298         list_add(&info->link, &domain->devices);
2299         list_add(&info->global, &device_domain_list);
2300         pdev->dev.archdata.iommu = info;
2301         spin_unlock_irqrestore(&device_domain_lock, flags);
2302
2303         return 0;
2304 }
2305
2306 static int iommu_should_identity_map(struct pci_dev *pdev, int startup)
2307 {
2308         if ((iommu_identity_mapping & IDENTMAP_AZALIA) && IS_AZALIA(pdev))
2309                 return 1;
2310
2311         if ((iommu_identity_mapping & IDENTMAP_GFX) && IS_GFX_DEVICE(pdev))
2312                 return 1;
2313
2314         if (!(iommu_identity_mapping & IDENTMAP_ALL))
2315                 return 0;
2316
2317         /*
2318          * We want to start off with all devices in the 1:1 domain, and
2319          * take them out later if we find they can't access all of memory.
2320          *
2321          * However, we can't do this for PCI devices behind bridges,
2322          * because all PCI devices behind the same bridge will end up
2323          * with the same source-id on their transactions.
2324          *
2325          * Practically speaking, we can't change things around for these
2326          * devices at run-time, because we can't be sure there'll be no
2327          * DMA transactions in flight for any of their siblings.
2328          * 
2329          * So PCI devices (unless they're on the root bus) as well as
2330          * their parent PCI-PCI or PCIe-PCI bridges must be left _out_ of
2331          * the 1:1 domain, just in _case_ one of their siblings turns out
2332          * not to be able to map all of memory.
2333          */
2334         if (!pci_is_pcie(pdev)) {
2335                 if (!pci_is_root_bus(pdev->bus))
2336                         return 0;
2337                 if (pdev->class >> 8 == PCI_CLASS_BRIDGE_PCI)
2338                         return 0;
2339         } else if (pdev->pcie_type == PCI_EXP_TYPE_PCI_BRIDGE)
2340                 return 0;
2341
2342         /* 
2343          * At boot time, we don't yet know if devices will be 64-bit capable.
2344          * Assume that they will -- if they turn out not to be, then we can 
2345          * take them out of the 1:1 domain later.
2346          */
2347         if (!startup) {
2348                 /*
2349                  * If the device's dma_mask is less than the system's memory
2350                  * size then this is not a candidate for identity mapping.
2351                  */
2352                 u64 dma_mask = pdev->dma_mask;
2353
2354                 if (pdev->dev.coherent_dma_mask &&
2355                     pdev->dev.coherent_dma_mask < dma_mask)
2356                         dma_mask = pdev->dev.coherent_dma_mask;
2357
2358                 return dma_mask >= dma_get_required_mask(&pdev->dev);
2359         }
2360
2361         return 1;
2362 }
2363
2364 static int __init iommu_prepare_static_identity_mapping(int hw)
2365 {
2366         struct pci_dev *pdev = NULL;
2367         int ret;
2368
2369         ret = si_domain_init(hw);
2370         if (ret)
2371                 return -EFAULT;
2372
2373         for_each_pci_dev(pdev) {
2374                 /* Skip Host/PCI Bridge devices */
2375                 if (IS_BRIDGE_HOST_DEVICE(pdev))
2376                         continue;
2377                 if (iommu_should_identity_map(pdev, 1)) {
2378                         printk(KERN_INFO "IOMMU: %s identity mapping for device %s\n",
2379                                hw ? "hardware" : "software", pci_name(pdev));
2380
2381                         ret = domain_add_dev_info(si_domain, pdev,
2382                                                      hw ? CONTEXT_TT_PASS_THROUGH :
2383                                                      CONTEXT_TT_MULTI_LEVEL);
2384                         if (ret)
2385                                 return ret;
2386                 }
2387         }
2388
2389         return 0;
2390 }
2391
2392 static int __init init_dmars(void)
2393 {
2394         struct dmar_drhd_unit *drhd;
2395         struct dmar_rmrr_unit *rmrr;
2396         struct pci_dev *pdev;
2397         struct intel_iommu *iommu;
2398         int i, ret;
2399
2400         /*
2401          * for each drhd
2402          *    allocate root
2403          *    initialize and program root entry to not present
2404          * endfor
2405          */
2406         for_each_drhd_unit(drhd) {
2407                 g_num_of_iommus++;
2408                 /*
2409                  * lock not needed as this is only incremented in the single
2410                  * threaded kernel __init code path all other access are read
2411                  * only
2412                  */
2413         }
2414
2415         g_iommus = kcalloc(g_num_of_iommus, sizeof(struct intel_iommu *),
2416                         GFP_KERNEL);
2417         if (!g_iommus) {
2418                 printk(KERN_ERR "Allocating global iommu array failed\n");
2419                 ret = -ENOMEM;
2420                 goto error;
2421         }
2422
2423         deferred_flush = kzalloc(g_num_of_iommus *
2424                 sizeof(struct deferred_flush_tables), GFP_KERNEL);
2425         if (!deferred_flush) {
2426                 ret = -ENOMEM;
2427                 goto error;
2428         }
2429
2430         for_each_drhd_unit(drhd) {
2431                 if (drhd->ignored)
2432                         continue;
2433
2434                 iommu = drhd->iommu;
2435                 g_iommus[iommu->seq_id] = iommu;
2436
2437                 ret = iommu_init_domains(iommu);
2438                 if (ret)
2439                         goto error;
2440
2441                 /*
2442                  * TBD:
2443                  * we could share the same root & context tables
2444                  * among all IOMMU's. Need to Split it later.
2445                  */
2446                 ret = iommu_alloc_root_entry(iommu);
2447                 if (ret) {
2448                         printk(KERN_ERR "IOMMU: allocate root entry failed\n");
2449                         goto error;
2450                 }
2451                 if (!ecap_pass_through(iommu->ecap))
2452                         hw_pass_through = 0;
2453         }
2454
2455         /*
2456          * Start from the sane iommu hardware state.
2457          */
2458         for_each_drhd_unit(drhd) {
2459                 if (drhd->ignored)
2460                         continue;
2461
2462                 iommu = drhd->iommu;
2463
2464                 /*
2465                  * If the queued invalidation is already initialized by us
2466                  * (for example, while enabling interrupt-remapping) then
2467                  * we got the things already rolling from a sane state.
2468                  */
2469                 if (iommu->qi)
2470                         continue;
2471
2472                 /*
2473                  * Clear any previous faults.
2474                  */
2475                 dmar_fault(-1, iommu);
2476                 /*
2477                  * Disable queued invalidation if supported and already enabled
2478                  * before OS handover.
2479                  */
2480                 dmar_disable_qi(iommu);
2481         }
2482
2483         for_each_drhd_unit(drhd) {
2484                 if (drhd->ignored)
2485                         continue;
2486
2487                 iommu = drhd->iommu;
2488
2489                 if (dmar_enable_qi(iommu)) {
2490                         /*
2491                          * Queued Invalidate not enabled, use Register Based
2492                          * Invalidate
2493                          */
2494                         iommu->flush.flush_context = __iommu_flush_context;
2495                         iommu->flush.flush_iotlb = __iommu_flush_iotlb;
2496                         printk(KERN_INFO "IOMMU %d 0x%Lx: using Register based "
2497                                "invalidation\n",
2498                                 iommu->seq_id,
2499                                (unsigned long long)drhd->reg_base_addr);
2500                 } else {
2501                         iommu->flush.flush_context = qi_flush_context;
2502                         iommu->flush.flush_iotlb = qi_flush_iotlb;
2503                         printk(KERN_INFO "IOMMU %d 0x%Lx: using Queued "
2504                                "invalidation\n",
2505                                 iommu->seq_id,
2506                                (unsigned long long)drhd->reg_base_addr);
2507                 }
2508         }
2509
2510         if (iommu_pass_through)
2511                 iommu_identity_mapping |= IDENTMAP_ALL;
2512
2513 #ifdef CONFIG_INTEL_IOMMU_BROKEN_GFX_WA
2514         iommu_identity_mapping |= IDENTMAP_GFX;
2515 #endif
2516
2517         check_tylersburg_isoch();
2518
2519         /*
2520          * If pass through is not set or not enabled, setup context entries for
2521          * identity mappings for rmrr, gfx, and isa and may fall back to static
2522          * identity mapping if iommu_identity_mapping is set.
2523          */
2524         if (iommu_identity_mapping) {
2525                 ret = iommu_prepare_static_identity_mapping(hw_pass_through);
2526                 if (ret) {
2527                         printk(KERN_CRIT "Failed to setup IOMMU pass-through\n");
2528                         goto error;
2529                 }
2530         }
2531         /*
2532          * For each rmrr
2533          *   for each dev attached to rmrr
2534          *   do
2535          *     locate drhd for dev, alloc domain for dev
2536          *     allocate free domain
2537          *     allocate page table entries for rmrr
2538          *     if context not allocated for bus
2539          *           allocate and init context
2540          *           set present in root table for this bus
2541          *     init context with domain, translation etc
2542          *    endfor
2543          * endfor
2544          */
2545         printk(KERN_INFO "IOMMU: Setting RMRR:\n");
2546         for_each_rmrr_units(rmrr) {
2547                 for (i = 0; i < rmrr->devices_cnt; i++) {
2548                         pdev = rmrr->devices[i];
2549                         /*
2550                          * some BIOS lists non-exist devices in DMAR
2551                          * table.
2552                          */
2553                         if (!pdev)
2554                                 continue;
2555                         ret = iommu_prepare_rmrr_dev(rmrr, pdev);
2556                         if (ret)
2557                                 printk(KERN_ERR
2558                                        "IOMMU: mapping reserved region failed\n");
2559                 }
2560         }
2561
2562         iommu_prepare_isa();
2563
2564         /*
2565          * for each drhd
2566          *   enable fault log
2567          *   global invalidate context cache
2568          *   global invalidate iotlb
2569          *   enable translation
2570          */
2571         for_each_drhd_unit(drhd) {
2572                 if (drhd->ignored) {
2573                         /*
2574                          * we always have to disable PMRs or DMA may fail on
2575                          * this device
2576                          */
2577                         if (force_on)
2578                                 iommu_disable_protect_mem_regions(drhd->iommu);
2579                         continue;
2580                 }
2581                 iommu = drhd->iommu;
2582
2583                 iommu_flush_write_buffer(iommu);
2584
2585                 ret = dmar_set_interrupt(iommu);
2586                 if (ret)
2587                         goto error;
2588
2589                 iommu_set_root_entry(iommu);
2590
2591                 iommu->flush.flush_context(iommu, 0, 0, 0, DMA_CCMD_GLOBAL_INVL);
2592                 iommu->flush.flush_iotlb(iommu, 0, 0, 0, DMA_TLB_GLOBAL_FLUSH);
2593
2594                 ret = iommu_enable_translation(iommu);
2595                 if (ret)
2596                         goto error;
2597
2598                 iommu_disable_protect_mem_regions(iommu);
2599         }
2600
2601         return 0;
2602 error:
2603         for_each_drhd_unit(drhd) {
2604                 if (drhd->ignored)
2605                         continue;
2606                 iommu = drhd->iommu;
2607                 free_iommu(iommu);
2608         }
2609         kfree(g_iommus);
2610         return ret;
2611 }
2612
2613 /* This takes a number of _MM_ pages, not VTD pages */
2614 static struct iova *intel_alloc_iova(struct device *dev,
2615                                      struct dmar_domain *domain,
2616                                      unsigned long nrpages, uint64_t dma_mask)
2617 {
2618         struct pci_dev *pdev = to_pci_dev(dev);
2619         struct iova *iova = NULL;
2620
2621         /* Restrict dma_mask to the width that the iommu can handle */
2622         dma_mask = min_t(uint64_t, DOMAIN_MAX_ADDR(domain->gaw), dma_mask);
2623
2624         if (!dmar_forcedac && dma_mask > DMA_BIT_MASK(32)) {
2625                 /*
2626                  * First try to allocate an io virtual address in
2627                  * DMA_BIT_MASK(32) and if that fails then try allocating
2628                  * from higher range
2629                  */
2630                 iova = alloc_iova(&domain->iovad, nrpages,
2631                                   IOVA_PFN(DMA_BIT_MASK(32)), 1);
2632                 if (iova)
2633                         return iova;
2634         }
2635         iova = alloc_iova(&domain->iovad, nrpages, IOVA_PFN(dma_mask), 1);
2636         if (unlikely(!iova)) {
2637                 printk(KERN_ERR "Allocating %ld-page iova for %s failed",
2638                        nrpages, pci_name(pdev));
2639                 return NULL;
2640         }
2641
2642         return iova;
2643 }
2644
2645 static struct dmar_domain *__get_valid_domain_for_dev(struct pci_dev *pdev)
2646 {
2647         struct dmar_domain *domain;
2648         int ret;
2649
2650         domain = get_domain_for_dev(pdev,
2651                         DEFAULT_DOMAIN_ADDRESS_WIDTH);
2652         if (!domain) {
2653                 printk(KERN_ERR
2654                         "Allocating domain for %s failed", pci_name(pdev));
2655                 return NULL;
2656         }
2657
2658         /* make sure context mapping is ok */
2659         if (unlikely(!domain_context_mapped(pdev))) {
2660                 ret = domain_context_mapping(domain, pdev,
2661                                              CONTEXT_TT_MULTI_LEVEL);
2662                 if (ret) {
2663                         printk(KERN_ERR
2664                                 "Domain context map for %s failed",
2665                                 pci_name(pdev));
2666                         return NULL;
2667                 }
2668         }
2669
2670         return domain;
2671 }
2672
2673 static inline struct dmar_domain *get_valid_domain_for_dev(struct pci_dev *dev)
2674 {
2675         struct device_domain_info *info;
2676
2677         /* No lock here, assumes no domain exit in normal case */
2678         info = dev->dev.archdata.iommu;
2679         if (likely(info))
2680                 return info->domain;
2681
2682         return __get_valid_domain_for_dev(dev);
2683 }
2684
2685 static int iommu_dummy(struct pci_dev *pdev)
2686 {
2687         return pdev->dev.archdata.iommu == DUMMY_DEVICE_DOMAIN_INFO;
2688 }
2689
2690 /* Check if the pdev needs to go through non-identity map and unmap process.*/
2691 static int iommu_no_mapping(struct device *dev)
2692 {
2693         struct pci_dev *pdev;
2694         int found;
2695
2696         if (unlikely(dev->bus != &pci_bus_type))
2697                 return 1;
2698
2699         pdev = to_pci_dev(dev);
2700         if (iommu_dummy(pdev))
2701                 return 1;
2702
2703         if (!iommu_identity_mapping)
2704                 return 0;
2705
2706         found = identity_mapping(pdev);
2707         if (found) {
2708                 if (iommu_should_identity_map(pdev, 0))
2709                         return 1;
2710                 else {
2711                         /*
2712                          * 32 bit DMA is removed from si_domain and fall back
2713                          * to non-identity mapping.
2714                          */
2715                         domain_remove_one_dev_info(si_domain, pdev);
2716                         printk(KERN_INFO "32bit %s uses non-identity mapping\n",
2717                                pci_name(pdev));
2718                         return 0;
2719                 }
2720         } else {
2721                 /*
2722                  * In case of a detached 64 bit DMA device from vm, the device
2723                  * is put into si_domain for identity mapping.
2724                  */
2725                 if (iommu_should_identity_map(pdev, 0)) {
2726                         int ret;
2727                         ret = domain_add_dev_info(si_domain, pdev,
2728                                                   hw_pass_through ?
2729                                                   CONTEXT_TT_PASS_THROUGH :
2730                                                   CONTEXT_TT_MULTI_LEVEL);
2731                         if (!ret) {
2732                                 printk(KERN_INFO "64bit %s uses identity mapping\n",
2733                                        pci_name(pdev));
2734                                 return 1;
2735                         }
2736                 }
2737         }
2738
2739         return 0;
2740 }
2741
2742 static dma_addr_t __intel_map_single(struct device *hwdev, phys_addr_t paddr,
2743                                      size_t size, int dir, u64 dma_mask)
2744 {
2745         struct pci_dev *pdev = to_pci_dev(hwdev);
2746         struct dmar_domain *domain;
2747         phys_addr_t start_paddr;
2748         struct iova *iova;
2749         int prot = 0;
2750         int ret;
2751         struct intel_iommu *iommu;
2752         unsigned long paddr_pfn = paddr >> PAGE_SHIFT;
2753
2754         BUG_ON(dir == DMA_NONE);
2755
2756         if (iommu_no_mapping(hwdev))
2757                 return paddr;
2758
2759         domain = get_valid_domain_for_dev(pdev);
2760         if (!domain)
2761                 return 0;
2762
2763         iommu = domain_get_iommu(domain);
2764         size = aligned_nrpages(paddr, size);
2765
2766         iova = intel_alloc_iova(hwdev, domain, dma_to_mm_pfn(size), dma_mask);
2767         if (!iova)
2768                 goto error;
2769
2770         /*
2771          * Check if DMAR supports zero-length reads on write only
2772          * mappings..
2773          */
2774         if (dir == DMA_TO_DEVICE || dir == DMA_BIDIRECTIONAL || \
2775                         !cap_zlr(iommu->cap))
2776                 prot |= DMA_PTE_READ;
2777         if (dir == DMA_FROM_DEVICE || dir == DMA_BIDIRECTIONAL)
2778                 prot |= DMA_PTE_WRITE;
2779         /*
2780          * paddr - (paddr + size) might be partial page, we should map the whole
2781          * page.  Note: if two part of one page are separately mapped, we
2782          * might have two guest_addr mapping to the same host paddr, but this
2783          * is not a big problem
2784          */
2785         ret = domain_pfn_mapping(domain, mm_to_dma_pfn(iova->pfn_lo),
2786                                  mm_to_dma_pfn(paddr_pfn), size, prot);
2787         if (ret)
2788                 goto error;
2789
2790         /* it's a non-present to present mapping. Only flush if caching mode */
2791         if (cap_caching_mode(iommu->cap))
2792                 iommu_flush_iotlb_psi(iommu, domain->id, mm_to_dma_pfn(iova->pfn_lo), size, 1);
2793         else
2794                 iommu_flush_write_buffer(iommu);
2795
2796         start_paddr = (phys_addr_t)iova->pfn_lo << PAGE_SHIFT;
2797         start_paddr += paddr & ~PAGE_MASK;
2798         return start_paddr;
2799
2800 error:
2801         if (iova)
2802                 __free_iova(&domain->iovad, iova);
2803         printk(KERN_ERR"Device %s request: %zx@%llx dir %d --- failed\n",
2804                 pci_name(pdev), size, (unsigned long long)paddr, dir);
2805         return 0;
2806 }
2807
2808 static dma_addr_t intel_map_page(struct device *dev, struct page *page,
2809                                  unsigned long offset, size_t size,
2810                                  enum dma_data_direction dir,
2811                                  struct dma_attrs *attrs)
2812 {
2813         return __intel_map_single(dev, page_to_phys(page) + offset, size,
2814                                   dir, to_pci_dev(dev)->dma_mask);
2815 }
2816
2817 static void flush_unmaps(void)
2818 {
2819         int i, j;
2820
2821         timer_on = 0;
2822
2823         /* just flush them all */
2824         for (i = 0; i < g_num_of_iommus; i++) {
2825                 struct intel_iommu *iommu = g_iommus[i];
2826                 if (!iommu)
2827                         continue;
2828
2829                 if (!deferred_flush[i].next)
2830                         continue;
2831
2832                 /* In caching mode, global flushes turn emulation expensive */
2833                 if (!cap_caching_mode(iommu->cap))
2834                         iommu->flush.flush_iotlb(iommu, 0, 0, 0,
2835                                          DMA_TLB_GLOBAL_FLUSH);
2836                 for (j = 0; j < deferred_flush[i].next; j++) {
2837                         unsigned long mask;
2838                         struct iova *iova = deferred_flush[i].iova[j];
2839                         struct dmar_domain *domain = deferred_flush[i].domain[j];
2840
2841                         /* On real hardware multiple invalidations are expensive */
2842                         if (cap_caching_mode(iommu->cap))
2843                                 iommu_flush_iotlb_psi(iommu, domain->id,
2844                                 iova->pfn_lo, iova->pfn_hi - iova->pfn_lo + 1, 0);
2845                         else {
2846                                 mask = ilog2(mm_to_dma_pfn(iova->pfn_hi - iova->pfn_lo + 1));
2847                                 iommu_flush_dev_iotlb(deferred_flush[i].domain[j],
2848                                                 (uint64_t)iova->pfn_lo << PAGE_SHIFT, mask);
2849                         }
2850                         __free_iova(&deferred_flush[i].domain[j]->iovad, iova);
2851                 }
2852                 deferred_flush[i].next = 0;
2853         }
2854
2855         list_size = 0;
2856 }
2857
2858 static void flush_unmaps_timeout(unsigned long data)
2859 {
2860         unsigned long flags;
2861
2862         spin_lock_irqsave(&async_umap_flush_lock, flags);
2863         flush_unmaps();
2864         spin_unlock_irqrestore(&async_umap_flush_lock, flags);
2865 }
2866
2867 static void add_unmap(struct dmar_domain *dom, struct iova *iova)
2868 {
2869         unsigned long flags;
2870         int next, iommu_id;
2871         struct intel_iommu *iommu;
2872
2873         spin_lock_irqsave(&async_umap_flush_lock, flags);
2874         if (list_size == HIGH_WATER_MARK)
2875                 flush_unmaps();
2876
2877         iommu = domain_get_iommu(dom);
2878         iommu_id = iommu->seq_id;
2879
2880         next = deferred_flush[iommu_id].next;
2881         deferred_flush[iommu_id].domain[next] = dom;
2882         deferred_flush[iommu_id].iova[next] = iova;
2883         deferred_flush[iommu_id].next++;
2884
2885         if (!timer_on) {
2886                 mod_timer(&unmap_timer, jiffies + msecs_to_jiffies(10));
2887                 timer_on = 1;
2888         }
2889         list_size++;
2890         spin_unlock_irqrestore(&async_umap_flush_lock, flags);
2891 }
2892
2893 static void intel_unmap_page(struct device *dev, dma_addr_t dev_addr,
2894                              size_t size, enum dma_data_direction dir,
2895                              struct dma_attrs *attrs)
2896 {
2897         struct pci_dev *pdev = to_pci_dev(dev);
2898         struct dmar_domain *domain;
2899         unsigned long start_pfn, last_pfn;
2900         struct iova *iova;
2901         struct intel_iommu *iommu;
2902
2903         if (iommu_no_mapping(dev))
2904                 return;
2905
2906         domain = find_domain(pdev);
2907         BUG_ON(!domain);
2908
2909         iommu = domain_get_iommu(domain);
2910
2911         iova = find_iova(&domain->iovad, IOVA_PFN(dev_addr));
2912         if (WARN_ONCE(!iova, "Driver unmaps unmatched page at PFN %llx\n",
2913                       (unsigned long long)dev_addr))
2914                 return;
2915
2916         start_pfn = mm_to_dma_pfn(iova->pfn_lo);
2917         last_pfn = mm_to_dma_pfn(iova->pfn_hi + 1) - 1;
2918
2919         pr_debug("Device %s unmapping: pfn %lx-%lx\n",
2920                  pci_name(pdev), start_pfn, last_pfn);
2921
2922         /*  clear the whole page */
2923         dma_pte_clear_range(domain, start_pfn, last_pfn);
2924
2925         /* free page tables */
2926         dma_pte_free_pagetable(domain, start_pfn, last_pfn);
2927
2928         if (intel_iommu_strict) {
2929                 iommu_flush_iotlb_psi(iommu, domain->id, start_pfn,
2930                                       last_pfn - start_pfn + 1, 0);
2931                 /* free iova */
2932                 __free_iova(&domain->iovad, iova);
2933         } else {
2934                 add_unmap(domain, iova);
2935                 /*
2936                  * queue up the release of the unmap to save the 1/6th of the
2937                  * cpu used up by the iotlb flush operation...
2938                  */
2939         }
2940 }
2941
2942 static void *intel_alloc_coherent(struct device *hwdev, size_t size,
2943                                   dma_addr_t *dma_handle, gfp_t flags)
2944 {
2945         void *vaddr;
2946         int order;
2947
2948         size = PAGE_ALIGN(size);
2949         order = get_order(size);
2950
2951         if (!iommu_no_mapping(hwdev))
2952                 flags &= ~(GFP_DMA | GFP_DMA32);
2953         else if (hwdev->coherent_dma_mask < dma_get_required_mask(hwdev)) {
2954                 if (hwdev->coherent_dma_mask < DMA_BIT_MASK(32))
2955                         flags |= GFP_DMA;
2956                 else
2957                         flags |= GFP_DMA32;
2958         }
2959
2960         vaddr = (void *)__get_free_pages(flags, order);
2961         if (!vaddr)
2962                 return NULL;
2963         memset(vaddr, 0, size);
2964
2965         *dma_handle = __intel_map_single(hwdev, virt_to_bus(vaddr), size,
2966                                          DMA_BIDIRECTIONAL,
2967                                          hwdev->coherent_dma_mask);
2968         if (*dma_handle)
2969                 return vaddr;
2970         free_pages((unsigned long)vaddr, order);
2971         return NULL;
2972 }
2973
2974 static void intel_free_coherent(struct device *hwdev, size_t size, void *vaddr,
2975                                 dma_addr_t dma_handle)
2976 {
2977         int order;
2978
2979         size = PAGE_ALIGN(size);
2980         order = get_order(size);
2981
2982         intel_unmap_page(hwdev, dma_handle, size, DMA_BIDIRECTIONAL, NULL);
2983         free_pages((unsigned long)vaddr, order);
2984 }
2985
2986 static void intel_unmap_sg(struct device *hwdev, struct scatterlist *sglist,
2987                            int nelems, enum dma_data_direction dir,
2988                            struct dma_attrs *attrs)
2989 {
2990         struct pci_dev *pdev = to_pci_dev(hwdev);
2991         struct dmar_domain *domain;
2992         unsigned long start_pfn, last_pfn;
2993         struct iova *iova;
2994         struct intel_iommu *iommu;
2995
2996         if (iommu_no_mapping(hwdev))
2997                 return;
2998
2999         domain = find_domain(pdev);
3000         BUG_ON(!domain);
3001
3002         iommu = domain_get_iommu(domain);
3003
3004         iova = find_iova(&domain->iovad, IOVA_PFN(sglist[0].dma_address));
3005         if (WARN_ONCE(!iova, "Driver unmaps unmatched sglist at PFN %llx\n",
3006                       (unsigned long long)sglist[0].dma_address))
3007                 return;
3008
3009         start_pfn = mm_to_dma_pfn(iova->pfn_lo);
3010         last_pfn = mm_to_dma_pfn(iova->pfn_hi + 1) - 1;
3011
3012         /*  clear the whole page */
3013         dma_pte_clear_range(domain, start_pfn, last_pfn);
3014
3015         /* free page tables */
3016         dma_pte_free_pagetable(domain, start_pfn, last_pfn);
3017
3018         if (intel_iommu_strict) {
3019                 iommu_flush_iotlb_psi(iommu, domain->id, start_pfn,
3020                                       last_pfn - start_pfn + 1, 0);
3021                 /* free iova */
3022                 __free_iova(&domain->iovad, iova);
3023         } else {
3024                 add_unmap(domain, iova);
3025                 /*
3026                  * queue up the release of the unmap to save the 1/6th of the
3027                  * cpu used up by the iotlb flush operation...
3028                  */
3029         }
3030 }
3031
3032 static int intel_nontranslate_map_sg(struct device *hddev,
3033         struct scatterlist *sglist, int nelems, int dir)
3034 {
3035         int i;
3036         struct scatterlist *sg;
3037
3038         for_each_sg(sglist, sg, nelems, i) {
3039                 BUG_ON(!sg_page(sg));
3040                 sg->dma_address = page_to_phys(sg_page(sg)) + sg->offset;
3041                 sg->dma_length = sg->length;
3042         }
3043         return nelems;
3044 }
3045
3046 static int intel_map_sg(struct device *hwdev, struct scatterlist *sglist, int nelems,
3047                         enum dma_data_direction dir, struct dma_attrs *attrs)
3048 {
3049         int i;
3050         struct pci_dev *pdev = to_pci_dev(hwdev);
3051         struct dmar_domain *domain;
3052         size_t size = 0;
3053         int prot = 0;
3054         struct iova *iova = NULL;
3055         int ret;
3056         struct scatterlist *sg;
3057         unsigned long start_vpfn;
3058         struct intel_iommu *iommu;
3059
3060         BUG_ON(dir == DMA_NONE);
3061         if (iommu_no_mapping(hwdev))
3062                 return intel_nontranslate_map_sg(hwdev, sglist, nelems, dir);
3063
3064         domain = get_valid_domain_for_dev(pdev);
3065         if (!domain)
3066                 return 0;
3067
3068         iommu = domain_get_iommu(domain);
3069
3070         for_each_sg(sglist, sg, nelems, i)
3071                 size += aligned_nrpages(sg->offset, sg->length);
3072
3073         iova = intel_alloc_iova(hwdev, domain, dma_to_mm_pfn(size),
3074                                 pdev->dma_mask);
3075         if (!iova) {
3076                 sglist->dma_length = 0;
3077                 return 0;
3078         }
3079
3080         /*
3081          * Check if DMAR supports zero-length reads on write only
3082          * mappings..
3083          */
3084         if (dir == DMA_TO_DEVICE || dir == DMA_BIDIRECTIONAL || \
3085                         !cap_zlr(iommu->cap))
3086                 prot |= DMA_PTE_READ;
3087         if (dir == DMA_FROM_DEVICE || dir == DMA_BIDIRECTIONAL)
3088                 prot |= DMA_PTE_WRITE;
3089
3090         start_vpfn = mm_to_dma_pfn(iova->pfn_lo);
3091
3092         ret = domain_sg_mapping(domain, start_vpfn, sglist, size, prot);
3093         if (unlikely(ret)) {
3094                 /*  clear the page */
3095                 dma_pte_clear_range(domain, start_vpfn,
3096                                     start_vpfn + size - 1);
3097                 /* free page tables */
3098                 dma_pte_free_pagetable(domain, start_vpfn,
3099                                        start_vpfn + size - 1);
3100                 /* free iova */
3101                 __free_iova(&domain->iovad, iova);
3102                 return 0;
3103         }
3104
3105         /* it's a non-present to present mapping. Only flush if caching mode */
3106         if (cap_caching_mode(iommu->cap))
3107                 iommu_flush_iotlb_psi(iommu, domain->id, start_vpfn, size, 1);
3108         else
3109                 iommu_flush_write_buffer(iommu);
3110
3111         return nelems;
3112 }
3113
3114 static int intel_mapping_error(struct device *dev, dma_addr_t dma_addr)
3115 {
3116         return !dma_addr;
3117 }
3118
3119 struct dma_map_ops intel_dma_ops = {
3120         .alloc_coherent = intel_alloc_coherent,
3121         .free_coherent = intel_free_coherent,
3122         .map_sg = intel_map_sg,
3123         .unmap_sg = intel_unmap_sg,
3124         .map_page = intel_map_page,
3125         .unmap_page = intel_unmap_page,
3126         .mapping_error = intel_mapping_error,
3127 };
3128
3129 static inline int iommu_domain_cache_init(void)
3130 {
3131         int ret = 0;
3132
3133         iommu_domain_cache = kmem_cache_create("iommu_domain",
3134                                          sizeof(struct dmar_domain),
3135                                          0,
3136                                          SLAB_HWCACHE_ALIGN,
3137
3138                                          NULL);
3139         if (!iommu_domain_cache) {
3140                 printk(KERN_ERR "Couldn't create iommu_domain cache\n");
3141                 ret = -ENOMEM;
3142         }
3143
3144         return ret;
3145 }
3146
3147 static inline int iommu_devinfo_cache_init(void)
3148 {
3149         int ret = 0;
3150
3151         iommu_devinfo_cache = kmem_cache_create("iommu_devinfo",
3152                                          sizeof(struct device_domain_info),
3153                                          0,
3154                                          SLAB_HWCACHE_ALIGN,
3155                                          NULL);
3156         if (!iommu_devinfo_cache) {
3157                 printk(KERN_ERR "Couldn't create devinfo cache\n");
3158                 ret = -ENOMEM;
3159         }
3160
3161         return ret;
3162 }
3163
3164 static inline int iommu_iova_cache_init(void)
3165 {
3166         int ret = 0;
3167
3168         iommu_iova_cache = kmem_cache_create("iommu_iova",
3169                                          sizeof(struct iova),
3170                                          0,
3171                                          SLAB_HWCACHE_ALIGN,
3172                                          NULL);
3173         if (!iommu_iova_cache) {
3174                 printk(KERN_ERR "Couldn't create iova cache\n");
3175                 ret = -ENOMEM;
3176         }
3177
3178         return ret;
3179 }
3180
3181 static int __init iommu_init_mempool(void)
3182 {
3183         int ret;
3184         ret = iommu_iova_cache_init();
3185         if (ret)
3186                 return ret;
3187
3188         ret = iommu_domain_cache_init();
3189         if (ret)
3190                 goto domain_error;
3191
3192         ret = iommu_devinfo_cache_init();
3193         if (!ret)
3194                 return ret;
3195
3196         kmem_cache_destroy(iommu_domain_cache);
3197 domain_error:
3198         kmem_cache_destroy(iommu_iova_cache);
3199
3200         return -ENOMEM;
3201 }
3202
3203 static void __init iommu_exit_mempool(void)
3204 {
3205         kmem_cache_destroy(iommu_devinfo_cache);
3206         kmem_cache_destroy(iommu_domain_cache);
3207         kmem_cache_destroy(iommu_iova_cache);
3208
3209 }
3210
3211 static void quirk_ioat_snb_local_iommu(struct pci_dev *pdev)
3212 {
3213         struct dmar_drhd_unit *drhd;
3214         u32 vtbar;
3215         int rc;
3216
3217         /* We know that this device on this chipset has its own IOMMU.
3218          * If we find it under a different IOMMU, then the BIOS is lying
3219          * to us. Hope that the IOMMU for this device is actually
3220          * disabled, and it needs no translation...
3221          */
3222         rc = pci_bus_read_config_dword(pdev->bus, PCI_DEVFN(0, 0), 0xb0, &vtbar);
3223         if (rc) {
3224                 /* "can't" happen */
3225                 dev_info(&pdev->dev, "failed to run vt-d quirk\n");
3226                 return;
3227         }
3228         vtbar &= 0xffff0000;
3229
3230         /* we know that the this iommu should be at offset 0xa000 from vtbar */
3231         drhd = dmar_find_matched_drhd_unit(pdev);
3232         if (WARN_TAINT_ONCE(!drhd || drhd->reg_base_addr - vtbar != 0xa000,
3233                             TAINT_FIRMWARE_WORKAROUND,
3234                             "BIOS assigned incorrect VT-d unit for Intel(R) QuickData Technology device\n"))
3235                 pdev->dev.archdata.iommu = DUMMY_DEVICE_DOMAIN_INFO;
3236 }
3237 DECLARE_PCI_FIXUP_ENABLE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_IOAT_SNB, quirk_ioat_snb_local_iommu);
3238
3239 static void __init init_no_remapping_devices(void)
3240 {
3241         struct dmar_drhd_unit *drhd;
3242
3243         for_each_drhd_unit(drhd) {
3244                 if (!drhd->include_all) {
3245                         int i;
3246                         for (i = 0; i < drhd->devices_cnt; i++)
3247                                 if (drhd->devices[i] != NULL)
3248                                         break;
3249                         /* ignore DMAR unit if no pci devices exist */
3250                         if (i == drhd->devices_cnt)
3251                                 drhd->ignored = 1;
3252                 }
3253         }
3254
3255         for_each_drhd_unit(drhd) {
3256                 int i;
3257                 if (drhd->ignored || drhd->include_all)
3258                         continue;
3259
3260                 for (i = 0; i < drhd->devices_cnt; i++)
3261                         if (drhd->devices[i] &&
3262                             !IS_GFX_DEVICE(drhd->devices[i]))
3263                                 break;
3264
3265                 if (i < drhd->devices_cnt)
3266                         continue;
3267
3268                 /* This IOMMU has *only* gfx devices. Either bypass it or
3269                    set the gfx_mapped flag, as appropriate */
3270                 if (dmar_map_gfx) {
3271                         intel_iommu_gfx_mapped = 1;
3272                 } else {
3273                         drhd->ignored = 1;
3274                         for (i = 0; i < drhd->devices_cnt; i++) {
3275                                 if (!drhd->devices[i])
3276                                         continue;
3277                                 drhd->devices[i]->dev.archdata.iommu = DUMMY_DEVICE_DOMAIN_INFO;
3278                         }
3279                 }
3280         }
3281 }
3282
3283 #ifdef CONFIG_SUSPEND
3284 static int init_iommu_hw(void)
3285 {
3286         struct dmar_drhd_unit *drhd;
3287         struct intel_iommu *iommu = NULL;
3288
3289         for_each_active_iommu(iommu, drhd)
3290                 if (iommu->qi)
3291                         dmar_reenable_qi(iommu);
3292
3293         for_each_iommu(iommu, drhd) {
3294                 if (drhd->ignored) {
3295                         /*
3296                          * we always have to disable PMRs or DMA may fail on
3297                          * this device
3298                          */
3299                         if (force_on)
3300                                 iommu_disable_protect_mem_regions(iommu);
3301                         continue;
3302  &