Merge branch 'core-iommu-for-linus' of git://git.kernel.org/pub/scm/linux/kernel...
[~shefty/rdma-dev.git] / drivers / iommu / intel-iommu.c
1 /*
2  * Copyright (c) 2006, Intel Corporation.
3  *
4  * This program is free software; you can redistribute it and/or modify it
5  * under the terms and conditions of the GNU General Public License,
6  * version 2, as published by the Free Software Foundation.
7  *
8  * This program is distributed in the hope it will be useful, but WITHOUT
9  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
10  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
11  * more details.
12  *
13  * You should have received a copy of the GNU General Public License along with
14  * this program; if not, write to the Free Software Foundation, Inc., 59 Temple
15  * Place - Suite 330, Boston, MA 02111-1307 USA.
16  *
17  * Copyright (C) 2006-2008 Intel Corporation
18  * Author: Ashok Raj <ashok.raj@intel.com>
19  * Author: Shaohua Li <shaohua.li@intel.com>
20  * Author: Anil S Keshavamurthy <anil.s.keshavamurthy@intel.com>
21  * Author: Fenghua Yu <fenghua.yu@intel.com>
22  */
23
24 #include <linux/init.h>
25 #include <linux/bitmap.h>
26 #include <linux/debugfs.h>
27 #include <linux/export.h>
28 #include <linux/slab.h>
29 #include <linux/irq.h>
30 #include <linux/interrupt.h>
31 #include <linux/spinlock.h>
32 #include <linux/pci.h>
33 #include <linux/dmar.h>
34 #include <linux/dma-mapping.h>
35 #include <linux/mempool.h>
36 #include <linux/timer.h>
37 #include <linux/iova.h>
38 #include <linux/iommu.h>
39 #include <linux/intel-iommu.h>
40 #include <linux/syscore_ops.h>
41 #include <linux/tboot.h>
42 #include <linux/dmi.h>
43 #include <linux/pci-ats.h>
44 #include <linux/memblock.h>
45 #include <asm/cacheflush.h>
46 #include <asm/iommu.h>
47
48 #define ROOT_SIZE               VTD_PAGE_SIZE
49 #define CONTEXT_SIZE            VTD_PAGE_SIZE
50
51 #define IS_GFX_DEVICE(pdev) ((pdev->class >> 16) == PCI_BASE_CLASS_DISPLAY)
52 #define IS_ISA_DEVICE(pdev) ((pdev->class >> 8) == PCI_CLASS_BRIDGE_ISA)
53 #define IS_AZALIA(pdev) ((pdev)->vendor == 0x8086 && (pdev)->device == 0x3a3e)
54
55 #define IOAPIC_RANGE_START      (0xfee00000)
56 #define IOAPIC_RANGE_END        (0xfeefffff)
57 #define IOVA_START_ADDR         (0x1000)
58
59 #define DEFAULT_DOMAIN_ADDRESS_WIDTH 48
60
61 #define MAX_AGAW_WIDTH 64
62
63 #define __DOMAIN_MAX_PFN(gaw)  ((((uint64_t)1) << (gaw-VTD_PAGE_SHIFT)) - 1)
64 #define __DOMAIN_MAX_ADDR(gaw) ((((uint64_t)1) << gaw) - 1)
65
66 /* We limit DOMAIN_MAX_PFN to fit in an unsigned long, and DOMAIN_MAX_ADDR
67    to match. That way, we can use 'unsigned long' for PFNs with impunity. */
68 #define DOMAIN_MAX_PFN(gaw)     ((unsigned long) min_t(uint64_t, \
69                                 __DOMAIN_MAX_PFN(gaw), (unsigned long)-1))
70 #define DOMAIN_MAX_ADDR(gaw)    (((uint64_t)__DOMAIN_MAX_PFN(gaw)) << VTD_PAGE_SHIFT)
71
72 #define IOVA_PFN(addr)          ((addr) >> PAGE_SHIFT)
73 #define DMA_32BIT_PFN           IOVA_PFN(DMA_BIT_MASK(32))
74 #define DMA_64BIT_PFN           IOVA_PFN(DMA_BIT_MASK(64))
75
76 /* page table handling */
77 #define LEVEL_STRIDE            (9)
78 #define LEVEL_MASK              (((u64)1 << LEVEL_STRIDE) - 1)
79
80 /*
81  * This bitmap is used to advertise the page sizes our hardware support
82  * to the IOMMU core, which will then use this information to split
83  * physically contiguous memory regions it is mapping into page sizes
84  * that we support.
85  *
86  * Traditionally the IOMMU core just handed us the mappings directly,
87  * after making sure the size is an order of a 4KiB page and that the
88  * mapping has natural alignment.
89  *
90  * To retain this behavior, we currently advertise that we support
91  * all page sizes that are an order of 4KiB.
92  *
93  * If at some point we'd like to utilize the IOMMU core's new behavior,
94  * we could change this to advertise the real page sizes we support.
95  */
96 #define INTEL_IOMMU_PGSIZES     (~0xFFFUL)
97
98 static inline int agaw_to_level(int agaw)
99 {
100         return agaw + 2;
101 }
102
103 static inline int agaw_to_width(int agaw)
104 {
105         return 30 + agaw * LEVEL_STRIDE;
106 }
107
108 static inline int width_to_agaw(int width)
109 {
110         return (width - 30) / LEVEL_STRIDE;
111 }
112
113 static inline unsigned int level_to_offset_bits(int level)
114 {
115         return (level - 1) * LEVEL_STRIDE;
116 }
117
118 static inline int pfn_level_offset(unsigned long pfn, int level)
119 {
120         return (pfn >> level_to_offset_bits(level)) & LEVEL_MASK;
121 }
122
123 static inline unsigned long level_mask(int level)
124 {
125         return -1UL << level_to_offset_bits(level);
126 }
127
128 static inline unsigned long level_size(int level)
129 {
130         return 1UL << level_to_offset_bits(level);
131 }
132
133 static inline unsigned long align_to_level(unsigned long pfn, int level)
134 {
135         return (pfn + level_size(level) - 1) & level_mask(level);
136 }
137
138 static inline unsigned long lvl_to_nr_pages(unsigned int lvl)
139 {
140         return  1 << ((lvl - 1) * LEVEL_STRIDE);
141 }
142
143 /* VT-d pages must always be _smaller_ than MM pages. Otherwise things
144    are never going to work. */
145 static inline unsigned long dma_to_mm_pfn(unsigned long dma_pfn)
146 {
147         return dma_pfn >> (PAGE_SHIFT - VTD_PAGE_SHIFT);
148 }
149
150 static inline unsigned long mm_to_dma_pfn(unsigned long mm_pfn)
151 {
152         return mm_pfn << (PAGE_SHIFT - VTD_PAGE_SHIFT);
153 }
154 static inline unsigned long page_to_dma_pfn(struct page *pg)
155 {
156         return mm_to_dma_pfn(page_to_pfn(pg));
157 }
158 static inline unsigned long virt_to_dma_pfn(void *p)
159 {
160         return page_to_dma_pfn(virt_to_page(p));
161 }
162
163 /* global iommu list, set NULL for ignored DMAR units */
164 static struct intel_iommu **g_iommus;
165
166 static void __init check_tylersburg_isoch(void);
167 static int rwbf_quirk;
168
169 /*
170  * set to 1 to panic kernel if can't successfully enable VT-d
171  * (used when kernel is launched w/ TXT)
172  */
173 static int force_on = 0;
174
175 /*
176  * 0: Present
177  * 1-11: Reserved
178  * 12-63: Context Ptr (12 - (haw-1))
179  * 64-127: Reserved
180  */
181 struct root_entry {
182         u64     val;
183         u64     rsvd1;
184 };
185 #define ROOT_ENTRY_NR (VTD_PAGE_SIZE/sizeof(struct root_entry))
186 static inline bool root_present(struct root_entry *root)
187 {
188         return (root->val & 1);
189 }
190 static inline void set_root_present(struct root_entry *root)
191 {
192         root->val |= 1;
193 }
194 static inline void set_root_value(struct root_entry *root, unsigned long value)
195 {
196         root->val |= value & VTD_PAGE_MASK;
197 }
198
199 static inline struct context_entry *
200 get_context_addr_from_root(struct root_entry *root)
201 {
202         return (struct context_entry *)
203                 (root_present(root)?phys_to_virt(
204                 root->val & VTD_PAGE_MASK) :
205                 NULL);
206 }
207
208 /*
209  * low 64 bits:
210  * 0: present
211  * 1: fault processing disable
212  * 2-3: translation type
213  * 12-63: address space root
214  * high 64 bits:
215  * 0-2: address width
216  * 3-6: aval
217  * 8-23: domain id
218  */
219 struct context_entry {
220         u64 lo;
221         u64 hi;
222 };
223
224 static inline bool context_present(struct context_entry *context)
225 {
226         return (context->lo & 1);
227 }
228 static inline void context_set_present(struct context_entry *context)
229 {
230         context->lo |= 1;
231 }
232
233 static inline void context_set_fault_enable(struct context_entry *context)
234 {
235         context->lo &= (((u64)-1) << 2) | 1;
236 }
237
238 static inline void context_set_translation_type(struct context_entry *context,
239                                                 unsigned long value)
240 {
241         context->lo &= (((u64)-1) << 4) | 3;
242         context->lo |= (value & 3) << 2;
243 }
244
245 static inline void context_set_address_root(struct context_entry *context,
246                                             unsigned long value)
247 {
248         context->lo |= value & VTD_PAGE_MASK;
249 }
250
251 static inline void context_set_address_width(struct context_entry *context,
252                                              unsigned long value)
253 {
254         context->hi |= value & 7;
255 }
256
257 static inline void context_set_domain_id(struct context_entry *context,
258                                          unsigned long value)
259 {
260         context->hi |= (value & ((1 << 16) - 1)) << 8;
261 }
262
263 static inline void context_clear_entry(struct context_entry *context)
264 {
265         context->lo = 0;
266         context->hi = 0;
267 }
268
269 /*
270  * 0: readable
271  * 1: writable
272  * 2-6: reserved
273  * 7: super page
274  * 8-10: available
275  * 11: snoop behavior
276  * 12-63: Host physcial address
277  */
278 struct dma_pte {
279         u64 val;
280 };
281
282 static inline void dma_clear_pte(struct dma_pte *pte)
283 {
284         pte->val = 0;
285 }
286
287 static inline void dma_set_pte_readable(struct dma_pte *pte)
288 {
289         pte->val |= DMA_PTE_READ;
290 }
291
292 static inline void dma_set_pte_writable(struct dma_pte *pte)
293 {
294         pte->val |= DMA_PTE_WRITE;
295 }
296
297 static inline void dma_set_pte_snp(struct dma_pte *pte)
298 {
299         pte->val |= DMA_PTE_SNP;
300 }
301
302 static inline void dma_set_pte_prot(struct dma_pte *pte, unsigned long prot)
303 {
304         pte->val = (pte->val & ~3) | (prot & 3);
305 }
306
307 static inline u64 dma_pte_addr(struct dma_pte *pte)
308 {
309 #ifdef CONFIG_64BIT
310         return pte->val & VTD_PAGE_MASK;
311 #else
312         /* Must have a full atomic 64-bit read */
313         return  __cmpxchg64(&pte->val, 0ULL, 0ULL) & VTD_PAGE_MASK;
314 #endif
315 }
316
317 static inline void dma_set_pte_pfn(struct dma_pte *pte, unsigned long pfn)
318 {
319         pte->val |= (uint64_t)pfn << VTD_PAGE_SHIFT;
320 }
321
322 static inline bool dma_pte_present(struct dma_pte *pte)
323 {
324         return (pte->val & 3) != 0;
325 }
326
327 static inline bool dma_pte_superpage(struct dma_pte *pte)
328 {
329         return (pte->val & (1 << 7));
330 }
331
332 static inline int first_pte_in_page(struct dma_pte *pte)
333 {
334         return !((unsigned long)pte & ~VTD_PAGE_MASK);
335 }
336
337 /*
338  * This domain is a statically identity mapping domain.
339  *      1. This domain creats a static 1:1 mapping to all usable memory.
340  *      2. It maps to each iommu if successful.
341  *      3. Each iommu mapps to this domain if successful.
342  */
343 static struct dmar_domain *si_domain;
344 static int hw_pass_through = 1;
345
346 /* devices under the same p2p bridge are owned in one domain */
347 #define DOMAIN_FLAG_P2P_MULTIPLE_DEVICES (1 << 0)
348
349 /* domain represents a virtual machine, more than one devices
350  * across iommus may be owned in one domain, e.g. kvm guest.
351  */
352 #define DOMAIN_FLAG_VIRTUAL_MACHINE     (1 << 1)
353
354 /* si_domain contains mulitple devices */
355 #define DOMAIN_FLAG_STATIC_IDENTITY     (1 << 2)
356
357 /* define the limit of IOMMUs supported in each domain */
358 #ifdef  CONFIG_X86
359 # define        IOMMU_UNITS_SUPPORTED   MAX_IO_APICS
360 #else
361 # define        IOMMU_UNITS_SUPPORTED   64
362 #endif
363
364 struct dmar_domain {
365         int     id;                     /* domain id */
366         int     nid;                    /* node id */
367         DECLARE_BITMAP(iommu_bmp, IOMMU_UNITS_SUPPORTED);
368                                         /* bitmap of iommus this domain uses*/
369
370         struct list_head devices;       /* all devices' list */
371         struct iova_domain iovad;       /* iova's that belong to this domain */
372
373         struct dma_pte  *pgd;           /* virtual address */
374         int             gaw;            /* max guest address width */
375
376         /* adjusted guest address width, 0 is level 2 30-bit */
377         int             agaw;
378
379         int             flags;          /* flags to find out type of domain */
380
381         int             iommu_coherency;/* indicate coherency of iommu access */
382         int             iommu_snooping; /* indicate snooping control feature*/
383         int             iommu_count;    /* reference count of iommu */
384         int             iommu_superpage;/* Level of superpages supported:
385                                            0 == 4KiB (no superpages), 1 == 2MiB,
386                                            2 == 1GiB, 3 == 512GiB, 4 == 1TiB */
387         spinlock_t      iommu_lock;     /* protect iommu set in domain */
388         u64             max_addr;       /* maximum mapped address */
389 };
390
391 /* PCI domain-device relationship */
392 struct device_domain_info {
393         struct list_head link;  /* link to domain siblings */
394         struct list_head global; /* link to global list */
395         int segment;            /* PCI domain */
396         u8 bus;                 /* PCI bus number */
397         u8 devfn;               /* PCI devfn number */
398         struct pci_dev *dev; /* it's NULL for PCIe-to-PCI bridge */
399         struct intel_iommu *iommu; /* IOMMU used by this device */
400         struct dmar_domain *domain; /* pointer to domain */
401 };
402
403 static void flush_unmaps_timeout(unsigned long data);
404
405 DEFINE_TIMER(unmap_timer,  flush_unmaps_timeout, 0, 0);
406
407 #define HIGH_WATER_MARK 250
408 struct deferred_flush_tables {
409         int next;
410         struct iova *iova[HIGH_WATER_MARK];
411         struct dmar_domain *domain[HIGH_WATER_MARK];
412 };
413
414 static struct deferred_flush_tables *deferred_flush;
415
416 /* bitmap for indexing intel_iommus */
417 static int g_num_of_iommus;
418
419 static DEFINE_SPINLOCK(async_umap_flush_lock);
420 static LIST_HEAD(unmaps_to_do);
421
422 static int timer_on;
423 static long list_size;
424
425 static void domain_remove_dev_info(struct dmar_domain *domain);
426
427 #ifdef CONFIG_INTEL_IOMMU_DEFAULT_ON
428 int dmar_disabled = 0;
429 #else
430 int dmar_disabled = 1;
431 #endif /*CONFIG_INTEL_IOMMU_DEFAULT_ON*/
432
433 int intel_iommu_enabled = 0;
434 EXPORT_SYMBOL_GPL(intel_iommu_enabled);
435
436 static int dmar_map_gfx = 1;
437 static int dmar_forcedac;
438 static int intel_iommu_strict;
439 static int intel_iommu_superpage = 1;
440
441 int intel_iommu_gfx_mapped;
442 EXPORT_SYMBOL_GPL(intel_iommu_gfx_mapped);
443
444 #define DUMMY_DEVICE_DOMAIN_INFO ((struct device_domain_info *)(-1))
445 static DEFINE_SPINLOCK(device_domain_lock);
446 static LIST_HEAD(device_domain_list);
447
448 static struct iommu_ops intel_iommu_ops;
449
450 static int __init intel_iommu_setup(char *str)
451 {
452         if (!str)
453                 return -EINVAL;
454         while (*str) {
455                 if (!strncmp(str, "on", 2)) {
456                         dmar_disabled = 0;
457                         printk(KERN_INFO "Intel-IOMMU: enabled\n");
458                 } else if (!strncmp(str, "off", 3)) {
459                         dmar_disabled = 1;
460                         printk(KERN_INFO "Intel-IOMMU: disabled\n");
461                 } else if (!strncmp(str, "igfx_off", 8)) {
462                         dmar_map_gfx = 0;
463                         printk(KERN_INFO
464                                 "Intel-IOMMU: disable GFX device mapping\n");
465                 } else if (!strncmp(str, "forcedac", 8)) {
466                         printk(KERN_INFO
467                                 "Intel-IOMMU: Forcing DAC for PCI devices\n");
468                         dmar_forcedac = 1;
469                 } else if (!strncmp(str, "strict", 6)) {
470                         printk(KERN_INFO
471                                 "Intel-IOMMU: disable batched IOTLB flush\n");
472                         intel_iommu_strict = 1;
473                 } else if (!strncmp(str, "sp_off", 6)) {
474                         printk(KERN_INFO
475                                 "Intel-IOMMU: disable supported super page\n");
476                         intel_iommu_superpage = 0;
477                 }
478
479                 str += strcspn(str, ",");
480                 while (*str == ',')
481                         str++;
482         }
483         return 0;
484 }
485 __setup("intel_iommu=", intel_iommu_setup);
486
487 static struct kmem_cache *iommu_domain_cache;
488 static struct kmem_cache *iommu_devinfo_cache;
489 static struct kmem_cache *iommu_iova_cache;
490
491 static inline void *alloc_pgtable_page(int node)
492 {
493         struct page *page;
494         void *vaddr = NULL;
495
496         page = alloc_pages_node(node, GFP_ATOMIC | __GFP_ZERO, 0);
497         if (page)
498                 vaddr = page_address(page);
499         return vaddr;
500 }
501
502 static inline void free_pgtable_page(void *vaddr)
503 {
504         free_page((unsigned long)vaddr);
505 }
506
507 static inline void *alloc_domain_mem(void)
508 {
509         return kmem_cache_alloc(iommu_domain_cache, GFP_ATOMIC);
510 }
511
512 static void free_domain_mem(void *vaddr)
513 {
514         kmem_cache_free(iommu_domain_cache, vaddr);
515 }
516
517 static inline void * alloc_devinfo_mem(void)
518 {
519         return kmem_cache_alloc(iommu_devinfo_cache, GFP_ATOMIC);
520 }
521
522 static inline void free_devinfo_mem(void *vaddr)
523 {
524         kmem_cache_free(iommu_devinfo_cache, vaddr);
525 }
526
527 struct iova *alloc_iova_mem(void)
528 {
529         return kmem_cache_alloc(iommu_iova_cache, GFP_ATOMIC);
530 }
531
532 void free_iova_mem(struct iova *iova)
533 {
534         kmem_cache_free(iommu_iova_cache, iova);
535 }
536
537
538 static int __iommu_calculate_agaw(struct intel_iommu *iommu, int max_gaw)
539 {
540         unsigned long sagaw;
541         int agaw = -1;
542
543         sagaw = cap_sagaw(iommu->cap);
544         for (agaw = width_to_agaw(max_gaw);
545              agaw >= 0; agaw--) {
546                 if (test_bit(agaw, &sagaw))
547                         break;
548         }
549
550         return agaw;
551 }
552
553 /*
554  * Calculate max SAGAW for each iommu.
555  */
556 int iommu_calculate_max_sagaw(struct intel_iommu *iommu)
557 {
558         return __iommu_calculate_agaw(iommu, MAX_AGAW_WIDTH);
559 }
560
561 /*
562  * calculate agaw for each iommu.
563  * "SAGAW" may be different across iommus, use a default agaw, and
564  * get a supported less agaw for iommus that don't support the default agaw.
565  */
566 int iommu_calculate_agaw(struct intel_iommu *iommu)
567 {
568         return __iommu_calculate_agaw(iommu, DEFAULT_DOMAIN_ADDRESS_WIDTH);
569 }
570
571 /* This functionin only returns single iommu in a domain */
572 static struct intel_iommu *domain_get_iommu(struct dmar_domain *domain)
573 {
574         int iommu_id;
575
576         /* si_domain and vm domain should not get here. */
577         BUG_ON(domain->flags & DOMAIN_FLAG_VIRTUAL_MACHINE);
578         BUG_ON(domain->flags & DOMAIN_FLAG_STATIC_IDENTITY);
579
580         iommu_id = find_first_bit(domain->iommu_bmp, g_num_of_iommus);
581         if (iommu_id < 0 || iommu_id >= g_num_of_iommus)
582                 return NULL;
583
584         return g_iommus[iommu_id];
585 }
586
587 static void domain_update_iommu_coherency(struct dmar_domain *domain)
588 {
589         int i;
590
591         domain->iommu_coherency = 1;
592
593         for_each_set_bit(i, domain->iommu_bmp, g_num_of_iommus) {
594                 if (!ecap_coherent(g_iommus[i]->ecap)) {
595                         domain->iommu_coherency = 0;
596                         break;
597                 }
598         }
599 }
600
601 static void domain_update_iommu_snooping(struct dmar_domain *domain)
602 {
603         int i;
604
605         domain->iommu_snooping = 1;
606
607         for_each_set_bit(i, domain->iommu_bmp, g_num_of_iommus) {
608                 if (!ecap_sc_support(g_iommus[i]->ecap)) {
609                         domain->iommu_snooping = 0;
610                         break;
611                 }
612         }
613 }
614
615 static void domain_update_iommu_superpage(struct dmar_domain *domain)
616 {
617         struct dmar_drhd_unit *drhd;
618         struct intel_iommu *iommu = NULL;
619         int mask = 0xf;
620
621         if (!intel_iommu_superpage) {
622                 domain->iommu_superpage = 0;
623                 return;
624         }
625
626         /* set iommu_superpage to the smallest common denominator */
627         for_each_active_iommu(iommu, drhd) {
628                 mask &= cap_super_page_val(iommu->cap);
629                 if (!mask) {
630                         break;
631                 }
632         }
633         domain->iommu_superpage = fls(mask);
634 }
635
636 /* Some capabilities may be different across iommus */
637 static void domain_update_iommu_cap(struct dmar_domain *domain)
638 {
639         domain_update_iommu_coherency(domain);
640         domain_update_iommu_snooping(domain);
641         domain_update_iommu_superpage(domain);
642 }
643
644 static struct intel_iommu *device_to_iommu(int segment, u8 bus, u8 devfn)
645 {
646         struct dmar_drhd_unit *drhd = NULL;
647         int i;
648
649         for_each_drhd_unit(drhd) {
650                 if (drhd->ignored)
651                         continue;
652                 if (segment != drhd->segment)
653                         continue;
654
655                 for (i = 0; i < drhd->devices_cnt; i++) {
656                         if (drhd->devices[i] &&
657                             drhd->devices[i]->bus->number == bus &&
658                             drhd->devices[i]->devfn == devfn)
659                                 return drhd->iommu;
660                         if (drhd->devices[i] &&
661                             drhd->devices[i]->subordinate &&
662                             drhd->devices[i]->subordinate->number <= bus &&
663                             drhd->devices[i]->subordinate->subordinate >= bus)
664                                 return drhd->iommu;
665                 }
666
667                 if (drhd->include_all)
668                         return drhd->iommu;
669         }
670
671         return NULL;
672 }
673
674 static void domain_flush_cache(struct dmar_domain *domain,
675                                void *addr, int size)
676 {
677         if (!domain->iommu_coherency)
678                 clflush_cache_range(addr, size);
679 }
680
681 /* Gets context entry for a given bus and devfn */
682 static struct context_entry * device_to_context_entry(struct intel_iommu *iommu,
683                 u8 bus, u8 devfn)
684 {
685         struct root_entry *root;
686         struct context_entry *context;
687         unsigned long phy_addr;
688         unsigned long flags;
689
690         spin_lock_irqsave(&iommu->lock, flags);
691         root = &iommu->root_entry[bus];
692         context = get_context_addr_from_root(root);
693         if (!context) {
694                 context = (struct context_entry *)
695                                 alloc_pgtable_page(iommu->node);
696                 if (!context) {
697                         spin_unlock_irqrestore(&iommu->lock, flags);
698                         return NULL;
699                 }
700                 __iommu_flush_cache(iommu, (void *)context, CONTEXT_SIZE);
701                 phy_addr = virt_to_phys((void *)context);
702                 set_root_value(root, phy_addr);
703                 set_root_present(root);
704                 __iommu_flush_cache(iommu, root, sizeof(*root));
705         }
706         spin_unlock_irqrestore(&iommu->lock, flags);
707         return &context[devfn];
708 }
709
710 static int device_context_mapped(struct intel_iommu *iommu, u8 bus, u8 devfn)
711 {
712         struct root_entry *root;
713         struct context_entry *context;
714         int ret;
715         unsigned long flags;
716
717         spin_lock_irqsave(&iommu->lock, flags);
718         root = &iommu->root_entry[bus];
719         context = get_context_addr_from_root(root);
720         if (!context) {
721                 ret = 0;
722                 goto out;
723         }
724         ret = context_present(&context[devfn]);
725 out:
726         spin_unlock_irqrestore(&iommu->lock, flags);
727         return ret;
728 }
729
730 static void clear_context_table(struct intel_iommu *iommu, u8 bus, u8 devfn)
731 {
732         struct root_entry *root;
733         struct context_entry *context;
734         unsigned long flags;
735
736         spin_lock_irqsave(&iommu->lock, flags);
737         root = &iommu->root_entry[bus];
738         context = get_context_addr_from_root(root);
739         if (context) {
740                 context_clear_entry(&context[devfn]);
741                 __iommu_flush_cache(iommu, &context[devfn], \
742                         sizeof(*context));
743         }
744         spin_unlock_irqrestore(&iommu->lock, flags);
745 }
746
747 static void free_context_table(struct intel_iommu *iommu)
748 {
749         struct root_entry *root;
750         int i;
751         unsigned long flags;
752         struct context_entry *context;
753
754         spin_lock_irqsave(&iommu->lock, flags);
755         if (!iommu->root_entry) {
756                 goto out;
757         }
758         for (i = 0; i < ROOT_ENTRY_NR; i++) {
759                 root = &iommu->root_entry[i];
760                 context = get_context_addr_from_root(root);
761                 if (context)
762                         free_pgtable_page(context);
763         }
764         free_pgtable_page(iommu->root_entry);
765         iommu->root_entry = NULL;
766 out:
767         spin_unlock_irqrestore(&iommu->lock, flags);
768 }
769
770 static struct dma_pte *pfn_to_dma_pte(struct dmar_domain *domain,
771                                       unsigned long pfn, int target_level)
772 {
773         int addr_width = agaw_to_width(domain->agaw) - VTD_PAGE_SHIFT;
774         struct dma_pte *parent, *pte = NULL;
775         int level = agaw_to_level(domain->agaw);
776         int offset;
777
778         BUG_ON(!domain->pgd);
779         BUG_ON(addr_width < BITS_PER_LONG && pfn >> addr_width);
780         parent = domain->pgd;
781
782         while (level > 0) {
783                 void *tmp_page;
784
785                 offset = pfn_level_offset(pfn, level);
786                 pte = &parent[offset];
787                 if (!target_level && (dma_pte_superpage(pte) || !dma_pte_present(pte)))
788                         break;
789                 if (level == target_level)
790                         break;
791
792                 if (!dma_pte_present(pte)) {
793                         uint64_t pteval;
794
795                         tmp_page = alloc_pgtable_page(domain->nid);
796
797                         if (!tmp_page)
798                                 return NULL;
799
800                         domain_flush_cache(domain, tmp_page, VTD_PAGE_SIZE);
801                         pteval = ((uint64_t)virt_to_dma_pfn(tmp_page) << VTD_PAGE_SHIFT) | DMA_PTE_READ | DMA_PTE_WRITE;
802                         if (cmpxchg64(&pte->val, 0ULL, pteval)) {
803                                 /* Someone else set it while we were thinking; use theirs. */
804                                 free_pgtable_page(tmp_page);
805                         } else {
806                                 dma_pte_addr(pte);
807                                 domain_flush_cache(domain, pte, sizeof(*pte));
808                         }
809                 }
810                 parent = phys_to_virt(dma_pte_addr(pte));
811                 level--;
812         }
813
814         return pte;
815 }
816
817
818 /* return address's pte at specific level */
819 static struct dma_pte *dma_pfn_level_pte(struct dmar_domain *domain,
820                                          unsigned long pfn,
821                                          int level, int *large_page)
822 {
823         struct dma_pte *parent, *pte = NULL;
824         int total = agaw_to_level(domain->agaw);
825         int offset;
826
827         parent = domain->pgd;
828         while (level <= total) {
829                 offset = pfn_level_offset(pfn, total);
830                 pte = &parent[offset];
831                 if (level == total)
832                         return pte;
833
834                 if (!dma_pte_present(pte)) {
835                         *large_page = total;
836                         break;
837                 }
838
839                 if (pte->val & DMA_PTE_LARGE_PAGE) {
840                         *large_page = total;
841                         return pte;
842                 }
843
844                 parent = phys_to_virt(dma_pte_addr(pte));
845                 total--;
846         }
847         return NULL;
848 }
849
850 /* clear last level pte, a tlb flush should be followed */
851 static int dma_pte_clear_range(struct dmar_domain *domain,
852                                 unsigned long start_pfn,
853                                 unsigned long last_pfn)
854 {
855         int addr_width = agaw_to_width(domain->agaw) - VTD_PAGE_SHIFT;
856         unsigned int large_page = 1;
857         struct dma_pte *first_pte, *pte;
858         int order;
859
860         BUG_ON(addr_width < BITS_PER_LONG && start_pfn >> addr_width);
861         BUG_ON(addr_width < BITS_PER_LONG && last_pfn >> addr_width);
862         BUG_ON(start_pfn > last_pfn);
863
864         /* we don't need lock here; nobody else touches the iova range */
865         do {
866                 large_page = 1;
867                 first_pte = pte = dma_pfn_level_pte(domain, start_pfn, 1, &large_page);
868                 if (!pte) {
869                         start_pfn = align_to_level(start_pfn + 1, large_page + 1);
870                         continue;
871                 }
872                 do {
873                         dma_clear_pte(pte);
874                         start_pfn += lvl_to_nr_pages(large_page);
875                         pte++;
876                 } while (start_pfn <= last_pfn && !first_pte_in_page(pte));
877
878                 domain_flush_cache(domain, first_pte,
879                                    (void *)pte - (void *)first_pte);
880
881         } while (start_pfn && start_pfn <= last_pfn);
882
883         order = (large_page - 1) * 9;
884         return order;
885 }
886
887 /* free page table pages. last level pte should already be cleared */
888 static void dma_pte_free_pagetable(struct dmar_domain *domain,
889                                    unsigned long start_pfn,
890                                    unsigned long last_pfn)
891 {
892         int addr_width = agaw_to_width(domain->agaw) - VTD_PAGE_SHIFT;
893         struct dma_pte *first_pte, *pte;
894         int total = agaw_to_level(domain->agaw);
895         int level;
896         unsigned long tmp;
897         int large_page = 2;
898
899         BUG_ON(addr_width < BITS_PER_LONG && start_pfn >> addr_width);
900         BUG_ON(addr_width < BITS_PER_LONG && last_pfn >> addr_width);
901         BUG_ON(start_pfn > last_pfn);
902
903         /* We don't need lock here; nobody else touches the iova range */
904         level = 2;
905         while (level <= total) {
906                 tmp = align_to_level(start_pfn, level);
907
908                 /* If we can't even clear one PTE at this level, we're done */
909                 if (tmp + level_size(level) - 1 > last_pfn)
910                         return;
911
912                 do {
913                         large_page = level;
914                         first_pte = pte = dma_pfn_level_pte(domain, tmp, level, &large_page);
915                         if (large_page > level)
916                                 level = large_page + 1;
917                         if (!pte) {
918                                 tmp = align_to_level(tmp + 1, level + 1);
919                                 continue;
920                         }
921                         do {
922                                 if (dma_pte_present(pte)) {
923                                         free_pgtable_page(phys_to_virt(dma_pte_addr(pte)));
924                                         dma_clear_pte(pte);
925                                 }
926                                 pte++;
927                                 tmp += level_size(level);
928                         } while (!first_pte_in_page(pte) &&
929                                  tmp + level_size(level) - 1 <= last_pfn);
930
931                         domain_flush_cache(domain, first_pte,
932                                            (void *)pte - (void *)first_pte);
933                         
934                 } while (tmp && tmp + level_size(level) - 1 <= last_pfn);
935                 level++;
936         }
937         /* free pgd */
938         if (start_pfn == 0 && last_pfn == DOMAIN_MAX_PFN(domain->gaw)) {
939                 free_pgtable_page(domain->pgd);
940                 domain->pgd = NULL;
941         }
942 }
943
944 /* iommu handling */
945 static int iommu_alloc_root_entry(struct intel_iommu *iommu)
946 {
947         struct root_entry *root;
948         unsigned long flags;
949
950         root = (struct root_entry *)alloc_pgtable_page(iommu->node);
951         if (!root)
952                 return -ENOMEM;
953
954         __iommu_flush_cache(iommu, root, ROOT_SIZE);
955
956         spin_lock_irqsave(&iommu->lock, flags);
957         iommu->root_entry = root;
958         spin_unlock_irqrestore(&iommu->lock, flags);
959
960         return 0;
961 }
962
963 static void iommu_set_root_entry(struct intel_iommu *iommu)
964 {
965         void *addr;
966         u32 sts;
967         unsigned long flag;
968
969         addr = iommu->root_entry;
970
971         raw_spin_lock_irqsave(&iommu->register_lock, flag);
972         dmar_writeq(iommu->reg + DMAR_RTADDR_REG, virt_to_phys(addr));
973
974         writel(iommu->gcmd | DMA_GCMD_SRTP, iommu->reg + DMAR_GCMD_REG);
975
976         /* Make sure hardware complete it */
977         IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
978                       readl, (sts & DMA_GSTS_RTPS), sts);
979
980         raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
981 }
982
983 static void iommu_flush_write_buffer(struct intel_iommu *iommu)
984 {
985         u32 val;
986         unsigned long flag;
987
988         if (!rwbf_quirk && !cap_rwbf(iommu->cap))
989                 return;
990
991         raw_spin_lock_irqsave(&iommu->register_lock, flag);
992         writel(iommu->gcmd | DMA_GCMD_WBF, iommu->reg + DMAR_GCMD_REG);
993
994         /* Make sure hardware complete it */
995         IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
996                       readl, (!(val & DMA_GSTS_WBFS)), val);
997
998         raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
999 }
1000
1001 /* return value determine if we need a write buffer flush */
1002 static void __iommu_flush_context(struct intel_iommu *iommu,
1003                                   u16 did, u16 source_id, u8 function_mask,
1004                                   u64 type)
1005 {
1006         u64 val = 0;
1007         unsigned long flag;
1008
1009         switch (type) {
1010         case DMA_CCMD_GLOBAL_INVL:
1011                 val = DMA_CCMD_GLOBAL_INVL;
1012                 break;
1013         case DMA_CCMD_DOMAIN_INVL:
1014                 val = DMA_CCMD_DOMAIN_INVL|DMA_CCMD_DID(did);
1015                 break;
1016         case DMA_CCMD_DEVICE_INVL:
1017                 val = DMA_CCMD_DEVICE_INVL|DMA_CCMD_DID(did)
1018                         | DMA_CCMD_SID(source_id) | DMA_CCMD_FM(function_mask);
1019                 break;
1020         default:
1021                 BUG();
1022         }
1023         val |= DMA_CCMD_ICC;
1024
1025         raw_spin_lock_irqsave(&iommu->register_lock, flag);
1026         dmar_writeq(iommu->reg + DMAR_CCMD_REG, val);
1027
1028         /* Make sure hardware complete it */
1029         IOMMU_WAIT_OP(iommu, DMAR_CCMD_REG,
1030                 dmar_readq, (!(val & DMA_CCMD_ICC)), val);
1031
1032         raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1033 }
1034
1035 /* return value determine if we need a write buffer flush */
1036 static void __iommu_flush_iotlb(struct intel_iommu *iommu, u16 did,
1037                                 u64 addr, unsigned int size_order, u64 type)
1038 {
1039         int tlb_offset = ecap_iotlb_offset(iommu->ecap);
1040         u64 val = 0, val_iva = 0;
1041         unsigned long flag;
1042
1043         switch (type) {
1044         case DMA_TLB_GLOBAL_FLUSH:
1045                 /* global flush doesn't need set IVA_REG */
1046                 val = DMA_TLB_GLOBAL_FLUSH|DMA_TLB_IVT;
1047                 break;
1048         case DMA_TLB_DSI_FLUSH:
1049                 val = DMA_TLB_DSI_FLUSH|DMA_TLB_IVT|DMA_TLB_DID(did);
1050                 break;
1051         case DMA_TLB_PSI_FLUSH:
1052                 val = DMA_TLB_PSI_FLUSH|DMA_TLB_IVT|DMA_TLB_DID(did);
1053                 /* Note: always flush non-leaf currently */
1054                 val_iva = size_order | addr;
1055                 break;
1056         default:
1057                 BUG();
1058         }
1059         /* Note: set drain read/write */
1060 #if 0
1061         /*
1062          * This is probably to be super secure.. Looks like we can
1063          * ignore it without any impact.
1064          */
1065         if (cap_read_drain(iommu->cap))
1066                 val |= DMA_TLB_READ_DRAIN;
1067 #endif
1068         if (cap_write_drain(iommu->cap))
1069                 val |= DMA_TLB_WRITE_DRAIN;
1070
1071         raw_spin_lock_irqsave(&iommu->register_lock, flag);
1072         /* Note: Only uses first TLB reg currently */
1073         if (val_iva)
1074                 dmar_writeq(iommu->reg + tlb_offset, val_iva);
1075         dmar_writeq(iommu->reg + tlb_offset + 8, val);
1076
1077         /* Make sure hardware complete it */
1078         IOMMU_WAIT_OP(iommu, tlb_offset + 8,
1079                 dmar_readq, (!(val & DMA_TLB_IVT)), val);
1080
1081         raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1082
1083         /* check IOTLB invalidation granularity */
1084         if (DMA_TLB_IAIG(val) == 0)
1085                 printk(KERN_ERR"IOMMU: flush IOTLB failed\n");
1086         if (DMA_TLB_IAIG(val) != DMA_TLB_IIRG(type))
1087                 pr_debug("IOMMU: tlb flush request %Lx, actual %Lx\n",
1088                         (unsigned long long)DMA_TLB_IIRG(type),
1089                         (unsigned long long)DMA_TLB_IAIG(val));
1090 }
1091
1092 static struct device_domain_info *iommu_support_dev_iotlb(
1093         struct dmar_domain *domain, int segment, u8 bus, u8 devfn)
1094 {
1095         int found = 0;
1096         unsigned long flags;
1097         struct device_domain_info *info;
1098         struct intel_iommu *iommu = device_to_iommu(segment, bus, devfn);
1099
1100         if (!ecap_dev_iotlb_support(iommu->ecap))
1101                 return NULL;
1102
1103         if (!iommu->qi)
1104                 return NULL;
1105
1106         spin_lock_irqsave(&device_domain_lock, flags);
1107         list_for_each_entry(info, &domain->devices, link)
1108                 if (info->bus == bus && info->devfn == devfn) {
1109                         found = 1;
1110                         break;
1111                 }
1112         spin_unlock_irqrestore(&device_domain_lock, flags);
1113
1114         if (!found || !info->dev)
1115                 return NULL;
1116
1117         if (!pci_find_ext_capability(info->dev, PCI_EXT_CAP_ID_ATS))
1118                 return NULL;
1119
1120         if (!dmar_find_matched_atsr_unit(info->dev))
1121                 return NULL;
1122
1123         info->iommu = iommu;
1124
1125         return info;
1126 }
1127
1128 static void iommu_enable_dev_iotlb(struct device_domain_info *info)
1129 {
1130         if (!info)
1131                 return;
1132
1133         pci_enable_ats(info->dev, VTD_PAGE_SHIFT);
1134 }
1135
1136 static void iommu_disable_dev_iotlb(struct device_domain_info *info)
1137 {
1138         if (!info->dev || !pci_ats_enabled(info->dev))
1139                 return;
1140
1141         pci_disable_ats(info->dev);
1142 }
1143
1144 static void iommu_flush_dev_iotlb(struct dmar_domain *domain,
1145                                   u64 addr, unsigned mask)
1146 {
1147         u16 sid, qdep;
1148         unsigned long flags;
1149         struct device_domain_info *info;
1150
1151         spin_lock_irqsave(&device_domain_lock, flags);
1152         list_for_each_entry(info, &domain->devices, link) {
1153                 if (!info->dev || !pci_ats_enabled(info->dev))
1154                         continue;
1155
1156                 sid = info->bus << 8 | info->devfn;
1157                 qdep = pci_ats_queue_depth(info->dev);
1158                 qi_flush_dev_iotlb(info->iommu, sid, qdep, addr, mask);
1159         }
1160         spin_unlock_irqrestore(&device_domain_lock, flags);
1161 }
1162
1163 static void iommu_flush_iotlb_psi(struct intel_iommu *iommu, u16 did,
1164                                   unsigned long pfn, unsigned int pages, int map)
1165 {
1166         unsigned int mask = ilog2(__roundup_pow_of_two(pages));
1167         uint64_t addr = (uint64_t)pfn << VTD_PAGE_SHIFT;
1168
1169         BUG_ON(pages == 0);
1170
1171         /*
1172          * Fallback to domain selective flush if no PSI support or the size is
1173          * too big.
1174          * PSI requires page size to be 2 ^ x, and the base address is naturally
1175          * aligned to the size
1176          */
1177         if (!cap_pgsel_inv(iommu->cap) || mask > cap_max_amask_val(iommu->cap))
1178                 iommu->flush.flush_iotlb(iommu, did, 0, 0,
1179                                                 DMA_TLB_DSI_FLUSH);
1180         else
1181                 iommu->flush.flush_iotlb(iommu, did, addr, mask,
1182                                                 DMA_TLB_PSI_FLUSH);
1183
1184         /*
1185          * In caching mode, changes of pages from non-present to present require
1186          * flush. However, device IOTLB doesn't need to be flushed in this case.
1187          */
1188         if (!cap_caching_mode(iommu->cap) || !map)
1189                 iommu_flush_dev_iotlb(iommu->domains[did], addr, mask);
1190 }
1191
1192 static void iommu_disable_protect_mem_regions(struct intel_iommu *iommu)
1193 {
1194         u32 pmen;
1195         unsigned long flags;
1196
1197         raw_spin_lock_irqsave(&iommu->register_lock, flags);
1198         pmen = readl(iommu->reg + DMAR_PMEN_REG);
1199         pmen &= ~DMA_PMEN_EPM;
1200         writel(pmen, iommu->reg + DMAR_PMEN_REG);
1201
1202         /* wait for the protected region status bit to clear */
1203         IOMMU_WAIT_OP(iommu, DMAR_PMEN_REG,
1204                 readl, !(pmen & DMA_PMEN_PRS), pmen);
1205
1206         raw_spin_unlock_irqrestore(&iommu->register_lock, flags);
1207 }
1208
1209 static int iommu_enable_translation(struct intel_iommu *iommu)
1210 {
1211         u32 sts;
1212         unsigned long flags;
1213
1214         raw_spin_lock_irqsave(&iommu->register_lock, flags);
1215         iommu->gcmd |= DMA_GCMD_TE;
1216         writel(iommu->gcmd, iommu->reg + DMAR_GCMD_REG);
1217
1218         /* Make sure hardware complete it */
1219         IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1220                       readl, (sts & DMA_GSTS_TES), sts);
1221
1222         raw_spin_unlock_irqrestore(&iommu->register_lock, flags);
1223         return 0;
1224 }
1225
1226 static int iommu_disable_translation(struct intel_iommu *iommu)
1227 {
1228         u32 sts;
1229         unsigned long flag;
1230
1231         raw_spin_lock_irqsave(&iommu->register_lock, flag);
1232         iommu->gcmd &= ~DMA_GCMD_TE;
1233         writel(iommu->gcmd, iommu->reg + DMAR_GCMD_REG);
1234
1235         /* Make sure hardware complete it */
1236         IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1237                       readl, (!(sts & DMA_GSTS_TES)), sts);
1238
1239         raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1240         return 0;
1241 }
1242
1243
1244 static int iommu_init_domains(struct intel_iommu *iommu)
1245 {
1246         unsigned long ndomains;
1247         unsigned long nlongs;
1248
1249         ndomains = cap_ndoms(iommu->cap);
1250         pr_debug("IOMMU %d: Number of Domains supportd <%ld>\n", iommu->seq_id,
1251                         ndomains);
1252         nlongs = BITS_TO_LONGS(ndomains);
1253
1254         spin_lock_init(&iommu->lock);
1255
1256         /* TBD: there might be 64K domains,
1257          * consider other allocation for future chip
1258          */
1259         iommu->domain_ids = kcalloc(nlongs, sizeof(unsigned long), GFP_KERNEL);
1260         if (!iommu->domain_ids) {
1261                 printk(KERN_ERR "Allocating domain id array failed\n");
1262                 return -ENOMEM;
1263         }
1264         iommu->domains = kcalloc(ndomains, sizeof(struct dmar_domain *),
1265                         GFP_KERNEL);
1266         if (!iommu->domains) {
1267                 printk(KERN_ERR "Allocating domain array failed\n");
1268                 return -ENOMEM;
1269         }
1270
1271         /*
1272          * if Caching mode is set, then invalid translations are tagged
1273          * with domainid 0. Hence we need to pre-allocate it.
1274          */
1275         if (cap_caching_mode(iommu->cap))
1276                 set_bit(0, iommu->domain_ids);
1277         return 0;
1278 }
1279
1280
1281 static void domain_exit(struct dmar_domain *domain);
1282 static void vm_domain_exit(struct dmar_domain *domain);
1283
1284 void free_dmar_iommu(struct intel_iommu *iommu)
1285 {
1286         struct dmar_domain *domain;
1287         int i;
1288         unsigned long flags;
1289
1290         if ((iommu->domains) && (iommu->domain_ids)) {
1291                 for_each_set_bit(i, iommu->domain_ids, cap_ndoms(iommu->cap)) {
1292                         domain = iommu->domains[i];
1293                         clear_bit(i, iommu->domain_ids);
1294
1295                         spin_lock_irqsave(&domain->iommu_lock, flags);
1296                         if (--domain->iommu_count == 0) {
1297                                 if (domain->flags & DOMAIN_FLAG_VIRTUAL_MACHINE)
1298                                         vm_domain_exit(domain);
1299                                 else
1300                                         domain_exit(domain);
1301                         }
1302                         spin_unlock_irqrestore(&domain->iommu_lock, flags);
1303                 }
1304         }
1305
1306         if (iommu->gcmd & DMA_GCMD_TE)
1307                 iommu_disable_translation(iommu);
1308
1309         if (iommu->irq) {
1310                 irq_set_handler_data(iommu->irq, NULL);
1311                 /* This will mask the irq */
1312                 free_irq(iommu->irq, iommu);
1313                 destroy_irq(iommu->irq);
1314         }
1315
1316         kfree(iommu->domains);
1317         kfree(iommu->domain_ids);
1318
1319         g_iommus[iommu->seq_id] = NULL;
1320
1321         /* if all iommus are freed, free g_iommus */
1322         for (i = 0; i < g_num_of_iommus; i++) {
1323                 if (g_iommus[i])
1324                         break;
1325         }
1326
1327         if (i == g_num_of_iommus)
1328                 kfree(g_iommus);
1329
1330         /* free context mapping */
1331         free_context_table(iommu);
1332 }
1333
1334 static struct dmar_domain *alloc_domain(void)
1335 {
1336         struct dmar_domain *domain;
1337
1338         domain = alloc_domain_mem();
1339         if (!domain)
1340                 return NULL;
1341
1342         domain->nid = -1;
1343         memset(domain->iommu_bmp, 0, sizeof(domain->iommu_bmp));
1344         domain->flags = 0;
1345
1346         return domain;
1347 }
1348
1349 static int iommu_attach_domain(struct dmar_domain *domain,
1350                                struct intel_iommu *iommu)
1351 {
1352         int num;
1353         unsigned long ndomains;
1354         unsigned long flags;
1355
1356         ndomains = cap_ndoms(iommu->cap);
1357
1358         spin_lock_irqsave(&iommu->lock, flags);
1359
1360         num = find_first_zero_bit(iommu->domain_ids, ndomains);
1361         if (num >= ndomains) {
1362                 spin_unlock_irqrestore(&iommu->lock, flags);
1363                 printk(KERN_ERR "IOMMU: no free domain ids\n");
1364                 return -ENOMEM;
1365         }
1366
1367         domain->id = num;
1368         set_bit(num, iommu->domain_ids);
1369         set_bit(iommu->seq_id, domain->iommu_bmp);
1370         iommu->domains[num] = domain;
1371         spin_unlock_irqrestore(&iommu->lock, flags);
1372
1373         return 0;
1374 }
1375
1376 static void iommu_detach_domain(struct dmar_domain *domain,
1377                                 struct intel_iommu *iommu)
1378 {
1379         unsigned long flags;
1380         int num, ndomains;
1381         int found = 0;
1382
1383         spin_lock_irqsave(&iommu->lock, flags);
1384         ndomains = cap_ndoms(iommu->cap);
1385         for_each_set_bit(num, iommu->domain_ids, ndomains) {
1386                 if (iommu->domains[num] == domain) {
1387                         found = 1;
1388                         break;
1389                 }
1390         }
1391
1392         if (found) {
1393                 clear_bit(num, iommu->domain_ids);
1394                 clear_bit(iommu->seq_id, domain->iommu_bmp);
1395                 iommu->domains[num] = NULL;
1396         }
1397         spin_unlock_irqrestore(&iommu->lock, flags);
1398 }
1399
1400 static struct iova_domain reserved_iova_list;
1401 static struct lock_class_key reserved_rbtree_key;
1402
1403 static int dmar_init_reserved_ranges(void)
1404 {
1405         struct pci_dev *pdev = NULL;
1406         struct iova *iova;
1407         int i;
1408
1409         init_iova_domain(&reserved_iova_list, DMA_32BIT_PFN);
1410
1411         lockdep_set_class(&reserved_iova_list.iova_rbtree_lock,
1412                 &reserved_rbtree_key);
1413
1414         /* IOAPIC ranges shouldn't be accessed by DMA */
1415         iova = reserve_iova(&reserved_iova_list, IOVA_PFN(IOAPIC_RANGE_START),
1416                 IOVA_PFN(IOAPIC_RANGE_END));
1417         if (!iova) {
1418                 printk(KERN_ERR "Reserve IOAPIC range failed\n");
1419                 return -ENODEV;
1420         }
1421
1422         /* Reserve all PCI MMIO to avoid peer-to-peer access */
1423         for_each_pci_dev(pdev) {
1424                 struct resource *r;
1425
1426                 for (i = 0; i < PCI_NUM_RESOURCES; i++) {
1427                         r = &pdev->resource[i];
1428                         if (!r->flags || !(r->flags & IORESOURCE_MEM))
1429                                 continue;
1430                         iova = reserve_iova(&reserved_iova_list,
1431                                             IOVA_PFN(r->start),
1432                                             IOVA_PFN(r->end));
1433                         if (!iova) {
1434                                 printk(KERN_ERR "Reserve iova failed\n");
1435                                 return -ENODEV;
1436                         }
1437                 }
1438         }
1439         return 0;
1440 }
1441
1442 static void domain_reserve_special_ranges(struct dmar_domain *domain)
1443 {
1444         copy_reserved_iova(&reserved_iova_list, &domain->iovad);
1445 }
1446
1447 static inline int guestwidth_to_adjustwidth(int gaw)
1448 {
1449         int agaw;
1450         int r = (gaw - 12) % 9;
1451
1452         if (r == 0)
1453                 agaw = gaw;
1454         else
1455                 agaw = gaw + 9 - r;
1456         if (agaw > 64)
1457                 agaw = 64;
1458         return agaw;
1459 }
1460
1461 static int domain_init(struct dmar_domain *domain, int guest_width)
1462 {
1463         struct intel_iommu *iommu;
1464         int adjust_width, agaw;
1465         unsigned long sagaw;
1466
1467         init_iova_domain(&domain->iovad, DMA_32BIT_PFN);
1468         spin_lock_init(&domain->iommu_lock);
1469
1470         domain_reserve_special_ranges(domain);
1471
1472         /* calculate AGAW */
1473         iommu = domain_get_iommu(domain);
1474         if (guest_width > cap_mgaw(iommu->cap))
1475                 guest_width = cap_mgaw(iommu->cap);
1476         domain->gaw = guest_width;
1477         adjust_width = guestwidth_to_adjustwidth(guest_width);
1478         agaw = width_to_agaw(adjust_width);
1479         sagaw = cap_sagaw(iommu->cap);
1480         if (!test_bit(agaw, &sagaw)) {
1481                 /* hardware doesn't support it, choose a bigger one */
1482                 pr_debug("IOMMU: hardware doesn't support agaw %d\n", agaw);
1483                 agaw = find_next_bit(&sagaw, 5, agaw);
1484                 if (agaw >= 5)
1485                         return -ENODEV;
1486         }
1487         domain->agaw = agaw;
1488         INIT_LIST_HEAD(&domain->devices);
1489
1490         if (ecap_coherent(iommu->ecap))
1491                 domain->iommu_coherency = 1;
1492         else
1493                 domain->iommu_coherency = 0;
1494
1495         if (ecap_sc_support(iommu->ecap))
1496                 domain->iommu_snooping = 1;
1497         else
1498                 domain->iommu_snooping = 0;
1499
1500         domain->iommu_superpage = fls(cap_super_page_val(iommu->cap));
1501         domain->iommu_count = 1;
1502         domain->nid = iommu->node;
1503
1504         /* always allocate the top pgd */
1505         domain->pgd = (struct dma_pte *)alloc_pgtable_page(domain->nid);
1506         if (!domain->pgd)
1507                 return -ENOMEM;
1508         __iommu_flush_cache(iommu, domain->pgd, PAGE_SIZE);
1509         return 0;
1510 }
1511
1512 static void domain_exit(struct dmar_domain *domain)
1513 {
1514         struct dmar_drhd_unit *drhd;
1515         struct intel_iommu *iommu;
1516
1517         /* Domain 0 is reserved, so dont process it */
1518         if (!domain)
1519                 return;
1520
1521         /* Flush any lazy unmaps that may reference this domain */
1522         if (!intel_iommu_strict)
1523                 flush_unmaps_timeout(0);
1524
1525         domain_remove_dev_info(domain);
1526         /* destroy iovas */
1527         put_iova_domain(&domain->iovad);
1528
1529         /* clear ptes */
1530         dma_pte_clear_range(domain, 0, DOMAIN_MAX_PFN(domain->gaw));
1531
1532         /* free page tables */
1533         dma_pte_free_pagetable(domain, 0, DOMAIN_MAX_PFN(domain->gaw));
1534
1535         for_each_active_iommu(iommu, drhd)
1536                 if (test_bit(iommu->seq_id, domain->iommu_bmp))
1537                         iommu_detach_domain(domain, iommu);
1538
1539         free_domain_mem(domain);
1540 }
1541
1542 static int domain_context_mapping_one(struct dmar_domain *domain, int segment,
1543                                  u8 bus, u8 devfn, int translation)
1544 {
1545         struct context_entry *context;
1546         unsigned long flags;
1547         struct intel_iommu *iommu;
1548         struct dma_pte *pgd;
1549         unsigned long num;
1550         unsigned long ndomains;
1551         int id;
1552         int agaw;
1553         struct device_domain_info *info = NULL;
1554
1555         pr_debug("Set context mapping for %02x:%02x.%d\n",
1556                 bus, PCI_SLOT(devfn), PCI_FUNC(devfn));
1557
1558         BUG_ON(!domain->pgd);
1559         BUG_ON(translation != CONTEXT_TT_PASS_THROUGH &&
1560                translation != CONTEXT_TT_MULTI_LEVEL);
1561
1562         iommu = device_to_iommu(segment, bus, devfn);
1563         if (!iommu)
1564                 return -ENODEV;
1565
1566         context = device_to_context_entry(iommu, bus, devfn);
1567         if (!context)
1568                 return -ENOMEM;
1569         spin_lock_irqsave(&iommu->lock, flags);
1570         if (context_present(context)) {
1571                 spin_unlock_irqrestore(&iommu->lock, flags);
1572                 return 0;
1573         }
1574
1575         id = domain->id;
1576         pgd = domain->pgd;
1577
1578         if (domain->flags & DOMAIN_FLAG_VIRTUAL_MACHINE ||
1579             domain->flags & DOMAIN_FLAG_STATIC_IDENTITY) {
1580                 int found = 0;
1581
1582                 /* find an available domain id for this device in iommu */
1583                 ndomains = cap_ndoms(iommu->cap);
1584                 for_each_set_bit(num, iommu->domain_ids, ndomains) {
1585                         if (iommu->domains[num] == domain) {
1586                                 id = num;
1587                                 found = 1;
1588                                 break;
1589                         }
1590                 }
1591
1592                 if (found == 0) {
1593                         num = find_first_zero_bit(iommu->domain_ids, ndomains);
1594                         if (num >= ndomains) {
1595                                 spin_unlock_irqrestore(&iommu->lock, flags);
1596                                 printk(KERN_ERR "IOMMU: no free domain ids\n");
1597                                 return -EFAULT;
1598                         }
1599
1600                         set_bit(num, iommu->domain_ids);
1601                         iommu->domains[num] = domain;
1602                         id = num;
1603                 }
1604
1605                 /* Skip top levels of page tables for
1606                  * iommu which has less agaw than default.
1607                  * Unnecessary for PT mode.
1608                  */
1609                 if (translation != CONTEXT_TT_PASS_THROUGH) {
1610                         for (agaw = domain->agaw; agaw != iommu->agaw; agaw--) {
1611                                 pgd = phys_to_virt(dma_pte_addr(pgd));
1612                                 if (!dma_pte_present(pgd)) {
1613                                         spin_unlock_irqrestore(&iommu->lock, flags);
1614                                         return -ENOMEM;
1615                                 }
1616                         }
1617                 }
1618         }
1619
1620         context_set_domain_id(context, id);
1621
1622         if (translation != CONTEXT_TT_PASS_THROUGH) {
1623                 info = iommu_support_dev_iotlb(domain, segment, bus, devfn);
1624                 translation = info ? CONTEXT_TT_DEV_IOTLB :
1625                                      CONTEXT_TT_MULTI_LEVEL;
1626         }
1627         /*
1628          * In pass through mode, AW must be programmed to indicate the largest
1629          * AGAW value supported by hardware. And ASR is ignored by hardware.
1630          */
1631         if (unlikely(translation == CONTEXT_TT_PASS_THROUGH))
1632                 context_set_address_width(context, iommu->msagaw);
1633         else {
1634                 context_set_address_root(context, virt_to_phys(pgd));
1635                 context_set_address_width(context, iommu->agaw);
1636         }
1637
1638         context_set_translation_type(context, translation);
1639         context_set_fault_enable(context);
1640         context_set_present(context);
1641         domain_flush_cache(domain, context, sizeof(*context));
1642
1643         /*
1644          * It's a non-present to present mapping. If hardware doesn't cache
1645          * non-present entry we only need to flush the write-buffer. If the
1646          * _does_ cache non-present entries, then it does so in the special
1647          * domain #0, which we have to flush:
1648          */
1649         if (cap_caching_mode(iommu->cap)) {
1650                 iommu->flush.flush_context(iommu, 0,
1651                                            (((u16)bus) << 8) | devfn,
1652                                            DMA_CCMD_MASK_NOBIT,
1653                                            DMA_CCMD_DEVICE_INVL);
1654                 iommu->flush.flush_iotlb(iommu, domain->id, 0, 0, DMA_TLB_DSI_FLUSH);
1655         } else {
1656                 iommu_flush_write_buffer(iommu);
1657         }
1658         iommu_enable_dev_iotlb(info);
1659         spin_unlock_irqrestore(&iommu->lock, flags);
1660
1661         spin_lock_irqsave(&domain->iommu_lock, flags);
1662         if (!test_and_set_bit(iommu->seq_id, domain->iommu_bmp)) {
1663                 domain->iommu_count++;
1664                 if (domain->iommu_count == 1)
1665                         domain->nid = iommu->node;
1666                 domain_update_iommu_cap(domain);
1667         }
1668         spin_unlock_irqrestore(&domain->iommu_lock, flags);
1669         return 0;
1670 }
1671
1672 static int
1673 domain_context_mapping(struct dmar_domain *domain, struct pci_dev *pdev,
1674                         int translation)
1675 {
1676         int ret;
1677         struct pci_dev *tmp, *parent;
1678
1679         ret = domain_context_mapping_one(domain, pci_domain_nr(pdev->bus),
1680                                          pdev->bus->number, pdev->devfn,
1681                                          translation);
1682         if (ret)
1683                 return ret;
1684
1685         /* dependent device mapping */
1686         tmp = pci_find_upstream_pcie_bridge(pdev);
1687         if (!tmp)
1688                 return 0;
1689         /* Secondary interface's bus number and devfn 0 */
1690         parent = pdev->bus->self;
1691         while (parent != tmp) {
1692                 ret = domain_context_mapping_one(domain,
1693                                                  pci_domain_nr(parent->bus),
1694                                                  parent->bus->number,
1695                                                  parent->devfn, translation);
1696                 if (ret)
1697                         return ret;
1698                 parent = parent->bus->self;
1699         }
1700         if (pci_is_pcie(tmp)) /* this is a PCIe-to-PCI bridge */
1701                 return domain_context_mapping_one(domain,
1702                                         pci_domain_nr(tmp->subordinate),
1703                                         tmp->subordinate->number, 0,
1704                                         translation);
1705         else /* this is a legacy PCI bridge */
1706                 return domain_context_mapping_one(domain,
1707                                                   pci_domain_nr(tmp->bus),
1708                                                   tmp->bus->number,
1709                                                   tmp->devfn,
1710                                                   translation);
1711 }
1712
1713 static int domain_context_mapped(struct pci_dev *pdev)
1714 {
1715         int ret;
1716         struct pci_dev *tmp, *parent;
1717         struct intel_iommu *iommu;
1718
1719         iommu = device_to_iommu(pci_domain_nr(pdev->bus), pdev->bus->number,
1720                                 pdev->devfn);
1721         if (!iommu)
1722                 return -ENODEV;
1723
1724         ret = device_context_mapped(iommu, pdev->bus->number, pdev->devfn);
1725         if (!ret)
1726                 return ret;
1727         /* dependent device mapping */
1728         tmp = pci_find_upstream_pcie_bridge(pdev);
1729         if (!tmp)
1730                 return ret;
1731         /* Secondary interface's bus number and devfn 0 */
1732         parent = pdev->bus->self;
1733         while (parent != tmp) {
1734                 ret = device_context_mapped(iommu, parent->bus->number,
1735                                             parent->devfn);
1736                 if (!ret)
1737                         return ret;
1738                 parent = parent->bus->self;
1739         }
1740         if (pci_is_pcie(tmp))
1741                 return device_context_mapped(iommu, tmp->subordinate->number,
1742                                              0);
1743         else
1744                 return device_context_mapped(iommu, tmp->bus->number,
1745                                              tmp->devfn);
1746 }
1747
1748 /* Returns a number of VTD pages, but aligned to MM page size */
1749 static inline unsigned long aligned_nrpages(unsigned long host_addr,
1750                                             size_t size)
1751 {
1752         host_addr &= ~PAGE_MASK;
1753         return PAGE_ALIGN(host_addr + size) >> VTD_PAGE_SHIFT;
1754 }
1755
1756 /* Return largest possible superpage level for a given mapping */
1757 static inline int hardware_largepage_caps(struct dmar_domain *domain,
1758                                           unsigned long iov_pfn,
1759                                           unsigned long phy_pfn,
1760                                           unsigned long pages)
1761 {
1762         int support, level = 1;
1763         unsigned long pfnmerge;
1764
1765         support = domain->iommu_superpage;
1766
1767         /* To use a large page, the virtual *and* physical addresses
1768            must be aligned to 2MiB/1GiB/etc. Lower bits set in either
1769            of them will mean we have to use smaller pages. So just
1770            merge them and check both at once. */
1771         pfnmerge = iov_pfn | phy_pfn;
1772
1773         while (support && !(pfnmerge & ~VTD_STRIDE_MASK)) {
1774                 pages >>= VTD_STRIDE_SHIFT;
1775                 if (!pages)
1776                         break;
1777                 pfnmerge >>= VTD_STRIDE_SHIFT;
1778                 level++;
1779                 support--;
1780         }
1781         return level;
1782 }
1783
1784 static int __domain_mapping(struct dmar_domain *domain, unsigned long iov_pfn,
1785                             struct scatterlist *sg, unsigned long phys_pfn,
1786                             unsigned long nr_pages, int prot)
1787 {
1788         struct dma_pte *first_pte = NULL, *pte = NULL;
1789         phys_addr_t uninitialized_var(pteval);
1790         int addr_width = agaw_to_width(domain->agaw) - VTD_PAGE_SHIFT;
1791         unsigned long sg_res;
1792         unsigned int largepage_lvl = 0;
1793         unsigned long lvl_pages = 0;
1794
1795         BUG_ON(addr_width < BITS_PER_LONG && (iov_pfn + nr_pages - 1) >> addr_width);
1796
1797         if ((prot & (DMA_PTE_READ|DMA_PTE_WRITE)) == 0)
1798                 return -EINVAL;
1799
1800         prot &= DMA_PTE_READ | DMA_PTE_WRITE | DMA_PTE_SNP;
1801
1802         if (sg)
1803                 sg_res = 0;
1804         else {
1805                 sg_res = nr_pages + 1;
1806                 pteval = ((phys_addr_t)phys_pfn << VTD_PAGE_SHIFT) | prot;
1807         }
1808
1809         while (nr_pages > 0) {
1810                 uint64_t tmp;
1811
1812                 if (!sg_res) {
1813                         sg_res = aligned_nrpages(sg->offset, sg->length);
1814                         sg->dma_address = ((dma_addr_t)iov_pfn << VTD_PAGE_SHIFT) + sg->offset;
1815                         sg->dma_length = sg->length;
1816                         pteval = page_to_phys(sg_page(sg)) | prot;
1817                         phys_pfn = pteval >> VTD_PAGE_SHIFT;
1818                 }
1819
1820                 if (!pte) {
1821                         largepage_lvl = hardware_largepage_caps(domain, iov_pfn, phys_pfn, sg_res);
1822
1823                         first_pte = pte = pfn_to_dma_pte(domain, iov_pfn, largepage_lvl);
1824                         if (!pte)
1825                                 return -ENOMEM;
1826                         /* It is large page*/
1827                         if (largepage_lvl > 1)
1828                                 pteval |= DMA_PTE_LARGE_PAGE;
1829                         else
1830                                 pteval &= ~(uint64_t)DMA_PTE_LARGE_PAGE;
1831
1832                 }
1833                 /* We don't need lock here, nobody else
1834                  * touches the iova range
1835                  */
1836                 tmp = cmpxchg64_local(&pte->val, 0ULL, pteval);
1837                 if (tmp) {
1838                         static int dumps = 5;
1839                         printk(KERN_CRIT "ERROR: DMA PTE for vPFN 0x%lx already set (to %llx not %llx)\n",
1840                                iov_pfn, tmp, (unsigned long long)pteval);
1841                         if (dumps) {
1842                                 dumps--;
1843                                 debug_dma_dump_mappings(NULL);
1844                         }
1845                         WARN_ON(1);
1846                 }
1847
1848                 lvl_pages = lvl_to_nr_pages(largepage_lvl);
1849
1850                 BUG_ON(nr_pages < lvl_pages);
1851                 BUG_ON(sg_res < lvl_pages);
1852
1853                 nr_pages -= lvl_pages;
1854                 iov_pfn += lvl_pages;
1855                 phys_pfn += lvl_pages;
1856                 pteval += lvl_pages * VTD_PAGE_SIZE;
1857                 sg_res -= lvl_pages;
1858
1859                 /* If the next PTE would be the first in a new page, then we
1860                    need to flush the cache on the entries we've just written.
1861                    And then we'll need to recalculate 'pte', so clear it and
1862                    let it get set again in the if (!pte) block above.
1863
1864                    If we're done (!nr_pages) we need to flush the cache too.
1865
1866                    Also if we've been setting superpages, we may need to
1867                    recalculate 'pte' and switch back to smaller pages for the
1868                    end of the mapping, if the trailing size is not enough to
1869                    use another superpage (i.e. sg_res < lvl_pages). */
1870                 pte++;
1871                 if (!nr_pages || first_pte_in_page(pte) ||
1872                     (largepage_lvl > 1 && sg_res < lvl_pages)) {
1873                         domain_flush_cache(domain, first_pte,
1874                                            (void *)pte - (void *)first_pte);
1875                         pte = NULL;
1876                 }
1877
1878                 if (!sg_res && nr_pages)
1879                         sg = sg_next(sg);
1880         }
1881         return 0;
1882 }
1883
1884 static inline int domain_sg_mapping(struct dmar_domain *domain, unsigned long iov_pfn,
1885                                     struct scatterlist *sg, unsigned long nr_pages,
1886                                     int prot)
1887 {
1888         return __domain_mapping(domain, iov_pfn, sg, 0, nr_pages, prot);
1889 }
1890
1891 static inline int domain_pfn_mapping(struct dmar_domain *domain, unsigned long iov_pfn,
1892                                      unsigned long phys_pfn, unsigned long nr_pages,
1893                                      int prot)
1894 {
1895         return __domain_mapping(domain, iov_pfn, NULL, phys_pfn, nr_pages, prot);
1896 }
1897
1898 static void iommu_detach_dev(struct intel_iommu *iommu, u8 bus, u8 devfn)
1899 {
1900         if (!iommu)
1901                 return;
1902
1903         clear_context_table(iommu, bus, devfn);
1904         iommu->flush.flush_context(iommu, 0, 0, 0,
1905                                            DMA_CCMD_GLOBAL_INVL);
1906         iommu->flush.flush_iotlb(iommu, 0, 0, 0, DMA_TLB_GLOBAL_FLUSH);
1907 }
1908
1909 static void domain_remove_dev_info(struct dmar_domain *domain)
1910 {
1911         struct device_domain_info *info;
1912         unsigned long flags;
1913         struct intel_iommu *iommu;
1914
1915         spin_lock_irqsave(&device_domain_lock, flags);
1916         while (!list_empty(&domain->devices)) {
1917                 info = list_entry(domain->devices.next,
1918                         struct device_domain_info, link);
1919                 list_del(&info->link);
1920                 list_del(&info->global);
1921                 if (info->dev)
1922                         info->dev->dev.archdata.iommu = NULL;
1923                 spin_unlock_irqrestore(&device_domain_lock, flags);
1924
1925                 iommu_disable_dev_iotlb(info);
1926                 iommu = device_to_iommu(info->segment, info->bus, info->devfn);
1927                 iommu_detach_dev(iommu, info->bus, info->devfn);
1928                 free_devinfo_mem(info);
1929
1930                 spin_lock_irqsave(&device_domain_lock, flags);
1931         }
1932         spin_unlock_irqrestore(&device_domain_lock, flags);
1933 }
1934
1935 /*
1936  * find_domain
1937  * Note: we use struct pci_dev->dev.archdata.iommu stores the info
1938  */
1939 static struct dmar_domain *
1940 find_domain(struct pci_dev *pdev)
1941 {
1942         struct device_domain_info *info;
1943
1944         /* No lock here, assumes no domain exit in normal case */
1945         info = pdev->dev.archdata.iommu;
1946         if (info)
1947                 return info->domain;
1948         return NULL;
1949 }
1950
1951 /* domain is initialized */
1952 static struct dmar_domain *get_domain_for_dev(struct pci_dev *pdev, int gaw)
1953 {
1954         struct dmar_domain *domain, *found = NULL;
1955         struct intel_iommu *iommu;
1956         struct dmar_drhd_unit *drhd;
1957         struct device_domain_info *info, *tmp;
1958         struct pci_dev *dev_tmp;
1959         unsigned long flags;
1960         int bus = 0, devfn = 0;
1961         int segment;
1962         int ret;
1963
1964         domain = find_domain(pdev);
1965         if (domain)
1966                 return domain;
1967
1968         segment = pci_domain_nr(pdev->bus);
1969
1970         dev_tmp = pci_find_upstream_pcie_bridge(pdev);
1971         if (dev_tmp) {
1972                 if (pci_is_pcie(dev_tmp)) {
1973                         bus = dev_tmp->subordinate->number;
1974                         devfn = 0;
1975                 } else {
1976                         bus = dev_tmp->bus->number;
1977                         devfn = dev_tmp->devfn;
1978                 }
1979                 spin_lock_irqsave(&device_domain_lock, flags);
1980                 list_for_each_entry(info, &device_domain_list, global) {
1981                         if (info->segment == segment &&
1982                             info->bus == bus && info->devfn == devfn) {
1983                                 found = info->domain;
1984                                 break;
1985                         }
1986                 }
1987                 spin_unlock_irqrestore(&device_domain_lock, flags);
1988                 /* pcie-pci bridge already has a domain, uses it */
1989                 if (found) {
1990                         domain = found;
1991                         goto found_domain;
1992                 }
1993         }
1994
1995         domain = alloc_domain();
1996         if (!domain)
1997                 goto error;
1998
1999         /* Allocate new domain for the device */
2000         drhd = dmar_find_matched_drhd_unit(pdev);
2001         if (!drhd) {
2002                 printk(KERN_ERR "IOMMU: can't find DMAR for device %s\n",
2003                         pci_name(pdev));
2004                 return NULL;
2005         }
2006         iommu = drhd->iommu;
2007
2008         ret = iommu_attach_domain(domain, iommu);
2009         if (ret) {
2010                 free_domain_mem(domain);
2011                 goto error;
2012         }
2013
2014         if (domain_init(domain, gaw)) {
2015                 domain_exit(domain);
2016                 goto error;
2017         }
2018
2019         /* register pcie-to-pci device */
2020         if (dev_tmp) {
2021                 info = alloc_devinfo_mem();
2022                 if (!info) {
2023                         domain_exit(domain);
2024                         goto error;
2025                 }
2026                 info->segment = segment;
2027                 info->bus = bus;
2028                 info->devfn = devfn;
2029                 info->dev = NULL;
2030                 info->domain = domain;
2031                 /* This domain is shared by devices under p2p bridge */
2032                 domain->flags |= DOMAIN_FLAG_P2P_MULTIPLE_DEVICES;
2033
2034                 /* pcie-to-pci bridge already has a domain, uses it */
2035                 found = NULL;
2036                 spin_lock_irqsave(&device_domain_lock, flags);
2037                 list_for_each_entry(tmp, &device_domain_list, global) {
2038                         if (tmp->segment == segment &&
2039                             tmp->bus == bus && tmp->devfn == devfn) {
2040                                 found = tmp->domain;
2041                                 break;
2042                         }
2043                 }
2044                 if (found) {
2045                         spin_unlock_irqrestore(&device_domain_lock, flags);
2046                         free_devinfo_mem(info);
2047                         domain_exit(domain);
2048                         domain = found;
2049                 } else {
2050                         list_add(&info->link, &domain->devices);
2051                         list_add(&info->global, &device_domain_list);
2052                         spin_unlock_irqrestore(&device_domain_lock, flags);
2053                 }
2054         }
2055
2056 found_domain:
2057         info = alloc_devinfo_mem();
2058         if (!info)
2059                 goto error;
2060         info->segment = segment;
2061         info->bus = pdev->bus->number;
2062         info->devfn = pdev->devfn;
2063         info->dev = pdev;
2064         info->domain = domain;
2065         spin_lock_irqsave(&device_domain_lock, flags);
2066         /* somebody is fast */
2067         found = find_domain(pdev);
2068         if (found != NULL) {
2069                 spin_unlock_irqrestore(&device_domain_lock, flags);
2070                 if (found != domain) {
2071                         domain_exit(domain);
2072                         domain = found;
2073                 }
2074                 free_devinfo_mem(info);
2075                 return domain;
2076         }
2077         list_add(&info->link, &domain->devices);
2078         list_add(&info->global, &device_domain_list);
2079         pdev->dev.archdata.iommu = info;
2080         spin_unlock_irqrestore(&device_domain_lock, flags);
2081         return domain;
2082 error:
2083         /* recheck it here, maybe others set it */
2084         return find_domain(pdev);
2085 }
2086
2087 static int iommu_identity_mapping;
2088 #define IDENTMAP_ALL            1
2089 #define IDENTMAP_GFX            2
2090 #define IDENTMAP_AZALIA         4
2091
2092 static int iommu_domain_identity_map(struct dmar_domain *domain,
2093                                      unsigned long long start,
2094                                      unsigned long long end)
2095 {
2096         unsigned long first_vpfn = start >> VTD_PAGE_SHIFT;
2097         unsigned long last_vpfn = end >> VTD_PAGE_SHIFT;
2098
2099         if (!reserve_iova(&domain->iovad, dma_to_mm_pfn(first_vpfn),
2100                           dma_to_mm_pfn(last_vpfn))) {
2101                 printk(KERN_ERR "IOMMU: reserve iova failed\n");
2102                 return -ENOMEM;
2103         }
2104
2105         pr_debug("Mapping reserved region %llx-%llx for domain %d\n",
2106                  start, end, domain->id);
2107         /*
2108          * RMRR range might have overlap with physical memory range,
2109          * clear it first
2110          */
2111         dma_pte_clear_range(domain, first_vpfn, last_vpfn);
2112
2113         return domain_pfn_mapping(domain, first_vpfn, first_vpfn,
2114                                   last_vpfn - first_vpfn + 1,
2115                                   DMA_PTE_READ|DMA_PTE_WRITE);
2116 }
2117
2118 static int iommu_prepare_identity_map(struct pci_dev *pdev,
2119                                       unsigned long long start,
2120                                       unsigned long long end)
2121 {
2122         struct dmar_domain *domain;
2123         int ret;
2124
2125         domain = get_domain_for_dev(pdev, DEFAULT_DOMAIN_ADDRESS_WIDTH);
2126         if (!domain)
2127                 return -ENOMEM;
2128
2129         /* For _hardware_ passthrough, don't bother. But for software
2130            passthrough, we do it anyway -- it may indicate a memory
2131            range which is reserved in E820, so which didn't get set
2132            up to start with in si_domain */
2133         if (domain == si_domain && hw_pass_through) {
2134                 printk("Ignoring identity map for HW passthrough device %s [0x%Lx - 0x%Lx]\n",
2135                        pci_name(pdev), start, end);
2136                 return 0;
2137         }
2138
2139         printk(KERN_INFO
2140                "IOMMU: Setting identity map for device %s [0x%Lx - 0x%Lx]\n",
2141                pci_name(pdev), start, end);
2142         
2143         if (end < start) {
2144                 WARN(1, "Your BIOS is broken; RMRR ends before it starts!\n"
2145                         "BIOS vendor: %s; Ver: %s; Product Version: %s\n",
2146                         dmi_get_system_info(DMI_BIOS_VENDOR),
2147                         dmi_get_system_info(DMI_BIOS_VERSION),
2148                      dmi_get_system_info(DMI_PRODUCT_VERSION));
2149                 ret = -EIO;
2150                 goto error;
2151         }
2152
2153         if (end >> agaw_to_width(domain->agaw)) {
2154                 WARN(1, "Your BIOS is broken; RMRR exceeds permitted address width (%d bits)\n"
2155                      "BIOS vendor: %s; Ver: %s; Product Version: %s\n",
2156                      agaw_to_width(domain->agaw),
2157                      dmi_get_system_info(DMI_BIOS_VENDOR),
2158                      dmi_get_system_info(DMI_BIOS_VERSION),
2159                      dmi_get_system_info(DMI_PRODUCT_VERSION));
2160                 ret = -EIO;
2161                 goto error;
2162         }
2163
2164         ret = iommu_domain_identity_map(domain, start, end);
2165         if (ret)
2166                 goto error;
2167
2168         /* context entry init */
2169         ret = domain_context_mapping(domain, pdev, CONTEXT_TT_MULTI_LEVEL);
2170         if (ret)
2171                 goto error;
2172
2173         return 0;
2174
2175  error:
2176         domain_exit(domain);
2177         return ret;
2178 }
2179
2180 static inline int iommu_prepare_rmrr_dev(struct dmar_rmrr_unit *rmrr,
2181         struct pci_dev *pdev)
2182 {
2183         if (pdev->dev.archdata.iommu == DUMMY_DEVICE_DOMAIN_INFO)
2184                 return 0;
2185         return iommu_prepare_identity_map(pdev, rmrr->base_address,
2186                 rmrr->end_address);
2187 }
2188
2189 #ifdef CONFIG_INTEL_IOMMU_FLOPPY_WA
2190 static inline void iommu_prepare_isa(void)
2191 {
2192         struct pci_dev *pdev;
2193         int ret;
2194
2195         pdev = pci_get_class(PCI_CLASS_BRIDGE_ISA << 8, NULL);
2196         if (!pdev)
2197                 return;
2198
2199         printk(KERN_INFO "IOMMU: Prepare 0-16MiB unity mapping for LPC\n");
2200         ret = iommu_prepare_identity_map(pdev, 0, 16*1024*1024 - 1);
2201
2202         if (ret)
2203                 printk(KERN_ERR "IOMMU: Failed to create 0-16MiB identity map; "
2204                        "floppy might not work\n");
2205
2206 }
2207 #else
2208 static inline void iommu_prepare_isa(void)
2209 {
2210         return;
2211 }
2212 #endif /* !CONFIG_INTEL_IOMMU_FLPY_WA */
2213
2214 static int md_domain_init(struct dmar_domain *domain, int guest_width);
2215
2216 static int __init si_domain_init(int hw)
2217 {
2218         struct dmar_drhd_unit *drhd;
2219         struct intel_iommu *iommu;
2220         int nid, ret = 0;
2221
2222         si_domain = alloc_domain();
2223         if (!si_domain)
2224                 return -EFAULT;
2225
2226         pr_debug("Identity mapping domain is domain %d\n", si_domain->id);
2227
2228         for_each_active_iommu(iommu, drhd) {
2229                 ret = iommu_attach_domain(si_domain, iommu);
2230                 if (ret) {
2231                         domain_exit(si_domain);
2232                         return -EFAULT;
2233                 }
2234         }
2235
2236         if (md_domain_init(si_domain, DEFAULT_DOMAIN_ADDRESS_WIDTH)) {
2237                 domain_exit(si_domain);
2238                 return -EFAULT;
2239         }
2240
2241         si_domain->flags = DOMAIN_FLAG_STATIC_IDENTITY;
2242
2243         if (hw)
2244                 return 0;
2245
2246         for_each_online_node(nid) {
2247                 unsigned long start_pfn, end_pfn;
2248                 int i;
2249
2250                 for_each_mem_pfn_range(i, nid, &start_pfn, &end_pfn, NULL) {
2251                         ret = iommu_domain_identity_map(si_domain,
2252                                         PFN_PHYS(start_pfn), PFN_PHYS(end_pfn));
2253                         if (ret)
2254                                 return ret;
2255                 }
2256         }
2257
2258         return 0;
2259 }
2260
2261 static void domain_remove_one_dev_info(struct dmar_domain *domain,
2262                                           struct pci_dev *pdev);
2263 static int identity_mapping(struct pci_dev *pdev)
2264 {
2265         struct device_domain_info *info;
2266
2267         if (likely(!iommu_identity_mapping))
2268                 return 0;
2269
2270         info = pdev->dev.archdata.iommu;
2271         if (info && info != DUMMY_DEVICE_DOMAIN_INFO)
2272                 return (info->domain == si_domain);
2273
2274         return 0;
2275 }
2276
2277 static int domain_add_dev_info(struct dmar_domain *domain,
2278                                struct pci_dev *pdev,
2279                                int translation)
2280 {
2281         struct device_domain_info *info;
2282         unsigned long flags;
2283         int ret;
2284
2285         info = alloc_devinfo_mem();
2286         if (!info)
2287                 return -ENOMEM;
2288
2289         ret = domain_context_mapping(domain, pdev, translation);
2290         if (ret) {
2291                 free_devinfo_mem(info);
2292                 return ret;
2293         }
2294
2295         info->segment = pci_domain_nr(pdev->bus);
2296         info->bus = pdev->bus->number;
2297         info->devfn = pdev->devfn;
2298         info->dev = pdev;
2299         info->domain = domain;
2300
2301         spin_lock_irqsave(&device_domain_lock, flags);
2302         list_add(&info->link, &domain->devices);
2303         list_add(&info->global, &device_domain_list);
2304         pdev->dev.archdata.iommu = info;
2305         spin_unlock_irqrestore(&device_domain_lock, flags);
2306
2307         return 0;
2308 }
2309
2310 static int iommu_should_identity_map(struct pci_dev *pdev, int startup)
2311 {
2312         if ((iommu_identity_mapping & IDENTMAP_AZALIA) && IS_AZALIA(pdev))
2313                 return 1;
2314
2315         if ((iommu_identity_mapping & IDENTMAP_GFX) && IS_GFX_DEVICE(pdev))
2316                 return 1;
2317
2318         if (!(iommu_identity_mapping & IDENTMAP_ALL))
2319                 return 0;
2320
2321         /*
2322          * We want to start off with all devices in the 1:1 domain, and
2323          * take them out later if we find they can't access all of memory.
2324          *
2325          * However, we can't do this for PCI devices behind bridges,
2326          * because all PCI devices behind the same bridge will end up
2327          * with the same source-id on their transactions.
2328          *
2329          * Practically speaking, we can't change things around for these
2330          * devices at run-time, because we can't be sure there'll be no
2331          * DMA transactions in flight for any of their siblings.
2332          * 
2333          * So PCI devices (unless they're on the root bus) as well as
2334          * their parent PCI-PCI or PCIe-PCI bridges must be left _out_ of
2335          * the 1:1 domain, just in _case_ one of their siblings turns out
2336          * not to be able to map all of memory.
2337          */
2338         if (!pci_is_pcie(pdev)) {
2339                 if (!pci_is_root_bus(pdev->bus))
2340                         return 0;
2341                 if (pdev->class >> 8 == PCI_CLASS_BRIDGE_PCI)
2342                         return 0;
2343         } else if (pdev->pcie_type == PCI_EXP_TYPE_PCI_BRIDGE)
2344                 return 0;
2345
2346         /* 
2347          * At boot time, we don't yet know if devices will be 64-bit capable.
2348          * Assume that they will -- if they turn out not to be, then we can 
2349          * take them out of the 1:1 domain later.
2350          */
2351         if (!startup) {
2352                 /*
2353                  * If the device's dma_mask is less than the system's memory
2354                  * size then this is not a candidate for identity mapping.
2355                  */
2356                 u64 dma_mask = pdev->dma_mask;
2357
2358                 if (pdev->dev.coherent_dma_mask &&
2359                     pdev->dev.coherent_dma_mask < dma_mask)
2360                         dma_mask = pdev->dev.coherent_dma_mask;
2361
2362                 return dma_mask >= dma_get_required_mask(&pdev->dev);
2363         }
2364
2365         return 1;
2366 }
2367
2368 static int __init iommu_prepare_static_identity_mapping(int hw)
2369 {
2370         struct pci_dev *pdev = NULL;
2371         int ret;
2372
2373         ret = si_domain_init(hw);
2374         if (ret)
2375                 return -EFAULT;
2376
2377         for_each_pci_dev(pdev) {
2378                 if (iommu_should_identity_map(pdev, 1)) {
2379                         ret = domain_add_dev_info(si_domain, pdev,
2380                                              hw ? CONTEXT_TT_PASS_THROUGH :
2381                                                   CONTEXT_TT_MULTI_LEVEL);
2382                         if (ret) {
2383                                 /* device not associated with an iommu */
2384                                 if (ret == -ENODEV)
2385                                         continue;
2386                                 return ret;
2387                         }
2388                         pr_info("IOMMU: %s identity mapping for device %s\n",
2389                                 hw ? "hardware" : "software", pci_name(pdev));
2390                 }
2391         }
2392
2393         return 0;
2394 }
2395
2396 static int __init init_dmars(void)
2397 {
2398         struct dmar_drhd_unit *drhd;
2399         struct dmar_rmrr_unit *rmrr;
2400         struct pci_dev *pdev;
2401         struct intel_iommu *iommu;
2402         int i, ret;
2403
2404         /*
2405          * for each drhd
2406          *    allocate root
2407          *    initialize and program root entry to not present
2408          * endfor
2409          */
2410         for_each_drhd_unit(drhd) {
2411                 /*
2412                  * lock not needed as this is only incremented in the single
2413                  * threaded kernel __init code path all other access are read
2414                  * only
2415                  */
2416                 if (g_num_of_iommus < IOMMU_UNITS_SUPPORTED) {
2417                         g_num_of_iommus++;
2418                         continue;
2419                 }
2420                 printk_once(KERN_ERR "intel-iommu: exceeded %d IOMMUs\n",
2421                           IOMMU_UNITS_SUPPORTED);
2422         }
2423
2424         g_iommus = kcalloc(g_num_of_iommus, sizeof(struct intel_iommu *),
2425                         GFP_KERNEL);
2426         if (!g_iommus) {
2427                 printk(KERN_ERR "Allocating global iommu array failed\n");
2428                 ret = -ENOMEM;
2429                 goto error;
2430         }
2431
2432         deferred_flush = kzalloc(g_num_of_iommus *
2433                 sizeof(struct deferred_flush_tables), GFP_KERNEL);
2434         if (!deferred_flush) {
2435                 ret = -ENOMEM;
2436                 goto error;
2437         }
2438
2439         for_each_drhd_unit(drhd) {
2440                 if (drhd->ignored)
2441                         continue;
2442
2443                 iommu = drhd->iommu;
2444                 g_iommus[iommu->seq_id] = iommu;
2445
2446                 ret = iommu_init_domains(iommu);
2447                 if (ret)
2448                         goto error;
2449
2450                 /*
2451                  * TBD:
2452                  * we could share the same root & context tables
2453                  * among all IOMMU's. Need to Split it later.
2454                  */
2455                 ret = iommu_alloc_root_entry(iommu);
2456                 if (ret) {
2457                         printk(KERN_ERR "IOMMU: allocate root entry failed\n");
2458                         goto error;
2459                 }
2460                 if (!ecap_pass_through(iommu->ecap))
2461                         hw_pass_through = 0;
2462         }
2463
2464         /*
2465          * Start from the sane iommu hardware state.
2466          */
2467         for_each_drhd_unit(drhd) {
2468                 if (drhd->ignored)
2469                         continue;
2470
2471                 iommu = drhd->iommu;
2472
2473                 /*
2474                  * If the queued invalidation is already initialized by us
2475                  * (for example, while enabling interrupt-remapping) then
2476                  * we got the things already rolling from a sane state.
2477                  */
2478                 if (iommu->qi)
2479                         continue;
2480
2481                 /*
2482                  * Clear any previous faults.
2483                  */
2484                 dmar_fault(-1, iommu);
2485                 /*
2486                  * Disable queued invalidation if supported and already enabled
2487                  * before OS handover.
2488                  */
2489                 dmar_disable_qi(iommu);
2490         }
2491
2492         for_each_drhd_unit(drhd) {
2493                 if (drhd->ignored)
2494                         continue;
2495
2496                 iommu = drhd->iommu;
2497
2498                 if (dmar_enable_qi(iommu)) {
2499                         /*
2500                          * Queued Invalidate not enabled, use Register Based
2501                          * Invalidate
2502                          */
2503                         iommu->flush.flush_context = __iommu_flush_context;
2504                         iommu->flush.flush_iotlb = __iommu_flush_iotlb;
2505                         printk(KERN_INFO "IOMMU %d 0x%Lx: using Register based "
2506                                "invalidation\n",
2507                                 iommu->seq_id,
2508                                (unsigned long long)drhd->reg_base_addr);
2509                 } else {
2510                         iommu->flush.flush_context = qi_flush_context;
2511                         iommu->flush.flush_iotlb = qi_flush_iotlb;
2512                         printk(KERN_INFO "IOMMU %d 0x%Lx: using Queued "
2513                                "invalidation\n",
2514                                 iommu->seq_id,
2515                                (unsigned long long)drhd->reg_base_addr);
2516                 }
2517         }
2518
2519         if (iommu_pass_through)
2520                 iommu_identity_mapping |= IDENTMAP_ALL;
2521
2522 #ifdef CONFIG_INTEL_IOMMU_BROKEN_GFX_WA
2523         iommu_identity_mapping |= IDENTMAP_GFX;
2524 #endif
2525
2526         check_tylersburg_isoch();
2527
2528         /*
2529          * If pass through is not set or not enabled, setup context entries for
2530          * identity mappings for rmrr, gfx, and isa and may fall back to static
2531          * identity mapping if iommu_identity_mapping is set.
2532          */
2533         if (iommu_identity_mapping) {
2534                 ret = iommu_prepare_static_identity_mapping(hw_pass_through);
2535                 if (ret) {
2536                         printk(KERN_CRIT "Failed to setup IOMMU pass-through\n");
2537                         goto error;
2538                 }
2539         }
2540         /*
2541          * For each rmrr
2542          *   for each dev attached to rmrr
2543          *   do
2544          *     locate drhd for dev, alloc domain for dev
2545          *     allocate free domain
2546          *     allocate page table entries for rmrr
2547          *     if context not allocated for bus
2548          *           allocate and init context
2549          *           set present in root table for this bus
2550          *     init context with domain, translation etc
2551          *    endfor
2552          * endfor
2553          */
2554         printk(KERN_INFO "IOMMU: Setting RMRR:\n");
2555         for_each_rmrr_units(rmrr) {
2556                 for (i = 0; i < rmrr->devices_cnt; i++) {
2557                         pdev = rmrr->devices[i];
2558                         /*
2559                          * some BIOS lists non-exist devices in DMAR
2560                          * table.
2561                          */
2562                         if (!pdev)
2563                                 continue;
2564                         ret = iommu_prepare_rmrr_dev(rmrr, pdev);
2565                         if (ret)
2566                                 printk(KERN_ERR
2567                                        "IOMMU: mapping reserved region failed\n");
2568                 }
2569         }
2570
2571         iommu_prepare_isa();
2572
2573         /*
2574          * for each drhd
2575          *   enable fault log
2576          *   global invalidate context cache
2577          *   global invalidate iotlb
2578          *   enable translation
2579          */
2580         for_each_drhd_unit(drhd) {
2581                 if (drhd->ignored) {
2582                         /*
2583                          * we always have to disable PMRs or DMA may fail on
2584                          * this device
2585                          */
2586                         if (force_on)
2587                                 iommu_disable_protect_mem_regions(drhd->iommu);
2588                         continue;
2589                 }
2590                 iommu = drhd->iommu;
2591
2592                 iommu_flush_write_buffer(iommu);
2593
2594                 ret = dmar_set_interrupt(iommu);
2595                 if (ret)
2596                         goto error;
2597
2598                 iommu_set_root_entry(iommu);
2599
2600                 iommu->flush.flush_context(iommu, 0, 0, 0, DMA_CCMD_GLOBAL_INVL);
2601                 iommu->flush.flush_iotlb(iommu, 0, 0, 0, DMA_TLB_GLOBAL_FLUSH);
2602
2603                 ret = iommu_enable_translation(iommu);
2604                 if (ret)
2605                         goto error;
2606
2607                 iommu_disable_protect_mem_regions(iommu);
2608         }
2609
2610         return 0;
2611 error:
2612         for_each_drhd_unit(drhd) {
2613                 if (drhd->ignored)
2614                         continue;
2615                 iommu = drhd->iommu;
2616                 free_iommu(iommu);
2617         }
2618         kfree(g_iommus);
2619         return ret;
2620 }
2621
2622 /* This takes a number of _MM_ pages, not VTD pages */
2623 static struct iova *intel_alloc_iova(struct device *dev,
2624                                      struct dmar_domain *domain,
2625                                      unsigned long nrpages, uint64_t dma_mask)
2626 {
2627         struct pci_dev *pdev = to_pci_dev(dev);
2628         struct iova *iova = NULL;
2629
2630         /* Restrict dma_mask to the width that the iommu can handle */
2631         dma_mask = min_t(uint64_t, DOMAIN_MAX_ADDR(domain->gaw), dma_mask);
2632
2633         if (!dmar_forcedac && dma_mask > DMA_BIT_MASK(32)) {
2634                 /*
2635                  * First try to allocate an io virtual address in
2636                  * DMA_BIT_MASK(32) and if that fails then try allocating
2637                  * from higher range
2638                  */
2639                 iova = alloc_iova(&domain->iovad, nrpages,
2640                                   IOVA_PFN(DMA_BIT_MASK(32)), 1);
2641                 if (iova)
2642                         return iova;
2643         }
2644         iova = alloc_iova(&domain->iovad, nrpages, IOVA_PFN(dma_mask), 1);
2645         if (unlikely(!iova)) {
2646                 printk(KERN_ERR "Allocating %ld-page iova for %s failed",
2647                        nrpages, pci_name(pdev));
2648                 return NULL;
2649         }
2650
2651         return iova;
2652 }
2653
2654 static struct dmar_domain *__get_valid_domain_for_dev(struct pci_dev *pdev)
2655 {
2656         struct dmar_domain *domain;
2657         int ret;
2658
2659         domain = get_domain_for_dev(pdev,
2660                         DEFAULT_DOMAIN_ADDRESS_WIDTH);
2661         if (!domain) {
2662                 printk(KERN_ERR
2663                         "Allocating domain for %s failed", pci_name(pdev));
2664                 return NULL;
2665         }
2666
2667         /* make sure context mapping is ok */
2668         if (unlikely(!domain_context_mapped(pdev))) {
2669                 ret = domain_context_mapping(domain, pdev,
2670                                              CONTEXT_TT_MULTI_LEVEL);
2671                 if (ret) {
2672                         printk(KERN_ERR
2673                                 "Domain context map for %s failed",
2674                                 pci_name(pdev));
2675                         return NULL;
2676                 }
2677         }
2678
2679         return domain;
2680 }
2681
2682 static inline struct dmar_domain *get_valid_domain_for_dev(struct pci_dev *dev)
2683 {
2684         struct device_domain_info *info;
2685
2686         /* No lock here, assumes no domain exit in normal case */
2687         info = dev->dev.archdata.iommu;
2688         if (likely(info))
2689                 return info->domain;
2690
2691         return __get_valid_domain_for_dev(dev);
2692 }
2693
2694 static int iommu_dummy(struct pci_dev *pdev)
2695 {
2696         return pdev->dev.archdata.iommu == DUMMY_DEVICE_DOMAIN_INFO;
2697 }
2698
2699 /* Check if the pdev needs to go through non-identity map and unmap process.*/
2700 static int iommu_no_mapping(struct device *dev)
2701 {
2702         struct pci_dev *pdev;
2703         int found;
2704
2705         if (unlikely(dev->bus != &pci_bus_type))
2706                 return 1;
2707
2708         pdev = to_pci_dev(dev);
2709         if (iommu_dummy(pdev))
2710                 return 1;
2711
2712         if (!iommu_identity_mapping)
2713                 return 0;
2714
2715         found = identity_mapping(pdev);
2716         if (found) {
2717                 if (iommu_should_identity_map(pdev, 0))
2718                         return 1;
2719                 else {
2720                         /*
2721                          * 32 bit DMA is removed from si_domain and fall back
2722                          * to non-identity mapping.
2723                          */
2724                         domain_remove_one_dev_info(si_domain, pdev);
2725                         printk(KERN_INFO "32bit %s uses non-identity mapping\n",
2726                                pci_name(pdev));
2727                         return 0;
2728                 }
2729         } else {
2730                 /*
2731                  * In case of a detached 64 bit DMA device from vm, the device
2732                  * is put into si_domain for identity mapping.
2733                  */
2734                 if (iommu_should_identity_map(pdev, 0)) {
2735                         int ret;
2736                         ret = domain_add_dev_info(si_domain, pdev,
2737                                                   hw_pass_through ?
2738                                                   CONTEXT_TT_PASS_THROUGH :
2739                                                   CONTEXT_TT_MULTI_LEVEL);
2740                         if (!ret) {
2741                                 printk(KERN_INFO "64bit %s uses identity mapping\n",
2742                                        pci_name(pdev));
2743                                 return 1;
2744                         }
2745                 }
2746         }
2747
2748         return 0;
2749 }
2750
2751 static dma_addr_t __intel_map_single(struct device *hwdev, phys_addr_t paddr,
2752                                      size_t size, int dir, u64 dma_mask)
2753 {
2754         struct pci_dev *pdev = to_pci_dev(hwdev);
2755         struct dmar_domain *domain;
2756         phys_addr_t start_paddr;
2757         struct iova *iova;
2758         int prot = 0;
2759         int ret;
2760         struct intel_iommu *iommu;
2761         unsigned long paddr_pfn = paddr >> PAGE_SHIFT;
2762
2763         BUG_ON(dir == DMA_NONE);
2764
2765         if (iommu_no_mapping(hwdev))
2766                 return paddr;
2767
2768         domain = get_valid_domain_for_dev(pdev);
2769         if (!domain)
2770                 return 0;
2771
2772         iommu = domain_get_iommu(domain);
2773         size = aligned_nrpages(paddr, size);
2774
2775         iova = intel_alloc_iova(hwdev, domain, dma_to_mm_pfn(size), dma_mask);
2776         if (!iova)
2777                 goto error;
2778
2779         /*
2780          * Check if DMAR supports zero-length reads on write only
2781          * mappings..
2782          */
2783         if (dir == DMA_TO_DEVICE || dir == DMA_BIDIRECTIONAL || \
2784                         !cap_zlr(iommu->cap))
2785                 prot |= DMA_PTE_READ;
2786         if (dir == DMA_FROM_DEVICE || dir == DMA_BIDIRECTIONAL)
2787                 prot |= DMA_PTE_WRITE;
2788         /*
2789          * paddr - (paddr + size) might be partial page, we should map the whole
2790          * page.  Note: if two part of one page are separately mapped, we
2791          * might have two guest_addr mapping to the same host paddr, but this
2792          * is not a big problem
2793          */
2794         ret = domain_pfn_mapping(domain, mm_to_dma_pfn(iova->pfn_lo),
2795                                  mm_to_dma_pfn(paddr_pfn), size, prot);
2796         if (ret)
2797                 goto error;
2798
2799         /* it's a non-present to present mapping. Only flush if caching mode */
2800         if (cap_caching_mode(iommu->cap))
2801                 iommu_flush_iotlb_psi(iommu, domain->id, mm_to_dma_pfn(iova->pfn_lo), size, 1);
2802         else
2803                 iommu_flush_write_buffer(iommu);
2804
2805         start_paddr = (phys_addr_t)iova->pfn_lo << PAGE_SHIFT;
2806         start_paddr += paddr & ~PAGE_MASK;
2807         return start_paddr;
2808
2809 error:
2810         if (iova)
2811                 __free_iova(&domain->iovad, iova);
2812         printk(KERN_ERR"Device %s request: %zx@%llx dir %d --- failed\n",
2813                 pci_name(pdev), size, (unsigned long long)paddr, dir);
2814         return 0;
2815 }
2816
2817 static dma_addr_t intel_map_page(struct device *dev, struct page *page,
2818                                  unsigned long offset, size_t size,
2819                                  enum dma_data_direction dir,
2820                                  struct dma_attrs *attrs)
2821 {
2822         return __intel_map_single(dev, page_to_phys(page) + offset, size,
2823                                   dir, to_pci_dev(dev)->dma_mask);
2824 }
2825
2826 static void flush_unmaps(void)
2827 {
2828         int i, j;
2829
2830         timer_on = 0;
2831
2832         /* just flush them all */
2833         for (i = 0; i < g_num_of_iommus; i++) {
2834                 struct intel_iommu *iommu = g_iommus[i];
2835                 if (!iommu)
2836                         continue;
2837
2838                 if (!deferred_flush[i].next)
2839                         continue;
2840
2841                 /* In caching mode, global flushes turn emulation expensive */
2842                 if (!cap_caching_mode(iommu->cap))
2843                         iommu->flush.flush_iotlb(iommu, 0, 0, 0,
2844                                          DMA_TLB_GLOBAL_FLUSH);
2845                 for (j = 0; j < deferred_flush[i].next; j++) {
2846                         unsigned long mask;
2847                         struct iova *iova = deferred_flush[i].iova[j];
2848                         struct dmar_domain *domain = deferred_flush[i].domain[j];
2849
2850                         /* On real hardware multiple invalidations are expensive */
2851                         if (cap_caching_mode(iommu->cap))
2852                                 iommu_flush_iotlb_psi(iommu, domain->id,
2853                                 iova->pfn_lo, iova->pfn_hi - iova->pfn_lo + 1, 0);
2854                         else {
2855                                 mask = ilog2(mm_to_dma_pfn(iova->pfn_hi - iova->pfn_lo + 1));
2856                                 iommu_flush_dev_iotlb(deferred_flush[i].domain[j],
2857                                                 (uint64_t)iova->pfn_lo << PAGE_SHIFT, mask);
2858                         }
2859                         __free_iova(&deferred_flush[i].domain[j]->iovad, iova);
2860                 }
2861                 deferred_flush[i].next = 0;
2862         }
2863
2864         list_size = 0;
2865 }
2866
2867 static void flush_unmaps_timeout(unsigned long data)
2868 {
2869         unsigned long flags;
2870
2871         spin_lock_irqsave(&async_umap_flush_lock, flags);
2872         flush_unmaps();
2873         spin_unlock_irqrestore(&async_umap_flush_lock, flags);
2874 }
2875
2876 static void add_unmap(struct dmar_domain *dom, struct iova *iova)
2877 {
2878         unsigned long flags;
2879         int next, iommu_id;
2880         struct intel_iommu *iommu;
2881
2882         spin_lock_irqsave(&async_umap_flush_lock, flags);
2883         if (list_size == HIGH_WATER_MARK)
2884                 flush_unmaps();
2885
2886         iommu = domain_get_iommu(dom);
2887         iommu_id = iommu->seq_id;
2888
2889         next = deferred_flush[iommu_id].next;
2890         deferred_flush[iommu_id].domain[next] = dom;
2891         deferred_flush[iommu_id].iova[next] = iova;
2892         deferred_flush[iommu_id].next++;
2893
2894         if (!timer_on) {
2895                 mod_timer(&unmap_timer, jiffies + msecs_to_jiffies(10));
2896                 timer_on = 1;
2897         }
2898         list_size++;
2899         spin_unlock_irqrestore(&async_umap_flush_lock, flags);
2900 }
2901
2902 static void intel_unmap_page(struct device *dev, dma_addr_t dev_addr,
2903                              size_t size, enum dma_data_direction dir,
2904                              struct dma_attrs *attrs)
2905 {
2906         struct pci_dev *pdev = to_pci_dev(dev);
2907         struct dmar_domain *domain;
2908         unsigned long start_pfn, last_pfn;
2909         struct iova *iova;
2910         struct intel_iommu *iommu;
2911
2912         if (iommu_no_mapping(dev))
2913                 return;
2914
2915         domain = find_domain(pdev);
2916         BUG_ON(!domain);
2917
2918         iommu = domain_get_iommu(domain);
2919
2920         iova = find_iova(&domain->iovad, IOVA_PFN(dev_addr));
2921         if (WARN_ONCE(!iova, "Driver unmaps unmatched page at PFN %llx\n",
2922                       (unsigned long long)dev_addr))
2923                 return;
2924
2925         start_pfn = mm_to_dma_pfn(iova->pfn_lo);
2926         last_pfn = mm_to_dma_pfn(iova->pfn_hi + 1) - 1;
2927
2928         pr_debug("Device %s unmapping: pfn %lx-%lx\n",
2929                  pci_name(pdev), start_pfn, last_pfn);
2930
2931         /*  clear the whole page */
2932         dma_pte_clear_range(domain, start_pfn, last_pfn);
2933
2934         /* free page tables */
2935         dma_pte_free_pagetable(domain, start_pfn, last_pfn);
2936
2937         if (intel_iommu_strict) {
2938                 iommu_flush_iotlb_psi(iommu, domain->id, start_pfn,
2939                                       last_pfn - start_pfn + 1, 0);
2940                 /* free iova */
2941                 __free_iova(&domain->iovad, iova);
2942         } else {
2943                 add_unmap(domain, iova);
2944                 /*
2945                  * queue up the release of the unmap to save the 1/6th of the
2946                  * cpu used up by the iotlb flush operation...
2947                  */
2948         }
2949 }
2950
2951 static void *intel_alloc_coherent(struct device *hwdev, size_t size,
2952                                   dma_addr_t *dma_handle, gfp_t flags)
2953 {
2954         void *vaddr;
2955         int order;
2956
2957         size = PAGE_ALIGN(size);
2958         order = get_order(size);
2959
2960         if (!iommu_no_mapping(hwdev))
2961                 flags &= ~(GFP_DMA | GFP_DMA32);
2962         else if (hwdev->coherent_dma_mask < dma_get_required_mask(hwdev)) {
2963                 if (hwdev->coherent_dma_mask < DMA_BIT_MASK(32))
2964                         flags |= GFP_DMA;
2965                 else
2966                         flags |= GFP_DMA32;
2967         }
2968
2969         vaddr = (void *)__get_free_pages(flags, order);
2970         if (!vaddr)
2971                 return NULL;
2972         memset(vaddr, 0, size);
2973
2974         *dma_handle = __intel_map_single(hwdev, virt_to_bus(vaddr), size,
2975                                          DMA_BIDIRECTIONAL,
2976                                          hwdev->coherent_dma_mask);
2977         if (*dma_handle)
2978                 return vaddr;
2979         free_pages((unsigned long)vaddr, order);
2980         return NULL;
2981 }
2982
2983 static void intel_free_coherent(struct device *hwdev, size_t size, void *vaddr,
2984                                 dma_addr_t dma_handle)
2985 {
2986         int order;
2987
2988         size = PAGE_ALIGN(size);
2989         order = get_order(size);
2990
2991         intel_unmap_page(hwdev, dma_handle, size, DMA_BIDIRECTIONAL, NULL);
2992         free_pages((unsigned long)vaddr, order);
2993 }
2994
2995 static void intel_unmap_sg(struct device *hwdev, struct scatterlist *sglist,
2996                            int nelems, enum dma_data_direction dir,
2997                            struct dma_attrs *attrs)
2998 {
2999         struct pci_dev *pdev = to_pci_dev(hwdev);
3000         struct dmar_domain *domain;
3001         unsigned long start_pfn, last_pfn;
3002         struct iova *iova;
3003         struct intel_iommu *iommu;
3004
3005         if (iommu_no_mapping(hwdev))
3006                 return;
3007
3008         domain = find_domain(pdev);
3009         BUG_ON(!domain);
3010
3011         iommu = domain_get_iommu(domain);
3012
3013         iova = find_iova(&domain->iovad, IOVA_PFN(sglist[0].dma_address));
3014         if (WARN_ONCE(!iova, "Driver unmaps unmatched sglist at PFN %llx\n",
3015                       (unsigned long long)sglist[0].dma_address))
3016                 return;
3017
3018         start_pfn = mm_to_dma_pfn(iova->pfn_lo);
3019         last_pfn = mm_to_dma_pfn(iova->pfn_hi + 1) - 1;
3020
3021         /*  clear the whole page */
3022         dma_pte_clear_range(domain, start_pfn, last_pfn);
3023
3024         /* free page tables */
3025         dma_pte_free_pagetable(domain, start_pfn, last_pfn);
3026
3027         if (intel_iommu_strict) {
3028                 iommu_flush_iotlb_psi(iommu, domain->id, start_pfn,
3029                                       last_pfn - start_pfn + 1, 0);
3030                 /* free iova */
3031                 __free_iova(&domain->iovad, iova);
3032         } else {
3033                 add_unmap(domain, iova);
3034                 /*
3035                  * queue up the release of the unmap to save the 1/6th of the
3036                  * cpu used up by the iotlb flush operation...
3037                  */
3038         }
3039 }
3040
3041 static int intel_nontranslate_map_sg(struct device *hddev,
3042         struct scatterlist *sglist, int nelems, int dir)
3043 {
3044         int i;
3045         struct scatterlist *sg;
3046
3047         for_each_sg(sglist, sg, nelems, i) {
3048                 BUG_ON(!sg_page(sg));
3049                 sg->dma_address = page_to_phys(sg_page(sg)) + sg->offset;
3050                 sg->dma_length = sg->length;
3051         }
3052         return nelems;
3053 }
3054
3055 static int intel_map_sg(struct device *hwdev, struct scatterlist *sglist, int nelems,
3056                         enum dma_data_direction dir, struct dma_attrs *attrs)
3057 {
3058         int i;
3059         struct pci_dev *pdev = to_pci_dev(hwdev);
3060         struct dmar_domain *domain;
3061         size_t size = 0;
3062         int prot = 0;
3063         struct iova *iova = NULL;
3064         int ret;
3065         struct scatterlist *sg;
3066         unsigned long start_vpfn;
3067         struct intel_iommu *iommu;
3068
3069         BUG_ON(dir == DMA_NONE);
3070         if (iommu_no_mapping(hwdev))
3071                 return intel_nontranslate_map_sg(hwdev, sglist, nelems, dir);
3072
3073         domain = get_valid_domain_for_dev(pdev);
3074         if (!domain)
3075                 return 0;
3076
3077         iommu = domain_get_iommu(domain);
3078
3079         for_each_sg(sglist, sg, nelems, i)
3080                 size += aligned_nrpages(sg->offset, sg->length);
3081
3082         iova = intel_alloc_iova(hwdev, domain, dma_to_mm_pfn(size),
3083                                 pdev->dma_mask);
3084         if (!iova) {
3085                 sglist->dma_length = 0;
3086                 return 0;
3087         }
3088
3089         /*
3090          * Check if DMAR supports zero-length reads on write only
3091          * mappings..
3092          */
3093         if (dir == DMA_TO_DEVICE || dir == DMA_BIDIRECTIONAL || \
3094                         !cap_zlr(iommu->cap))
3095                 prot |= DMA_PTE_READ;
3096         if (dir == DMA_FROM_DEVICE || dir == DMA_BIDIRECTIONAL)
3097                 prot |= DMA_PTE_WRITE;
3098
3099         start_vpfn = mm_to_dma_pfn(iova->pfn_lo);
3100
3101         ret = domain_sg_mapping(domain, start_vpfn, sglist, size, prot);
3102         if (unlikely(ret)) {
3103                 /*  clear the page */
3104                 dma_pte_clear_range(domain, start_vpfn,
3105                                     start_vpfn + size - 1);
3106                 /* free page tables */
3107                 dma_pte_free_pagetable(domain, start_vpfn,
3108                                        start_vpfn + size - 1);
3109                 /* free iova */
3110                 __free_iova(&domain->iovad, iova);
3111                 return 0;
3112         }
3113
3114         /* it's a non-present to present mapping. Only flush if caching mode */
3115         if (cap_caching_mode(iommu->cap))
3116                 iommu_flush_iotlb_psi(iommu, domain->id, start_vpfn, size, 1);
3117         else
3118                 iommu_flush_write_buffer(iommu);
3119
3120         return nelems;
3121 }
3122
3123 static int intel_mapping_error(struct device *dev, dma_addr_t dma_addr)
3124 {
3125         return !dma_addr;
3126 }
3127
3128 struct dma_map_ops intel_dma_ops = {
3129         .alloc_coherent = intel_alloc_coherent,
3130         .free_coherent = intel_free_coherent,
3131         .map_sg = intel_map_sg,
3132         .unmap_sg = intel_unmap_sg,
3133         .map_page = intel_map_page,
3134         .unmap_page = intel_unmap_page,
3135         .mapping_error = intel_mapping_error,
3136 };
3137
3138 static inline int iommu_domain_cache_init(void)
3139 {
3140         int ret = 0;
3141
3142         iommu_domain_cache = kmem_cache_create("iommu_domain",
3143                                          sizeof(struct dmar_domain),
3144                                          0,
3145                                          SLAB_HWCACHE_ALIGN,
3146
3147                                          NULL);
3148         if (!iommu_domain_cache) {
3149                 printk(KERN_ERR "Couldn't create iommu_domain cache\n");
3150                 ret = -ENOMEM;
3151         }
3152
3153         return ret;
3154 }
3155
3156 static inline int iommu_devinfo_cache_init(void)
3157 {
3158         int ret = 0;
3159
3160         iommu_devinfo_cache = kmem_cache_create("iommu_devinfo",
3161                                          sizeof(struct device_domain_info),
3162                                          0,
3163                                          SLAB_HWCACHE_ALIGN,
3164                                          NULL);
3165         if (!iommu_devinfo_cache) {
3166                 printk(KERN_ERR "Couldn't create devinfo cache\n");
3167                 ret = -ENOMEM;
3168         }
3169
3170         return ret;
3171 }
3172
3173 static inline int iommu_iova_cache_init(void)
3174 {
3175         int ret = 0;
3176
3177         iommu_iova_cache = kmem_cache_create("iommu_iova",
3178                                          sizeof(struct iova),
3179                                          0,
3180                                          SLAB_HWCACHE_ALIGN,
3181                                          NULL);
3182         if (!iommu_iova_cache) {
3183                 printk(KERN_ERR "Couldn't create iova cache\n");
3184                 ret = -ENOMEM;
3185         }
3186
3187         return ret;
3188 }
3189
3190 static int __init iommu_init_mempool(void)
3191 {
3192         int ret;
3193         ret = iommu_iova_cache_init();
3194         if (ret)
3195                 return ret;
3196
3197         ret = iommu_domain_cache_init();
3198         if (ret)
3199                 goto domain_error;
3200
3201         ret = iommu_devinfo_cache_init();
3202         if (!ret)
3203                 return ret;
3204
3205         kmem_cache_destroy(iommu_domain_cache);
3206 domain_error:
3207         kmem_cache_destroy(iommu_iova_cache);
3208
3209         return -ENOMEM;
3210 }
3211
3212 static void __init iommu_exit_mempool(void)
3213 {
3214         kmem_cache_destroy(iommu_devinfo_cache);
3215         kmem_cache_destroy(iommu_domain_cache);
3216         kmem_cache_destroy(iommu_iova_cache);
3217
3218 }
3219
3220 static void quirk_ioat_snb_local_iommu(struct pci_dev *pdev)
3221 {
3222         struct dmar_drhd_unit *drhd;
3223         u32 vtbar;
3224         int rc;
3225
3226         /* We know that this device on this chipset has its own IOMMU.
3227          * If we find it under a different IOMMU, then the BIOS is lying
3228          * to us. Hope that the IOMMU for this device is actually
3229          * disabled, and it needs no translation...
3230          */
3231         rc = pci_bus_read_config_dword(pdev->bus, PCI_DEVFN(0, 0), 0xb0, &vtbar);
3232         if (rc) {
3233                 /* "can't" happen */
3234                 dev_info(&pdev->dev, "failed to run vt-d quirk\n");
3235                 return;
3236         }
3237         vtbar &= 0xffff0000;
3238
3239         /* we know that the this iommu should be at offset 0xa000 from vtbar */
3240         drhd = dmar_find_matched_drhd_unit(pdev);
3241         if (WARN_TAINT_ONCE(!drhd || drhd->reg_base_addr - vtbar != 0xa000,
3242                             TAINT_FIRMWARE_WORKAROUND,
3243                             "BIOS assigned incorrect VT-d unit for Intel(R) QuickData Technology device\n"))
3244                 pdev->dev.archdata.iommu = DUMMY_DEVICE_DOMAIN_INFO;
3245 }
3246 DECLARE_PCI_FIXUP_ENABLE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_IOAT_SNB, quirk_ioat_snb_local_iommu);
3247
3248 static void __init init_no_remapping_devices(void)
3249 {
3250         struct dmar_drhd_unit *drhd;
3251
3252         for_each_drhd_unit(drhd) {
3253                 if (!drhd->include_all) {
3254                         int i;
3255                         for (i = 0; i < drhd->devices_cnt; i++)
3256                                 if (drhd->devices[i] != NULL)
3257                                         break;
3258                         /* ignore DMAR unit if no pci devices exist */
3259                         if (i == drhd->devices_cnt)
3260                                 drhd->ignored = 1;
3261                 }
3262         }
3263
3264         for_each_drhd_unit(drhd) {
3265                 int i;
3266                 if (drhd->ignored || drhd->include_all)
3267                         continue;
3268
3269                 for (i = 0; i < drhd->devices_cnt; i++)
3270                         if (drhd->devices[i] &&
3271                             !IS_GFX_DEVICE(drhd->devices[i]))
3272                                 break;
3273
3274                 if (i < drhd->devices_cnt)
3275                         continue;
3276
3277                 /* This IOMMU has *only* gfx devices. Either bypass it or
3278                    set the gfx_mapped flag, as appropriate */
3279                 if (dmar_map_gfx) {
3280                         intel_iommu_gfx_mapped = 1;
3281                 } else {
3282                         drhd->ignored = 1;
3283                         for (i = 0; i < drhd->devices_cnt; i++) {
3284                                 if (!drhd->devices[i])
3285                                         continue;
3286                                 drhd->devices[i]->dev.archdata.iommu = DUMMY_DEVICE_DOMAIN_INFO;
3287                         }
3288                 }
3289         }
3290 }
3291
3292 #ifdef CONFIG_SUSPEND
3293 static int init_iommu_hw(void)
3294 {
3295         struct dmar_drhd_unit *drhd;
3296         struct intel_iommu *iommu = NULL;
3297
3298         for_each_active_iommu(iommu, drhd)
3299                 if (iommu->qi)
3300                         dmar_reenable_qi(iommu);
3301
3302         for_each_iommu(iommu, drhd) {