43d5c8b8e7ad05681e758d23727fa680595c62f3
[~shefty/rdma-dev.git] / drivers / iommu / intel-iommu.c
1 /*
2  * Copyright (c) 2006, Intel Corporation.
3  *
4  * This program is free software; you can redistribute it and/or modify it
5  * under the terms and conditions of the GNU General Public License,
6  * version 2, as published by the Free Software Foundation.
7  *
8  * This program is distributed in the hope it will be useful, but WITHOUT
9  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
10  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
11  * more details.
12  *
13  * You should have received a copy of the GNU General Public License along with
14  * this program; if not, write to the Free Software Foundation, Inc., 59 Temple
15  * Place - Suite 330, Boston, MA 02111-1307 USA.
16  *
17  * Copyright (C) 2006-2008 Intel Corporation
18  * Author: Ashok Raj <ashok.raj@intel.com>
19  * Author: Shaohua Li <shaohua.li@intel.com>
20  * Author: Anil S Keshavamurthy <anil.s.keshavamurthy@intel.com>
21  * Author: Fenghua Yu <fenghua.yu@intel.com>
22  */
23
24 #include <linux/init.h>
25 #include <linux/bitmap.h>
26 #include <linux/debugfs.h>
27 #include <linux/export.h>
28 #include <linux/slab.h>
29 #include <linux/irq.h>
30 #include <linux/interrupt.h>
31 #include <linux/spinlock.h>
32 #include <linux/pci.h>
33 #include <linux/dmar.h>
34 #include <linux/dma-mapping.h>
35 #include <linux/mempool.h>
36 #include <linux/timer.h>
37 #include <linux/iova.h>
38 #include <linux/iommu.h>
39 #include <linux/intel-iommu.h>
40 #include <linux/syscore_ops.h>
41 #include <linux/tboot.h>
42 #include <linux/dmi.h>
43 #include <linux/pci-ats.h>
44 #include <linux/memblock.h>
45 #include <asm/irq_remapping.h>
46 #include <asm/cacheflush.h>
47 #include <asm/iommu.h>
48
49 #include "irq_remapping.h"
50
51 #define ROOT_SIZE               VTD_PAGE_SIZE
52 #define CONTEXT_SIZE            VTD_PAGE_SIZE
53
54 #define IS_GFX_DEVICE(pdev) ((pdev->class >> 16) == PCI_BASE_CLASS_DISPLAY)
55 #define IS_ISA_DEVICE(pdev) ((pdev->class >> 8) == PCI_CLASS_BRIDGE_ISA)
56 #define IS_AZALIA(pdev) ((pdev)->vendor == 0x8086 && (pdev)->device == 0x3a3e)
57
58 #define IOAPIC_RANGE_START      (0xfee00000)
59 #define IOAPIC_RANGE_END        (0xfeefffff)
60 #define IOVA_START_ADDR         (0x1000)
61
62 #define DEFAULT_DOMAIN_ADDRESS_WIDTH 48
63
64 #define MAX_AGAW_WIDTH 64
65
66 #define __DOMAIN_MAX_PFN(gaw)  ((((uint64_t)1) << (gaw-VTD_PAGE_SHIFT)) - 1)
67 #define __DOMAIN_MAX_ADDR(gaw) ((((uint64_t)1) << gaw) - 1)
68
69 /* We limit DOMAIN_MAX_PFN to fit in an unsigned long, and DOMAIN_MAX_ADDR
70    to match. That way, we can use 'unsigned long' for PFNs with impunity. */
71 #define DOMAIN_MAX_PFN(gaw)     ((unsigned long) min_t(uint64_t, \
72                                 __DOMAIN_MAX_PFN(gaw), (unsigned long)-1))
73 #define DOMAIN_MAX_ADDR(gaw)    (((uint64_t)__DOMAIN_MAX_PFN(gaw)) << VTD_PAGE_SHIFT)
74
75 #define IOVA_PFN(addr)          ((addr) >> PAGE_SHIFT)
76 #define DMA_32BIT_PFN           IOVA_PFN(DMA_BIT_MASK(32))
77 #define DMA_64BIT_PFN           IOVA_PFN(DMA_BIT_MASK(64))
78
79 /* page table handling */
80 #define LEVEL_STRIDE            (9)
81 #define LEVEL_MASK              (((u64)1 << LEVEL_STRIDE) - 1)
82
83 /*
84  * This bitmap is used to advertise the page sizes our hardware support
85  * to the IOMMU core, which will then use this information to split
86  * physically contiguous memory regions it is mapping into page sizes
87  * that we support.
88  *
89  * Traditionally the IOMMU core just handed us the mappings directly,
90  * after making sure the size is an order of a 4KiB page and that the
91  * mapping has natural alignment.
92  *
93  * To retain this behavior, we currently advertise that we support
94  * all page sizes that are an order of 4KiB.
95  *
96  * If at some point we'd like to utilize the IOMMU core's new behavior,
97  * we could change this to advertise the real page sizes we support.
98  */
99 #define INTEL_IOMMU_PGSIZES     (~0xFFFUL)
100
101 static inline int agaw_to_level(int agaw)
102 {
103         return agaw + 2;
104 }
105
106 static inline int agaw_to_width(int agaw)
107 {
108         return 30 + agaw * LEVEL_STRIDE;
109 }
110
111 static inline int width_to_agaw(int width)
112 {
113         return (width - 30) / LEVEL_STRIDE;
114 }
115
116 static inline unsigned int level_to_offset_bits(int level)
117 {
118         return (level - 1) * LEVEL_STRIDE;
119 }
120
121 static inline int pfn_level_offset(unsigned long pfn, int level)
122 {
123         return (pfn >> level_to_offset_bits(level)) & LEVEL_MASK;
124 }
125
126 static inline unsigned long level_mask(int level)
127 {
128         return -1UL << level_to_offset_bits(level);
129 }
130
131 static inline unsigned long level_size(int level)
132 {
133         return 1UL << level_to_offset_bits(level);
134 }
135
136 static inline unsigned long align_to_level(unsigned long pfn, int level)
137 {
138         return (pfn + level_size(level) - 1) & level_mask(level);
139 }
140
141 static inline unsigned long lvl_to_nr_pages(unsigned int lvl)
142 {
143         return  1 << ((lvl - 1) * LEVEL_STRIDE);
144 }
145
146 /* VT-d pages must always be _smaller_ than MM pages. Otherwise things
147    are never going to work. */
148 static inline unsigned long dma_to_mm_pfn(unsigned long dma_pfn)
149 {
150         return dma_pfn >> (PAGE_SHIFT - VTD_PAGE_SHIFT);
151 }
152
153 static inline unsigned long mm_to_dma_pfn(unsigned long mm_pfn)
154 {
155         return mm_pfn << (PAGE_SHIFT - VTD_PAGE_SHIFT);
156 }
157 static inline unsigned long page_to_dma_pfn(struct page *pg)
158 {
159         return mm_to_dma_pfn(page_to_pfn(pg));
160 }
161 static inline unsigned long virt_to_dma_pfn(void *p)
162 {
163         return page_to_dma_pfn(virt_to_page(p));
164 }
165
166 /* global iommu list, set NULL for ignored DMAR units */
167 static struct intel_iommu **g_iommus;
168
169 static void __init check_tylersburg_isoch(void);
170 static int rwbf_quirk;
171
172 /*
173  * set to 1 to panic kernel if can't successfully enable VT-d
174  * (used when kernel is launched w/ TXT)
175  */
176 static int force_on = 0;
177
178 /*
179  * 0: Present
180  * 1-11: Reserved
181  * 12-63: Context Ptr (12 - (haw-1))
182  * 64-127: Reserved
183  */
184 struct root_entry {
185         u64     val;
186         u64     rsvd1;
187 };
188 #define ROOT_ENTRY_NR (VTD_PAGE_SIZE/sizeof(struct root_entry))
189 static inline bool root_present(struct root_entry *root)
190 {
191         return (root->val & 1);
192 }
193 static inline void set_root_present(struct root_entry *root)
194 {
195         root->val |= 1;
196 }
197 static inline void set_root_value(struct root_entry *root, unsigned long value)
198 {
199         root->val |= value & VTD_PAGE_MASK;
200 }
201
202 static inline struct context_entry *
203 get_context_addr_from_root(struct root_entry *root)
204 {
205         return (struct context_entry *)
206                 (root_present(root)?phys_to_virt(
207                 root->val & VTD_PAGE_MASK) :
208                 NULL);
209 }
210
211 /*
212  * low 64 bits:
213  * 0: present
214  * 1: fault processing disable
215  * 2-3: translation type
216  * 12-63: address space root
217  * high 64 bits:
218  * 0-2: address width
219  * 3-6: aval
220  * 8-23: domain id
221  */
222 struct context_entry {
223         u64 lo;
224         u64 hi;
225 };
226
227 static inline bool context_present(struct context_entry *context)
228 {
229         return (context->lo & 1);
230 }
231 static inline void context_set_present(struct context_entry *context)
232 {
233         context->lo |= 1;
234 }
235
236 static inline void context_set_fault_enable(struct context_entry *context)
237 {
238         context->lo &= (((u64)-1) << 2) | 1;
239 }
240
241 static inline void context_set_translation_type(struct context_entry *context,
242                                                 unsigned long value)
243 {
244         context->lo &= (((u64)-1) << 4) | 3;
245         context->lo |= (value & 3) << 2;
246 }
247
248 static inline void context_set_address_root(struct context_entry *context,
249                                             unsigned long value)
250 {
251         context->lo |= value & VTD_PAGE_MASK;
252 }
253
254 static inline void context_set_address_width(struct context_entry *context,
255                                              unsigned long value)
256 {
257         context->hi |= value & 7;
258 }
259
260 static inline void context_set_domain_id(struct context_entry *context,
261                                          unsigned long value)
262 {
263         context->hi |= (value & ((1 << 16) - 1)) << 8;
264 }
265
266 static inline void context_clear_entry(struct context_entry *context)
267 {
268         context->lo = 0;
269         context->hi = 0;
270 }
271
272 /*
273  * 0: readable
274  * 1: writable
275  * 2-6: reserved
276  * 7: super page
277  * 8-10: available
278  * 11: snoop behavior
279  * 12-63: Host physcial address
280  */
281 struct dma_pte {
282         u64 val;
283 };
284
285 static inline void dma_clear_pte(struct dma_pte *pte)
286 {
287         pte->val = 0;
288 }
289
290 static inline void dma_set_pte_readable(struct dma_pte *pte)
291 {
292         pte->val |= DMA_PTE_READ;
293 }
294
295 static inline void dma_set_pte_writable(struct dma_pte *pte)
296 {
297         pte->val |= DMA_PTE_WRITE;
298 }
299
300 static inline void dma_set_pte_snp(struct dma_pte *pte)
301 {
302         pte->val |= DMA_PTE_SNP;
303 }
304
305 static inline void dma_set_pte_prot(struct dma_pte *pte, unsigned long prot)
306 {
307         pte->val = (pte->val & ~3) | (prot & 3);
308 }
309
310 static inline u64 dma_pte_addr(struct dma_pte *pte)
311 {
312 #ifdef CONFIG_64BIT
313         return pte->val & VTD_PAGE_MASK;
314 #else
315         /* Must have a full atomic 64-bit read */
316         return  __cmpxchg64(&pte->val, 0ULL, 0ULL) & VTD_PAGE_MASK;
317 #endif
318 }
319
320 static inline void dma_set_pte_pfn(struct dma_pte *pte, unsigned long pfn)
321 {
322         pte->val |= (uint64_t)pfn << VTD_PAGE_SHIFT;
323 }
324
325 static inline bool dma_pte_present(struct dma_pte *pte)
326 {
327         return (pte->val & 3) != 0;
328 }
329
330 static inline bool dma_pte_superpage(struct dma_pte *pte)
331 {
332         return (pte->val & (1 << 7));
333 }
334
335 static inline int first_pte_in_page(struct dma_pte *pte)
336 {
337         return !((unsigned long)pte & ~VTD_PAGE_MASK);
338 }
339
340 /*
341  * This domain is a statically identity mapping domain.
342  *      1. This domain creats a static 1:1 mapping to all usable memory.
343  *      2. It maps to each iommu if successful.
344  *      3. Each iommu mapps to this domain if successful.
345  */
346 static struct dmar_domain *si_domain;
347 static int hw_pass_through = 1;
348
349 /* devices under the same p2p bridge are owned in one domain */
350 #define DOMAIN_FLAG_P2P_MULTIPLE_DEVICES (1 << 0)
351
352 /* domain represents a virtual machine, more than one devices
353  * across iommus may be owned in one domain, e.g. kvm guest.
354  */
355 #define DOMAIN_FLAG_VIRTUAL_MACHINE     (1 << 1)
356
357 /* si_domain contains mulitple devices */
358 #define DOMAIN_FLAG_STATIC_IDENTITY     (1 << 2)
359
360 /* define the limit of IOMMUs supported in each domain */
361 #ifdef  CONFIG_X86
362 # define        IOMMU_UNITS_SUPPORTED   MAX_IO_APICS
363 #else
364 # define        IOMMU_UNITS_SUPPORTED   64
365 #endif
366
367 struct dmar_domain {
368         int     id;                     /* domain id */
369         int     nid;                    /* node id */
370         DECLARE_BITMAP(iommu_bmp, IOMMU_UNITS_SUPPORTED);
371                                         /* bitmap of iommus this domain uses*/
372
373         struct list_head devices;       /* all devices' list */
374         struct iova_domain iovad;       /* iova's that belong to this domain */
375
376         struct dma_pte  *pgd;           /* virtual address */
377         int             gaw;            /* max guest address width */
378
379         /* adjusted guest address width, 0 is level 2 30-bit */
380         int             agaw;
381
382         int             flags;          /* flags to find out type of domain */
383
384         int             iommu_coherency;/* indicate coherency of iommu access */
385         int             iommu_snooping; /* indicate snooping control feature*/
386         int             iommu_count;    /* reference count of iommu */
387         int             iommu_superpage;/* Level of superpages supported:
388                                            0 == 4KiB (no superpages), 1 == 2MiB,
389                                            2 == 1GiB, 3 == 512GiB, 4 == 1TiB */
390         spinlock_t      iommu_lock;     /* protect iommu set in domain */
391         u64             max_addr;       /* maximum mapped address */
392 };
393
394 /* PCI domain-device relationship */
395 struct device_domain_info {
396         struct list_head link;  /* link to domain siblings */
397         struct list_head global; /* link to global list */
398         int segment;            /* PCI domain */
399         u8 bus;                 /* PCI bus number */
400         u8 devfn;               /* PCI devfn number */
401         struct pci_dev *dev; /* it's NULL for PCIe-to-PCI bridge */
402         struct intel_iommu *iommu; /* IOMMU used by this device */
403         struct dmar_domain *domain; /* pointer to domain */
404 };
405
406 static void flush_unmaps_timeout(unsigned long data);
407
408 DEFINE_TIMER(unmap_timer,  flush_unmaps_timeout, 0, 0);
409
410 #define HIGH_WATER_MARK 250
411 struct deferred_flush_tables {
412         int next;
413         struct iova *iova[HIGH_WATER_MARK];
414         struct dmar_domain *domain[HIGH_WATER_MARK];
415 };
416
417 static struct deferred_flush_tables *deferred_flush;
418
419 /* bitmap for indexing intel_iommus */
420 static int g_num_of_iommus;
421
422 static DEFINE_SPINLOCK(async_umap_flush_lock);
423 static LIST_HEAD(unmaps_to_do);
424
425 static int timer_on;
426 static long list_size;
427
428 static void domain_remove_dev_info(struct dmar_domain *domain);
429
430 #ifdef CONFIG_INTEL_IOMMU_DEFAULT_ON
431 int dmar_disabled = 0;
432 #else
433 int dmar_disabled = 1;
434 #endif /*CONFIG_INTEL_IOMMU_DEFAULT_ON*/
435
436 int intel_iommu_enabled = 0;
437 EXPORT_SYMBOL_GPL(intel_iommu_enabled);
438
439 static int dmar_map_gfx = 1;
440 static int dmar_forcedac;
441 static int intel_iommu_strict;
442 static int intel_iommu_superpage = 1;
443
444 int intel_iommu_gfx_mapped;
445 EXPORT_SYMBOL_GPL(intel_iommu_gfx_mapped);
446
447 #define DUMMY_DEVICE_DOMAIN_INFO ((struct device_domain_info *)(-1))
448 static DEFINE_SPINLOCK(device_domain_lock);
449 static LIST_HEAD(device_domain_list);
450
451 static struct iommu_ops intel_iommu_ops;
452
453 static int __init intel_iommu_setup(char *str)
454 {
455         if (!str)
456                 return -EINVAL;
457         while (*str) {
458                 if (!strncmp(str, "on", 2)) {
459                         dmar_disabled = 0;
460                         printk(KERN_INFO "Intel-IOMMU: enabled\n");
461                 } else if (!strncmp(str, "off", 3)) {
462                         dmar_disabled = 1;
463                         printk(KERN_INFO "Intel-IOMMU: disabled\n");
464                 } else if (!strncmp(str, "igfx_off", 8)) {
465                         dmar_map_gfx = 0;
466                         printk(KERN_INFO
467                                 "Intel-IOMMU: disable GFX device mapping\n");
468                 } else if (!strncmp(str, "forcedac", 8)) {
469                         printk(KERN_INFO
470                                 "Intel-IOMMU: Forcing DAC for PCI devices\n");
471                         dmar_forcedac = 1;
472                 } else if (!strncmp(str, "strict", 6)) {
473                         printk(KERN_INFO
474                                 "Intel-IOMMU: disable batched IOTLB flush\n");
475                         intel_iommu_strict = 1;
476                 } else if (!strncmp(str, "sp_off", 6)) {
477                         printk(KERN_INFO
478                                 "Intel-IOMMU: disable supported super page\n");
479                         intel_iommu_superpage = 0;
480                 }
481
482                 str += strcspn(str, ",");
483                 while (*str == ',')
484                         str++;
485         }
486         return 0;
487 }
488 __setup("intel_iommu=", intel_iommu_setup);
489
490 static struct kmem_cache *iommu_domain_cache;
491 static struct kmem_cache *iommu_devinfo_cache;
492 static struct kmem_cache *iommu_iova_cache;
493
494 static inline void *alloc_pgtable_page(int node)
495 {
496         struct page *page;
497         void *vaddr = NULL;
498
499         page = alloc_pages_node(node, GFP_ATOMIC | __GFP_ZERO, 0);
500         if (page)
501                 vaddr = page_address(page);
502         return vaddr;
503 }
504
505 static inline void free_pgtable_page(void *vaddr)
506 {
507         free_page((unsigned long)vaddr);
508 }
509
510 static inline void *alloc_domain_mem(void)
511 {
512         return kmem_cache_alloc(iommu_domain_cache, GFP_ATOMIC);
513 }
514
515 static void free_domain_mem(void *vaddr)
516 {
517         kmem_cache_free(iommu_domain_cache, vaddr);
518 }
519
520 static inline void * alloc_devinfo_mem(void)
521 {
522         return kmem_cache_alloc(iommu_devinfo_cache, GFP_ATOMIC);
523 }
524
525 static inline void free_devinfo_mem(void *vaddr)
526 {
527         kmem_cache_free(iommu_devinfo_cache, vaddr);
528 }
529
530 struct iova *alloc_iova_mem(void)
531 {
532         return kmem_cache_alloc(iommu_iova_cache, GFP_ATOMIC);
533 }
534
535 void free_iova_mem(struct iova *iova)
536 {
537         kmem_cache_free(iommu_iova_cache, iova);
538 }
539
540
541 static int __iommu_calculate_agaw(struct intel_iommu *iommu, int max_gaw)
542 {
543         unsigned long sagaw;
544         int agaw = -1;
545
546         sagaw = cap_sagaw(iommu->cap);
547         for (agaw = width_to_agaw(max_gaw);
548              agaw >= 0; agaw--) {
549                 if (test_bit(agaw, &sagaw))
550                         break;
551         }
552
553         return agaw;
554 }
555
556 /*
557  * Calculate max SAGAW for each iommu.
558  */
559 int iommu_calculate_max_sagaw(struct intel_iommu *iommu)
560 {
561         return __iommu_calculate_agaw(iommu, MAX_AGAW_WIDTH);
562 }
563
564 /*
565  * calculate agaw for each iommu.
566  * "SAGAW" may be different across iommus, use a default agaw, and
567  * get a supported less agaw for iommus that don't support the default agaw.
568  */
569 int iommu_calculate_agaw(struct intel_iommu *iommu)
570 {
571         return __iommu_calculate_agaw(iommu, DEFAULT_DOMAIN_ADDRESS_WIDTH);
572 }
573
574 /* This functionin only returns single iommu in a domain */
575 static struct intel_iommu *domain_get_iommu(struct dmar_domain *domain)
576 {
577         int iommu_id;
578
579         /* si_domain and vm domain should not get here. */
580         BUG_ON(domain->flags & DOMAIN_FLAG_VIRTUAL_MACHINE);
581         BUG_ON(domain->flags & DOMAIN_FLAG_STATIC_IDENTITY);
582
583         iommu_id = find_first_bit(domain->iommu_bmp, g_num_of_iommus);
584         if (iommu_id < 0 || iommu_id >= g_num_of_iommus)
585                 return NULL;
586
587         return g_iommus[iommu_id];
588 }
589
590 static void domain_update_iommu_coherency(struct dmar_domain *domain)
591 {
592         int i;
593
594         i = find_first_bit(domain->iommu_bmp, g_num_of_iommus);
595
596         domain->iommu_coherency = i < g_num_of_iommus ? 1 : 0;
597
598         for_each_set_bit(i, domain->iommu_bmp, g_num_of_iommus) {
599                 if (!ecap_coherent(g_iommus[i]->ecap)) {
600                         domain->iommu_coherency = 0;
601                         break;
602                 }
603         }
604 }
605
606 static void domain_update_iommu_snooping(struct dmar_domain *domain)
607 {
608         int i;
609
610         domain->iommu_snooping = 1;
611
612         for_each_set_bit(i, domain->iommu_bmp, g_num_of_iommus) {
613                 if (!ecap_sc_support(g_iommus[i]->ecap)) {
614                         domain->iommu_snooping = 0;
615                         break;
616                 }
617         }
618 }
619
620 static void domain_update_iommu_superpage(struct dmar_domain *domain)
621 {
622         struct dmar_drhd_unit *drhd;
623         struct intel_iommu *iommu = NULL;
624         int mask = 0xf;
625
626         if (!intel_iommu_superpage) {
627                 domain->iommu_superpage = 0;
628                 return;
629         }
630
631         /* set iommu_superpage to the smallest common denominator */
632         for_each_active_iommu(iommu, drhd) {
633                 mask &= cap_super_page_val(iommu->cap);
634                 if (!mask) {
635                         break;
636                 }
637         }
638         domain->iommu_superpage = fls(mask);
639 }
640
641 /* Some capabilities may be different across iommus */
642 static void domain_update_iommu_cap(struct dmar_domain *domain)
643 {
644         domain_update_iommu_coherency(domain);
645         domain_update_iommu_snooping(domain);
646         domain_update_iommu_superpage(domain);
647 }
648
649 static struct intel_iommu *device_to_iommu(int segment, u8 bus, u8 devfn)
650 {
651         struct dmar_drhd_unit *drhd = NULL;
652         int i;
653
654         for_each_drhd_unit(drhd) {
655                 if (drhd->ignored)
656                         continue;
657                 if (segment != drhd->segment)
658                         continue;
659
660                 for (i = 0; i < drhd->devices_cnt; i++) {
661                         if (drhd->devices[i] &&
662                             drhd->devices[i]->bus->number == bus &&
663                             drhd->devices[i]->devfn == devfn)
664                                 return drhd->iommu;
665                         if (drhd->devices[i] &&
666                             drhd->devices[i]->subordinate &&
667                             drhd->devices[i]->subordinate->number <= bus &&
668                             drhd->devices[i]->subordinate->busn_res.end >= bus)
669                                 return drhd->iommu;
670                 }
671
672                 if (drhd->include_all)
673                         return drhd->iommu;
674         }
675
676         return NULL;
677 }
678
679 static void domain_flush_cache(struct dmar_domain *domain,
680                                void *addr, int size)
681 {
682         if (!domain->iommu_coherency)
683                 clflush_cache_range(addr, size);
684 }
685
686 /* Gets context entry for a given bus and devfn */
687 static struct context_entry * device_to_context_entry(struct intel_iommu *iommu,
688                 u8 bus, u8 devfn)
689 {
690         struct root_entry *root;
691         struct context_entry *context;
692         unsigned long phy_addr;
693         unsigned long flags;
694
695         spin_lock_irqsave(&iommu->lock, flags);
696         root = &iommu->root_entry[bus];
697         context = get_context_addr_from_root(root);
698         if (!context) {
699                 context = (struct context_entry *)
700                                 alloc_pgtable_page(iommu->node);
701                 if (!context) {
702                         spin_unlock_irqrestore(&iommu->lock, flags);
703                         return NULL;
704                 }
705                 __iommu_flush_cache(iommu, (void *)context, CONTEXT_SIZE);
706                 phy_addr = virt_to_phys((void *)context);
707                 set_root_value(root, phy_addr);
708                 set_root_present(root);
709                 __iommu_flush_cache(iommu, root, sizeof(*root));
710         }
711         spin_unlock_irqrestore(&iommu->lock, flags);
712         return &context[devfn];
713 }
714
715 static int device_context_mapped(struct intel_iommu *iommu, u8 bus, u8 devfn)
716 {
717         struct root_entry *root;
718         struct context_entry *context;
719         int ret;
720         unsigned long flags;
721
722         spin_lock_irqsave(&iommu->lock, flags);
723         root = &iommu->root_entry[bus];
724         context = get_context_addr_from_root(root);
725         if (!context) {
726                 ret = 0;
727                 goto out;
728         }
729         ret = context_present(&context[devfn]);
730 out:
731         spin_unlock_irqrestore(&iommu->lock, flags);
732         return ret;
733 }
734
735 static void clear_context_table(struct intel_iommu *iommu, u8 bus, u8 devfn)
736 {
737         struct root_entry *root;
738         struct context_entry *context;
739         unsigned long flags;
740
741         spin_lock_irqsave(&iommu->lock, flags);
742         root = &iommu->root_entry[bus];
743         context = get_context_addr_from_root(root);
744         if (context) {
745                 context_clear_entry(&context[devfn]);
746                 __iommu_flush_cache(iommu, &context[devfn], \
747                         sizeof(*context));
748         }
749         spin_unlock_irqrestore(&iommu->lock, flags);
750 }
751
752 static void free_context_table(struct intel_iommu *iommu)
753 {
754         struct root_entry *root;
755         int i;
756         unsigned long flags;
757         struct context_entry *context;
758
759         spin_lock_irqsave(&iommu->lock, flags);
760         if (!iommu->root_entry) {
761                 goto out;
762         }
763         for (i = 0; i < ROOT_ENTRY_NR; i++) {
764                 root = &iommu->root_entry[i];
765                 context = get_context_addr_from_root(root);
766                 if (context)
767                         free_pgtable_page(context);
768         }
769         free_pgtable_page(iommu->root_entry);
770         iommu->root_entry = NULL;
771 out:
772         spin_unlock_irqrestore(&iommu->lock, flags);
773 }
774
775 static struct dma_pte *pfn_to_dma_pte(struct dmar_domain *domain,
776                                       unsigned long pfn, int target_level)
777 {
778         int addr_width = agaw_to_width(domain->agaw) - VTD_PAGE_SHIFT;
779         struct dma_pte *parent, *pte = NULL;
780         int level = agaw_to_level(domain->agaw);
781         int offset;
782
783         BUG_ON(!domain->pgd);
784         BUG_ON(addr_width < BITS_PER_LONG && pfn >> addr_width);
785         parent = domain->pgd;
786
787         while (level > 0) {
788                 void *tmp_page;
789
790                 offset = pfn_level_offset(pfn, level);
791                 pte = &parent[offset];
792                 if (!target_level && (dma_pte_superpage(pte) || !dma_pte_present(pte)))
793                         break;
794                 if (level == target_level)
795                         break;
796
797                 if (!dma_pte_present(pte)) {
798                         uint64_t pteval;
799
800                         tmp_page = alloc_pgtable_page(domain->nid);
801
802                         if (!tmp_page)
803                                 return NULL;
804
805                         domain_flush_cache(domain, tmp_page, VTD_PAGE_SIZE);
806                         pteval = ((uint64_t)virt_to_dma_pfn(tmp_page) << VTD_PAGE_SHIFT) | DMA_PTE_READ | DMA_PTE_WRITE;
807                         if (cmpxchg64(&pte->val, 0ULL, pteval)) {
808                                 /* Someone else set it while we were thinking; use theirs. */
809                                 free_pgtable_page(tmp_page);
810                         } else {
811                                 dma_pte_addr(pte);
812                                 domain_flush_cache(domain, pte, sizeof(*pte));
813                         }
814                 }
815                 parent = phys_to_virt(dma_pte_addr(pte));
816                 level--;
817         }
818
819         return pte;
820 }
821
822
823 /* return address's pte at specific level */
824 static struct dma_pte *dma_pfn_level_pte(struct dmar_domain *domain,
825                                          unsigned long pfn,
826                                          int level, int *large_page)
827 {
828         struct dma_pte *parent, *pte = NULL;
829         int total = agaw_to_level(domain->agaw);
830         int offset;
831
832         parent = domain->pgd;
833         while (level <= total) {
834                 offset = pfn_level_offset(pfn, total);
835                 pte = &parent[offset];
836                 if (level == total)
837                         return pte;
838
839                 if (!dma_pte_present(pte)) {
840                         *large_page = total;
841                         break;
842                 }
843
844                 if (pte->val & DMA_PTE_LARGE_PAGE) {
845                         *large_page = total;
846                         return pte;
847                 }
848
849                 parent = phys_to_virt(dma_pte_addr(pte));
850                 total--;
851         }
852         return NULL;
853 }
854
855 /* clear last level pte, a tlb flush should be followed */
856 static int dma_pte_clear_range(struct dmar_domain *domain,
857                                 unsigned long start_pfn,
858                                 unsigned long last_pfn)
859 {
860         int addr_width = agaw_to_width(domain->agaw) - VTD_PAGE_SHIFT;
861         unsigned int large_page = 1;
862         struct dma_pte *first_pte, *pte;
863         int order;
864
865         BUG_ON(addr_width < BITS_PER_LONG && start_pfn >> addr_width);
866         BUG_ON(addr_width < BITS_PER_LONG && last_pfn >> addr_width);
867         BUG_ON(start_pfn > last_pfn);
868
869         /* we don't need lock here; nobody else touches the iova range */
870         do {
871                 large_page = 1;
872                 first_pte = pte = dma_pfn_level_pte(domain, start_pfn, 1, &large_page);
873                 if (!pte) {
874                         start_pfn = align_to_level(start_pfn + 1, large_page + 1);
875                         continue;
876                 }
877                 do {
878                         dma_clear_pte(pte);
879                         start_pfn += lvl_to_nr_pages(large_page);
880                         pte++;
881                 } while (start_pfn <= last_pfn && !first_pte_in_page(pte));
882
883                 domain_flush_cache(domain, first_pte,
884                                    (void *)pte - (void *)first_pte);
885
886         } while (start_pfn && start_pfn <= last_pfn);
887
888         order = (large_page - 1) * 9;
889         return order;
890 }
891
892 /* free page table pages. last level pte should already be cleared */
893 static void dma_pte_free_pagetable(struct dmar_domain *domain,
894                                    unsigned long start_pfn,
895                                    unsigned long last_pfn)
896 {
897         int addr_width = agaw_to_width(domain->agaw) - VTD_PAGE_SHIFT;
898         struct dma_pte *first_pte, *pte;
899         int total = agaw_to_level(domain->agaw);
900         int level;
901         unsigned long tmp;
902         int large_page = 2;
903
904         BUG_ON(addr_width < BITS_PER_LONG && start_pfn >> addr_width);
905         BUG_ON(addr_width < BITS_PER_LONG && last_pfn >> addr_width);
906         BUG_ON(start_pfn > last_pfn);
907
908         /* We don't need lock here; nobody else touches the iova range */
909         level = 2;
910         while (level <= total) {
911                 tmp = align_to_level(start_pfn, level);
912
913                 /* If we can't even clear one PTE at this level, we're done */
914                 if (tmp + level_size(level) - 1 > last_pfn)
915                         return;
916
917                 do {
918                         large_page = level;
919                         first_pte = pte = dma_pfn_level_pte(domain, tmp, level, &large_page);
920                         if (large_page > level)
921                                 level = large_page + 1;
922                         if (!pte) {
923                                 tmp = align_to_level(tmp + 1, level + 1);
924                                 continue;
925                         }
926                         do {
927                                 if (dma_pte_present(pte)) {
928                                         free_pgtable_page(phys_to_virt(dma_pte_addr(pte)));
929                                         dma_clear_pte(pte);
930                                 }
931                                 pte++;
932                                 tmp += level_size(level);
933                         } while (!first_pte_in_page(pte) &&
934                                  tmp + level_size(level) - 1 <= last_pfn);
935
936                         domain_flush_cache(domain, first_pte,
937                                            (void *)pte - (void *)first_pte);
938                         
939                 } while (tmp && tmp + level_size(level) - 1 <= last_pfn);
940                 level++;
941         }
942         /* free pgd */
943         if (start_pfn == 0 && last_pfn == DOMAIN_MAX_PFN(domain->gaw)) {
944                 free_pgtable_page(domain->pgd);
945                 domain->pgd = NULL;
946         }
947 }
948
949 /* iommu handling */
950 static int iommu_alloc_root_entry(struct intel_iommu *iommu)
951 {
952         struct root_entry *root;
953         unsigned long flags;
954
955         root = (struct root_entry *)alloc_pgtable_page(iommu->node);
956         if (!root)
957                 return -ENOMEM;
958
959         __iommu_flush_cache(iommu, root, ROOT_SIZE);
960
961         spin_lock_irqsave(&iommu->lock, flags);
962         iommu->root_entry = root;
963         spin_unlock_irqrestore(&iommu->lock, flags);
964
965         return 0;
966 }
967
968 static void iommu_set_root_entry(struct intel_iommu *iommu)
969 {
970         void *addr;
971         u32 sts;
972         unsigned long flag;
973
974         addr = iommu->root_entry;
975
976         raw_spin_lock_irqsave(&iommu->register_lock, flag);
977         dmar_writeq(iommu->reg + DMAR_RTADDR_REG, virt_to_phys(addr));
978
979         writel(iommu->gcmd | DMA_GCMD_SRTP, iommu->reg + DMAR_GCMD_REG);
980
981         /* Make sure hardware complete it */
982         IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
983                       readl, (sts & DMA_GSTS_RTPS), sts);
984
985         raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
986 }
987
988 static void iommu_flush_write_buffer(struct intel_iommu *iommu)
989 {
990         u32 val;
991         unsigned long flag;
992
993         if (!rwbf_quirk && !cap_rwbf(iommu->cap))
994                 return;
995
996         raw_spin_lock_irqsave(&iommu->register_lock, flag);
997         writel(iommu->gcmd | DMA_GCMD_WBF, iommu->reg + DMAR_GCMD_REG);
998
999         /* Make sure hardware complete it */
1000         IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1001                       readl, (!(val & DMA_GSTS_WBFS)), val);
1002
1003         raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1004 }
1005
1006 /* return value determine if we need a write buffer flush */
1007 static void __iommu_flush_context(struct intel_iommu *iommu,
1008                                   u16 did, u16 source_id, u8 function_mask,
1009                                   u64 type)
1010 {
1011         u64 val = 0;
1012         unsigned long flag;
1013
1014         switch (type) {
1015         case DMA_CCMD_GLOBAL_INVL:
1016                 val = DMA_CCMD_GLOBAL_INVL;
1017                 break;
1018         case DMA_CCMD_DOMAIN_INVL:
1019                 val = DMA_CCMD_DOMAIN_INVL|DMA_CCMD_DID(did);
1020                 break;
1021         case DMA_CCMD_DEVICE_INVL:
1022                 val = DMA_CCMD_DEVICE_INVL|DMA_CCMD_DID(did)
1023                         | DMA_CCMD_SID(source_id) | DMA_CCMD_FM(function_mask);
1024                 break;
1025         default:
1026                 BUG();
1027         }
1028         val |= DMA_CCMD_ICC;
1029
1030         raw_spin_lock_irqsave(&iommu->register_lock, flag);
1031         dmar_writeq(iommu->reg + DMAR_CCMD_REG, val);
1032
1033         /* Make sure hardware complete it */
1034         IOMMU_WAIT_OP(iommu, DMAR_CCMD_REG,
1035                 dmar_readq, (!(val & DMA_CCMD_ICC)), val);
1036
1037         raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1038 }
1039
1040 /* return value determine if we need a write buffer flush */
1041 static void __iommu_flush_iotlb(struct intel_iommu *iommu, u16 did,
1042                                 u64 addr, unsigned int size_order, u64 type)
1043 {
1044         int tlb_offset = ecap_iotlb_offset(iommu->ecap);
1045         u64 val = 0, val_iva = 0;
1046         unsigned long flag;
1047
1048         switch (type) {
1049         case DMA_TLB_GLOBAL_FLUSH:
1050                 /* global flush doesn't need set IVA_REG */
1051                 val = DMA_TLB_GLOBAL_FLUSH|DMA_TLB_IVT;
1052                 break;
1053         case DMA_TLB_DSI_FLUSH:
1054                 val = DMA_TLB_DSI_FLUSH|DMA_TLB_IVT|DMA_TLB_DID(did);
1055                 break;
1056         case DMA_TLB_PSI_FLUSH:
1057                 val = DMA_TLB_PSI_FLUSH|DMA_TLB_IVT|DMA_TLB_DID(did);
1058                 /* Note: always flush non-leaf currently */
1059                 val_iva = size_order | addr;
1060                 break;
1061         default:
1062                 BUG();
1063         }
1064         /* Note: set drain read/write */
1065 #if 0
1066         /*
1067          * This is probably to be super secure.. Looks like we can
1068          * ignore it without any impact.
1069          */
1070         if (cap_read_drain(iommu->cap))
1071                 val |= DMA_TLB_READ_DRAIN;
1072 #endif
1073         if (cap_write_drain(iommu->cap))
1074                 val |= DMA_TLB_WRITE_DRAIN;
1075
1076         raw_spin_lock_irqsave(&iommu->register_lock, flag);
1077         /* Note: Only uses first TLB reg currently */
1078         if (val_iva)
1079                 dmar_writeq(iommu->reg + tlb_offset, val_iva);
1080         dmar_writeq(iommu->reg + tlb_offset + 8, val);
1081
1082         /* Make sure hardware complete it */
1083         IOMMU_WAIT_OP(iommu, tlb_offset + 8,
1084                 dmar_readq, (!(val & DMA_TLB_IVT)), val);
1085
1086         raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1087
1088         /* check IOTLB invalidation granularity */
1089         if (DMA_TLB_IAIG(val) == 0)
1090                 printk(KERN_ERR"IOMMU: flush IOTLB failed\n");
1091         if (DMA_TLB_IAIG(val) != DMA_TLB_IIRG(type))
1092                 pr_debug("IOMMU: tlb flush request %Lx, actual %Lx\n",
1093                         (unsigned long long)DMA_TLB_IIRG(type),
1094                         (unsigned long long)DMA_TLB_IAIG(val));
1095 }
1096
1097 static struct device_domain_info *iommu_support_dev_iotlb(
1098         struct dmar_domain *domain, int segment, u8 bus, u8 devfn)
1099 {
1100         int found = 0;
1101         unsigned long flags;
1102         struct device_domain_info *info;
1103         struct intel_iommu *iommu = device_to_iommu(segment, bus, devfn);
1104
1105         if (!ecap_dev_iotlb_support(iommu->ecap))
1106                 return NULL;
1107
1108         if (!iommu->qi)
1109                 return NULL;
1110
1111         spin_lock_irqsave(&device_domain_lock, flags);
1112         list_for_each_entry(info, &domain->devices, link)
1113                 if (info->bus == bus && info->devfn == devfn) {
1114                         found = 1;
1115                         break;
1116                 }
1117         spin_unlock_irqrestore(&device_domain_lock, flags);
1118
1119         if (!found || !info->dev)
1120                 return NULL;
1121
1122         if (!pci_find_ext_capability(info->dev, PCI_EXT_CAP_ID_ATS))
1123                 return NULL;
1124
1125         if (!dmar_find_matched_atsr_unit(info->dev))
1126                 return NULL;
1127
1128         info->iommu = iommu;
1129
1130         return info;
1131 }
1132
1133 static void iommu_enable_dev_iotlb(struct device_domain_info *info)
1134 {
1135         if (!info)
1136                 return;
1137
1138         pci_enable_ats(info->dev, VTD_PAGE_SHIFT);
1139 }
1140
1141 static void iommu_disable_dev_iotlb(struct device_domain_info *info)
1142 {
1143         if (!info->dev || !pci_ats_enabled(info->dev))
1144                 return;
1145
1146         pci_disable_ats(info->dev);
1147 }
1148
1149 static void iommu_flush_dev_iotlb(struct dmar_domain *domain,
1150                                   u64 addr, unsigned mask)
1151 {
1152         u16 sid, qdep;
1153         unsigned long flags;
1154         struct device_domain_info *info;
1155
1156         spin_lock_irqsave(&device_domain_lock, flags);
1157         list_for_each_entry(info, &domain->devices, link) {
1158                 if (!info->dev || !pci_ats_enabled(info->dev))
1159                         continue;
1160
1161                 sid = info->bus << 8 | info->devfn;
1162                 qdep = pci_ats_queue_depth(info->dev);
1163                 qi_flush_dev_iotlb(info->iommu, sid, qdep, addr, mask);
1164         }
1165         spin_unlock_irqrestore(&device_domain_lock, flags);
1166 }
1167
1168 static void iommu_flush_iotlb_psi(struct intel_iommu *iommu, u16 did,
1169                                   unsigned long pfn, unsigned int pages, int map)
1170 {
1171         unsigned int mask = ilog2(__roundup_pow_of_two(pages));
1172         uint64_t addr = (uint64_t)pfn << VTD_PAGE_SHIFT;
1173
1174         BUG_ON(pages == 0);
1175
1176         /*
1177          * Fallback to domain selective flush if no PSI support or the size is
1178          * too big.
1179          * PSI requires page size to be 2 ^ x, and the base address is naturally
1180          * aligned to the size
1181          */
1182         if (!cap_pgsel_inv(iommu->cap) || mask > cap_max_amask_val(iommu->cap))
1183                 iommu->flush.flush_iotlb(iommu, did, 0, 0,
1184                                                 DMA_TLB_DSI_FLUSH);
1185         else
1186                 iommu->flush.flush_iotlb(iommu, did, addr, mask,
1187                                                 DMA_TLB_PSI_FLUSH);
1188
1189         /*
1190          * In caching mode, changes of pages from non-present to present require
1191          * flush. However, device IOTLB doesn't need to be flushed in this case.
1192          */
1193         if (!cap_caching_mode(iommu->cap) || !map)
1194                 iommu_flush_dev_iotlb(iommu->domains[did], addr, mask);
1195 }
1196
1197 static void iommu_disable_protect_mem_regions(struct intel_iommu *iommu)
1198 {
1199         u32 pmen;
1200         unsigned long flags;
1201
1202         raw_spin_lock_irqsave(&iommu->register_lock, flags);
1203         pmen = readl(iommu->reg + DMAR_PMEN_REG);
1204         pmen &= ~DMA_PMEN_EPM;
1205         writel(pmen, iommu->reg + DMAR_PMEN_REG);
1206
1207         /* wait for the protected region status bit to clear */
1208         IOMMU_WAIT_OP(iommu, DMAR_PMEN_REG,
1209                 readl, !(pmen & DMA_PMEN_PRS), pmen);
1210
1211         raw_spin_unlock_irqrestore(&iommu->register_lock, flags);
1212 }
1213
1214 static int iommu_enable_translation(struct intel_iommu *iommu)
1215 {
1216         u32 sts;
1217         unsigned long flags;
1218
1219         raw_spin_lock_irqsave(&iommu->register_lock, flags);
1220         iommu->gcmd |= DMA_GCMD_TE;
1221         writel(iommu->gcmd, iommu->reg + DMAR_GCMD_REG);
1222
1223         /* Make sure hardware complete it */
1224         IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1225                       readl, (sts & DMA_GSTS_TES), sts);
1226
1227         raw_spin_unlock_irqrestore(&iommu->register_lock, flags);
1228         return 0;
1229 }
1230
1231 static int iommu_disable_translation(struct intel_iommu *iommu)
1232 {
1233         u32 sts;
1234         unsigned long flag;
1235
1236         raw_spin_lock_irqsave(&iommu->register_lock, flag);
1237         iommu->gcmd &= ~DMA_GCMD_TE;
1238         writel(iommu->gcmd, iommu->reg + DMAR_GCMD_REG);
1239
1240         /* Make sure hardware complete it */
1241         IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1242                       readl, (!(sts & DMA_GSTS_TES)), sts);
1243
1244         raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1245         return 0;
1246 }
1247
1248
1249 static int iommu_init_domains(struct intel_iommu *iommu)
1250 {
1251         unsigned long ndomains;
1252         unsigned long nlongs;
1253
1254         ndomains = cap_ndoms(iommu->cap);
1255         pr_debug("IOMMU %d: Number of Domains supported <%ld>\n", iommu->seq_id,
1256                         ndomains);
1257         nlongs = BITS_TO_LONGS(ndomains);
1258
1259         spin_lock_init(&iommu->lock);
1260
1261         /* TBD: there might be 64K domains,
1262          * consider other allocation for future chip
1263          */
1264         iommu->domain_ids = kcalloc(nlongs, sizeof(unsigned long), GFP_KERNEL);
1265         if (!iommu->domain_ids) {
1266                 printk(KERN_ERR "Allocating domain id array failed\n");
1267                 return -ENOMEM;
1268         }
1269         iommu->domains = kcalloc(ndomains, sizeof(struct dmar_domain *),
1270                         GFP_KERNEL);
1271         if (!iommu->domains) {
1272                 printk(KERN_ERR "Allocating domain array failed\n");
1273                 return -ENOMEM;
1274         }
1275
1276         /*
1277          * if Caching mode is set, then invalid translations are tagged
1278          * with domainid 0. Hence we need to pre-allocate it.
1279          */
1280         if (cap_caching_mode(iommu->cap))
1281                 set_bit(0, iommu->domain_ids);
1282         return 0;
1283 }
1284
1285
1286 static void domain_exit(struct dmar_domain *domain);
1287 static void vm_domain_exit(struct dmar_domain *domain);
1288
1289 void free_dmar_iommu(struct intel_iommu *iommu)
1290 {
1291         struct dmar_domain *domain;
1292         int i;
1293         unsigned long flags;
1294
1295         if ((iommu->domains) && (iommu->domain_ids)) {
1296                 for_each_set_bit(i, iommu->domain_ids, cap_ndoms(iommu->cap)) {
1297                         domain = iommu->domains[i];
1298                         clear_bit(i, iommu->domain_ids);
1299
1300                         spin_lock_irqsave(&domain->iommu_lock, flags);
1301                         if (--domain->iommu_count == 0) {
1302                                 if (domain->flags & DOMAIN_FLAG_VIRTUAL_MACHINE)
1303                                         vm_domain_exit(domain);
1304                                 else
1305                                         domain_exit(domain);
1306                         }
1307                         spin_unlock_irqrestore(&domain->iommu_lock, flags);
1308                 }
1309         }
1310
1311         if (iommu->gcmd & DMA_GCMD_TE)
1312                 iommu_disable_translation(iommu);
1313
1314         if (iommu->irq) {
1315                 irq_set_handler_data(iommu->irq, NULL);
1316                 /* This will mask the irq */
1317                 free_irq(iommu->irq, iommu);
1318                 destroy_irq(iommu->irq);
1319         }
1320
1321         kfree(iommu->domains);
1322         kfree(iommu->domain_ids);
1323
1324         g_iommus[iommu->seq_id] = NULL;
1325
1326         /* if all iommus are freed, free g_iommus */
1327         for (i = 0; i < g_num_of_iommus; i++) {
1328                 if (g_iommus[i])
1329                         break;
1330         }
1331
1332         if (i == g_num_of_iommus)
1333                 kfree(g_iommus);
1334
1335         /* free context mapping */
1336         free_context_table(iommu);
1337 }
1338
1339 static struct dmar_domain *alloc_domain(void)
1340 {
1341         struct dmar_domain *domain;
1342
1343         domain = alloc_domain_mem();
1344         if (!domain)
1345                 return NULL;
1346
1347         domain->nid = -1;
1348         memset(domain->iommu_bmp, 0, sizeof(domain->iommu_bmp));
1349         domain->flags = 0;
1350
1351         return domain;
1352 }
1353
1354 static int iommu_attach_domain(struct dmar_domain *domain,
1355                                struct intel_iommu *iommu)
1356 {
1357         int num;
1358         unsigned long ndomains;
1359         unsigned long flags;
1360
1361         ndomains = cap_ndoms(iommu->cap);
1362
1363         spin_lock_irqsave(&iommu->lock, flags);
1364
1365         num = find_first_zero_bit(iommu->domain_ids, ndomains);
1366         if (num >= ndomains) {
1367                 spin_unlock_irqrestore(&iommu->lock, flags);
1368                 printk(KERN_ERR "IOMMU: no free domain ids\n");
1369                 return -ENOMEM;
1370         }
1371
1372         domain->id = num;
1373         set_bit(num, iommu->domain_ids);
1374         set_bit(iommu->seq_id, domain->iommu_bmp);
1375         iommu->domains[num] = domain;
1376         spin_unlock_irqrestore(&iommu->lock, flags);
1377
1378         return 0;
1379 }
1380
1381 static void iommu_detach_domain(struct dmar_domain *domain,
1382                                 struct intel_iommu *iommu)
1383 {
1384         unsigned long flags;
1385         int num, ndomains;
1386         int found = 0;
1387
1388         spin_lock_irqsave(&iommu->lock, flags);
1389         ndomains = cap_ndoms(iommu->cap);
1390         for_each_set_bit(num, iommu->domain_ids, ndomains) {
1391                 if (iommu->domains[num] == domain) {
1392                         found = 1;
1393                         break;
1394                 }
1395         }
1396
1397         if (found) {
1398                 clear_bit(num, iommu->domain_ids);
1399                 clear_bit(iommu->seq_id, domain->iommu_bmp);
1400                 iommu->domains[num] = NULL;
1401         }
1402         spin_unlock_irqrestore(&iommu->lock, flags);
1403 }
1404
1405 static struct iova_domain reserved_iova_list;
1406 static struct lock_class_key reserved_rbtree_key;
1407
1408 static int dmar_init_reserved_ranges(void)
1409 {
1410         struct pci_dev *pdev = NULL;
1411         struct iova *iova;
1412         int i;
1413
1414         init_iova_domain(&reserved_iova_list, DMA_32BIT_PFN);
1415
1416         lockdep_set_class(&reserved_iova_list.iova_rbtree_lock,
1417                 &reserved_rbtree_key);
1418
1419         /* IOAPIC ranges shouldn't be accessed by DMA */
1420         iova = reserve_iova(&reserved_iova_list, IOVA_PFN(IOAPIC_RANGE_START),
1421                 IOVA_PFN(IOAPIC_RANGE_END));
1422         if (!iova) {
1423                 printk(KERN_ERR "Reserve IOAPIC range failed\n");
1424                 return -ENODEV;
1425         }
1426
1427         /* Reserve all PCI MMIO to avoid peer-to-peer access */
1428         for_each_pci_dev(pdev) {
1429                 struct resource *r;
1430
1431                 for (i = 0; i < PCI_NUM_RESOURCES; i++) {
1432                         r = &pdev->resource[i];
1433                         if (!r->flags || !(r->flags & IORESOURCE_MEM))
1434                                 continue;
1435                         iova = reserve_iova(&reserved_iova_list,
1436                                             IOVA_PFN(r->start),
1437                                             IOVA_PFN(r->end));
1438                         if (!iova) {
1439                                 printk(KERN_ERR "Reserve iova failed\n");
1440                                 return -ENODEV;
1441                         }
1442                 }
1443         }
1444         return 0;
1445 }
1446
1447 static void domain_reserve_special_ranges(struct dmar_domain *domain)
1448 {
1449         copy_reserved_iova(&reserved_iova_list, &domain->iovad);
1450 }
1451
1452 static inline int guestwidth_to_adjustwidth(int gaw)
1453 {
1454         int agaw;
1455         int r = (gaw - 12) % 9;
1456
1457         if (r == 0)
1458                 agaw = gaw;
1459         else
1460                 agaw = gaw + 9 - r;
1461         if (agaw > 64)
1462                 agaw = 64;
1463         return agaw;
1464 }
1465
1466 static int domain_init(struct dmar_domain *domain, int guest_width)
1467 {
1468         struct intel_iommu *iommu;
1469         int adjust_width, agaw;
1470         unsigned long sagaw;
1471
1472         init_iova_domain(&domain->iovad, DMA_32BIT_PFN);
1473         spin_lock_init(&domain->iommu_lock);
1474
1475         domain_reserve_special_ranges(domain);
1476
1477         /* calculate AGAW */
1478         iommu = domain_get_iommu(domain);
1479         if (guest_width > cap_mgaw(iommu->cap))
1480                 guest_width = cap_mgaw(iommu->cap);
1481         domain->gaw = guest_width;
1482         adjust_width = guestwidth_to_adjustwidth(guest_width);
1483         agaw = width_to_agaw(adjust_width);
1484         sagaw = cap_sagaw(iommu->cap);
1485         if (!test_bit(agaw, &sagaw)) {
1486                 /* hardware doesn't support it, choose a bigger one */
1487                 pr_debug("IOMMU: hardware doesn't support agaw %d\n", agaw);
1488                 agaw = find_next_bit(&sagaw, 5, agaw);
1489                 if (agaw >= 5)
1490                         return -ENODEV;
1491         }
1492         domain->agaw = agaw;
1493         INIT_LIST_HEAD(&domain->devices);
1494
1495         if (ecap_coherent(iommu->ecap))
1496                 domain->iommu_coherency = 1;
1497         else
1498                 domain->iommu_coherency = 0;
1499
1500         if (ecap_sc_support(iommu->ecap))
1501                 domain->iommu_snooping = 1;
1502         else
1503                 domain->iommu_snooping = 0;
1504
1505         domain->iommu_superpage = fls(cap_super_page_val(iommu->cap));
1506         domain->iommu_count = 1;
1507         domain->nid = iommu->node;
1508
1509         /* always allocate the top pgd */
1510         domain->pgd = (struct dma_pte *)alloc_pgtable_page(domain->nid);
1511         if (!domain->pgd)
1512                 return -ENOMEM;
1513         __iommu_flush_cache(iommu, domain->pgd, PAGE_SIZE);
1514         return 0;
1515 }
1516
1517 static void domain_exit(struct dmar_domain *domain)
1518 {
1519         struct dmar_drhd_unit *drhd;
1520         struct intel_iommu *iommu;
1521
1522         /* Domain 0 is reserved, so dont process it */
1523         if (!domain)
1524                 return;
1525
1526         /* Flush any lazy unmaps that may reference this domain */
1527         if (!intel_iommu_strict)
1528                 flush_unmaps_timeout(0);
1529
1530         domain_remove_dev_info(domain);
1531         /* destroy iovas */
1532         put_iova_domain(&domain->iovad);
1533
1534         /* clear ptes */
1535         dma_pte_clear_range(domain, 0, DOMAIN_MAX_PFN(domain->gaw));
1536
1537         /* free page tables */
1538         dma_pte_free_pagetable(domain, 0, DOMAIN_MAX_PFN(domain->gaw));
1539
1540         for_each_active_iommu(iommu, drhd)
1541                 if (test_bit(iommu->seq_id, domain->iommu_bmp))
1542                         iommu_detach_domain(domain, iommu);
1543
1544         free_domain_mem(domain);
1545 }
1546
1547 static int domain_context_mapping_one(struct dmar_domain *domain, int segment,
1548                                  u8 bus, u8 devfn, int translation)
1549 {
1550         struct context_entry *context;
1551         unsigned long flags;
1552         struct intel_iommu *iommu;
1553         struct dma_pte *pgd;
1554         unsigned long num;
1555         unsigned long ndomains;
1556         int id;
1557         int agaw;
1558         struct device_domain_info *info = NULL;
1559
1560         pr_debug("Set context mapping for %02x:%02x.%d\n",
1561                 bus, PCI_SLOT(devfn), PCI_FUNC(devfn));
1562
1563         BUG_ON(!domain->pgd);
1564         BUG_ON(translation != CONTEXT_TT_PASS_THROUGH &&
1565                translation != CONTEXT_TT_MULTI_LEVEL);
1566
1567         iommu = device_to_iommu(segment, bus, devfn);
1568         if (!iommu)
1569                 return -ENODEV;
1570
1571         context = device_to_context_entry(iommu, bus, devfn);
1572         if (!context)
1573                 return -ENOMEM;
1574         spin_lock_irqsave(&iommu->lock, flags);
1575         if (context_present(context)) {
1576                 spin_unlock_irqrestore(&iommu->lock, flags);
1577                 return 0;
1578         }
1579
1580         id = domain->id;
1581         pgd = domain->pgd;
1582
1583         if (domain->flags & DOMAIN_FLAG_VIRTUAL_MACHINE ||
1584             domain->flags & DOMAIN_FLAG_STATIC_IDENTITY) {
1585                 int found = 0;
1586
1587                 /* find an available domain id for this device in iommu */
1588                 ndomains = cap_ndoms(iommu->cap);
1589                 for_each_set_bit(num, iommu->domain_ids, ndomains) {
1590                         if (iommu->domains[num] == domain) {
1591                                 id = num;
1592                                 found = 1;
1593                                 break;
1594                         }
1595                 }
1596
1597                 if (found == 0) {
1598                         num = find_first_zero_bit(iommu->domain_ids, ndomains);
1599                         if (num >= ndomains) {
1600                                 spin_unlock_irqrestore(&iommu->lock, flags);
1601                                 printk(KERN_ERR "IOMMU: no free domain ids\n");
1602                                 return -EFAULT;
1603                         }
1604
1605                         set_bit(num, iommu->domain_ids);
1606                         iommu->domains[num] = domain;
1607                         id = num;
1608                 }
1609
1610                 /* Skip top levels of page tables for
1611                  * iommu which has less agaw than default.
1612                  * Unnecessary for PT mode.
1613                  */
1614                 if (translation != CONTEXT_TT_PASS_THROUGH) {
1615                         for (agaw = domain->agaw; agaw != iommu->agaw; agaw--) {
1616                                 pgd = phys_to_virt(dma_pte_addr(pgd));
1617                                 if (!dma_pte_present(pgd)) {
1618                                         spin_unlock_irqrestore(&iommu->lock, flags);
1619                                         return -ENOMEM;
1620                                 }
1621                         }
1622                 }
1623         }
1624
1625         context_set_domain_id(context, id);
1626
1627         if (translation != CONTEXT_TT_PASS_THROUGH) {
1628                 info = iommu_support_dev_iotlb(domain, segment, bus, devfn);
1629                 translation = info ? CONTEXT_TT_DEV_IOTLB :
1630                                      CONTEXT_TT_MULTI_LEVEL;
1631         }
1632         /*
1633          * In pass through mode, AW must be programmed to indicate the largest
1634          * AGAW value supported by hardware. And ASR is ignored by hardware.
1635          */
1636         if (unlikely(translation == CONTEXT_TT_PASS_THROUGH))
1637                 context_set_address_width(context, iommu->msagaw);
1638         else {
1639                 context_set_address_root(context, virt_to_phys(pgd));
1640                 context_set_address_width(context, iommu->agaw);
1641         }
1642
1643         context_set_translation_type(context, translation);
1644         context_set_fault_enable(context);
1645         context_set_present(context);
1646         domain_flush_cache(domain, context, sizeof(*context));
1647
1648         /*
1649          * It's a non-present to present mapping. If hardware doesn't cache
1650          * non-present entry we only need to flush the write-buffer. If the
1651          * _does_ cache non-present entries, then it does so in the special
1652          * domain #0, which we have to flush:
1653          */
1654         if (cap_caching_mode(iommu->cap)) {
1655                 iommu->flush.flush_context(iommu, 0,
1656                                            (((u16)bus) << 8) | devfn,
1657                                            DMA_CCMD_MASK_NOBIT,
1658                                            DMA_CCMD_DEVICE_INVL);
1659                 iommu->flush.flush_iotlb(iommu, domain->id, 0, 0, DMA_TLB_DSI_FLUSH);
1660         } else {
1661                 iommu_flush_write_buffer(iommu);
1662         }
1663         iommu_enable_dev_iotlb(info);
1664         spin_unlock_irqrestore(&iommu->lock, flags);
1665
1666         spin_lock_irqsave(&domain->iommu_lock, flags);
1667         if (!test_and_set_bit(iommu->seq_id, domain->iommu_bmp)) {
1668                 domain->iommu_count++;
1669                 if (domain->iommu_count == 1)
1670                         domain->nid = iommu->node;
1671                 domain_update_iommu_cap(domain);
1672         }
1673         spin_unlock_irqrestore(&domain->iommu_lock, flags);
1674         return 0;
1675 }
1676
1677 static int
1678 domain_context_mapping(struct dmar_domain *domain, struct pci_dev *pdev,
1679                         int translation)
1680 {
1681         int ret;
1682         struct pci_dev *tmp, *parent;
1683
1684         ret = domain_context_mapping_one(domain, pci_domain_nr(pdev->bus),
1685                                          pdev->bus->number, pdev->devfn,
1686                                          translation);
1687         if (ret)
1688                 return ret;
1689
1690         /* dependent device mapping */
1691         tmp = pci_find_upstream_pcie_bridge(pdev);
1692         if (!tmp)
1693                 return 0;
1694         /* Secondary interface's bus number and devfn 0 */
1695         parent = pdev->bus->self;
1696         while (parent != tmp) {
1697                 ret = domain_context_mapping_one(domain,
1698                                                  pci_domain_nr(parent->bus),
1699                                                  parent->bus->number,
1700                                                  parent->devfn, translation);
1701                 if (ret)
1702                         return ret;
1703                 parent = parent->bus->self;
1704         }
1705         if (pci_is_pcie(tmp)) /* this is a PCIe-to-PCI bridge */
1706                 return domain_context_mapping_one(domain,
1707                                         pci_domain_nr(tmp->subordinate),
1708                                         tmp->subordinate->number, 0,
1709                                         translation);
1710         else /* this is a legacy PCI bridge */
1711                 return domain_context_mapping_one(domain,
1712                                                   pci_domain_nr(tmp->bus),
1713                                                   tmp->bus->number,
1714                                                   tmp->devfn,
1715                                                   translation);
1716 }
1717
1718 static int domain_context_mapped(struct pci_dev *pdev)
1719 {
1720         int ret;
1721         struct pci_dev *tmp, *parent;
1722         struct intel_iommu *iommu;
1723
1724         iommu = device_to_iommu(pci_domain_nr(pdev->bus), pdev->bus->number,
1725                                 pdev->devfn);
1726         if (!iommu)
1727                 return -ENODEV;
1728
1729         ret = device_context_mapped(iommu, pdev->bus->number, pdev->devfn);
1730         if (!ret)
1731                 return ret;
1732         /* dependent device mapping */
1733         tmp = pci_find_upstream_pcie_bridge(pdev);
1734         if (!tmp)
1735                 return ret;
1736         /* Secondary interface's bus number and devfn 0 */
1737         parent = pdev->bus->self;
1738         while (parent != tmp) {
1739                 ret = device_context_mapped(iommu, parent->bus->number,
1740                                             parent->devfn);
1741                 if (!ret)
1742                         return ret;
1743                 parent = parent->bus->self;
1744         }
1745         if (pci_is_pcie(tmp))
1746                 return device_context_mapped(iommu, tmp->subordinate->number,
1747                                              0);
1748         else
1749                 return device_context_mapped(iommu, tmp->bus->number,
1750                                              tmp->devfn);
1751 }
1752
1753 /* Returns a number of VTD pages, but aligned to MM page size */
1754 static inline unsigned long aligned_nrpages(unsigned long host_addr,
1755                                             size_t size)
1756 {
1757         host_addr &= ~PAGE_MASK;
1758         return PAGE_ALIGN(host_addr + size) >> VTD_PAGE_SHIFT;
1759 }
1760
1761 /* Return largest possible superpage level for a given mapping */
1762 static inline int hardware_largepage_caps(struct dmar_domain *domain,
1763                                           unsigned long iov_pfn,
1764                                           unsigned long phy_pfn,
1765                                           unsigned long pages)
1766 {
1767         int support, level = 1;
1768         unsigned long pfnmerge;
1769
1770         support = domain->iommu_superpage;
1771
1772         /* To use a large page, the virtual *and* physical addresses
1773            must be aligned to 2MiB/1GiB/etc. Lower bits set in either
1774            of them will mean we have to use smaller pages. So just
1775            merge them and check both at once. */
1776         pfnmerge = iov_pfn | phy_pfn;
1777
1778         while (support && !(pfnmerge & ~VTD_STRIDE_MASK)) {
1779                 pages >>= VTD_STRIDE_SHIFT;
1780                 if (!pages)
1781                         break;
1782                 pfnmerge >>= VTD_STRIDE_SHIFT;
1783                 level++;
1784                 support--;
1785         }
1786         return level;
1787 }
1788
1789 static int __domain_mapping(struct dmar_domain *domain, unsigned long iov_pfn,
1790                             struct scatterlist *sg, unsigned long phys_pfn,
1791                             unsigned long nr_pages, int prot)
1792 {
1793         struct dma_pte *first_pte = NULL, *pte = NULL;
1794         phys_addr_t uninitialized_var(pteval);
1795         int addr_width = agaw_to_width(domain->agaw) - VTD_PAGE_SHIFT;
1796         unsigned long sg_res;
1797         unsigned int largepage_lvl = 0;
1798         unsigned long lvl_pages = 0;
1799
1800         BUG_ON(addr_width < BITS_PER_LONG && (iov_pfn + nr_pages - 1) >> addr_width);
1801
1802         if ((prot & (DMA_PTE_READ|DMA_PTE_WRITE)) == 0)
1803                 return -EINVAL;
1804
1805         prot &= DMA_PTE_READ | DMA_PTE_WRITE | DMA_PTE_SNP;
1806
1807         if (sg)
1808                 sg_res = 0;
1809         else {
1810                 sg_res = nr_pages + 1;
1811                 pteval = ((phys_addr_t)phys_pfn << VTD_PAGE_SHIFT) | prot;
1812         }
1813
1814         while (nr_pages > 0) {
1815                 uint64_t tmp;
1816
1817                 if (!sg_res) {
1818                         sg_res = aligned_nrpages(sg->offset, sg->length);
1819                         sg->dma_address = ((dma_addr_t)iov_pfn << VTD_PAGE_SHIFT) + sg->offset;
1820                         sg->dma_length = sg->length;
1821                         pteval = page_to_phys(sg_page(sg)) | prot;
1822                         phys_pfn = pteval >> VTD_PAGE_SHIFT;
1823                 }
1824
1825                 if (!pte) {
1826                         largepage_lvl = hardware_largepage_caps(domain, iov_pfn, phys_pfn, sg_res);
1827
1828                         first_pte = pte = pfn_to_dma_pte(domain, iov_pfn, largepage_lvl);
1829                         if (!pte)
1830                                 return -ENOMEM;
1831                         /* It is large page*/
1832                         if (largepage_lvl > 1) {
1833                                 pteval |= DMA_PTE_LARGE_PAGE;
1834                                 /* Ensure that old small page tables are removed to make room
1835                                    for superpage, if they exist. */
1836                                 dma_pte_clear_range(domain, iov_pfn,
1837                                                     iov_pfn + lvl_to_nr_pages(largepage_lvl) - 1);
1838                                 dma_pte_free_pagetable(domain, iov_pfn,
1839                                                        iov_pfn + lvl_to_nr_pages(largepage_lvl) - 1);
1840                         } else {
1841                                 pteval &= ~(uint64_t)DMA_PTE_LARGE_PAGE;
1842                         }
1843
1844                 }
1845                 /* We don't need lock here, nobody else
1846                  * touches the iova range
1847                  */
1848                 tmp = cmpxchg64_local(&pte->val, 0ULL, pteval);
1849                 if (tmp) {
1850                         static int dumps = 5;
1851                         printk(KERN_CRIT "ERROR: DMA PTE for vPFN 0x%lx already set (to %llx not %llx)\n",
1852                                iov_pfn, tmp, (unsigned long long)pteval);
1853                         if (dumps) {
1854                                 dumps--;
1855                                 debug_dma_dump_mappings(NULL);
1856                         }
1857                         WARN_ON(1);
1858                 }
1859
1860                 lvl_pages = lvl_to_nr_pages(largepage_lvl);
1861
1862                 BUG_ON(nr_pages < lvl_pages);
1863                 BUG_ON(sg_res < lvl_pages);
1864
1865                 nr_pages -= lvl_pages;
1866                 iov_pfn += lvl_pages;
1867                 phys_pfn += lvl_pages;
1868                 pteval += lvl_pages * VTD_PAGE_SIZE;
1869                 sg_res -= lvl_pages;
1870
1871                 /* If the next PTE would be the first in a new page, then we
1872                    need to flush the cache on the entries we've just written.
1873                    And then we'll need to recalculate 'pte', so clear it and
1874                    let it get set again in the if (!pte) block above.
1875
1876                    If we're done (!nr_pages) we need to flush the cache too.
1877
1878                    Also if we've been setting superpages, we may need to
1879                    recalculate 'pte' and switch back to smaller pages for the
1880                    end of the mapping, if the trailing size is not enough to
1881                    use another superpage (i.e. sg_res < lvl_pages). */
1882                 pte++;
1883                 if (!nr_pages || first_pte_in_page(pte) ||
1884                     (largepage_lvl > 1 && sg_res < lvl_pages)) {
1885                         domain_flush_cache(domain, first_pte,
1886                                            (void *)pte - (void *)first_pte);
1887                         pte = NULL;
1888                 }
1889
1890                 if (!sg_res && nr_pages)
1891                         sg = sg_next(sg);
1892         }
1893         return 0;
1894 }
1895
1896 static inline int domain_sg_mapping(struct dmar_domain *domain, unsigned long iov_pfn,
1897                                     struct scatterlist *sg, unsigned long nr_pages,
1898                                     int prot)
1899 {
1900         return __domain_mapping(domain, iov_pfn, sg, 0, nr_pages, prot);
1901 }
1902
1903 static inline int domain_pfn_mapping(struct dmar_domain *domain, unsigned long iov_pfn,
1904                                      unsigned long phys_pfn, unsigned long nr_pages,
1905                                      int prot)
1906 {
1907         return __domain_mapping(domain, iov_pfn, NULL, phys_pfn, nr_pages, prot);
1908 }
1909
1910 static void iommu_detach_dev(struct intel_iommu *iommu, u8 bus, u8 devfn)
1911 {
1912         if (!iommu)
1913                 return;
1914
1915         clear_context_table(iommu, bus, devfn);
1916         iommu->flush.flush_context(iommu, 0, 0, 0,
1917                                            DMA_CCMD_GLOBAL_INVL);
1918         iommu->flush.flush_iotlb(iommu, 0, 0, 0, DMA_TLB_GLOBAL_FLUSH);
1919 }
1920
1921 static inline void unlink_domain_info(struct device_domain_info *info)
1922 {
1923         assert_spin_locked(&device_domain_lock);
1924         list_del(&info->link);
1925         list_del(&info->global);
1926         if (info->dev)
1927                 info->dev->dev.archdata.iommu = NULL;
1928 }
1929
1930 static void domain_remove_dev_info(struct dmar_domain *domain)
1931 {
1932         struct device_domain_info *info;
1933         unsigned long flags;
1934         struct intel_iommu *iommu;
1935
1936         spin_lock_irqsave(&device_domain_lock, flags);
1937         while (!list_empty(&domain->devices)) {
1938                 info = list_entry(domain->devices.next,
1939                         struct device_domain_info, link);
1940                 unlink_domain_info(info);
1941                 spin_unlock_irqrestore(&device_domain_lock, flags);
1942
1943                 iommu_disable_dev_iotlb(info);
1944                 iommu = device_to_iommu(info->segment, info->bus, info->devfn);
1945                 iommu_detach_dev(iommu, info->bus, info->devfn);
1946                 free_devinfo_mem(info);
1947
1948                 spin_lock_irqsave(&device_domain_lock, flags);
1949         }
1950         spin_unlock_irqrestore(&device_domain_lock, flags);
1951 }
1952
1953 /*
1954  * find_domain
1955  * Note: we use struct pci_dev->dev.archdata.iommu stores the info
1956  */
1957 static struct dmar_domain *
1958 find_domain(struct pci_dev *pdev)
1959 {
1960         struct device_domain_info *info;
1961
1962         /* No lock here, assumes no domain exit in normal case */
1963         info = pdev->dev.archdata.iommu;
1964         if (info)
1965                 return info->domain;
1966         return NULL;
1967 }
1968
1969 /* domain is initialized */
1970 static struct dmar_domain *get_domain_for_dev(struct pci_dev *pdev, int gaw)
1971 {
1972         struct dmar_domain *domain, *found = NULL;
1973         struct intel_iommu *iommu;
1974         struct dmar_drhd_unit *drhd;
1975         struct device_domain_info *info, *tmp;
1976         struct pci_dev *dev_tmp;
1977         unsigned long flags;
1978         int bus = 0, devfn = 0;
1979         int segment;
1980         int ret;
1981
1982         domain = find_domain(pdev);
1983         if (domain)
1984                 return domain;
1985
1986         segment = pci_domain_nr(pdev->bus);
1987
1988         dev_tmp = pci_find_upstream_pcie_bridge(pdev);
1989         if (dev_tmp) {
1990                 if (pci_is_pcie(dev_tmp)) {
1991                         bus = dev_tmp->subordinate->number;
1992                         devfn = 0;
1993                 } else {
1994                         bus = dev_tmp->bus->number;
1995                         devfn = dev_tmp->devfn;
1996                 }
1997                 spin_lock_irqsave(&device_domain_lock, flags);
1998                 list_for_each_entry(info, &device_domain_list, global) {
1999                         if (info->segment == segment &&
2000                             info->bus == bus && info->devfn == devfn) {
2001                                 found = info->domain;
2002                                 break;
2003                         }
2004                 }
2005                 spin_unlock_irqrestore(&device_domain_lock, flags);
2006                 /* pcie-pci bridge already has a domain, uses it */
2007                 if (found) {
2008                         domain = found;
2009                         goto found_domain;
2010                 }
2011         }
2012
2013         domain = alloc_domain();
2014         if (!domain)
2015                 goto error;
2016
2017         /* Allocate new domain for the device */
2018         drhd = dmar_find_matched_drhd_unit(pdev);
2019         if (!drhd) {
2020                 printk(KERN_ERR "IOMMU: can't find DMAR for device %s\n",
2021                         pci_name(pdev));
2022                 free_domain_mem(domain);
2023                 return NULL;
2024         }
2025         iommu = drhd->iommu;
2026
2027         ret = iommu_attach_domain(domain, iommu);
2028         if (ret) {
2029                 free_domain_mem(domain);
2030                 goto error;
2031         }
2032
2033         if (domain_init(domain, gaw)) {
2034                 domain_exit(domain);
2035                 goto error;
2036         }
2037
2038         /* register pcie-to-pci device */
2039         if (dev_tmp) {
2040                 info = alloc_devinfo_mem();
2041                 if (!info) {
2042                         domain_exit(domain);
2043                         goto error;
2044                 }
2045                 info->segment = segment;
2046                 info->bus = bus;
2047                 info->devfn = devfn;
2048                 info->dev = NULL;
2049                 info->domain = domain;
2050                 /* This domain is shared by devices under p2p bridge */
2051                 domain->flags |= DOMAIN_FLAG_P2P_MULTIPLE_DEVICES;
2052
2053                 /* pcie-to-pci bridge already has a domain, uses it */
2054                 found = NULL;
2055                 spin_lock_irqsave(&device_domain_lock, flags);
2056                 list_for_each_entry(tmp, &device_domain_list, global) {
2057                         if (tmp->segment == segment &&
2058                             tmp->bus == bus && tmp->devfn == devfn) {
2059                                 found = tmp->domain;
2060                                 break;
2061                         }
2062                 }
2063                 if (found) {
2064                         spin_unlock_irqrestore(&device_domain_lock, flags);
2065                         free_devinfo_mem(info);
2066                         domain_exit(domain);
2067                         domain = found;
2068                 } else {
2069                         list_add(&info->link, &domain->devices);
2070                         list_add(&info->global, &device_domain_list);
2071                         spin_unlock_irqrestore(&device_domain_lock, flags);
2072                 }
2073         }
2074
2075 found_domain:
2076         info = alloc_devinfo_mem();
2077         if (!info)
2078                 goto error;
2079         info->segment = segment;
2080         info->bus = pdev->bus->number;
2081         info->devfn = pdev->devfn;
2082         info->dev = pdev;
2083         info->domain = domain;
2084         spin_lock_irqsave(&device_domain_lock, flags);
2085         /* somebody is fast */
2086         found = find_domain(pdev);
2087         if (found != NULL) {
2088                 spin_unlock_irqrestore(&device_domain_lock, flags);
2089                 if (found != domain) {
2090                         domain_exit(domain);
2091                         domain = found;
2092                 }
2093                 free_devinfo_mem(info);
2094                 return domain;
2095         }
2096         list_add(&info->link, &domain->devices);
2097         list_add(&info->global, &device_domain_list);
2098         pdev->dev.archdata.iommu = info;
2099         spin_unlock_irqrestore(&device_domain_lock, flags);
2100         return domain;
2101 error:
2102         /* recheck it here, maybe others set it */
2103         return find_domain(pdev);
2104 }
2105
2106 static int iommu_identity_mapping;
2107 #define IDENTMAP_ALL            1
2108 #define IDENTMAP_GFX            2
2109 #define IDENTMAP_AZALIA         4
2110
2111 static int iommu_domain_identity_map(struct dmar_domain *domain,
2112                                      unsigned long long start,
2113                                      unsigned long long end)
2114 {
2115         unsigned long first_vpfn = start >> VTD_PAGE_SHIFT;
2116         unsigned long last_vpfn = end >> VTD_PAGE_SHIFT;
2117
2118         if (!reserve_iova(&domain->iovad, dma_to_mm_pfn(first_vpfn),
2119                           dma_to_mm_pfn(last_vpfn))) {
2120                 printk(KERN_ERR "IOMMU: reserve iova failed\n");
2121                 return -ENOMEM;
2122         }
2123
2124         pr_debug("Mapping reserved region %llx-%llx for domain %d\n",
2125                  start, end, domain->id);
2126         /*
2127          * RMRR range might have overlap with physical memory range,
2128          * clear it first
2129          */
2130         dma_pte_clear_range(domain, first_vpfn, last_vpfn);
2131
2132         return domain_pfn_mapping(domain, first_vpfn, first_vpfn,
2133                                   last_vpfn - first_vpfn + 1,
2134                                   DMA_PTE_READ|DMA_PTE_WRITE);
2135 }
2136
2137 static int iommu_prepare_identity_map(struct pci_dev *pdev,
2138                                       unsigned long long start,
2139                                       unsigned long long end)
2140 {
2141         struct dmar_domain *domain;
2142         int ret;
2143
2144         domain = get_domain_for_dev(pdev, DEFAULT_DOMAIN_ADDRESS_WIDTH);
2145         if (!domain)
2146                 return -ENOMEM;
2147
2148         /* For _hardware_ passthrough, don't bother. But for software
2149            passthrough, we do it anyway -- it may indicate a memory
2150            range which is reserved in E820, so which didn't get set
2151            up to start with in si_domain */
2152         if (domain == si_domain && hw_pass_through) {
2153                 printk("Ignoring identity map for HW passthrough device %s [0x%Lx - 0x%Lx]\n",
2154                        pci_name(pdev), start, end);
2155                 return 0;
2156         }
2157
2158         printk(KERN_INFO
2159                "IOMMU: Setting identity map for device %s [0x%Lx - 0x%Lx]\n",
2160                pci_name(pdev), start, end);
2161         
2162         if (end < start) {
2163                 WARN(1, "Your BIOS is broken; RMRR ends before it starts!\n"
2164                         "BIOS vendor: %s; Ver: %s; Product Version: %s\n",
2165                         dmi_get_system_info(DMI_BIOS_VENDOR),
2166                         dmi_get_system_info(DMI_BIOS_VERSION),
2167                      dmi_get_system_info(DMI_PRODUCT_VERSION));
2168                 ret = -EIO;
2169                 goto error;
2170         }
2171
2172         if (end >> agaw_to_width(domain->agaw)) {
2173                 WARN(1, "Your BIOS is broken; RMRR exceeds permitted address width (%d bits)\n"
2174                      "BIOS vendor: %s; Ver: %s; Product Version: %s\n",
2175                      agaw_to_width(domain->agaw),
2176                      dmi_get_system_info(DMI_BIOS_VENDOR),
2177                      dmi_get_system_info(DMI_BIOS_VERSION),
2178                      dmi_get_system_info(DMI_PRODUCT_VERSION));
2179                 ret = -EIO;
2180                 goto error;
2181         }
2182
2183         ret = iommu_domain_identity_map(domain, start, end);
2184         if (ret)
2185                 goto error;
2186
2187         /* context entry init */
2188         ret = domain_context_mapping(domain, pdev, CONTEXT_TT_MULTI_LEVEL);
2189         if (ret)
2190                 goto error;
2191
2192         return 0;
2193
2194  error:
2195         domain_exit(domain);
2196         return ret;
2197 }
2198
2199 static inline int iommu_prepare_rmrr_dev(struct dmar_rmrr_unit *rmrr,
2200         struct pci_dev *pdev)
2201 {
2202         if (pdev->dev.archdata.iommu == DUMMY_DEVICE_DOMAIN_INFO)
2203                 return 0;
2204         return iommu_prepare_identity_map(pdev, rmrr->base_address,
2205                 rmrr->end_address);
2206 }
2207
2208 #ifdef CONFIG_INTEL_IOMMU_FLOPPY_WA
2209 static inline void iommu_prepare_isa(void)
2210 {
2211         struct pci_dev *pdev;
2212         int ret;
2213
2214         pdev = pci_get_class(PCI_CLASS_BRIDGE_ISA << 8, NULL);
2215         if (!pdev)
2216                 return;
2217
2218         printk(KERN_INFO "IOMMU: Prepare 0-16MiB unity mapping for LPC\n");
2219         ret = iommu_prepare_identity_map(pdev, 0, 16*1024*1024 - 1);
2220
2221         if (ret)
2222                 printk(KERN_ERR "IOMMU: Failed to create 0-16MiB identity map; "
2223                        "floppy might not work\n");
2224
2225 }
2226 #else
2227 static inline void iommu_prepare_isa(void)
2228 {
2229         return;
2230 }
2231 #endif /* !CONFIG_INTEL_IOMMU_FLPY_WA */
2232
2233 static int md_domain_init(struct dmar_domain *domain, int guest_width);
2234
2235 static int __init si_domain_init(int hw)
2236 {
2237         struct dmar_drhd_unit *drhd;
2238         struct intel_iommu *iommu;
2239         int nid, ret = 0;
2240
2241         si_domain = alloc_domain();
2242         if (!si_domain)
2243                 return -EFAULT;
2244
2245         pr_debug("Identity mapping domain is domain %d\n", si_domain->id);
2246
2247         for_each_active_iommu(iommu, drhd) {
2248                 ret = iommu_attach_domain(si_domain, iommu);
2249                 if (ret) {
2250                         domain_exit(si_domain);
2251                         return -EFAULT;
2252                 }
2253         }
2254
2255         if (md_domain_init(si_domain, DEFAULT_DOMAIN_ADDRESS_WIDTH)) {
2256                 domain_exit(si_domain);
2257                 return -EFAULT;
2258         }
2259
2260         si_domain->flags = DOMAIN_FLAG_STATIC_IDENTITY;
2261
2262         if (hw)
2263                 return 0;
2264
2265         for_each_online_node(nid) {
2266                 unsigned long start_pfn, end_pfn;
2267                 int i;
2268
2269                 for_each_mem_pfn_range(i, nid, &start_pfn, &end_pfn, NULL) {
2270                         ret = iommu_domain_identity_map(si_domain,
2271                                         PFN_PHYS(start_pfn), PFN_PHYS(end_pfn));
2272                         if (ret)
2273                                 return ret;
2274                 }
2275         }
2276
2277         return 0;
2278 }
2279
2280 static void domain_remove_one_dev_info(struct dmar_domain *domain,
2281                                           struct pci_dev *pdev);
2282 static int identity_mapping(struct pci_dev *pdev)
2283 {
2284         struct device_domain_info *info;
2285
2286         if (likely(!iommu_identity_mapping))
2287                 return 0;
2288
2289         info = pdev->dev.archdata.iommu;
2290         if (info && info != DUMMY_DEVICE_DOMAIN_INFO)
2291                 return (info->domain == si_domain);
2292
2293         return 0;
2294 }
2295
2296 static int domain_add_dev_info(struct dmar_domain *domain,
2297                                struct pci_dev *pdev,
2298                                int translation)
2299 {
2300         struct device_domain_info *info;
2301         unsigned long flags;
2302         int ret;
2303
2304         info = alloc_devinfo_mem();
2305         if (!info)
2306                 return -ENOMEM;
2307
2308         info->segment = pci_domain_nr(pdev->bus);
2309         info->bus = pdev->bus->number;
2310         info->devfn = pdev->devfn;
2311         info->dev = pdev;
2312         info->domain = domain;
2313
2314         spin_lock_irqsave(&device_domain_lock, flags);
2315         list_add(&info->link, &domain->devices);
2316         list_add(&info->global, &device_domain_list);
2317         pdev->dev.archdata.iommu = info;
2318         spin_unlock_irqrestore(&device_domain_lock, flags);
2319
2320         ret = domain_context_mapping(domain, pdev, translation);
2321         if (ret) {
2322                 spin_lock_irqsave(&device_domain_lock, flags);
2323                 unlink_domain_info(info);
2324                 spin_unlock_irqrestore(&device_domain_lock, flags);
2325                 free_devinfo_mem(info);
2326                 return ret;
2327         }
2328
2329         return 0;
2330 }
2331
2332 static bool device_has_rmrr(struct pci_dev *dev)
2333 {
2334         struct dmar_rmrr_unit *rmrr;
2335         int i;
2336
2337         for_each_rmrr_units(rmrr) {
2338                 for (i = 0; i < rmrr->devices_cnt; i++) {
2339                         /*
2340                          * Return TRUE if this RMRR contains the device that
2341                          * is passed in.
2342                          */
2343                         if (rmrr->devices[i] == dev)
2344                                 return true;
2345                 }
2346         }
2347         return false;
2348 }
2349
2350 static int iommu_should_identity_map(struct pci_dev *pdev, int startup)
2351 {
2352
2353         /*
2354          * We want to prevent any device associated with an RMRR from
2355          * getting placed into the SI Domain. This is done because
2356          * problems exist when devices are moved in and out of domains
2357          * and their respective RMRR info is lost. We exempt USB devices
2358          * from this process due to their usage of RMRRs that are known
2359          * to not be needed after BIOS hand-off to OS.
2360          */
2361         if (device_has_rmrr(pdev) &&
2362             (pdev->class >> 8) != PCI_CLASS_SERIAL_USB)
2363                 return 0;
2364
2365         if ((iommu_identity_mapping & IDENTMAP_AZALIA) && IS_AZALIA(pdev))
2366                 return 1;
2367
2368         if ((iommu_identity_mapping & IDENTMAP_GFX) && IS_GFX_DEVICE(pdev))
2369                 return 1;
2370
2371         if (!(iommu_identity_mapping & IDENTMAP_ALL))
2372                 return 0;
2373
2374         /*
2375          * We want to start off with all devices in the 1:1 domain, and
2376          * take them out later if we find they can't access all of memory.
2377          *
2378          * However, we can't do this for PCI devices behind bridges,
2379          * because all PCI devices behind the same bridge will end up
2380          * with the same source-id on their transactions.
2381          *
2382          * Practically speaking, we can't change things around for these
2383          * devices at run-time, because we can't be sure there'll be no
2384          * DMA transactions in flight for any of their siblings.
2385          * 
2386          * So PCI devices (unless they're on the root bus) as well as
2387          * their parent PCI-PCI or PCIe-PCI bridges must be left _out_ of
2388          * the 1:1 domain, just in _case_ one of their siblings turns out
2389          * not to be able to map all of memory.
2390          */
2391         if (!pci_is_pcie(pdev)) {
2392                 if (!pci_is_root_bus(pdev->bus))
2393                         return 0;
2394                 if (pdev->class >> 8 == PCI_CLASS_BRIDGE_PCI)
2395                         return 0;
2396         } else if (pci_pcie_type(pdev) == PCI_EXP_TYPE_PCI_BRIDGE)
2397                 return 0;
2398
2399         /* 
2400          * At boot time, we don't yet know if devices will be 64-bit capable.
2401          * Assume that they will -- if they turn out not to be, then we can 
2402          * take them out of the 1:1 domain later.
2403          */
2404         if (!startup) {
2405                 /*
2406                  * If the device's dma_mask is less than the system's memory
2407                  * size then this is not a candidate for identity mapping.
2408                  */
2409                 u64 dma_mask = pdev->dma_mask;
2410
2411                 if (pdev->dev.coherent_dma_mask &&
2412                     pdev->dev.coherent_dma_mask < dma_mask)
2413                         dma_mask = pdev->dev.coherent_dma_mask;
2414
2415                 return dma_mask >= dma_get_required_mask(&pdev->dev);
2416         }
2417
2418         return 1;
2419 }
2420
2421 static int __init iommu_prepare_static_identity_mapping(int hw)
2422 {
2423         struct pci_dev *pdev = NULL;
2424         int ret;
2425
2426         ret = si_domain_init(hw);
2427         if (ret)
2428                 return -EFAULT;
2429
2430         for_each_pci_dev(pdev) {
2431                 if (iommu_should_identity_map(pdev, 1)) {
2432                         ret = domain_add_dev_info(si_domain, pdev,
2433                                              hw ? CONTEXT_TT_PASS_THROUGH :
2434                                                   CONTEXT_TT_MULTI_LEVEL);
2435                         if (ret) {
2436                                 /* device not associated with an iommu */
2437                                 if (ret == -ENODEV)
2438                                         continue;
2439                                 return ret;
2440                         }
2441                         pr_info("IOMMU: %s identity mapping for device %s\n",
2442                                 hw ? "hardware" : "software", pci_name(pdev));
2443                 }
2444         }
2445
2446         return 0;
2447 }
2448
2449 static int __init init_dmars(void)
2450 {
2451         struct dmar_drhd_unit *drhd;
2452         struct dmar_rmrr_unit *rmrr;
2453         struct pci_dev *pdev;
2454         struct intel_iommu *iommu;
2455         int i, ret;
2456
2457         /*
2458          * for each drhd
2459          *    allocate root
2460          *    initialize and program root entry to not present
2461          * endfor
2462          */
2463         for_each_drhd_unit(drhd) {
2464                 /*
2465                  * lock not needed as this is only incremented in the single
2466                  * threaded kernel __init code path all other access are read
2467                  * only
2468                  */
2469                 if (g_num_of_iommus < IOMMU_UNITS_SUPPORTED) {
2470                         g_num_of_iommus++;
2471                         continue;
2472                 }
2473                 printk_once(KERN_ERR "intel-iommu: exceeded %d IOMMUs\n",
2474                           IOMMU_UNITS_SUPPORTED);
2475         }
2476
2477         g_iommus = kcalloc(g_num_of_iommus, sizeof(struct intel_iommu *),
2478                         GFP_KERNEL);
2479         if (!g_iommus) {
2480                 printk(KERN_ERR "Allocating global iommu array failed\n");
2481                 ret = -ENOMEM;
2482                 goto error;
2483         }
2484
2485         deferred_flush = kzalloc(g_num_of_iommus *
2486                 sizeof(struct deferred_flush_tables), GFP_KERNEL);
2487         if (!deferred_flush) {
2488                 ret = -ENOMEM;
2489                 goto error;
2490         }
2491
2492         for_each_drhd_unit(drhd) {
2493                 if (drhd->ignored)
2494                         continue;
2495
2496                 iommu = drhd->iommu;
2497                 g_iommus[iommu->seq_id] = iommu;
2498
2499                 ret = iommu_init_domains(iommu);
2500                 if (ret)
2501                         goto error;
2502
2503                 /*
2504                  * TBD:
2505                  * we could share the same root & context tables
2506                  * among all IOMMU's. Need to Split it later.
2507                  */
2508                 ret = iommu_alloc_root_entry(iommu);
2509                 if (ret) {
2510                         printk(KERN_ERR "IOMMU: allocate root entry failed\n");
2511                         goto error;
2512                 }
2513                 if (!ecap_pass_through(iommu->ecap))
2514                         hw_pass_through = 0;
2515         }
2516
2517         /*
2518          * Start from the sane iommu hardware state.
2519          */
2520         for_each_drhd_unit(drhd) {
2521                 if (drhd->ignored)
2522                         continue;
2523
2524                 iommu = drhd->iommu;
2525
2526                 /*
2527                  * If the queued invalidation is already initialized by us
2528                  * (for example, while enabling interrupt-remapping) then
2529                  * we got the things already rolling from a sane state.
2530                  */
2531                 if (iommu->qi)
2532                         continue;
2533
2534                 /*
2535                  * Clear any previous faults.
2536                  */
2537                 dmar_fault(-1, iommu);
2538                 /*
2539                  * Disable queued invalidation if supported and already enabled
2540                  * before OS handover.
2541                  */
2542                 dmar_disable_qi(iommu);
2543         }
2544
2545         for_each_drhd_unit(drhd) {
2546                 if (drhd->ignored)
2547                         continue;
2548
2549                 iommu = drhd->iommu;
2550
2551                 if (dmar_enable_qi(iommu)) {
2552                         /*
2553                          * Queued Invalidate not enabled, use Register Based
2554                          * Invalidate
2555                          */
2556                         iommu->flush.flush_context = __iommu_flush_context;
2557                         iommu->flush.flush_iotlb = __iommu_flush_iotlb;
2558                         printk(KERN_INFO "IOMMU %d 0x%Lx: using Register based "
2559                                "invalidation\n",
2560                                 iommu->seq_id,
2561                                (unsigned long long)drhd->reg_base_addr);
2562                 } else {
2563                         iommu->flush.flush_context = qi_flush_context;
2564                         iommu->flush.flush_iotlb = qi_flush_iotlb;
2565                         printk(KERN_INFO "IOMMU %d 0x%Lx: using Queued "
2566                                "invalidation\n",
2567                                 iommu->seq_id,
2568                                (unsigned long long)drhd->reg_base_addr);
2569                 }
2570         }
2571
2572         if (iommu_pass_through)
2573                 iommu_identity_mapping |= IDENTMAP_ALL;
2574
2575 #ifdef CONFIG_INTEL_IOMMU_BROKEN_GFX_WA
2576         iommu_identity_mapping |= IDENTMAP_GFX;
2577 #endif
2578
2579         check_tylersburg_isoch();
2580
2581         /*
2582          * If pass through is not set or not enabled, setup context entries for
2583          * identity mappings for rmrr, gfx, and isa and may fall back to static
2584          * identity mapping if iommu_identity_mapping is set.
2585          */
2586         if (iommu_identity_mapping) {
2587                 ret = iommu_prepare_static_identity_mapping(hw_pass_through);
2588                 if (ret) {
2589                         printk(KERN_CRIT "Failed to setup IOMMU pass-through\n");
2590                         goto error;
2591                 }
2592         }
2593         /*
2594          * For each rmrr
2595          *   for each dev attached to rmrr
2596          *   do
2597          *     locate drhd for dev, alloc domain for dev
2598          *     allocate free domain
2599          *     allocate page table entries for rmrr
2600          *     if context not allocated for bus
2601          *           allocate and init context
2602          *           set present in root table for this bus
2603          *     init context with domain, translation etc
2604          *    endfor
2605          * endfor
2606          */
2607         printk(KERN_INFO "IOMMU: Setting RMRR:\n");
2608         for_each_rmrr_units(rmrr) {
2609                 for (i = 0; i < rmrr->devices_cnt; i++) {
2610                         pdev = rmrr->devices[i];
2611                         /*
2612                          * some BIOS lists non-exist devices in DMAR
2613                          * table.
2614                          */
2615                         if (!pdev)
2616                                 continue;
2617                         ret = iommu_prepare_rmrr_dev(rmrr, pdev);
2618                         if (ret)
2619                                 printk(KERN_ERR
2620                                        "IOMMU: mapping reserved region failed\n");
2621                 }
2622         }
2623
2624         iommu_prepare_isa();
2625
2626         /*
2627          * for each drhd
2628          *   enable fault log
2629          *   global invalidate context cache
2630          *   global invalidate iotlb
2631          *   enable translation
2632          */
2633         for_each_drhd_unit(drhd) {
2634                 if (drhd->ignored) {
2635                         /*
2636                          * we always have to disable PMRs or DMA may fail on
2637                          * this device
2638                          */
2639                         if (force_on)
2640                                 iommu_disable_protect_mem_regions(drhd->iommu);
2641                         continue;
2642                 }
2643                 iommu = drhd->iommu;
2644
2645                 iommu_flush_write_buffer(iommu);
2646
2647                 ret = dmar_set_interrupt(iommu);
2648                 if (ret)
2649                         goto error;
2650
2651                 iommu_set_root_entry(iommu);
2652
2653                 iommu->flush.flush_context(iommu, 0, 0, 0, DMA_CCMD_GLOBAL_INVL);
2654                 iommu->flush.flush_iotlb(iommu, 0, 0, 0, DMA_TLB_GLOBAL_FLUSH);
2655
2656                 ret = iommu_enable_translation(iommu);
2657                 if (ret)
2658                         goto error;
2659
2660                 iommu_disable_protect_mem_regions(iommu);
2661         }
2662
2663         return 0;
2664 error:
2665         for_each_drhd_unit(drhd) {
2666                 if (drhd->ignored)
2667                         continue;
2668                 iommu = drhd->iommu;
2669                 free_iommu(iommu);
2670         }
2671         kfree(g_iommus);
2672         return ret;
2673 }
2674
2675 /* This takes a number of _MM_ pages, not VTD pages */
2676 static struct iova *intel_alloc_iova(struct device *dev,
2677                                      struct dmar_domain *domain,
2678                                      unsigned long nrpages, uint64_t dma_mask)
2679 {
2680         struct pci_dev *pdev = to_pci_dev(dev);
2681         struct iova *iova = NULL;
2682
2683         /* Restrict dma_mask to the width that the iommu can handle */
2684         dma_mask = min_t(uint64_t, DOMAIN_MAX_ADDR(domain->gaw), dma_mask);
2685
2686         if (!dmar_forcedac && dma_mask > DMA_BIT_MASK(32)) {
2687                 /*
2688                  * First try to allocate an io virtual address in
2689                  * DMA_BIT_MASK(32) and if that fails then try allocating
2690                  * from higher range
2691                  */
2692                 iova = alloc_iova(&domain->iovad, nrpages,
2693                                   IOVA_PFN(DMA_BIT_MASK(32)), 1);
2694                 if (iova)
2695                         return iova;
2696         }
2697         iova = alloc_iova(&domain->iovad, nrpages, IOVA_PFN(dma_mask), 1);
2698         if (unlikely(!iova)) {
2699                 printk(KERN_ERR "Allocating %ld-page iova for %s failed",
2700                        nrpages, pci_name(pdev));
2701                 return NULL;
2702         }
2703
2704         return iova;
2705 }
2706
2707 static struct dmar_domain *__get_valid_domain_for_dev(struct pci_dev *pdev)
2708 {
2709         struct dmar_domain *domain;
2710         int ret;
2711
2712         domain = get_domain_for_dev(pdev,
2713                         DEFAULT_DOMAIN_ADDRESS_WIDTH);
2714         if (!domain) {
2715                 printk(KERN_ERR
2716                         "Allocating domain for %s failed", pci_name(pdev));
2717                 return NULL;
2718         }
2719
2720         /* make sure context mapping is ok */
2721         if (unlikely(!domain_context_mapped(pdev))) {
2722                 ret = domain_context_mapping(domain, pdev,
2723                                              CONTEXT_TT_MULTI_LEVEL);
2724                 if (ret) {
2725                         printk(KERN_ERR
2726                                 "Domain context map for %s failed",
2727                                 pci_name(pdev));
2728                         return NULL;
2729                 }
2730         }
2731
2732         return domain;
2733 }
2734
2735 static inline struct dmar_domain *get_valid_domain_for_dev(struct pci_dev *dev)
2736 {
2737         struct device_domain_info *info;
2738
2739         /* No lock here, assumes no domain exit in normal case */
2740         info = dev->dev.archdata.iommu;
2741         if (likely(info))
2742                 return info->domain;
2743
2744         return __get_valid_domain_for_dev(dev);
2745 }
2746
2747 static int iommu_dummy(struct pci_dev *pdev)
2748 {
2749         return pdev->dev.archdata.iommu == DUMMY_DEVICE_DOMAIN_INFO;
2750 }
2751
2752 /* Check if the pdev needs to go through non-identity map and unmap process.*/
2753 static int iommu_no_mapping(struct device *dev)
2754 {
2755         struct pci_dev *pdev;
2756         int found;
2757
2758         if (unlikely(dev->bus != &pci_bus_type))
2759                 return 1;
2760
2761         pdev = to_pci_dev(dev);
2762         if (iommu_dummy(pdev))
2763                 return 1;
2764
2765         if (!iommu_identity_mapping)
2766                 return 0;
2767
2768         found = identity_mapping(pdev);
2769         if (found) {
2770                 if (iommu_should_identity_map(pdev, 0))
2771                         return 1;
2772                 else {
2773                         /*
2774                          * 32 bit DMA is removed from si_domain and fall back
2775                          * to non-identity mapping.
2776                          */
2777                         domain_remove_one_dev_info(si_domain, pdev);
2778                         printk(KERN_INFO "32bit %s uses non-identity mapping\n",
2779                                pci_name(pdev));
2780                         return 0;
2781                 }
2782         } else {
2783                 /*
2784                  * In case of a detached 64 bit DMA device from vm, the device
2785                  * is put into si_domain for identity mapping.
2786                  */
2787                 if (iommu_should_identity_map(pdev, 0)) {
2788                         int ret;
2789                         ret = domain_add_dev_info(si_domain, pdev,
2790                                                   hw_pass_through ?
2791                                                   CONTEXT_TT_PASS_THROUGH :
2792                                                   CONTEXT_TT_MULTI_LEVEL);
2793                         if (!ret) {
2794                                 printk(KERN_INFO "64bit %s uses identity mapping\n",
2795                                        pci_name(pdev));
2796                                 return 1;
2797                         }
2798                 }
2799         }
2800
2801         return 0;
2802 }
2803
2804 static dma_addr_t __intel_map_single(struct device *hwdev, phys_addr_t paddr,
2805                                      size_t size, int dir, u64 dma_mask)
2806 {
2807         struct pci_dev *pdev = to_pci_dev(hwdev);
2808         struct dmar_domain *domain;
2809         phys_addr_t start_paddr;
2810         struct iova *iova;
2811         int prot = 0;
2812         int ret;
2813         struct intel_iommu *iommu;
2814         unsigned long paddr_pfn = paddr >> PAGE_SHIFT;
2815
2816         BUG_ON(dir == DMA_NONE);
2817
2818         if (iommu_no_mapping(hwdev))
2819                 return paddr;
2820
2821         domain = get_valid_domain_for_dev(pdev);
2822         if (!domain)
2823                 return 0;
2824
2825         iommu = domain_get_iommu(domain);
2826         size = aligned_nrpages(paddr, size);
2827
2828         iova = intel_alloc_iova(hwdev, domain, dma_to_mm_pfn(size), dma_mask);
2829         if (!iova)
2830                 goto error;
2831
2832         /*
2833          * Check if DMAR supports zero-length reads on write only
2834          * mappings..
2835          */
2836         if (dir == DMA_TO_DEVICE || dir == DMA_BIDIRECTIONAL || \
2837                         !cap_zlr(iommu->cap))
2838                 prot |= DMA_PTE_READ;
2839         if (dir == DMA_FROM_DEVICE || dir == DMA_BIDIRECTIONAL)
2840                 prot |= DMA_PTE_WRITE;
2841         /*
2842          * paddr - (paddr + size) might be partial page, we should map the whole
2843          * page.  Note: if two part of one page are separately mapped, we
2844          * might have two guest_addr mapping to the same host paddr, but this
2845          * is not a big problem
2846          */
2847         ret = domain_pfn_mapping(domain, mm_to_dma_pfn(iova->pfn_lo),
2848                                  mm_to_dma_pfn(paddr_pfn), size, prot);
2849         if (ret)
2850                 goto error;
2851
2852         /* it's a non-present to present mapping. Only flush if caching mode */
2853         if (cap_caching_mode(iommu->cap))
2854                 iommu_flush_iotlb_psi(iommu, domain->id, mm_to_dma_pfn(iova->pfn_lo), size, 1);
2855         else
2856                 iommu_flush_write_buffer(iommu);
2857
2858         start_paddr = (phys_addr_t)iova->pfn_lo << PAGE_SHIFT;
2859         start_paddr += paddr & ~PAGE_MASK;
2860         return start_paddr;
2861
2862 error:
2863         if (iova)
2864                 __free_iova(&domain->iovad, iova);
2865         printk(KERN_ERR"Device %s request: %zx@%llx dir %d --- failed\n",
2866                 pci_name(pdev), size, (unsigned long long)paddr, dir);
2867         return 0;
2868 }
2869
2870 static dma_addr_t intel_map_page(struct device *dev, struct page *page,
2871                                  unsigned long offset, size_t size,
2872                                  enum dma_data_direction dir,
2873                                  struct dma_attrs *attrs)
2874 {
2875         return __intel_map_single(dev, page_to_phys(page) + offset, size,
2876                                   dir, to_pci_dev(dev)->dma_mask);
2877 }
2878
2879 static void flush_unmaps(void)
2880 {
2881         int i, j;
2882
2883         timer_on = 0;
2884
2885         /* just flush them all */
2886         for (i = 0; i < g_num_of_iommus; i++) {
2887                 struct intel_iommu *iommu = g_iommus[i];
2888                 if (!iommu)
2889                         continue;
2890
2891                 if (!deferred_flush[i].next)
2892                         continue;
2893
2894                 /* In caching mode, global flushes turn emulation expensive */
2895                 if (!cap_caching_mode(iommu->cap))
2896                         iommu->flush.flush_iotlb(iommu, 0, 0, 0,
2897                                          DMA_TLB_GLOBAL_FLUSH);
2898                 for (j = 0; j < deferred_flush[i].next; j++) {
2899                         unsigned long mask;
2900                         struct iova *iova = deferred_flush[i].iova[j];
2901                         struct dmar_domain *domain = deferred_flush[i].domain[j];
2902
2903                         /* On real hardware multiple invalidations are expensive */
2904                         if (cap_caching_mode(iommu->cap))
2905                                 iommu_flush_iotlb_psi(iommu, domain->id,
2906                                 iova->pfn_lo, iova->pfn_hi - iova->pfn_lo + 1, 0);
2907                         else {
2908                                 mask = ilog2(mm_to_dma_pfn(iova->pfn_hi - iova->pfn_lo + 1));
2909                                 iommu_flush_dev_iotlb(deferred_flush[i].domain[j],
2910                                                 (uint64_t)iova->pfn_lo << PAGE_SHIFT, mask);
2911                         }
2912                         __free_iova(&deferred_flush[i].domain[j]->iovad, iova);
2913                 }
2914                 deferred_flush[i].next = 0;
2915         }
2916
2917         list_size = 0;
2918 }
2919
2920 static void flush_unmaps_timeout(unsigned long data)
2921 {
2922         unsigned long flags;
2923
2924         spin_lock_irqsave(&async_umap_flush_lock, flags);
2925         flush_unmaps();
2926         spin_unlock_irqrestore(&async_umap_flush_lock, flags);
2927 }
2928
2929 static void add_unmap(struct dmar_domain *dom, struct iova *iova)
2930 {
2931         unsigned long flags;
2932         int next, iommu_id;
2933         struct intel_iommu *iommu;
2934
2935         spin_lock_irqsave(&async_umap_flush_lock, flags);
2936         if (list_size == HIGH_WATER_MARK)
2937                 flush_unmaps();
2938
2939         iommu = domain_get_iommu(dom);
2940         iommu_id = iommu->seq_id;
2941
2942         next = deferred_flush[iommu_id].next;
2943         deferred_flush[iommu_id].domain[next] = dom;
2944         deferred_flush[iommu_id].iova[next] = iova;
2945         deferred_flush[iommu_id].next++;
2946
2947         if (!timer_on) {
2948                 mod_timer(&unmap_timer, jiffies + msecs_to_jiffies(10));
2949                 timer_on = 1;
2950         }
2951         list_size++;
2952         spin_unlock_irqrestore(&async_umap_flush_lock, flags);
2953 }
2954
2955 static void intel_unmap_page(struct device *dev, dma_addr_t dev_addr,
2956                              size_t size, enum dma_data_direction dir,
2957                              struct dma_attrs *attrs)
2958 {
2959         struct pci_dev *pdev = to_pci_dev(dev);
2960         struct dmar_domain *domain;
2961         unsigned long start_pfn, last_pfn;
2962         struct iova *iova;
2963         struct intel_iommu *iommu;
2964
2965         if (iommu_no_mapping(dev))
2966                 return;
2967
2968         domain = find_domain(pdev);
2969         BUG_ON(!domain);
2970
2971         iommu = domain_get_iommu(domain);
2972
2973         iova = find_iova(&domain->iovad, IOVA_PFN(dev_addr));
2974         if (WARN_ONCE(!iova, "Driver unmaps unmatched page at PFN %llx\n",
2975                       (unsigned long long)dev_addr))
2976                 return;
2977
2978         start_pfn = mm_to_dma_pfn(iova->pfn_lo);
2979         last_pfn = mm_to_dma_pfn(iova->pfn_hi + 1) - 1;
2980
2981         pr_debug("Device %s unmapping: pfn %lx-%lx\n",
2982                  pci_name(pdev), start_pfn, last_pfn);
2983
2984         /*  clear the whole page */
2985         dma_pte_clear_range(domain, start_pfn, last_pfn);
2986
2987         /* free page tables */
2988         dma_pte_free_pagetable(domain, start_pfn, last_pfn);
2989
2990         if (intel_iommu_strict) {
2991                 iommu_flush_iotlb_psi(iommu, domain->id, start_pfn,
2992                                       last_pfn - start_pfn + 1, 0);
2993                 /* free iova */
2994                 __free_iova(&domain->iovad, iova);
2995         } else {
2996                 add_unmap(domain, iova);
2997                 /*
2998                  * queue up the release of the unmap to save the 1/6th of the
2999                  * cpu used up by the iotlb flush operation...
3000                  */
3001         }
3002 }
3003
3004 static void *intel_alloc_coherent(struct device *hwdev, size_t size,
3005                                   dma_addr_t *dma_handle, gfp_t flags,
3006                                   struct dma_attrs *attrs)
3007 {
3008         void *vaddr;
3009         int order;
3010
3011         size = PAGE_ALIGN(size);
3012         order = get_order(size);
3013
3014         if (!iommu_no_mapping(hwdev))
3015                 flags &= ~(GFP_DMA | GFP_DMA32);
3016         else if (hwdev->coherent_dma_mask < dma_get_required_mask(hwdev)) {
3017                 if (hwdev->coherent_dma_mask < DMA_BIT_MASK(32))
3018                         flags |= GFP_DMA;
3019                 else
3020                         flags |= GFP_DMA32;
3021         }
3022
3023         vaddr = (void *)__get_free_pages(flags, order);
3024         if (!vaddr)
3025                 return NULL;
3026         memset(vaddr, 0, size);
3027
3028         *dma_handle = __intel_map_single(hwdev, virt_to_bus(vaddr), size,
3029                                          DMA_BIDIRECTIONAL,
3030                                          hwdev->coherent_dma_mask);
3031         if (*dma_handle)
3032                 return vaddr;
3033         free_pages((unsigned long)vaddr, order);
3034         return NULL;
3035 }
3036
3037 static void intel_free_coherent(struct device *hwdev, size_t size, void *vaddr,
3038                                 dma_addr_t dma_handle, struct dma_attrs *attrs)
3039 {
3040         int order;
3041
3042         size = PAGE_ALIGN(size);
3043         order = get_order(size);
3044
3045         intel_unmap_page(hwdev, dma_handle, size, DMA_BIDIRECTIONAL, NULL);
3046         free_pages((unsigned long)vaddr, order);
3047 }
3048
3049 static void intel_unmap_sg(struct device *hwdev, struct scatterlist *sglist,
3050                            int nelems, enum dma_data_direction dir,
3051                            struct dma_attrs *attrs)
3052 {
3053         struct pci_dev *pdev = to_pci_dev(hwdev);
3054         struct dmar_domain *domain;
3055         unsigned long start_pfn, last_pfn;
3056         struct iova *iova;
3057         struct intel_iommu *iommu;
3058
3059         if (iommu_no_mapping(hwdev))
3060                 return;
3061
3062         domain = find_domain(pdev);
3063         BUG_ON(!domain);
3064
3065         iommu = domain_get_iommu(domain);
3066
3067         iova = find_iova(&domain->iovad, IOVA_PFN(sglist[0].dma_address));
3068         if (WARN_ONCE(!iova, "Driver unmaps unmatched sglist at PFN %llx\n",
3069                       (unsigned long long)sglist[0].dma_address))
3070                 return;
3071
3072         start_pfn = mm_to_dma_pfn(iova->pfn_lo);
3073         last_pfn = mm_to_dma_pfn(iova->pfn_hi + 1) - 1;
3074
3075         /*  clear the whole page */
3076         dma_pte_clear_range(domain, start_pfn, last_pfn);
3077
3078         /* free page tables */
3079         dma_pte_free_pagetable(domain, start_pfn, last_pfn);
3080
3081         if (intel_iommu_strict) {
3082                 iommu_flush_iotlb_psi(iommu, domain->id, start_pfn,
3083                                       last_pfn - start_pfn + 1, 0);
3084                 /* free iova */
3085                 __free_iova(&domain->iovad, iova);
3086         } else {
3087                 add_unmap(domain, iova);
3088                 /*
3089                  * queue up the release of the unmap to save the 1/6th of the
3090                  * cpu used up by the iotlb flush operation...
3091                  */
3092         }
3093 }
3094
3095 static int intel_nontranslate_map_sg(struct device *hddev,
3096         struct scatterlist *sglist, int nelems, int dir)
3097 {
3098         int i;
3099         struct scatterlist *sg;
3100
3101         for_each_sg(sglist, sg, nelems, i) {
3102                 BUG_ON(!sg_page(sg));
3103                 sg->dma_address = page_to_phys(sg_page(sg)) + sg->offset;
3104                 sg->dma_length = sg->length;
3105         }
3106         return nelems;
3107 }
3108
3109 static int intel_map_sg(struct device *hwdev, struct scatterlist *sglist, int nelems,
3110                         enum dma_data_direction dir, struct dma_attrs *attrs)
3111 {
3112         int i;
3113         struct pci_dev *pdev = to_pci_dev(hwdev);
3114         struct dmar_domain *domain;
3115         size_t size = 0;
3116         int prot = 0;
3117         struct iova *iova = NULL;
3118         int ret;
3119         struct scatterlist *sg;
3120         unsigned long start_vpfn;
3121         struct intel_iommu *iommu;
3122
3123         BUG_ON(dir == DMA_NONE);
3124         if (iommu_no_mapping(hwdev))
3125                 return intel_nontranslate_map_sg(hwdev, sglist, nelems, dir);
3126
3127         domain = get_valid_domain_for_dev(pdev);
3128         if (!domain)
3129                 return 0;
3130
3131         iommu = domain_get_iommu(domain);
3132
3133         for_each_sg(sglist, sg, nelems, i)
3134                 size += aligned_nrpages(sg->offset, sg->length);
3135
3136         iova = intel_alloc_iova(hwdev, domain, dma_to_mm_pfn(size),
3137                                 pdev->dma_mask);
3138         if (!iova) {
3139                 sglist->dma_length = 0;
3140                 return 0;
3141         }
3142
3143         /*
3144          * Check if DMAR supports zero-length reads on write only
3145          * mappings..
3146          */
3147         if (dir == DMA_TO_DEVICE || dir == DMA_BIDIRECTIONAL || \
3148                         !cap_zlr(iommu->cap))
3149                 prot |= DMA_PTE_READ;
3150         if (dir == DMA_FROM_DEVICE || dir == DMA_BIDIRECTIONAL)
3151                 prot |= DMA_PTE_WRITE;
3152
3153         start_vpfn = mm_to_dma_pfn(iova->pfn_lo);
3154
3155         ret = domain_sg_mapping(domain, start_vpfn, sglist, size, prot);
3156         if (unlikely(ret)) {
3157                 /*  clear the page */
3158                 dma_pte_clear_range(domain, start_vpfn,
3159                                     start_vpfn + size - 1);
3160                 /* free page tables */
3161                 dma_pte_free_pagetable(domain, start_vpfn,
3162                                        start_vpfn + size - 1);
3163                 /* free iova */
3164                 __free_iova(&domain->iovad, iova);
3165                 return 0;
3166         }
3167
3168         /* it's a non-present to present mapping. Only flush if caching mode */
3169         if (cap_caching_mode(iommu->cap))
3170                 iommu_flush_iotlb_psi(iommu, domain->id, start_vpfn, size, 1);
3171         else
3172                 iommu_flush_write_buffer(iommu);
3173
3174         return nelems;
3175 }
3176
3177 static int intel_mapping_error(struct device *dev, dma_addr_t dma_addr)
3178 {
3179         return !dma_addr;
3180 }
3181
3182 struct dma_map_ops intel_dma_ops = {
3183         .alloc = intel_alloc_coherent,
3184         .free = intel_free_coherent,
3185         .map_sg = intel_map_sg,
3186         .unmap_sg = intel_unmap_sg,
3187         .map_page = intel_map_page,
3188         .unmap_page = intel_unmap_page,
3189         .mapping_error = intel_mapping_error,
3190 };
3191
3192 static inline int iommu_domain_cache_init(void)
3193 {
3194         int ret = 0;
3195
3196         iommu_domain_cache = kmem_cache_create("iommu_domain",
3197                                          sizeof(struct dmar_domain),
3198                                          0,
3199                                          SLAB_HWCACHE_ALIGN,
3200
3201                                          NULL);
3202         if (!iommu_domain_cache) {
3203                 printk(KERN_ERR "Couldn't create iommu_domain cache\n");
3204                 ret = -ENOMEM;
3205         }
3206
3207         return ret;
3208 }
3209
3210 static inline int iommu_devinfo_cache_init(void)
3211 {
3212         int ret = 0;
3213
3214         iommu_devinfo_cache = kmem_cache_create("iommu_devinfo",
3215                                          sizeof(struct device_domain_info),
3216                                          0,
3217                                          SLAB_HWCACHE_ALIGN,
3218                                          NULL);
3219         if (!iommu_devinfo_cache) {
3220                 printk(KERN_ERR "Couldn't create devinfo cache\n");
3221                 ret = -ENOMEM;
3222         }
3223
3224         return ret;
3225 }
3226
3227 static inline int iommu_iova_cache_init(void)
3228 {
3229         int ret = 0;
3230
3231         iommu_iova_cache = kmem_cache_create("iommu_iova",
3232                                          sizeof(struct iova),
3233                                          0,
3234                                          SLAB_HWCACHE_ALIGN,
3235                                          NULL);
3236         if (!iommu_iova_cache) {
3237                 printk(KERN_ERR "Couldn't create iova cache\n");
3238                 ret = -ENOMEM;
3239         }
3240
3241         return ret;
3242 }
3243
3244 static int __init iommu_init_mempool(void)
3245 {
3246         int ret;
3247         ret = iommu_iova_cache_init();
3248         if (ret)
3249                 return ret;
3250
3251         ret = iommu_domain_cache_init();
3252         if (ret)
3253                 goto domain_error;
3254
3255         ret = iommu_devinfo_cache_init();
3256         if (!ret)
3257                 return ret;
3258
3259         kmem_cache_destroy(iommu_domain_cache);
3260 domain_error:
3261         kmem_cache_destroy(iommu_iova_cache);
3262
3263         return -ENOMEM;
3264 }
3265
3266 static void __init iommu_exit_mempool(void)
3267 {
3268         kmem_cache_destroy(iommu_devinfo_cache);
3269         kmem_cache_destroy(iommu_domain_cache);
3270         kmem_cache_destroy(iommu_iova_cache);
3271
3272 }
3273
3274 static void quirk_ioat_snb_local_iommu(struct pci_dev *pdev)
3275 {
3276         struct dmar_drhd_unit *drhd;
3277         u32 vtbar;
3278         int rc;
3279
3280         /* We know that this device on this chipset has its own IOMMU.
3281          * If we find it under a different IOMMU, then the BIOS is lying
3282          * to us. Hope that the IOMMU for this device is actually
3283          * disabled, and it needs no translation...
3284          */
3285         rc = pci_bus_read_config_dword(pdev->bus, PCI_DEVFN(0, 0), 0xb0, &vtbar);
3286         if (rc) {
3287                 /* "can't" happen */
3288                 dev_info(&pdev->dev, "failed to run vt-d quirk\n");
3289                 return;
3290         }
3291         vtbar &= 0xffff0000;
3292
3293         /* we know that the this iommu should be at offset 0xa000 from vtbar */
3294         drhd = dmar_find_matched_drhd_unit(pdev);
3295         if (WARN_TAINT_ONCE(!drhd || drhd->reg_base_addr - vtbar != 0xa000,
3296                             TAINT_FIRMWARE_WORKAROUND,
3297                             "BIOS assigned incorrect VT-d unit for Intel(R) QuickData Technology device\n"))
3298                 pdev->dev.archdata.iommu = DUMMY_DEVICE_DOMAIN_INFO;
3299 }
3300 DECLARE_PCI_FIXUP_ENABLE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_IOAT_SNB, quirk_ioat_snb_local_iommu);
3301
3302 static void __init init_no_remapping_devices(void)
3303 {
3304         struct dmar_drhd_unit *drhd;
3305
3306         for_each_drhd_unit(drhd) {