]> git.openfabrics.org - ~shefty/rdma-dev.git/blob - drivers/iommu/intel-iommu.c
Merge branch 'next' of git://git.kernel.org/pub/scm/linux/kernel/git/joro/iommu
[~shefty/rdma-dev.git] / drivers / iommu / intel-iommu.c
1 /*
2  * Copyright (c) 2006, Intel Corporation.
3  *
4  * This program is free software; you can redistribute it and/or modify it
5  * under the terms and conditions of the GNU General Public License,
6  * version 2, as published by the Free Software Foundation.
7  *
8  * This program is distributed in the hope it will be useful, but WITHOUT
9  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
10  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
11  * more details.
12  *
13  * You should have received a copy of the GNU General Public License along with
14  * this program; if not, write to the Free Software Foundation, Inc., 59 Temple
15  * Place - Suite 330, Boston, MA 02111-1307 USA.
16  *
17  * Copyright (C) 2006-2008 Intel Corporation
18  * Author: Ashok Raj <ashok.raj@intel.com>
19  * Author: Shaohua Li <shaohua.li@intel.com>
20  * Author: Anil S Keshavamurthy <anil.s.keshavamurthy@intel.com>
21  * Author: Fenghua Yu <fenghua.yu@intel.com>
22  */
23
24 #include <linux/init.h>
25 #include <linux/bitmap.h>
26 #include <linux/debugfs.h>
27 #include <linux/export.h>
28 #include <linux/slab.h>
29 #include <linux/irq.h>
30 #include <linux/interrupt.h>
31 #include <linux/spinlock.h>
32 #include <linux/pci.h>
33 #include <linux/dmar.h>
34 #include <linux/dma-mapping.h>
35 #include <linux/mempool.h>
36 #include <linux/timer.h>
37 #include <linux/iova.h>
38 #include <linux/iommu.h>
39 #include <linux/intel-iommu.h>
40 #include <linux/syscore_ops.h>
41 #include <linux/tboot.h>
42 #include <linux/dmi.h>
43 #include <linux/pci-ats.h>
44 #include <linux/memblock.h>
45 #include <asm/cacheflush.h>
46 #include <asm/iommu.h>
47
48 #define ROOT_SIZE               VTD_PAGE_SIZE
49 #define CONTEXT_SIZE            VTD_PAGE_SIZE
50
51 #define IS_BRIDGE_HOST_DEVICE(pdev) \
52                             ((pdev->class >> 8) == PCI_CLASS_BRIDGE_HOST)
53 #define IS_GFX_DEVICE(pdev) ((pdev->class >> 16) == PCI_BASE_CLASS_DISPLAY)
54 #define IS_ISA_DEVICE(pdev) ((pdev->class >> 8) == PCI_CLASS_BRIDGE_ISA)
55 #define IS_AZALIA(pdev) ((pdev)->vendor == 0x8086 && (pdev)->device == 0x3a3e)
56
57 #define IOAPIC_RANGE_START      (0xfee00000)
58 #define IOAPIC_RANGE_END        (0xfeefffff)
59 #define IOVA_START_ADDR         (0x1000)
60
61 #define DEFAULT_DOMAIN_ADDRESS_WIDTH 48
62
63 #define MAX_AGAW_WIDTH 64
64
65 #define __DOMAIN_MAX_PFN(gaw)  ((((uint64_t)1) << (gaw-VTD_PAGE_SHIFT)) - 1)
66 #define __DOMAIN_MAX_ADDR(gaw) ((((uint64_t)1) << gaw) - 1)
67
68 /* We limit DOMAIN_MAX_PFN to fit in an unsigned long, and DOMAIN_MAX_ADDR
69    to match. That way, we can use 'unsigned long' for PFNs with impunity. */
70 #define DOMAIN_MAX_PFN(gaw)     ((unsigned long) min_t(uint64_t, \
71                                 __DOMAIN_MAX_PFN(gaw), (unsigned long)-1))
72 #define DOMAIN_MAX_ADDR(gaw)    (((uint64_t)__DOMAIN_MAX_PFN(gaw)) << VTD_PAGE_SHIFT)
73
74 #define IOVA_PFN(addr)          ((addr) >> PAGE_SHIFT)
75 #define DMA_32BIT_PFN           IOVA_PFN(DMA_BIT_MASK(32))
76 #define DMA_64BIT_PFN           IOVA_PFN(DMA_BIT_MASK(64))
77
78 /* page table handling */
79 #define LEVEL_STRIDE            (9)
80 #define LEVEL_MASK              (((u64)1 << LEVEL_STRIDE) - 1)
81
82 /*
83  * This bitmap is used to advertise the page sizes our hardware support
84  * to the IOMMU core, which will then use this information to split
85  * physically contiguous memory regions it is mapping into page sizes
86  * that we support.
87  *
88  * Traditionally the IOMMU core just handed us the mappings directly,
89  * after making sure the size is an order of a 4KiB page and that the
90  * mapping has natural alignment.
91  *
92  * To retain this behavior, we currently advertise that we support
93  * all page sizes that are an order of 4KiB.
94  *
95  * If at some point we'd like to utilize the IOMMU core's new behavior,
96  * we could change this to advertise the real page sizes we support.
97  */
98 #define INTEL_IOMMU_PGSIZES     (~0xFFFUL)
99
100 static inline int agaw_to_level(int agaw)
101 {
102         return agaw + 2;
103 }
104
105 static inline int agaw_to_width(int agaw)
106 {
107         return 30 + agaw * LEVEL_STRIDE;
108 }
109
110 static inline int width_to_agaw(int width)
111 {
112         return (width - 30) / LEVEL_STRIDE;
113 }
114
115 static inline unsigned int level_to_offset_bits(int level)
116 {
117         return (level - 1) * LEVEL_STRIDE;
118 }
119
120 static inline int pfn_level_offset(unsigned long pfn, int level)
121 {
122         return (pfn >> level_to_offset_bits(level)) & LEVEL_MASK;
123 }
124
125 static inline unsigned long level_mask(int level)
126 {
127         return -1UL << level_to_offset_bits(level);
128 }
129
130 static inline unsigned long level_size(int level)
131 {
132         return 1UL << level_to_offset_bits(level);
133 }
134
135 static inline unsigned long align_to_level(unsigned long pfn, int level)
136 {
137         return (pfn + level_size(level) - 1) & level_mask(level);
138 }
139
140 static inline unsigned long lvl_to_nr_pages(unsigned int lvl)
141 {
142         return  1 << ((lvl - 1) * LEVEL_STRIDE);
143 }
144
145 /* VT-d pages must always be _smaller_ than MM pages. Otherwise things
146    are never going to work. */
147 static inline unsigned long dma_to_mm_pfn(unsigned long dma_pfn)
148 {
149         return dma_pfn >> (PAGE_SHIFT - VTD_PAGE_SHIFT);
150 }
151
152 static inline unsigned long mm_to_dma_pfn(unsigned long mm_pfn)
153 {
154         return mm_pfn << (PAGE_SHIFT - VTD_PAGE_SHIFT);
155 }
156 static inline unsigned long page_to_dma_pfn(struct page *pg)
157 {
158         return mm_to_dma_pfn(page_to_pfn(pg));
159 }
160 static inline unsigned long virt_to_dma_pfn(void *p)
161 {
162         return page_to_dma_pfn(virt_to_page(p));
163 }
164
165 /* global iommu list, set NULL for ignored DMAR units */
166 static struct intel_iommu **g_iommus;
167
168 static void __init check_tylersburg_isoch(void);
169 static int rwbf_quirk;
170
171 /*
172  * set to 1 to panic kernel if can't successfully enable VT-d
173  * (used when kernel is launched w/ TXT)
174  */
175 static int force_on = 0;
176
177 /*
178  * 0: Present
179  * 1-11: Reserved
180  * 12-63: Context Ptr (12 - (haw-1))
181  * 64-127: Reserved
182  */
183 struct root_entry {
184         u64     val;
185         u64     rsvd1;
186 };
187 #define ROOT_ENTRY_NR (VTD_PAGE_SIZE/sizeof(struct root_entry))
188 static inline bool root_present(struct root_entry *root)
189 {
190         return (root->val & 1);
191 }
192 static inline void set_root_present(struct root_entry *root)
193 {
194         root->val |= 1;
195 }
196 static inline void set_root_value(struct root_entry *root, unsigned long value)
197 {
198         root->val |= value & VTD_PAGE_MASK;
199 }
200
201 static inline struct context_entry *
202 get_context_addr_from_root(struct root_entry *root)
203 {
204         return (struct context_entry *)
205                 (root_present(root)?phys_to_virt(
206                 root->val & VTD_PAGE_MASK) :
207                 NULL);
208 }
209
210 /*
211  * low 64 bits:
212  * 0: present
213  * 1: fault processing disable
214  * 2-3: translation type
215  * 12-63: address space root
216  * high 64 bits:
217  * 0-2: address width
218  * 3-6: aval
219  * 8-23: domain id
220  */
221 struct context_entry {
222         u64 lo;
223         u64 hi;
224 };
225
226 static inline bool context_present(struct context_entry *context)
227 {
228         return (context->lo & 1);
229 }
230 static inline void context_set_present(struct context_entry *context)
231 {
232         context->lo |= 1;
233 }
234
235 static inline void context_set_fault_enable(struct context_entry *context)
236 {
237         context->lo &= (((u64)-1) << 2) | 1;
238 }
239
240 static inline void context_set_translation_type(struct context_entry *context,
241                                                 unsigned long value)
242 {
243         context->lo &= (((u64)-1) << 4) | 3;
244         context->lo |= (value & 3) << 2;
245 }
246
247 static inline void context_set_address_root(struct context_entry *context,
248                                             unsigned long value)
249 {
250         context->lo |= value & VTD_PAGE_MASK;
251 }
252
253 static inline void context_set_address_width(struct context_entry *context,
254                                              unsigned long value)
255 {
256         context->hi |= value & 7;
257 }
258
259 static inline void context_set_domain_id(struct context_entry *context,
260                                          unsigned long value)
261 {
262         context->hi |= (value & ((1 << 16) - 1)) << 8;
263 }
264
265 static inline void context_clear_entry(struct context_entry *context)
266 {
267         context->lo = 0;
268         context->hi = 0;
269 }
270
271 /*
272  * 0: readable
273  * 1: writable
274  * 2-6: reserved
275  * 7: super page
276  * 8-10: available
277  * 11: snoop behavior
278  * 12-63: Host physcial address
279  */
280 struct dma_pte {
281         u64 val;
282 };
283
284 static inline void dma_clear_pte(struct dma_pte *pte)
285 {
286         pte->val = 0;
287 }
288
289 static inline void dma_set_pte_readable(struct dma_pte *pte)
290 {
291         pte->val |= DMA_PTE_READ;
292 }
293
294 static inline void dma_set_pte_writable(struct dma_pte *pte)
295 {
296         pte->val |= DMA_PTE_WRITE;
297 }
298
299 static inline void dma_set_pte_snp(struct dma_pte *pte)
300 {
301         pte->val |= DMA_PTE_SNP;
302 }
303
304 static inline void dma_set_pte_prot(struct dma_pte *pte, unsigned long prot)
305 {
306         pte->val = (pte->val & ~3) | (prot & 3);
307 }
308
309 static inline u64 dma_pte_addr(struct dma_pte *pte)
310 {
311 #ifdef CONFIG_64BIT
312         return pte->val & VTD_PAGE_MASK;
313 #else
314         /* Must have a full atomic 64-bit read */
315         return  __cmpxchg64(&pte->val, 0ULL, 0ULL) & VTD_PAGE_MASK;
316 #endif
317 }
318
319 static inline void dma_set_pte_pfn(struct dma_pte *pte, unsigned long pfn)
320 {
321         pte->val |= (uint64_t)pfn << VTD_PAGE_SHIFT;
322 }
323
324 static inline bool dma_pte_present(struct dma_pte *pte)
325 {
326         return (pte->val & 3) != 0;
327 }
328
329 static inline bool dma_pte_superpage(struct dma_pte *pte)
330 {
331         return (pte->val & (1 << 7));
332 }
333
334 static inline int first_pte_in_page(struct dma_pte *pte)
335 {
336         return !((unsigned long)pte & ~VTD_PAGE_MASK);
337 }
338
339 /*
340  * This domain is a statically identity mapping domain.
341  *      1. This domain creats a static 1:1 mapping to all usable memory.
342  *      2. It maps to each iommu if successful.
343  *      3. Each iommu mapps to this domain if successful.
344  */
345 static struct dmar_domain *si_domain;
346 static int hw_pass_through = 1;
347
348 /* devices under the same p2p bridge are owned in one domain */
349 #define DOMAIN_FLAG_P2P_MULTIPLE_DEVICES (1 << 0)
350
351 /* domain represents a virtual machine, more than one devices
352  * across iommus may be owned in one domain, e.g. kvm guest.
353  */
354 #define DOMAIN_FLAG_VIRTUAL_MACHINE     (1 << 1)
355
356 /* si_domain contains mulitple devices */
357 #define DOMAIN_FLAG_STATIC_IDENTITY     (1 << 2)
358
359 struct dmar_domain {
360         int     id;                     /* domain id */
361         int     nid;                    /* node id */
362         unsigned long iommu_bmp;        /* bitmap of iommus this domain uses*/
363
364         struct list_head devices;       /* all devices' list */
365         struct iova_domain iovad;       /* iova's that belong to this domain */
366
367         struct dma_pte  *pgd;           /* virtual address */
368         int             gaw;            /* max guest address width */
369
370         /* adjusted guest address width, 0 is level 2 30-bit */
371         int             agaw;
372
373         int             flags;          /* flags to find out type of domain */
374
375         int             iommu_coherency;/* indicate coherency of iommu access */
376         int             iommu_snooping; /* indicate snooping control feature*/
377         int             iommu_count;    /* reference count of iommu */
378         int             iommu_superpage;/* Level of superpages supported:
379                                            0 == 4KiB (no superpages), 1 == 2MiB,
380                                            2 == 1GiB, 3 == 512GiB, 4 == 1TiB */
381         spinlock_t      iommu_lock;     /* protect iommu set in domain */
382         u64             max_addr;       /* maximum mapped address */
383 };
384
385 /* PCI domain-device relationship */
386 struct device_domain_info {
387         struct list_head link;  /* link to domain siblings */
388         struct list_head global; /* link to global list */
389         int segment;            /* PCI domain */
390         u8 bus;                 /* PCI bus number */
391         u8 devfn;               /* PCI devfn number */
392         struct pci_dev *dev; /* it's NULL for PCIe-to-PCI bridge */
393         struct intel_iommu *iommu; /* IOMMU used by this device */
394         struct dmar_domain *domain; /* pointer to domain */
395 };
396
397 static void flush_unmaps_timeout(unsigned long data);
398
399 DEFINE_TIMER(unmap_timer,  flush_unmaps_timeout, 0, 0);
400
401 #define HIGH_WATER_MARK 250
402 struct deferred_flush_tables {
403         int next;
404         struct iova *iova[HIGH_WATER_MARK];
405         struct dmar_domain *domain[HIGH_WATER_MARK];
406 };
407
408 static struct deferred_flush_tables *deferred_flush;
409
410 /* bitmap for indexing intel_iommus */
411 static int g_num_of_iommus;
412
413 static DEFINE_SPINLOCK(async_umap_flush_lock);
414 static LIST_HEAD(unmaps_to_do);
415
416 static int timer_on;
417 static long list_size;
418
419 static void domain_remove_dev_info(struct dmar_domain *domain);
420
421 #ifdef CONFIG_INTEL_IOMMU_DEFAULT_ON
422 int dmar_disabled = 0;
423 #else
424 int dmar_disabled = 1;
425 #endif /*CONFIG_INTEL_IOMMU_DEFAULT_ON*/
426
427 int intel_iommu_enabled = 0;
428 EXPORT_SYMBOL_GPL(intel_iommu_enabled);
429
430 static int dmar_map_gfx = 1;
431 static int dmar_forcedac;
432 static int intel_iommu_strict;
433 static int intel_iommu_superpage = 1;
434
435 int intel_iommu_gfx_mapped;
436 EXPORT_SYMBOL_GPL(intel_iommu_gfx_mapped);
437
438 #define DUMMY_DEVICE_DOMAIN_INFO ((struct device_domain_info *)(-1))
439 static DEFINE_SPINLOCK(device_domain_lock);
440 static LIST_HEAD(device_domain_list);
441
442 static struct iommu_ops intel_iommu_ops;
443
444 static int __init intel_iommu_setup(char *str)
445 {
446         if (!str)
447                 return -EINVAL;
448         while (*str) {
449                 if (!strncmp(str, "on", 2)) {
450                         dmar_disabled = 0;
451                         printk(KERN_INFO "Intel-IOMMU: enabled\n");
452                 } else if (!strncmp(str, "off", 3)) {
453                         dmar_disabled = 1;
454                         printk(KERN_INFO "Intel-IOMMU: disabled\n");
455                 } else if (!strncmp(str, "igfx_off", 8)) {
456                         dmar_map_gfx = 0;
457                         printk(KERN_INFO
458                                 "Intel-IOMMU: disable GFX device mapping\n");
459                 } else if (!strncmp(str, "forcedac", 8)) {
460                         printk(KERN_INFO
461                                 "Intel-IOMMU: Forcing DAC for PCI devices\n");
462                         dmar_forcedac = 1;
463                 } else if (!strncmp(str, "strict", 6)) {
464                         printk(KERN_INFO
465                                 "Intel-IOMMU: disable batched IOTLB flush\n");
466                         intel_iommu_strict = 1;
467                 } else if (!strncmp(str, "sp_off", 6)) {
468                         printk(KERN_INFO
469                                 "Intel-IOMMU: disable supported super page\n");
470                         intel_iommu_superpage = 0;
471                 }
472
473                 str += strcspn(str, ",");
474                 while (*str == ',')
475                         str++;
476         }
477         return 0;
478 }
479 __setup("intel_iommu=", intel_iommu_setup);
480
481 static struct kmem_cache *iommu_domain_cache;
482 static struct kmem_cache *iommu_devinfo_cache;
483 static struct kmem_cache *iommu_iova_cache;
484
485 static inline void *alloc_pgtable_page(int node)
486 {
487         struct page *page;
488         void *vaddr = NULL;
489
490         page = alloc_pages_node(node, GFP_ATOMIC | __GFP_ZERO, 0);
491         if (page)
492                 vaddr = page_address(page);
493         return vaddr;
494 }
495
496 static inline void free_pgtable_page(void *vaddr)
497 {
498         free_page((unsigned long)vaddr);
499 }
500
501 static inline void *alloc_domain_mem(void)
502 {
503         return kmem_cache_alloc(iommu_domain_cache, GFP_ATOMIC);
504 }
505
506 static void free_domain_mem(void *vaddr)
507 {
508         kmem_cache_free(iommu_domain_cache, vaddr);
509 }
510
511 static inline void * alloc_devinfo_mem(void)
512 {
513         return kmem_cache_alloc(iommu_devinfo_cache, GFP_ATOMIC);
514 }
515
516 static inline void free_devinfo_mem(void *vaddr)
517 {
518         kmem_cache_free(iommu_devinfo_cache, vaddr);
519 }
520
521 struct iova *alloc_iova_mem(void)
522 {
523         return kmem_cache_alloc(iommu_iova_cache, GFP_ATOMIC);
524 }
525
526 void free_iova_mem(struct iova *iova)
527 {
528         kmem_cache_free(iommu_iova_cache, iova);
529 }
530
531
532 static int __iommu_calculate_agaw(struct intel_iommu *iommu, int max_gaw)
533 {
534         unsigned long sagaw;
535         int agaw = -1;
536
537         sagaw = cap_sagaw(iommu->cap);
538         for (agaw = width_to_agaw(max_gaw);
539              agaw >= 0; agaw--) {
540                 if (test_bit(agaw, &sagaw))
541                         break;
542         }
543
544         return agaw;
545 }
546
547 /*
548  * Calculate max SAGAW for each iommu.
549  */
550 int iommu_calculate_max_sagaw(struct intel_iommu *iommu)
551 {
552         return __iommu_calculate_agaw(iommu, MAX_AGAW_WIDTH);
553 }
554
555 /*
556  * calculate agaw for each iommu.
557  * "SAGAW" may be different across iommus, use a default agaw, and
558  * get a supported less agaw for iommus that don't support the default agaw.
559  */
560 int iommu_calculate_agaw(struct intel_iommu *iommu)
561 {
562         return __iommu_calculate_agaw(iommu, DEFAULT_DOMAIN_ADDRESS_WIDTH);
563 }
564
565 /* This functionin only returns single iommu in a domain */
566 static struct intel_iommu *domain_get_iommu(struct dmar_domain *domain)
567 {
568         int iommu_id;
569
570         /* si_domain and vm domain should not get here. */
571         BUG_ON(domain->flags & DOMAIN_FLAG_VIRTUAL_MACHINE);
572         BUG_ON(domain->flags & DOMAIN_FLAG_STATIC_IDENTITY);
573
574         iommu_id = find_first_bit(&domain->iommu_bmp, g_num_of_iommus);
575         if (iommu_id < 0 || iommu_id >= g_num_of_iommus)
576                 return NULL;
577
578         return g_iommus[iommu_id];
579 }
580
581 static void domain_update_iommu_coherency(struct dmar_domain *domain)
582 {
583         int i;
584
585         domain->iommu_coherency = 1;
586
587         for_each_set_bit(i, &domain->iommu_bmp, g_num_of_iommus) {
588                 if (!ecap_coherent(g_iommus[i]->ecap)) {
589                         domain->iommu_coherency = 0;
590                         break;
591                 }
592         }
593 }
594
595 static void domain_update_iommu_snooping(struct dmar_domain *domain)
596 {
597         int i;
598
599         domain->iommu_snooping = 1;
600
601         for_each_set_bit(i, &domain->iommu_bmp, g_num_of_iommus) {
602                 if (!ecap_sc_support(g_iommus[i]->ecap)) {
603                         domain->iommu_snooping = 0;
604                         break;
605                 }
606         }
607 }
608
609 static void domain_update_iommu_superpage(struct dmar_domain *domain)
610 {
611         struct dmar_drhd_unit *drhd;
612         struct intel_iommu *iommu = NULL;
613         int mask = 0xf;
614
615         if (!intel_iommu_superpage) {
616                 domain->iommu_superpage = 0;
617                 return;
618         }
619
620         /* set iommu_superpage to the smallest common denominator */
621         for_each_active_iommu(iommu, drhd) {
622                 mask &= cap_super_page_val(iommu->cap);
623                 if (!mask) {
624                         break;
625                 }
626         }
627         domain->iommu_superpage = fls(mask);
628 }
629
630 /* Some capabilities may be different across iommus */
631 static void domain_update_iommu_cap(struct dmar_domain *domain)
632 {
633         domain_update_iommu_coherency(domain);
634         domain_update_iommu_snooping(domain);
635         domain_update_iommu_superpage(domain);
636 }
637
638 static struct intel_iommu *device_to_iommu(int segment, u8 bus, u8 devfn)
639 {
640         struct dmar_drhd_unit *drhd = NULL;
641         int i;
642
643         for_each_drhd_unit(drhd) {
644                 if (drhd->ignored)
645                         continue;
646                 if (segment != drhd->segment)
647                         continue;
648
649                 for (i = 0; i < drhd->devices_cnt; i++) {
650                         if (drhd->devices[i] &&
651                             drhd->devices[i]->bus->number == bus &&
652                             drhd->devices[i]->devfn == devfn)
653                                 return drhd->iommu;
654                         if (drhd->devices[i] &&
655                             drhd->devices[i]->subordinate &&
656                             drhd->devices[i]->subordinate->number <= bus &&
657                             drhd->devices[i]->subordinate->subordinate >= bus)
658                                 return drhd->iommu;
659                 }
660
661                 if (drhd->include_all)
662                         return drhd->iommu;
663         }
664
665         return NULL;
666 }
667
668 static void domain_flush_cache(struct dmar_domain *domain,
669                                void *addr, int size)
670 {
671         if (!domain->iommu_coherency)
672                 clflush_cache_range(addr, size);
673 }
674
675 /* Gets context entry for a given bus and devfn */
676 static struct context_entry * device_to_context_entry(struct intel_iommu *iommu,
677                 u8 bus, u8 devfn)
678 {
679         struct root_entry *root;
680         struct context_entry *context;
681         unsigned long phy_addr;
682         unsigned long flags;
683
684         spin_lock_irqsave(&iommu->lock, flags);
685         root = &iommu->root_entry[bus];
686         context = get_context_addr_from_root(root);
687         if (!context) {
688                 context = (struct context_entry *)
689                                 alloc_pgtable_page(iommu->node);
690                 if (!context) {
691                         spin_unlock_irqrestore(&iommu->lock, flags);
692                         return NULL;
693                 }
694                 __iommu_flush_cache(iommu, (void *)context, CONTEXT_SIZE);
695                 phy_addr = virt_to_phys((void *)context);
696                 set_root_value(root, phy_addr);
697                 set_root_present(root);
698                 __iommu_flush_cache(iommu, root, sizeof(*root));
699         }
700         spin_unlock_irqrestore(&iommu->lock, flags);
701         return &context[devfn];
702 }
703
704 static int device_context_mapped(struct intel_iommu *iommu, u8 bus, u8 devfn)
705 {
706         struct root_entry *root;
707         struct context_entry *context;
708         int ret;
709         unsigned long flags;
710
711         spin_lock_irqsave(&iommu->lock, flags);
712         root = &iommu->root_entry[bus];
713         context = get_context_addr_from_root(root);
714         if (!context) {
715                 ret = 0;
716                 goto out;
717         }
718         ret = context_present(&context[devfn]);
719 out:
720         spin_unlock_irqrestore(&iommu->lock, flags);
721         return ret;
722 }
723
724 static void clear_context_table(struct intel_iommu *iommu, u8 bus, u8 devfn)
725 {
726         struct root_entry *root;
727         struct context_entry *context;
728         unsigned long flags;
729
730         spin_lock_irqsave(&iommu->lock, flags);
731         root = &iommu->root_entry[bus];
732         context = get_context_addr_from_root(root);
733         if (context) {
734                 context_clear_entry(&context[devfn]);
735                 __iommu_flush_cache(iommu, &context[devfn], \
736                         sizeof(*context));
737         }
738         spin_unlock_irqrestore(&iommu->lock, flags);
739 }
740
741 static void free_context_table(struct intel_iommu *iommu)
742 {
743         struct root_entry *root;
744         int i;
745         unsigned long flags;
746         struct context_entry *context;
747
748         spin_lock_irqsave(&iommu->lock, flags);
749         if (!iommu->root_entry) {
750                 goto out;
751         }
752         for (i = 0; i < ROOT_ENTRY_NR; i++) {
753                 root = &iommu->root_entry[i];
754                 context = get_context_addr_from_root(root);
755                 if (context)
756                         free_pgtable_page(context);
757         }
758         free_pgtable_page(iommu->root_entry);
759         iommu->root_entry = NULL;
760 out:
761         spin_unlock_irqrestore(&iommu->lock, flags);
762 }
763
764 static struct dma_pte *pfn_to_dma_pte(struct dmar_domain *domain,
765                                       unsigned long pfn, int target_level)
766 {
767         int addr_width = agaw_to_width(domain->agaw) - VTD_PAGE_SHIFT;
768         struct dma_pte *parent, *pte = NULL;
769         int level = agaw_to_level(domain->agaw);
770         int offset;
771
772         BUG_ON(!domain->pgd);
773         BUG_ON(addr_width < BITS_PER_LONG && pfn >> addr_width);
774         parent = domain->pgd;
775
776         while (level > 0) {
777                 void *tmp_page;
778
779                 offset = pfn_level_offset(pfn, level);
780                 pte = &parent[offset];
781                 if (!target_level && (dma_pte_superpage(pte) || !dma_pte_present(pte)))
782                         break;
783                 if (level == target_level)
784                         break;
785
786                 if (!dma_pte_present(pte)) {
787                         uint64_t pteval;
788
789                         tmp_page = alloc_pgtable_page(domain->nid);
790
791                         if (!tmp_page)
792                                 return NULL;
793
794                         domain_flush_cache(domain, tmp_page, VTD_PAGE_SIZE);
795                         pteval = ((uint64_t)virt_to_dma_pfn(tmp_page) << VTD_PAGE_SHIFT) | DMA_PTE_READ | DMA_PTE_WRITE;
796                         if (cmpxchg64(&pte->val, 0ULL, pteval)) {
797                                 /* Someone else set it while we were thinking; use theirs. */
798                                 free_pgtable_page(tmp_page);
799                         } else {
800                                 dma_pte_addr(pte);
801                                 domain_flush_cache(domain, pte, sizeof(*pte));
802                         }
803                 }
804                 parent = phys_to_virt(dma_pte_addr(pte));
805                 level--;
806         }
807
808         return pte;
809 }
810
811
812 /* return address's pte at specific level */
813 static struct dma_pte *dma_pfn_level_pte(struct dmar_domain *domain,
814                                          unsigned long pfn,
815                                          int level, int *large_page)
816 {
817         struct dma_pte *parent, *pte = NULL;
818         int total = agaw_to_level(domain->agaw);
819         int offset;
820
821         parent = domain->pgd;
822         while (level <= total) {
823                 offset = pfn_level_offset(pfn, total);
824                 pte = &parent[offset];
825                 if (level == total)
826                         return pte;
827
828                 if (!dma_pte_present(pte)) {
829                         *large_page = total;
830                         break;
831                 }
832
833                 if (pte->val & DMA_PTE_LARGE_PAGE) {
834                         *large_page = total;
835                         return pte;
836                 }
837
838                 parent = phys_to_virt(dma_pte_addr(pte));
839                 total--;
840         }
841         return NULL;
842 }
843
844 /* clear last level pte, a tlb flush should be followed */
845 static int dma_pte_clear_range(struct dmar_domain *domain,
846                                 unsigned long start_pfn,
847                                 unsigned long last_pfn)
848 {
849         int addr_width = agaw_to_width(domain->agaw) - VTD_PAGE_SHIFT;
850         unsigned int large_page = 1;
851         struct dma_pte *first_pte, *pte;
852         int order;
853
854         BUG_ON(addr_width < BITS_PER_LONG && start_pfn >> addr_width);
855         BUG_ON(addr_width < BITS_PER_LONG && last_pfn >> addr_width);
856         BUG_ON(start_pfn > last_pfn);
857
858         /* we don't need lock here; nobody else touches the iova range */
859         do {
860                 large_page = 1;
861                 first_pte = pte = dma_pfn_level_pte(domain, start_pfn, 1, &large_page);
862                 if (!pte) {
863                         start_pfn = align_to_level(start_pfn + 1, large_page + 1);
864                         continue;
865                 }
866                 do {
867                         dma_clear_pte(pte);
868                         start_pfn += lvl_to_nr_pages(large_page);
869                         pte++;
870                 } while (start_pfn <= last_pfn && !first_pte_in_page(pte));
871
872                 domain_flush_cache(domain, first_pte,
873                                    (void *)pte - (void *)first_pte);
874
875         } while (start_pfn && start_pfn <= last_pfn);
876
877         order = (large_page - 1) * 9;
878         return order;
879 }
880
881 /* free page table pages. last level pte should already be cleared */
882 static void dma_pte_free_pagetable(struct dmar_domain *domain,
883                                    unsigned long start_pfn,
884                                    unsigned long last_pfn)
885 {
886         int addr_width = agaw_to_width(domain->agaw) - VTD_PAGE_SHIFT;
887         struct dma_pte *first_pte, *pte;
888         int total = agaw_to_level(domain->agaw);
889         int level;
890         unsigned long tmp;
891         int large_page = 2;
892
893         BUG_ON(addr_width < BITS_PER_LONG && start_pfn >> addr_width);
894         BUG_ON(addr_width < BITS_PER_LONG && last_pfn >> addr_width);
895         BUG_ON(start_pfn > last_pfn);
896
897         /* We don't need lock here; nobody else touches the iova range */
898         level = 2;
899         while (level <= total) {
900                 tmp = align_to_level(start_pfn, level);
901
902                 /* If we can't even clear one PTE at this level, we're done */
903                 if (tmp + level_size(level) - 1 > last_pfn)
904                         return;
905
906                 do {
907                         large_page = level;
908                         first_pte = pte = dma_pfn_level_pte(domain, tmp, level, &large_page);
909                         if (large_page > level)
910                                 level = large_page + 1;
911                         if (!pte) {
912                                 tmp = align_to_level(tmp + 1, level + 1);
913                                 continue;
914                         }
915                         do {
916                                 if (dma_pte_present(pte)) {
917                                         free_pgtable_page(phys_to_virt(dma_pte_addr(pte)));
918                                         dma_clear_pte(pte);
919                                 }
920                                 pte++;
921                                 tmp += level_size(level);
922                         } while (!first_pte_in_page(pte) &&
923                                  tmp + level_size(level) - 1 <= last_pfn);
924
925                         domain_flush_cache(domain, first_pte,
926                                            (void *)pte - (void *)first_pte);
927                         
928                 } while (tmp && tmp + level_size(level) - 1 <= last_pfn);
929                 level++;
930         }
931         /* free pgd */
932         if (start_pfn == 0 && last_pfn == DOMAIN_MAX_PFN(domain->gaw)) {
933                 free_pgtable_page(domain->pgd);
934                 domain->pgd = NULL;
935         }
936 }
937
938 /* iommu handling */
939 static int iommu_alloc_root_entry(struct intel_iommu *iommu)
940 {
941         struct root_entry *root;
942         unsigned long flags;
943
944         root = (struct root_entry *)alloc_pgtable_page(iommu->node);
945         if (!root)
946                 return -ENOMEM;
947
948         __iommu_flush_cache(iommu, root, ROOT_SIZE);
949
950         spin_lock_irqsave(&iommu->lock, flags);
951         iommu->root_entry = root;
952         spin_unlock_irqrestore(&iommu->lock, flags);
953
954         return 0;
955 }
956
957 static void iommu_set_root_entry(struct intel_iommu *iommu)
958 {
959         void *addr;
960         u32 sts;
961         unsigned long flag;
962
963         addr = iommu->root_entry;
964
965         raw_spin_lock_irqsave(&iommu->register_lock, flag);
966         dmar_writeq(iommu->reg + DMAR_RTADDR_REG, virt_to_phys(addr));
967
968         writel(iommu->gcmd | DMA_GCMD_SRTP, iommu->reg + DMAR_GCMD_REG);
969
970         /* Make sure hardware complete it */
971         IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
972                       readl, (sts & DMA_GSTS_RTPS), sts);
973
974         raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
975 }
976
977 static void iommu_flush_write_buffer(struct intel_iommu *iommu)
978 {
979         u32 val;
980         unsigned long flag;
981
982         if (!rwbf_quirk && !cap_rwbf(iommu->cap))
983                 return;
984
985         raw_spin_lock_irqsave(&iommu->register_lock, flag);
986         writel(iommu->gcmd | DMA_GCMD_WBF, iommu->reg + DMAR_GCMD_REG);
987
988         /* Make sure hardware complete it */
989         IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
990                       readl, (!(val & DMA_GSTS_WBFS)), val);
991
992         raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
993 }
994
995 /* return value determine if we need a write buffer flush */
996 static void __iommu_flush_context(struct intel_iommu *iommu,
997                                   u16 did, u16 source_id, u8 function_mask,
998                                   u64 type)
999 {
1000         u64 val = 0;
1001         unsigned long flag;
1002
1003         switch (type) {
1004         case DMA_CCMD_GLOBAL_INVL:
1005                 val = DMA_CCMD_GLOBAL_INVL;
1006                 break;
1007         case DMA_CCMD_DOMAIN_INVL:
1008                 val = DMA_CCMD_DOMAIN_INVL|DMA_CCMD_DID(did);
1009                 break;
1010         case DMA_CCMD_DEVICE_INVL:
1011                 val = DMA_CCMD_DEVICE_INVL|DMA_CCMD_DID(did)
1012                         | DMA_CCMD_SID(source_id) | DMA_CCMD_FM(function_mask);
1013                 break;
1014         default:
1015                 BUG();
1016         }
1017         val |= DMA_CCMD_ICC;
1018
1019         raw_spin_lock_irqsave(&iommu->register_lock, flag);
1020         dmar_writeq(iommu->reg + DMAR_CCMD_REG, val);
1021
1022         /* Make sure hardware complete it */
1023         IOMMU_WAIT_OP(iommu, DMAR_CCMD_REG,
1024                 dmar_readq, (!(val & DMA_CCMD_ICC)), val);
1025
1026         raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1027 }
1028
1029 /* return value determine if we need a write buffer flush */
1030 static void __iommu_flush_iotlb(struct intel_iommu *iommu, u16 did,
1031                                 u64 addr, unsigned int size_order, u64 type)
1032 {
1033         int tlb_offset = ecap_iotlb_offset(iommu->ecap);
1034         u64 val = 0, val_iva = 0;
1035         unsigned long flag;
1036
1037         switch (type) {
1038         case DMA_TLB_GLOBAL_FLUSH:
1039                 /* global flush doesn't need set IVA_REG */
1040                 val = DMA_TLB_GLOBAL_FLUSH|DMA_TLB_IVT;
1041                 break;
1042         case DMA_TLB_DSI_FLUSH:
1043                 val = DMA_TLB_DSI_FLUSH|DMA_TLB_IVT|DMA_TLB_DID(did);
1044                 break;
1045         case DMA_TLB_PSI_FLUSH:
1046                 val = DMA_TLB_PSI_FLUSH|DMA_TLB_IVT|DMA_TLB_DID(did);
1047                 /* Note: always flush non-leaf currently */
1048                 val_iva = size_order | addr;
1049                 break;
1050         default:
1051                 BUG();
1052         }
1053         /* Note: set drain read/write */
1054 #if 0
1055         /*
1056          * This is probably to be super secure.. Looks like we can
1057          * ignore it without any impact.
1058          */
1059         if (cap_read_drain(iommu->cap))
1060                 val |= DMA_TLB_READ_DRAIN;
1061 #endif
1062         if (cap_write_drain(iommu->cap))
1063                 val |= DMA_TLB_WRITE_DRAIN;
1064
1065         raw_spin_lock_irqsave(&iommu->register_lock, flag);
1066         /* Note: Only uses first TLB reg currently */
1067         if (val_iva)
1068                 dmar_writeq(iommu->reg + tlb_offset, val_iva);
1069         dmar_writeq(iommu->reg + tlb_offset + 8, val);
1070
1071         /* Make sure hardware complete it */
1072         IOMMU_WAIT_OP(iommu, tlb_offset + 8,
1073                 dmar_readq, (!(val & DMA_TLB_IVT)), val);
1074
1075         raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1076
1077         /* check IOTLB invalidation granularity */
1078         if (DMA_TLB_IAIG(val) == 0)
1079                 printk(KERN_ERR"IOMMU: flush IOTLB failed\n");
1080         if (DMA_TLB_IAIG(val) != DMA_TLB_IIRG(type))
1081                 pr_debug("IOMMU: tlb flush request %Lx, actual %Lx\n",
1082                         (unsigned long long)DMA_TLB_IIRG(type),
1083                         (unsigned long long)DMA_TLB_IAIG(val));
1084 }
1085
1086 static struct device_domain_info *iommu_support_dev_iotlb(
1087         struct dmar_domain *domain, int segment, u8 bus, u8 devfn)
1088 {
1089         int found = 0;
1090         unsigned long flags;
1091         struct device_domain_info *info;
1092         struct intel_iommu *iommu = device_to_iommu(segment, bus, devfn);
1093
1094         if (!ecap_dev_iotlb_support(iommu->ecap))
1095                 return NULL;
1096
1097         if (!iommu->qi)
1098                 return NULL;
1099
1100         spin_lock_irqsave(&device_domain_lock, flags);
1101         list_for_each_entry(info, &domain->devices, link)
1102                 if (info->bus == bus && info->devfn == devfn) {
1103                         found = 1;
1104                         break;
1105                 }
1106         spin_unlock_irqrestore(&device_domain_lock, flags);
1107
1108         if (!found || !info->dev)
1109                 return NULL;
1110
1111         if (!pci_find_ext_capability(info->dev, PCI_EXT_CAP_ID_ATS))
1112                 return NULL;
1113
1114         if (!dmar_find_matched_atsr_unit(info->dev))
1115                 return NULL;
1116
1117         info->iommu = iommu;
1118
1119         return info;
1120 }
1121
1122 static void iommu_enable_dev_iotlb(struct device_domain_info *info)
1123 {
1124         if (!info)
1125                 return;
1126
1127         pci_enable_ats(info->dev, VTD_PAGE_SHIFT);
1128 }
1129
1130 static void iommu_disable_dev_iotlb(struct device_domain_info *info)
1131 {
1132         if (!info->dev || !pci_ats_enabled(info->dev))
1133                 return;
1134
1135         pci_disable_ats(info->dev);
1136 }
1137
1138 static void iommu_flush_dev_iotlb(struct dmar_domain *domain,
1139                                   u64 addr, unsigned mask)
1140 {
1141         u16 sid, qdep;
1142         unsigned long flags;
1143         struct device_domain_info *info;
1144
1145         spin_lock_irqsave(&device_domain_lock, flags);
1146         list_for_each_entry(info, &domain->devices, link) {
1147                 if (!info->dev || !pci_ats_enabled(info->dev))
1148                         continue;
1149
1150                 sid = info->bus << 8 | info->devfn;
1151                 qdep = pci_ats_queue_depth(info->dev);
1152                 qi_flush_dev_iotlb(info->iommu, sid, qdep, addr, mask);
1153         }
1154         spin_unlock_irqrestore(&device_domain_lock, flags);
1155 }
1156
1157 static void iommu_flush_iotlb_psi(struct intel_iommu *iommu, u16 did,
1158                                   unsigned long pfn, unsigned int pages, int map)
1159 {
1160         unsigned int mask = ilog2(__roundup_pow_of_two(pages));
1161         uint64_t addr = (uint64_t)pfn << VTD_PAGE_SHIFT;
1162
1163         BUG_ON(pages == 0);
1164
1165         /*
1166          * Fallback to domain selective flush if no PSI support or the size is
1167          * too big.
1168          * PSI requires page size to be 2 ^ x, and the base address is naturally
1169          * aligned to the size
1170          */
1171         if (!cap_pgsel_inv(iommu->cap) || mask > cap_max_amask_val(iommu->cap))
1172                 iommu->flush.flush_iotlb(iommu, did, 0, 0,
1173                                                 DMA_TLB_DSI_FLUSH);
1174         else
1175                 iommu->flush.flush_iotlb(iommu, did, addr, mask,
1176                                                 DMA_TLB_PSI_FLUSH);
1177
1178         /*
1179          * In caching mode, changes of pages from non-present to present require
1180          * flush. However, device IOTLB doesn't need to be flushed in this case.
1181          */
1182         if (!cap_caching_mode(iommu->cap) || !map)
1183                 iommu_flush_dev_iotlb(iommu->domains[did], addr, mask);
1184 }
1185
1186 static void iommu_disable_protect_mem_regions(struct intel_iommu *iommu)
1187 {
1188         u32 pmen;
1189         unsigned long flags;
1190
1191         raw_spin_lock_irqsave(&iommu->register_lock, flags);
1192         pmen = readl(iommu->reg + DMAR_PMEN_REG);
1193         pmen &= ~DMA_PMEN_EPM;
1194         writel(pmen, iommu->reg + DMAR_PMEN_REG);
1195
1196         /* wait for the protected region status bit to clear */
1197         IOMMU_WAIT_OP(iommu, DMAR_PMEN_REG,
1198                 readl, !(pmen & DMA_PMEN_PRS), pmen);
1199
1200         raw_spin_unlock_irqrestore(&iommu->register_lock, flags);
1201 }
1202
1203 static int iommu_enable_translation(struct intel_iommu *iommu)
1204 {
1205         u32 sts;
1206         unsigned long flags;
1207
1208         raw_spin_lock_irqsave(&iommu->register_lock, flags);
1209         iommu->gcmd |= DMA_GCMD_TE;
1210         writel(iommu->gcmd, iommu->reg + DMAR_GCMD_REG);
1211
1212         /* Make sure hardware complete it */
1213         IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1214                       readl, (sts & DMA_GSTS_TES), sts);
1215
1216         raw_spin_unlock_irqrestore(&iommu->register_lock, flags);
1217         return 0;
1218 }
1219
1220 static int iommu_disable_translation(struct intel_iommu *iommu)
1221 {
1222         u32 sts;
1223         unsigned long flag;
1224
1225         raw_spin_lock_irqsave(&iommu->register_lock, flag);
1226         iommu->gcmd &= ~DMA_GCMD_TE;
1227         writel(iommu->gcmd, iommu->reg + DMAR_GCMD_REG);
1228
1229         /* Make sure hardware complete it */
1230         IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1231                       readl, (!(sts & DMA_GSTS_TES)), sts);
1232
1233         raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1234         return 0;
1235 }
1236
1237
1238 static int iommu_init_domains(struct intel_iommu *iommu)
1239 {
1240         unsigned long ndomains;
1241         unsigned long nlongs;
1242
1243         ndomains = cap_ndoms(iommu->cap);
1244         pr_debug("IOMMU %d: Number of Domains supportd <%ld>\n", iommu->seq_id,
1245                         ndomains);
1246         nlongs = BITS_TO_LONGS(ndomains);
1247
1248         spin_lock_init(&iommu->lock);
1249
1250         /* TBD: there might be 64K domains,
1251          * consider other allocation for future chip
1252          */
1253         iommu->domain_ids = kcalloc(nlongs, sizeof(unsigned long), GFP_KERNEL);
1254         if (!iommu->domain_ids) {
1255                 printk(KERN_ERR "Allocating domain id array failed\n");
1256                 return -ENOMEM;
1257         }
1258         iommu->domains = kcalloc(ndomains, sizeof(struct dmar_domain *),
1259                         GFP_KERNEL);
1260         if (!iommu->domains) {
1261                 printk(KERN_ERR "Allocating domain array failed\n");
1262                 return -ENOMEM;
1263         }
1264
1265         /*
1266          * if Caching mode is set, then invalid translations are tagged
1267          * with domainid 0. Hence we need to pre-allocate it.
1268          */
1269         if (cap_caching_mode(iommu->cap))
1270                 set_bit(0, iommu->domain_ids);
1271         return 0;
1272 }
1273
1274
1275 static void domain_exit(struct dmar_domain *domain);
1276 static void vm_domain_exit(struct dmar_domain *domain);
1277
1278 void free_dmar_iommu(struct intel_iommu *iommu)
1279 {
1280         struct dmar_domain *domain;
1281         int i;
1282         unsigned long flags;
1283
1284         if ((iommu->domains) && (iommu->domain_ids)) {
1285                 for_each_set_bit(i, iommu->domain_ids, cap_ndoms(iommu->cap)) {
1286                         domain = iommu->domains[i];
1287                         clear_bit(i, iommu->domain_ids);
1288
1289                         spin_lock_irqsave(&domain->iommu_lock, flags);
1290                         if (--domain->iommu_count == 0) {
1291                                 if (domain->flags & DOMAIN_FLAG_VIRTUAL_MACHINE)
1292                                         vm_domain_exit(domain);
1293                                 else
1294                                         domain_exit(domain);
1295                         }
1296                         spin_unlock_irqrestore(&domain->iommu_lock, flags);
1297                 }
1298         }
1299
1300         if (iommu->gcmd & DMA_GCMD_TE)
1301                 iommu_disable_translation(iommu);
1302
1303         if (iommu->irq) {
1304                 irq_set_handler_data(iommu->irq, NULL);
1305                 /* This will mask the irq */
1306                 free_irq(iommu->irq, iommu);
1307                 destroy_irq(iommu->irq);
1308         }
1309
1310         kfree(iommu->domains);
1311         kfree(iommu->domain_ids);
1312
1313         g_iommus[iommu->seq_id] = NULL;
1314
1315         /* if all iommus are freed, free g_iommus */
1316         for (i = 0; i < g_num_of_iommus; i++) {
1317                 if (g_iommus[i])
1318                         break;
1319         }
1320
1321         if (i == g_num_of_iommus)
1322                 kfree(g_iommus);
1323
1324         /* free context mapping */
1325         free_context_table(iommu);
1326 }
1327
1328 static struct dmar_domain *alloc_domain(void)
1329 {
1330         struct dmar_domain *domain;
1331
1332         domain = alloc_domain_mem();
1333         if (!domain)
1334                 return NULL;
1335
1336         domain->nid = -1;
1337         memset(&domain->iommu_bmp, 0, sizeof(unsigned long));
1338         domain->flags = 0;
1339
1340         return domain;
1341 }
1342
1343 static int iommu_attach_domain(struct dmar_domain *domain,
1344                                struct intel_iommu *iommu)
1345 {
1346         int num;
1347         unsigned long ndomains;
1348         unsigned long flags;
1349
1350         ndomains = cap_ndoms(iommu->cap);
1351
1352         spin_lock_irqsave(&iommu->lock, flags);
1353
1354         num = find_first_zero_bit(iommu->domain_ids, ndomains);
1355         if (num >= ndomains) {
1356                 spin_unlock_irqrestore(&iommu->lock, flags);
1357                 printk(KERN_ERR "IOMMU: no free domain ids\n");
1358                 return -ENOMEM;
1359         }
1360
1361         domain->id = num;
1362         set_bit(num, iommu->domain_ids);
1363         set_bit(iommu->seq_id, &domain->iommu_bmp);
1364         iommu->domains[num] = domain;
1365         spin_unlock_irqrestore(&iommu->lock, flags);
1366
1367         return 0;
1368 }
1369
1370 static void iommu_detach_domain(struct dmar_domain *domain,
1371                                 struct intel_iommu *iommu)
1372 {
1373         unsigned long flags;
1374         int num, ndomains;
1375         int found = 0;
1376
1377         spin_lock_irqsave(&iommu->lock, flags);
1378         ndomains = cap_ndoms(iommu->cap);
1379         for_each_set_bit(num, iommu->domain_ids, ndomains) {
1380                 if (iommu->domains[num] == domain) {
1381                         found = 1;
1382                         break;
1383                 }
1384         }
1385
1386         if (found) {
1387                 clear_bit(num, iommu->domain_ids);
1388                 clear_bit(iommu->seq_id, &domain->iommu_bmp);
1389                 iommu->domains[num] = NULL;
1390         }
1391         spin_unlock_irqrestore(&iommu->lock, flags);
1392 }
1393
1394 static struct iova_domain reserved_iova_list;
1395 static struct lock_class_key reserved_rbtree_key;
1396
1397 static int dmar_init_reserved_ranges(void)
1398 {
1399         struct pci_dev *pdev = NULL;
1400         struct iova *iova;
1401         int i;
1402
1403         init_iova_domain(&reserved_iova_list, DMA_32BIT_PFN);
1404
1405         lockdep_set_class(&reserved_iova_list.iova_rbtree_lock,
1406                 &reserved_rbtree_key);
1407
1408         /* IOAPIC ranges shouldn't be accessed by DMA */
1409         iova = reserve_iova(&reserved_iova_list, IOVA_PFN(IOAPIC_RANGE_START),
1410                 IOVA_PFN(IOAPIC_RANGE_END));
1411         if (!iova) {
1412                 printk(KERN_ERR "Reserve IOAPIC range failed\n");
1413                 return -ENODEV;
1414         }
1415
1416         /* Reserve all PCI MMIO to avoid peer-to-peer access */
1417         for_each_pci_dev(pdev) {
1418                 struct resource *r;
1419
1420                 for (i = 0; i < PCI_NUM_RESOURCES; i++) {
1421                         r = &pdev->resource[i];
1422                         if (!r->flags || !(r->flags & IORESOURCE_MEM))
1423                                 continue;
1424                         iova = reserve_iova(&reserved_iova_list,
1425                                             IOVA_PFN(r->start),
1426                                             IOVA_PFN(r->end));
1427                         if (!iova) {
1428                                 printk(KERN_ERR "Reserve iova failed\n");
1429                                 return -ENODEV;
1430                         }
1431                 }
1432         }
1433         return 0;
1434 }
1435
1436 static void domain_reserve_special_ranges(struct dmar_domain *domain)
1437 {
1438         copy_reserved_iova(&reserved_iova_list, &domain->iovad);
1439 }
1440
1441 static inline int guestwidth_to_adjustwidth(int gaw)
1442 {
1443         int agaw;
1444         int r = (gaw - 12) % 9;
1445
1446         if (r == 0)
1447                 agaw = gaw;
1448         else
1449                 agaw = gaw + 9 - r;
1450         if (agaw > 64)
1451                 agaw = 64;
1452         return agaw;
1453 }
1454
1455 static int domain_init(struct dmar_domain *domain, int guest_width)
1456 {
1457         struct intel_iommu *iommu;
1458         int adjust_width, agaw;
1459         unsigned long sagaw;
1460
1461         init_iova_domain(&domain->iovad, DMA_32BIT_PFN);
1462         spin_lock_init(&domain->iommu_lock);
1463
1464         domain_reserve_special_ranges(domain);
1465
1466         /* calculate AGAW */
1467         iommu = domain_get_iommu(domain);
1468         if (guest_width > cap_mgaw(iommu->cap))
1469                 guest_width = cap_mgaw(iommu->cap);
1470         domain->gaw = guest_width;
1471         adjust_width = guestwidth_to_adjustwidth(guest_width);
1472         agaw = width_to_agaw(adjust_width);
1473         sagaw = cap_sagaw(iommu->cap);
1474         if (!test_bit(agaw, &sagaw)) {
1475                 /* hardware doesn't support it, choose a bigger one */
1476                 pr_debug("IOMMU: hardware doesn't support agaw %d\n", agaw);
1477                 agaw = find_next_bit(&sagaw, 5, agaw);
1478                 if (agaw >= 5)
1479                         return -ENODEV;
1480         }
1481         domain->agaw = agaw;
1482         INIT_LIST_HEAD(&domain->devices);
1483
1484         if (ecap_coherent(iommu->ecap))
1485                 domain->iommu_coherency = 1;
1486         else
1487                 domain->iommu_coherency = 0;
1488
1489         if (ecap_sc_support(iommu->ecap))
1490                 domain->iommu_snooping = 1;
1491         else
1492                 domain->iommu_snooping = 0;
1493
1494         domain->iommu_superpage = fls(cap_super_page_val(iommu->cap));
1495         domain->iommu_count = 1;
1496         domain->nid = iommu->node;
1497
1498         /* always allocate the top pgd */
1499         domain->pgd = (struct dma_pte *)alloc_pgtable_page(domain->nid);
1500         if (!domain->pgd)
1501                 return -ENOMEM;
1502         __iommu_flush_cache(iommu, domain->pgd, PAGE_SIZE);
1503         return 0;
1504 }
1505
1506 static void domain_exit(struct dmar_domain *domain)
1507 {
1508         struct dmar_drhd_unit *drhd;
1509         struct intel_iommu *iommu;
1510
1511         /* Domain 0 is reserved, so dont process it */
1512         if (!domain)
1513                 return;
1514
1515         /* Flush any lazy unmaps that may reference this domain */
1516         if (!intel_iommu_strict)
1517                 flush_unmaps_timeout(0);
1518
1519         domain_remove_dev_info(domain);
1520         /* destroy iovas */
1521         put_iova_domain(&domain->iovad);
1522
1523         /* clear ptes */
1524         dma_pte_clear_range(domain, 0, DOMAIN_MAX_PFN(domain->gaw));
1525
1526         /* free page tables */
1527         dma_pte_free_pagetable(domain, 0, DOMAIN_MAX_PFN(domain->gaw));
1528
1529         for_each_active_iommu(iommu, drhd)
1530                 if (test_bit(iommu->seq_id, &domain->iommu_bmp))
1531                         iommu_detach_domain(domain, iommu);
1532
1533         free_domain_mem(domain);
1534 }
1535
1536 static int domain_context_mapping_one(struct dmar_domain *domain, int segment,
1537                                  u8 bus, u8 devfn, int translation)
1538 {
1539         struct context_entry *context;
1540         unsigned long flags;
1541         struct intel_iommu *iommu;
1542         struct dma_pte *pgd;
1543         unsigned long num;
1544         unsigned long ndomains;
1545         int id;
1546         int agaw;
1547         struct device_domain_info *info = NULL;
1548
1549         pr_debug("Set context mapping for %02x:%02x.%d\n",
1550                 bus, PCI_SLOT(devfn), PCI_FUNC(devfn));
1551
1552         BUG_ON(!domain->pgd);
1553         BUG_ON(translation != CONTEXT_TT_PASS_THROUGH &&
1554                translation != CONTEXT_TT_MULTI_LEVEL);
1555
1556         iommu = device_to_iommu(segment, bus, devfn);
1557         if (!iommu)
1558                 return -ENODEV;
1559
1560         context = device_to_context_entry(iommu, bus, devfn);
1561         if (!context)
1562                 return -ENOMEM;
1563         spin_lock_irqsave(&iommu->lock, flags);
1564         if (context_present(context)) {
1565                 spin_unlock_irqrestore(&iommu->lock, flags);
1566                 return 0;
1567         }
1568
1569         id = domain->id;
1570         pgd = domain->pgd;
1571
1572         if (domain->flags & DOMAIN_FLAG_VIRTUAL_MACHINE ||
1573             domain->flags & DOMAIN_FLAG_STATIC_IDENTITY) {
1574                 int found = 0;
1575
1576                 /* find an available domain id for this device in iommu */
1577                 ndomains = cap_ndoms(iommu->cap);
1578                 for_each_set_bit(num, iommu->domain_ids, ndomains) {
1579                         if (iommu->domains[num] == domain) {
1580                                 id = num;
1581                                 found = 1;
1582                                 break;
1583                         }
1584                 }
1585
1586                 if (found == 0) {
1587                         num = find_first_zero_bit(iommu->domain_ids, ndomains);
1588                         if (num >= ndomains) {
1589                                 spin_unlock_irqrestore(&iommu->lock, flags);
1590                                 printk(KERN_ERR "IOMMU: no free domain ids\n");
1591                                 return -EFAULT;
1592                         }
1593
1594                         set_bit(num, iommu->domain_ids);
1595                         iommu->domains[num] = domain;
1596                         id = num;
1597                 }
1598
1599                 /* Skip top levels of page tables for
1600                  * iommu which has less agaw than default.
1601                  * Unnecessary for PT mode.
1602                  */
1603                 if (translation != CONTEXT_TT_PASS_THROUGH) {
1604                         for (agaw = domain->agaw; agaw != iommu->agaw; agaw--) {
1605                                 pgd = phys_to_virt(dma_pte_addr(pgd));
1606                                 if (!dma_pte_present(pgd)) {
1607                                         spin_unlock_irqrestore(&iommu->lock, flags);
1608                                         return -ENOMEM;
1609                                 }
1610                         }
1611                 }
1612         }
1613
1614         context_set_domain_id(context, id);
1615
1616         if (translation != CONTEXT_TT_PASS_THROUGH) {
1617                 info = iommu_support_dev_iotlb(domain, segment, bus, devfn);
1618                 translation = info ? CONTEXT_TT_DEV_IOTLB :
1619                                      CONTEXT_TT_MULTI_LEVEL;
1620         }
1621         /*
1622          * In pass through mode, AW must be programmed to indicate the largest
1623          * AGAW value supported by hardware. And ASR is ignored by hardware.
1624          */
1625         if (unlikely(translation == CONTEXT_TT_PASS_THROUGH))
1626                 context_set_address_width(context, iommu->msagaw);
1627         else {
1628                 context_set_address_root(context, virt_to_phys(pgd));
1629                 context_set_address_width(context, iommu->agaw);
1630         }
1631
1632         context_set_translation_type(context, translation);
1633         context_set_fault_enable(context);
1634         context_set_present(context);
1635         domain_flush_cache(domain, context, sizeof(*context));
1636
1637         /*
1638          * It's a non-present to present mapping. If hardware doesn't cache
1639          * non-present entry we only need to flush the write-buffer. If the
1640          * _does_ cache non-present entries, then it does so in the special
1641          * domain #0, which we have to flush:
1642          */
1643         if (cap_caching_mode(iommu->cap)) {
1644                 iommu->flush.flush_context(iommu, 0,
1645                                            (((u16)bus) << 8) | devfn,
1646                                            DMA_CCMD_MASK_NOBIT,
1647                                            DMA_CCMD_DEVICE_INVL);
1648                 iommu->flush.flush_iotlb(iommu, domain->id, 0, 0, DMA_TLB_DSI_FLUSH);
1649         } else {
1650                 iommu_flush_write_buffer(iommu);
1651         }
1652         iommu_enable_dev_iotlb(info);
1653         spin_unlock_irqrestore(&iommu->lock, flags);
1654
1655         spin_lock_irqsave(&domain->iommu_lock, flags);
1656         if (!test_and_set_bit(iommu->seq_id, &domain->iommu_bmp)) {
1657                 domain->iommu_count++;
1658                 if (domain->iommu_count == 1)
1659                         domain->nid = iommu->node;
1660                 domain_update_iommu_cap(domain);
1661         }
1662         spin_unlock_irqrestore(&domain->iommu_lock, flags);
1663         return 0;
1664 }
1665
1666 static int
1667 domain_context_mapping(struct dmar_domain *domain, struct pci_dev *pdev,
1668                         int translation)
1669 {
1670         int ret;
1671         struct pci_dev *tmp, *parent;
1672
1673         ret = domain_context_mapping_one(domain, pci_domain_nr(pdev->bus),
1674                                          pdev->bus->number, pdev->devfn,
1675                                          translation);
1676         if (ret)
1677                 return ret;
1678
1679         /* dependent device mapping */
1680         tmp = pci_find_upstream_pcie_bridge(pdev);
1681         if (!tmp)
1682                 return 0;
1683         /* Secondary interface's bus number and devfn 0 */
1684         parent = pdev->bus->self;
1685         while (parent != tmp) {
1686                 ret = domain_context_mapping_one(domain,
1687                                                  pci_domain_nr(parent->bus),
1688                                                  parent->bus->number,
1689                                                  parent->devfn, translation);
1690                 if (ret)
1691                         return ret;
1692                 parent = parent->bus->self;
1693         }
1694         if (pci_is_pcie(tmp)) /* this is a PCIe-to-PCI bridge */
1695                 return domain_context_mapping_one(domain,
1696                                         pci_domain_nr(tmp->subordinate),
1697                                         tmp->subordinate->number, 0,
1698                                         translation);
1699         else /* this is a legacy PCI bridge */
1700                 return domain_context_mapping_one(domain,
1701                                                   pci_domain_nr(tmp->bus),
1702                                                   tmp->bus->number,
1703                                                   tmp->devfn,
1704                                                   translation);
1705 }
1706
1707 static int domain_context_mapped(struct pci_dev *pdev)
1708 {
1709         int ret;
1710         struct pci_dev *tmp, *parent;
1711         struct intel_iommu *iommu;
1712
1713         iommu = device_to_iommu(pci_domain_nr(pdev->bus), pdev->bus->number,
1714                                 pdev->devfn);
1715         if (!iommu)
1716                 return -ENODEV;
1717
1718         ret = device_context_mapped(iommu, pdev->bus->number, pdev->devfn);
1719         if (!ret)
1720                 return ret;
1721         /* dependent device mapping */
1722         tmp = pci_find_upstream_pcie_bridge(pdev);
1723         if (!tmp)
1724                 return ret;
1725         /* Secondary interface's bus number and devfn 0 */
1726         parent = pdev->bus->self;
1727         while (parent != tmp) {
1728                 ret = device_context_mapped(iommu, parent->bus->number,
1729                                             parent->devfn);
1730                 if (!ret)
1731                         return ret;
1732                 parent = parent->bus->self;
1733         }
1734         if (pci_is_pcie(tmp))
1735                 return device_context_mapped(iommu, tmp->subordinate->number,
1736                                              0);
1737         else
1738                 return device_context_mapped(iommu, tmp->bus->number,
1739                                              tmp->devfn);
1740 }
1741
1742 /* Returns a number of VTD pages, but aligned to MM page size */
1743 static inline unsigned long aligned_nrpages(unsigned long host_addr,
1744                                             size_t size)
1745 {
1746         host_addr &= ~PAGE_MASK;
1747         return PAGE_ALIGN(host_addr + size) >> VTD_PAGE_SHIFT;
1748 }
1749
1750 /* Return largest possible superpage level for a given mapping */
1751 static inline int hardware_largepage_caps(struct dmar_domain *domain,
1752                                           unsigned long iov_pfn,
1753                                           unsigned long phy_pfn,
1754                                           unsigned long pages)
1755 {
1756         int support, level = 1;
1757         unsigned long pfnmerge;
1758
1759         support = domain->iommu_superpage;
1760
1761         /* To use a large page, the virtual *and* physical addresses
1762            must be aligned to 2MiB/1GiB/etc. Lower bits set in either
1763            of them will mean we have to use smaller pages. So just
1764            merge them and check both at once. */
1765         pfnmerge = iov_pfn | phy_pfn;
1766
1767         while (support && !(pfnmerge & ~VTD_STRIDE_MASK)) {
1768                 pages >>= VTD_STRIDE_SHIFT;
1769                 if (!pages)
1770                         break;
1771                 pfnmerge >>= VTD_STRIDE_SHIFT;
1772                 level++;
1773                 support--;
1774         }
1775         return level;
1776 }
1777
1778 static int __domain_mapping(struct dmar_domain *domain, unsigned long iov_pfn,
1779                             struct scatterlist *sg, unsigned long phys_pfn,
1780                             unsigned long nr_pages, int prot)
1781 {
1782         struct dma_pte *first_pte = NULL, *pte = NULL;
1783         phys_addr_t uninitialized_var(pteval);
1784         int addr_width = agaw_to_width(domain->agaw) - VTD_PAGE_SHIFT;
1785         unsigned long sg_res;
1786         unsigned int largepage_lvl = 0;
1787         unsigned long lvl_pages = 0;
1788
1789         BUG_ON(addr_width < BITS_PER_LONG && (iov_pfn + nr_pages - 1) >> addr_width);
1790
1791         if ((prot & (DMA_PTE_READ|DMA_PTE_WRITE)) == 0)
1792                 return -EINVAL;
1793
1794         prot &= DMA_PTE_READ | DMA_PTE_WRITE | DMA_PTE_SNP;
1795
1796         if (sg)
1797                 sg_res = 0;
1798         else {
1799                 sg_res = nr_pages + 1;
1800                 pteval = ((phys_addr_t)phys_pfn << VTD_PAGE_SHIFT) | prot;
1801         }
1802
1803         while (nr_pages > 0) {
1804                 uint64_t tmp;
1805
1806                 if (!sg_res) {
1807                         sg_res = aligned_nrpages(sg->offset, sg->length);
1808                         sg->dma_address = ((dma_addr_t)iov_pfn << VTD_PAGE_SHIFT) + sg->offset;
1809                         sg->dma_length = sg->length;
1810                         pteval = page_to_phys(sg_page(sg)) | prot;
1811                         phys_pfn = pteval >> VTD_PAGE_SHIFT;
1812                 }
1813
1814                 if (!pte) {
1815                         largepage_lvl = hardware_largepage_caps(domain, iov_pfn, phys_pfn, sg_res);
1816
1817                         first_pte = pte = pfn_to_dma_pte(domain, iov_pfn, largepage_lvl);
1818                         if (!pte)
1819                                 return -ENOMEM;
1820                         /* It is large page*/
1821                         if (largepage_lvl > 1)
1822                                 pteval |= DMA_PTE_LARGE_PAGE;
1823                         else
1824                                 pteval &= ~(uint64_t)DMA_PTE_LARGE_PAGE;
1825
1826                 }
1827                 /* We don't need lock here, nobody else
1828                  * touches the iova range
1829                  */
1830                 tmp = cmpxchg64_local(&pte->val, 0ULL, pteval);
1831                 if (tmp) {
1832                         static int dumps = 5;
1833                         printk(KERN_CRIT "ERROR: DMA PTE for vPFN 0x%lx already set (to %llx not %llx)\n",
1834                                iov_pfn, tmp, (unsigned long long)pteval);
1835                         if (dumps) {
1836                                 dumps--;
1837                                 debug_dma_dump_mappings(NULL);
1838                         }
1839                         WARN_ON(1);
1840                 }
1841
1842                 lvl_pages = lvl_to_nr_pages(largepage_lvl);
1843
1844                 BUG_ON(nr_pages < lvl_pages);
1845                 BUG_ON(sg_res < lvl_pages);
1846
1847                 nr_pages -= lvl_pages;
1848                 iov_pfn += lvl_pages;
1849                 phys_pfn += lvl_pages;
1850                 pteval += lvl_pages * VTD_PAGE_SIZE;
1851                 sg_res -= lvl_pages;
1852
1853                 /* If the next PTE would be the first in a new page, then we
1854                    need to flush the cache on the entries we've just written.
1855                    And then we'll need to recalculate 'pte', so clear it and
1856                    let it get set again in the if (!pte) block above.
1857
1858                    If we're done (!nr_pages) we need to flush the cache too.
1859
1860                    Also if we've been setting superpages, we may need to
1861                    recalculate 'pte' and switch back to smaller pages for the
1862                    end of the mapping, if the trailing size is not enough to
1863                    use another superpage (i.e. sg_res < lvl_pages). */
1864                 pte++;
1865                 if (!nr_pages || first_pte_in_page(pte) ||
1866                     (largepage_lvl > 1 && sg_res < lvl_pages)) {
1867                         domain_flush_cache(domain, first_pte,
1868                                            (void *)pte - (void *)first_pte);
1869                         pte = NULL;
1870                 }
1871
1872                 if (!sg_res && nr_pages)
1873                         sg = sg_next(sg);
1874         }
1875         return 0;
1876 }
1877
1878 static inline int domain_sg_mapping(struct dmar_domain *domain, unsigned long iov_pfn,
1879                                     struct scatterlist *sg, unsigned long nr_pages,
1880                                     int prot)
1881 {
1882         return __domain_mapping(domain, iov_pfn, sg, 0, nr_pages, prot);
1883 }
1884
1885 static inline int domain_pfn_mapping(struct dmar_domain *domain, unsigned long iov_pfn,
1886                                      unsigned long phys_pfn, unsigned long nr_pages,
1887                                      int prot)
1888 {
1889         return __domain_mapping(domain, iov_pfn, NULL, phys_pfn, nr_pages, prot);
1890 }
1891
1892 static void iommu_detach_dev(struct intel_iommu *iommu, u8 bus, u8 devfn)
1893 {
1894         if (!iommu)
1895                 return;
1896
1897         clear_context_table(iommu, bus, devfn);
1898         iommu->flush.flush_context(iommu, 0, 0, 0,
1899                                            DMA_CCMD_GLOBAL_INVL);
1900         iommu->flush.flush_iotlb(iommu, 0, 0, 0, DMA_TLB_GLOBAL_FLUSH);
1901 }
1902
1903 static void domain_remove_dev_info(struct dmar_domain *domain)
1904 {
1905         struct device_domain_info *info;
1906         unsigned long flags;
1907         struct intel_iommu *iommu;
1908
1909         spin_lock_irqsave(&device_domain_lock, flags);
1910         while (!list_empty(&domain->devices)) {
1911                 info = list_entry(domain->devices.next,
1912                         struct device_domain_info, link);
1913                 list_del(&info->link);
1914                 list_del(&info->global);
1915                 if (info->dev)
1916                         info->dev->dev.archdata.iommu = NULL;
1917                 spin_unlock_irqrestore(&device_domain_lock, flags);
1918
1919                 iommu_disable_dev_iotlb(info);
1920                 iommu = device_to_iommu(info->segment, info->bus, info->devfn);
1921                 iommu_detach_dev(iommu, info->bus, info->devfn);
1922                 free_devinfo_mem(info);
1923
1924                 spin_lock_irqsave(&device_domain_lock, flags);
1925         }
1926         spin_unlock_irqrestore(&device_domain_lock, flags);
1927 }
1928
1929 /*
1930  * find_domain
1931  * Note: we use struct pci_dev->dev.archdata.iommu stores the info
1932  */
1933 static struct dmar_domain *
1934 find_domain(struct pci_dev *pdev)
1935 {
1936         struct device_domain_info *info;
1937
1938         /* No lock here, assumes no domain exit in normal case */
1939         info = pdev->dev.archdata.iommu;
1940         if (info)
1941                 return info->domain;
1942         return NULL;
1943 }
1944
1945 /* domain is initialized */
1946 static struct dmar_domain *get_domain_for_dev(struct pci_dev *pdev, int gaw)
1947 {
1948         struct dmar_domain *domain, *found = NULL;
1949         struct intel_iommu *iommu;
1950         struct dmar_drhd_unit *drhd;
1951         struct device_domain_info *info, *tmp;
1952         struct pci_dev *dev_tmp;
1953         unsigned long flags;
1954         int bus = 0, devfn = 0;
1955         int segment;
1956         int ret;
1957
1958         domain = find_domain(pdev);
1959         if (domain)
1960                 return domain;
1961
1962         segment = pci_domain_nr(pdev->bus);
1963
1964         dev_tmp = pci_find_upstream_pcie_bridge(pdev);
1965         if (dev_tmp) {
1966                 if (pci_is_pcie(dev_tmp)) {
1967                         bus = dev_tmp->subordinate->number;
1968                         devfn = 0;
1969                 } else {
1970                         bus = dev_tmp->bus->number;
1971                         devfn = dev_tmp->devfn;
1972                 }
1973                 spin_lock_irqsave(&device_domain_lock, flags);
1974                 list_for_each_entry(info, &device_domain_list, global) {
1975                         if (info->segment == segment &&
1976                             info->bus == bus && info->devfn == devfn) {
1977                                 found = info->domain;
1978                                 break;
1979                         }
1980                 }
1981                 spin_unlock_irqrestore(&device_domain_lock, flags);
1982                 /* pcie-pci bridge already has a domain, uses it */
1983                 if (found) {
1984                         domain = found;
1985                         goto found_domain;
1986                 }
1987         }
1988
1989         domain = alloc_domain();
1990         if (!domain)
1991                 goto error;
1992
1993         /* Allocate new domain for the device */
1994         drhd = dmar_find_matched_drhd_unit(pdev);
1995         if (!drhd) {
1996                 printk(KERN_ERR "IOMMU: can't find DMAR for device %s\n",
1997                         pci_name(pdev));
1998                 return NULL;
1999         }
2000         iommu = drhd->iommu;
2001
2002         ret = iommu_attach_domain(domain, iommu);
2003         if (ret) {
2004                 free_domain_mem(domain);
2005                 goto error;
2006         }
2007
2008         if (domain_init(domain, gaw)) {
2009                 domain_exit(domain);
2010                 goto error;
2011         }
2012
2013         /* register pcie-to-pci device */
2014         if (dev_tmp) {
2015                 info = alloc_devinfo_mem();
2016                 if (!info) {
2017                         domain_exit(domain);
2018                         goto error;
2019                 }
2020                 info->segment = segment;
2021                 info->bus = bus;
2022                 info->devfn = devfn;
2023                 info->dev = NULL;
2024                 info->domain = domain;
2025                 /* This domain is shared by devices under p2p bridge */
2026                 domain->flags |= DOMAIN_FLAG_P2P_MULTIPLE_DEVICES;
2027
2028                 /* pcie-to-pci bridge already has a domain, uses it */
2029                 found = NULL;
2030                 spin_lock_irqsave(&device_domain_lock, flags);
2031                 list_for_each_entry(tmp, &device_domain_list, global) {
2032                         if (tmp->segment == segment &&
2033                             tmp->bus == bus && tmp->devfn == devfn) {
2034                                 found = tmp->domain;
2035                                 break;
2036                         }
2037                 }
2038                 if (found) {
2039                         spin_unlock_irqrestore(&device_domain_lock, flags);
2040                         free_devinfo_mem(info);
2041                         domain_exit(domain);
2042                         domain = found;
2043                 } else {
2044                         list_add(&info->link, &domain->devices);
2045                         list_add(&info->global, &device_domain_list);
2046                         spin_unlock_irqrestore(&device_domain_lock, flags);
2047                 }
2048         }
2049
2050 found_domain:
2051         info = alloc_devinfo_mem();
2052         if (!info)
2053                 goto error;
2054         info->segment = segment;
2055         info->bus = pdev->bus->number;
2056         info->devfn = pdev->devfn;
2057         info->dev = pdev;
2058         info->domain = domain;
2059         spin_lock_irqsave(&device_domain_lock, flags);
2060         /* somebody is fast */
2061         found = find_domain(pdev);
2062         if (found != NULL) {
2063                 spin_unlock_irqrestore(&device_domain_lock, flags);
2064                 if (found != domain) {
2065                         domain_exit(domain);
2066                         domain = found;
2067                 }
2068                 free_devinfo_mem(info);
2069                 return domain;
2070         }
2071         list_add(&info->link, &domain->devices);
2072         list_add(&info->global, &device_domain_list);
2073         pdev->dev.archdata.iommu = info;
2074         spin_unlock_irqrestore(&device_domain_lock, flags);
2075         return domain;
2076 error:
2077         /* recheck it here, maybe others set it */
2078         return find_domain(pdev);
2079 }
2080
2081 static int iommu_identity_mapping;
2082 #define IDENTMAP_ALL            1
2083 #define IDENTMAP_GFX            2
2084 #define IDENTMAP_AZALIA         4
2085
2086 static int iommu_domain_identity_map(struct dmar_domain *domain,
2087                                      unsigned long long start,
2088                                      unsigned long long end)
2089 {
2090         unsigned long first_vpfn = start >> VTD_PAGE_SHIFT;
2091         unsigned long last_vpfn = end >> VTD_PAGE_SHIFT;
2092
2093         if (!reserve_iova(&domain->iovad, dma_to_mm_pfn(first_vpfn),
2094                           dma_to_mm_pfn(last_vpfn))) {
2095                 printk(KERN_ERR "IOMMU: reserve iova failed\n");
2096                 return -ENOMEM;
2097         }
2098
2099         pr_debug("Mapping reserved region %llx-%llx for domain %d\n",
2100                  start, end, domain->id);
2101         /*
2102          * RMRR range might have overlap with physical memory range,
2103          * clear it first
2104          */
2105         dma_pte_clear_range(domain, first_vpfn, last_vpfn);
2106
2107         return domain_pfn_mapping(domain, first_vpfn, first_vpfn,
2108                                   last_vpfn - first_vpfn + 1,
2109                                   DMA_PTE_READ|DMA_PTE_WRITE);
2110 }
2111
2112 static int iommu_prepare_identity_map(struct pci_dev *pdev,
2113                                       unsigned long long start,
2114                                       unsigned long long end)
2115 {
2116         struct dmar_domain *domain;
2117         int ret;
2118
2119         domain = get_domain_for_dev(pdev, DEFAULT_DOMAIN_ADDRESS_WIDTH);
2120         if (!domain)
2121                 return -ENOMEM;
2122
2123         /* For _hardware_ passthrough, don't bother. But for software
2124            passthrough, we do it anyway -- it may indicate a memory
2125            range which is reserved in E820, so which didn't get set
2126            up to start with in si_domain */
2127         if (domain == si_domain && hw_pass_through) {
2128                 printk("Ignoring identity map for HW passthrough device %s [0x%Lx - 0x%Lx]\n",
2129                        pci_name(pdev), start, end);
2130                 return 0;
2131         }
2132
2133         printk(KERN_INFO
2134                "IOMMU: Setting identity map for device %s [0x%Lx - 0x%Lx]\n",
2135                pci_name(pdev), start, end);
2136         
2137         if (end < start) {
2138                 WARN(1, "Your BIOS is broken; RMRR ends before it starts!\n"
2139                         "BIOS vendor: %s; Ver: %s; Product Version: %s\n",
2140                         dmi_get_system_info(DMI_BIOS_VENDOR),
2141                         dmi_get_system_info(DMI_BIOS_VERSION),
2142                      dmi_get_system_info(DMI_PRODUCT_VERSION));
2143                 ret = -EIO;
2144                 goto error;
2145         }
2146
2147         if (end >> agaw_to_width(domain->agaw)) {
2148                 WARN(1, "Your BIOS is broken; RMRR exceeds permitted address width (%d bits)\n"
2149                      "BIOS vendor: %s; Ver: %s; Product Version: %s\n",
2150                      agaw_to_width(domain->agaw),
2151                      dmi_get_system_info(DMI_BIOS_VENDOR),
2152                      dmi_get_system_info(DMI_BIOS_VERSION),
2153                      dmi_get_system_info(DMI_PRODUCT_VERSION));
2154                 ret = -EIO;
2155                 goto error;
2156         }
2157
2158         ret = iommu_domain_identity_map(domain, start, end);
2159         if (ret)
2160                 goto error;
2161
2162         /* context entry init */
2163         ret = domain_context_mapping(domain, pdev, CONTEXT_TT_MULTI_LEVEL);
2164         if (ret)
2165                 goto error;
2166
2167         return 0;
2168
2169  error:
2170         domain_exit(domain);
2171         return ret;
2172 }
2173
2174 static inline int iommu_prepare_rmrr_dev(struct dmar_rmrr_unit *rmrr,
2175         struct pci_dev *pdev)
2176 {
2177         if (pdev->dev.archdata.iommu == DUMMY_DEVICE_DOMAIN_INFO)
2178                 return 0;
2179         return iommu_prepare_identity_map(pdev, rmrr->base_address,
2180                 rmrr->end_address);
2181 }
2182
2183 #ifdef CONFIG_INTEL_IOMMU_FLOPPY_WA
2184 static inline void iommu_prepare_isa(void)
2185 {
2186         struct pci_dev *pdev;
2187         int ret;
2188
2189         pdev = pci_get_class(PCI_CLASS_BRIDGE_ISA << 8, NULL);
2190         if (!pdev)
2191                 return;
2192
2193         printk(KERN_INFO "IOMMU: Prepare 0-16MiB unity mapping for LPC\n");
2194         ret = iommu_prepare_identity_map(pdev, 0, 16*1024*1024 - 1);
2195
2196         if (ret)
2197                 printk(KERN_ERR "IOMMU: Failed to create 0-16MiB identity map; "
2198                        "floppy might not work\n");
2199
2200 }
2201 #else
2202 static inline void iommu_prepare_isa(void)
2203 {
2204         return;
2205 }
2206 #endif /* !CONFIG_INTEL_IOMMU_FLPY_WA */
2207
2208 static int md_domain_init(struct dmar_domain *domain, int guest_width);
2209
2210 static int __init si_domain_init(int hw)
2211 {
2212         struct dmar_drhd_unit *drhd;
2213         struct intel_iommu *iommu;
2214         int nid, ret = 0;
2215
2216         si_domain = alloc_domain();
2217         if (!si_domain)
2218                 return -EFAULT;
2219
2220         pr_debug("Identity mapping domain is domain %d\n", si_domain->id);
2221
2222         for_each_active_iommu(iommu, drhd) {
2223                 ret = iommu_attach_domain(si_domain, iommu);
2224                 if (ret) {
2225                         domain_exit(si_domain);
2226                         return -EFAULT;
2227                 }
2228         }
2229
2230         if (md_domain_init(si_domain, DEFAULT_DOMAIN_ADDRESS_WIDTH)) {
2231                 domain_exit(si_domain);
2232                 return -EFAULT;
2233         }
2234
2235         si_domain->flags = DOMAIN_FLAG_STATIC_IDENTITY;
2236
2237         if (hw)
2238                 return 0;
2239
2240         for_each_online_node(nid) {
2241                 unsigned long start_pfn, end_pfn;
2242                 int i;
2243
2244                 for_each_mem_pfn_range(i, nid, &start_pfn, &end_pfn, NULL) {
2245                         ret = iommu_domain_identity_map(si_domain,
2246                                         PFN_PHYS(start_pfn), PFN_PHYS(end_pfn));
2247                         if (ret)
2248                                 return ret;
2249                 }
2250         }
2251
2252         return 0;
2253 }
2254
2255 static void domain_remove_one_dev_info(struct dmar_domain *domain,
2256                                           struct pci_dev *pdev);
2257 static int identity_mapping(struct pci_dev *pdev)
2258 {
2259         struct device_domain_info *info;
2260
2261         if (likely(!iommu_identity_mapping))
2262                 return 0;
2263
2264         info = pdev->dev.archdata.iommu;
2265         if (info && info != DUMMY_DEVICE_DOMAIN_INFO)
2266                 return (info->domain == si_domain);
2267
2268         return 0;
2269 }
2270
2271 static int domain_add_dev_info(struct dmar_domain *domain,
2272                                struct pci_dev *pdev,
2273                                int translation)
2274 {
2275         struct device_domain_info *info;
2276         unsigned long flags;
2277         int ret;
2278
2279         info = alloc_devinfo_mem();
2280         if (!info)
2281                 return -ENOMEM;
2282
2283         ret = domain_context_mapping(domain, pdev, translation);
2284         if (ret) {
2285                 free_devinfo_mem(info);
2286                 return ret;
2287         }
2288
2289         info->segment = pci_domain_nr(pdev->bus);
2290         info->bus = pdev->bus->number;
2291         info->devfn = pdev->devfn;
2292         info->dev = pdev;
2293         info->domain = domain;
2294
2295         spin_lock_irqsave(&device_domain_lock, flags);
2296         list_add(&info->link, &domain->devices);
2297         list_add(&info->global, &device_domain_list);
2298         pdev->dev.archdata.iommu = info;
2299         spin_unlock_irqrestore(&device_domain_lock, flags);
2300
2301         return 0;
2302 }
2303
2304 static int iommu_should_identity_map(struct pci_dev *pdev, int startup)
2305 {
2306         if ((iommu_identity_mapping & IDENTMAP_AZALIA) && IS_AZALIA(pdev))
2307                 return 1;
2308
2309         if ((iommu_identity_mapping & IDENTMAP_GFX) && IS_GFX_DEVICE(pdev))
2310                 return 1;
2311
2312         if (!(iommu_identity_mapping & IDENTMAP_ALL))
2313                 return 0;
2314
2315         /*
2316          * We want to start off with all devices in the 1:1 domain, and
2317          * take them out later if we find they can't access all of memory.
2318          *
2319          * However, we can't do this for PCI devices behind bridges,
2320          * because all PCI devices behind the same bridge will end up
2321          * with the same source-id on their transactions.
2322          *
2323          * Practically speaking, we can't change things around for these
2324          * devices at run-time, because we can't be sure there'll be no
2325          * DMA transactions in flight for any of their siblings.
2326          * 
2327          * So PCI devices (unless they're on the root bus) as well as
2328          * their parent PCI-PCI or PCIe-PCI bridges must be left _out_ of
2329          * the 1:1 domain, just in _case_ one of their siblings turns out
2330          * not to be able to map all of memory.
2331          */
2332         if (!pci_is_pcie(pdev)) {
2333                 if (!pci_is_root_bus(pdev->bus))
2334                         return 0;
2335                 if (pdev->class >> 8 == PCI_CLASS_BRIDGE_PCI)
2336                         return 0;
2337         } else if (pdev->pcie_type == PCI_EXP_TYPE_PCI_BRIDGE)
2338                 return 0;
2339
2340         /* 
2341          * At boot time, we don't yet know if devices will be 64-bit capable.
2342          * Assume that they will -- if they turn out not to be, then we can 
2343          * take them out of the 1:1 domain later.
2344          */
2345         if (!startup) {
2346                 /*
2347                  * If the device's dma_mask is less than the system's memory
2348                  * size then this is not a candidate for identity mapping.
2349                  */
2350                 u64 dma_mask = pdev->dma_mask;
2351
2352                 if (pdev->dev.coherent_dma_mask &&
2353                     pdev->dev.coherent_dma_mask < dma_mask)
2354                         dma_mask = pdev->dev.coherent_dma_mask;
2355
2356                 return dma_mask >= dma_get_required_mask(&pdev->dev);
2357         }
2358
2359         return 1;
2360 }
2361
2362 static int __init iommu_prepare_static_identity_mapping(int hw)
2363 {
2364         struct pci_dev *pdev = NULL;
2365         int ret;
2366
2367         ret = si_domain_init(hw);
2368         if (ret)
2369                 return -EFAULT;
2370
2371         for_each_pci_dev(pdev) {
2372                 /* Skip Host/PCI Bridge devices */
2373                 if (IS_BRIDGE_HOST_DEVICE(pdev))
2374                         continue;
2375                 if (iommu_should_identity_map(pdev, 1)) {
2376                         printk(KERN_INFO "IOMMU: %s identity mapping for device %s\n",
2377                                hw ? "hardware" : "software", pci_name(pdev));
2378
2379                         ret = domain_add_dev_info(si_domain, pdev,
2380                                                      hw ? CONTEXT_TT_PASS_THROUGH :
2381                                                      CONTEXT_TT_MULTI_LEVEL);
2382                         if (ret)
2383                                 return ret;
2384                 }
2385         }
2386
2387         return 0;
2388 }
2389
2390 static int __init init_dmars(void)
2391 {
2392         struct dmar_drhd_unit *drhd;
2393         struct dmar_rmrr_unit *rmrr;
2394         struct pci_dev *pdev;
2395         struct intel_iommu *iommu;
2396         int i, ret;
2397
2398         /*
2399          * for each drhd
2400          *    allocate root
2401          *    initialize and program root entry to not present
2402          * endfor
2403          */
2404         for_each_drhd_unit(drhd) {
2405                 g_num_of_iommus++;
2406                 /*
2407                  * lock not needed as this is only incremented in the single
2408                  * threaded kernel __init code path all other access are read
2409                  * only
2410                  */
2411         }
2412
2413         g_iommus = kcalloc(g_num_of_iommus, sizeof(struct intel_iommu *),
2414                         GFP_KERNEL);
2415         if (!g_iommus) {
2416                 printk(KERN_ERR "Allocating global iommu array failed\n");
2417                 ret = -ENOMEM;
2418                 goto error;
2419         }
2420
2421         deferred_flush = kzalloc(g_num_of_iommus *
2422                 sizeof(struct deferred_flush_tables), GFP_KERNEL);
2423         if (!deferred_flush) {
2424                 ret = -ENOMEM;
2425                 goto error;
2426         }
2427
2428         for_each_drhd_unit(drhd) {
2429                 if (drhd->ignored)
2430                         continue;
2431
2432                 iommu = drhd->iommu;
2433                 g_iommus[iommu->seq_id] = iommu;
2434
2435                 ret = iommu_init_domains(iommu);
2436                 if (ret)
2437                         goto error;
2438
2439                 /*
2440                  * TBD:
2441                  * we could share the same root & context tables
2442                  * among all IOMMU's. Need to Split it later.
2443                  */
2444                 ret = iommu_alloc_root_entry(iommu);
2445                 if (ret) {
2446                         printk(KERN_ERR "IOMMU: allocate root entry failed\n");
2447                         goto error;
2448                 }
2449                 if (!ecap_pass_through(iommu->ecap))
2450                         hw_pass_through = 0;
2451         }
2452
2453         /*
2454          * Start from the sane iommu hardware state.
2455          */
2456         for_each_drhd_unit(drhd) {
2457                 if (drhd->ignored)
2458                         continue;
2459
2460                 iommu = drhd->iommu;
2461
2462                 /*
2463                  * If the queued invalidation is already initialized by us
2464                  * (for example, while enabling interrupt-remapping) then
2465                  * we got the things already rolling from a sane state.
2466                  */
2467                 if (iommu->qi)
2468                         continue;
2469
2470                 /*
2471                  * Clear any previous faults.
2472                  */
2473                 dmar_fault(-1, iommu);
2474                 /*
2475                  * Disable queued invalidation if supported and already enabled
2476                  * before OS handover.
2477                  */
2478                 dmar_disable_qi(iommu);
2479         }
2480
2481         for_each_drhd_unit(drhd) {
2482                 if (drhd->ignored)
2483                         continue;
2484
2485                 iommu = drhd->iommu;
2486
2487                 if (dmar_enable_qi(iommu)) {
2488                         /*
2489                          * Queued Invalidate not enabled, use Register Based
2490                          * Invalidate
2491                          */
2492                         iommu->flush.flush_context = __iommu_flush_context;
2493                         iommu->flush.flush_iotlb = __iommu_flush_iotlb;
2494                         printk(KERN_INFO "IOMMU %d 0x%Lx: using Register based "
2495                                "invalidation\n",
2496                                 iommu->seq_id,
2497                                (unsigned long long)drhd->reg_base_addr);
2498                 } else {
2499                         iommu->flush.flush_context = qi_flush_context;
2500                         iommu->flush.flush_iotlb = qi_flush_iotlb;
2501                         printk(KERN_INFO "IOMMU %d 0x%Lx: using Queued "
2502                                "invalidation\n",
2503                                 iommu->seq_id,
2504                                (unsigned long long)drhd->reg_base_addr);
2505                 }
2506         }
2507
2508         if (iommu_pass_through)
2509                 iommu_identity_mapping |= IDENTMAP_ALL;
2510
2511 #ifdef CONFIG_INTEL_IOMMU_BROKEN_GFX_WA
2512         iommu_identity_mapping |= IDENTMAP_GFX;
2513 #endif
2514
2515         check_tylersburg_isoch();
2516
2517         /*
2518          * If pass through is not set or not enabled, setup context entries for
2519          * identity mappings for rmrr, gfx, and isa and may fall back to static
2520          * identity mapping if iommu_identity_mapping is set.
2521          */
2522         if (iommu_identity_mapping) {
2523                 ret = iommu_prepare_static_identity_mapping(hw_pass_through);
2524                 if (ret) {
2525                         printk(KERN_CRIT "Failed to setup IOMMU pass-through\n");
2526                         goto error;
2527                 }
2528         }
2529         /*
2530          * For each rmrr
2531          *   for each dev attached to rmrr
2532          *   do
2533          *     locate drhd for dev, alloc domain for dev
2534          *     allocate free domain
2535          *     allocate page table entries for rmrr
2536          *     if context not allocated for bus
2537          *           allocate and init context
2538          *           set present in root table for this bus
2539          *     init context with domain, translation etc
2540          *    endfor
2541          * endfor
2542          */
2543         printk(KERN_INFO "IOMMU: Setting RMRR:\n");
2544         for_each_rmrr_units(rmrr) {
2545                 for (i = 0; i < rmrr->devices_cnt; i++) {
2546                         pdev = rmrr->devices[i];
2547                         /*
2548                          * some BIOS lists non-exist devices in DMAR
2549                          * table.
2550                          */
2551                         if (!pdev)
2552                                 continue;
2553                         ret = iommu_prepare_rmrr_dev(rmrr, pdev);
2554                         if (ret)
2555                                 printk(KERN_ERR
2556                                        "IOMMU: mapping reserved region failed\n");
2557                 }
2558         }
2559
2560         iommu_prepare_isa();
2561
2562         /*
2563          * for each drhd
2564          *   enable fault log
2565          *   global invalidate context cache
2566          *   global invalidate iotlb
2567          *   enable translation
2568          */
2569         for_each_drhd_unit(drhd) {
2570                 if (drhd->ignored) {
2571                         /*
2572                          * we always have to disable PMRs or DMA may fail on
2573                          * this device
2574                          */
2575                         if (force_on)
2576                                 iommu_disable_protect_mem_regions(drhd->iommu);
2577                         continue;
2578                 }
2579                 iommu = drhd->iommu;
2580
2581                 iommu_flush_write_buffer(iommu);
2582
2583                 ret = dmar_set_interrupt(iommu);
2584                 if (ret)
2585                         goto error;
2586
2587                 iommu_set_root_entry(iommu);
2588
2589                 iommu->flush.flush_context(iommu, 0, 0, 0, DMA_CCMD_GLOBAL_INVL);
2590                 iommu->flush.flush_iotlb(iommu, 0, 0, 0, DMA_TLB_GLOBAL_FLUSH);
2591
2592                 ret = iommu_enable_translation(iommu);
2593                 if (ret)
2594                         goto error;
2595
2596                 iommu_disable_protect_mem_regions(iommu);
2597         }
2598
2599         return 0;
2600 error:
2601         for_each_drhd_unit(drhd) {
2602                 if (drhd->ignored)
2603                         continue;
2604                 iommu = drhd->iommu;
2605                 free_iommu(iommu);
2606         }
2607         kfree(g_iommus);
2608         return ret;
2609 }
2610
2611 /* This takes a number of _MM_ pages, not VTD pages */
2612 static struct iova *intel_alloc_iova(struct device *dev,
2613                                      struct dmar_domain *domain,
2614                                      unsigned long nrpages, uint64_t dma_mask)
2615 {
2616         struct pci_dev *pdev = to_pci_dev(dev);
2617         struct iova *iova = NULL;
2618
2619         /* Restrict dma_mask to the width that the iommu can handle */
2620         dma_mask = min_t(uint64_t, DOMAIN_MAX_ADDR(domain->gaw), dma_mask);
2621
2622         if (!dmar_forcedac && dma_mask > DMA_BIT_MASK(32)) {
2623                 /*
2624                  * First try to allocate an io virtual address in
2625                  * DMA_BIT_MASK(32) and if that fails then try allocating
2626                  * from higher range
2627                  */
2628                 iova = alloc_iova(&domain->iovad, nrpages,
2629                                   IOVA_PFN(DMA_BIT_MASK(32)), 1);
2630                 if (iova)
2631                         return iova;
2632         }
2633         iova = alloc_iova(&domain->iovad, nrpages, IOVA_PFN(dma_mask), 1);
2634         if (unlikely(!iova)) {
2635                 printk(KERN_ERR "Allocating %ld-page iova for %s failed",
2636                        nrpages, pci_name(pdev));
2637                 return NULL;
2638         }
2639
2640         return iova;
2641 }
2642
2643 static struct dmar_domain *__get_valid_domain_for_dev(struct pci_dev *pdev)
2644 {
2645         struct dmar_domain *domain;
2646         int ret;
2647
2648         domain = get_domain_for_dev(pdev,
2649                         DEFAULT_DOMAIN_ADDRESS_WIDTH);
2650         if (!domain) {
2651                 printk(KERN_ERR
2652                         "Allocating domain for %s failed", pci_name(pdev));
2653                 return NULL;
2654         }
2655
2656         /* make sure context mapping is ok */
2657         if (unlikely(!domain_context_mapped(pdev))) {
2658                 ret = domain_context_mapping(domain, pdev,
2659                                              CONTEXT_TT_MULTI_LEVEL);
2660                 if (ret) {
2661                         printk(KERN_ERR
2662                                 "Domain context map for %s failed",
2663                                 pci_name(pdev));
2664                         return NULL;
2665                 }
2666         }
2667
2668         return domain;
2669 }
2670
2671 static inline struct dmar_domain *get_valid_domain_for_dev(struct pci_dev *dev)
2672 {
2673         struct device_domain_info *info;
2674
2675         /* No lock here, assumes no domain exit in normal case */
2676         info = dev->dev.archdata.iommu;
2677         if (likely(info))
2678                 return info->domain;
2679
2680         return __get_valid_domain_for_dev(dev);
2681 }
2682
2683 static int iommu_dummy(struct pci_dev *pdev)
2684 {
2685         return pdev->dev.archdata.iommu == DUMMY_DEVICE_DOMAIN_INFO;
2686 }
2687
2688 /* Check if the pdev needs to go through non-identity map and unmap process.*/
2689 static int iommu_no_mapping(struct device *dev)
2690 {
2691         struct pci_dev *pdev;
2692         int found;
2693
2694         if (unlikely(dev->bus != &pci_bus_type))
2695                 return 1;
2696
2697         pdev = to_pci_dev(dev);
2698         if (iommu_dummy(pdev))
2699                 return 1;
2700
2701         if (!iommu_identity_mapping)
2702                 return 0;
2703
2704         found = identity_mapping(pdev);
2705         if (found) {
2706                 if (iommu_should_identity_map(pdev, 0))
2707                         return 1;
2708                 else {
2709                         /*
2710                          * 32 bit DMA is removed from si_domain and fall back
2711                          * to non-identity mapping.
2712                          */
2713                         domain_remove_one_dev_info(si_domain, pdev);
2714                         printk(KERN_INFO "32bit %s uses non-identity mapping\n",
2715                                pci_name(pdev));
2716                         return 0;
2717                 }
2718         } else {
2719                 /*
2720                  * In case of a detached 64 bit DMA device from vm, the device
2721                  * is put into si_domain for identity mapping.
2722                  */
2723                 if (iommu_should_identity_map(pdev, 0)) {
2724                         int ret;
2725                         ret = domain_add_dev_info(si_domain, pdev,
2726                                                   hw_pass_through ?
2727                                                   CONTEXT_TT_PASS_THROUGH :
2728                                                   CONTEXT_TT_MULTI_LEVEL);
2729                         if (!ret) {
2730                                 printk(KERN_INFO "64bit %s uses identity mapping\n",
2731                                        pci_name(pdev));
2732                                 return 1;
2733                         }
2734                 }
2735         }
2736
2737         return 0;
2738 }
2739
2740 static dma_addr_t __intel_map_single(struct device *hwdev, phys_addr_t paddr,
2741                                      size_t size, int dir, u64 dma_mask)
2742 {
2743         struct pci_dev *pdev = to_pci_dev(hwdev);
2744         struct dmar_domain *domain;
2745         phys_addr_t start_paddr;
2746         struct iova *iova;
2747         int prot = 0;
2748         int ret;
2749         struct intel_iommu *iommu;
2750         unsigned long paddr_pfn = paddr >> PAGE_SHIFT;
2751
2752         BUG_ON(dir == DMA_NONE);
2753
2754         if (iommu_no_mapping(hwdev))
2755                 return paddr;
2756
2757         domain = get_valid_domain_for_dev(pdev);
2758         if (!domain)
2759                 return 0;
2760
2761         iommu = domain_get_iommu(domain);
2762         size = aligned_nrpages(paddr, size);
2763
2764         iova = intel_alloc_iova(hwdev, domain, dma_to_mm_pfn(size), dma_mask);
2765         if (!iova)
2766                 goto error;
2767
2768         /*
2769          * Check if DMAR supports zero-length reads on write only
2770          * mappings..
2771          */
2772         if (dir == DMA_TO_DEVICE || dir == DMA_BIDIRECTIONAL || \
2773                         !cap_zlr(iommu->cap))
2774                 prot |= DMA_PTE_READ;
2775         if (dir == DMA_FROM_DEVICE || dir == DMA_BIDIRECTIONAL)
2776                 prot |= DMA_PTE_WRITE;
2777         /*
2778          * paddr - (paddr + size) might be partial page, we should map the whole
2779          * page.  Note: if two part of one page are separately mapped, we
2780          * might have two guest_addr mapping to the same host paddr, but this
2781          * is not a big problem
2782          */
2783         ret = domain_pfn_mapping(domain, mm_to_dma_pfn(iova->pfn_lo),
2784                                  mm_to_dma_pfn(paddr_pfn), size, prot);
2785         if (ret)
2786                 goto error;
2787
2788         /* it's a non-present to present mapping. Only flush if caching mode */
2789         if (cap_caching_mode(iommu->cap))
2790                 iommu_flush_iotlb_psi(iommu, domain->id, mm_to_dma_pfn(iova->pfn_lo), size, 1);
2791         else
2792                 iommu_flush_write_buffer(iommu);
2793
2794         start_paddr = (phys_addr_t)iova->pfn_lo << PAGE_SHIFT;
2795         start_paddr += paddr & ~PAGE_MASK;
2796         return start_paddr;
2797
2798 error:
2799         if (iova)
2800                 __free_iova(&domain->iovad, iova);
2801         printk(KERN_ERR"Device %s request: %zx@%llx dir %d --- failed\n",
2802                 pci_name(pdev), size, (unsigned long long)paddr, dir);
2803         return 0;
2804 }
2805
2806 static dma_addr_t intel_map_page(struct device *dev, struct page *page,
2807                                  unsigned long offset, size_t size,
2808                                  enum dma_data_direction dir,
2809                                  struct dma_attrs *attrs)
2810 {
2811         return __intel_map_single(dev, page_to_phys(page) + offset, size,
2812                                   dir, to_pci_dev(dev)->dma_mask);
2813 }
2814
2815 static void flush_unmaps(void)
2816 {
2817         int i, j;
2818
2819         timer_on = 0;
2820
2821         /* just flush them all */
2822         for (i = 0; i < g_num_of_iommus; i++) {
2823                 struct intel_iommu *iommu = g_iommus[i];
2824                 if (!iommu)
2825                         continue;
2826
2827                 if (!deferred_flush[i].next)
2828                         continue;
2829
2830                 /* In caching mode, global flushes turn emulation expensive */
2831                 if (!cap_caching_mode(iommu->cap))
2832                         iommu->flush.flush_iotlb(iommu, 0, 0, 0,
2833                                          DMA_TLB_GLOBAL_FLUSH);
2834                 for (j = 0; j < deferred_flush[i].next; j++) {
2835                         unsigned long mask;
2836                         struct iova *iova = deferred_flush[i].iova[j];
2837                         struct dmar_domain *domain = deferred_flush[i].domain[j];
2838
2839                         /* On real hardware multiple invalidations are expensive */
2840                         if (cap_caching_mode(iommu->cap))
2841                                 iommu_flush_iotlb_psi(iommu, domain->id,
2842                                 iova->pfn_lo, iova->pfn_hi - iova->pfn_lo + 1, 0);
2843                         else {
2844                                 mask = ilog2(mm_to_dma_pfn(iova->pfn_hi - iova->pfn_lo + 1));
2845                                 iommu_flush_dev_iotlb(deferred_flush[i].domain[j],
2846                                                 (uint64_t)iova->pfn_lo << PAGE_SHIFT, mask);
2847                         }
2848                         __free_iova(&deferred_flush[i].domain[j]->iovad, iova);
2849                 }
2850                 deferred_flush[i].next = 0;
2851         }
2852
2853         list_size = 0;
2854 }
2855
2856 static void flush_unmaps_timeout(unsigned long data)
2857 {
2858         unsigned long flags;
2859
2860         spin_lock_irqsave(&async_umap_flush_lock, flags);
2861         flush_unmaps();
2862         spin_unlock_irqrestore(&async_umap_flush_lock, flags);
2863 }
2864
2865 static void add_unmap(struct dmar_domain *dom, struct iova *iova)
2866 {
2867         unsigned long flags;
2868         int next, iommu_id;
2869         struct intel_iommu *iommu;
2870
2871         spin_lock_irqsave(&async_umap_flush_lock, flags);
2872         if (list_size == HIGH_WATER_MARK)
2873                 flush_unmaps();
2874
2875         iommu = domain_get_iommu(dom);
2876         iommu_id = iommu->seq_id;
2877
2878         next = deferred_flush[iommu_id].next;
2879         deferred_flush[iommu_id].domain[next] = dom;
2880         deferred_flush[iommu_id].iova[next] = iova;
2881         deferred_flush[iommu_id].next++;
2882
2883         if (!timer_on) {
2884                 mod_timer(&unmap_timer, jiffies + msecs_to_jiffies(10));
2885                 timer_on = 1;
2886         }
2887         list_size++;
2888         spin_unlock_irqrestore(&async_umap_flush_lock, flags);
2889 }
2890
2891 static void intel_unmap_page(struct device *dev, dma_addr_t dev_addr,
2892                              size_t size, enum dma_data_direction dir,
2893                              struct dma_attrs *attrs)
2894 {
2895         struct pci_dev *pdev = to_pci_dev(dev);
2896         struct dmar_domain *domain;
2897         unsigned long start_pfn, last_pfn;
2898         struct iova *iova;
2899         struct intel_iommu *iommu;
2900
2901         if (iommu_no_mapping(dev))
2902                 return;
2903
2904         domain = find_domain(pdev);
2905         BUG_ON(!domain);
2906
2907         iommu = domain_get_iommu(domain);
2908
2909         iova = find_iova(&domain->iovad, IOVA_PFN(dev_addr));
2910         if (WARN_ONCE(!iova, "Driver unmaps unmatched page at PFN %llx\n",
2911                       (unsigned long long)dev_addr))
2912                 return;
2913
2914         start_pfn = mm_to_dma_pfn(iova->pfn_lo);
2915         last_pfn = mm_to_dma_pfn(iova->pfn_hi + 1) - 1;
2916
2917         pr_debug("Device %s unmapping: pfn %lx-%lx\n",
2918                  pci_name(pdev), start_pfn, last_pfn);
2919
2920         /*  clear the whole page */
2921         dma_pte_clear_range(domain, start_pfn, last_pfn);
2922
2923         /* free page tables */
2924         dma_pte_free_pagetable(domain, start_pfn, last_pfn);
2925
2926         if (intel_iommu_strict) {
2927                 iommu_flush_iotlb_psi(iommu, domain->id, start_pfn,
2928                                       last_pfn - start_pfn + 1, 0);
2929                 /* free iova */
2930                 __free_iova(&domain->iovad, iova);
2931         } else {
2932                 add_unmap(domain, iova);
2933                 /*
2934                  * queue up the release of the unmap to save the 1/6th of the
2935                  * cpu used up by the iotlb flush operation...
2936                  */
2937         }
2938 }
2939
2940 static void *intel_alloc_coherent(struct device *hwdev, size_t size,
2941                                   dma_addr_t *dma_handle, gfp_t flags)
2942 {
2943         void *vaddr;
2944         int order;
2945
2946         size = PAGE_ALIGN(size);
2947         order = get_order(size);
2948
2949         if (!iommu_no_mapping(hwdev))
2950                 flags &= ~(GFP_DMA | GFP_DMA32);
2951         else if (hwdev->coherent_dma_mask < dma_get_required_mask(hwdev)) {
2952                 if (hwdev->coherent_dma_mask < DMA_BIT_MASK(32))
2953                         flags |= GFP_DMA;
2954                 else
2955                         flags |= GFP_DMA32;
2956         }
2957
2958         vaddr = (void *)__get_free_pages(flags, order);
2959         if (!vaddr)
2960                 return NULL;
2961         memset(vaddr, 0, size);
2962
2963         *dma_handle = __intel_map_single(hwdev, virt_to_bus(vaddr), size,
2964                                          DMA_BIDIRECTIONAL,
2965                                          hwdev->coherent_dma_mask);
2966         if (*dma_handle)
2967                 return vaddr;
2968         free_pages((unsigned long)vaddr, order);
2969         return NULL;
2970 }
2971
2972 static void intel_free_coherent(struct device *hwdev, size_t size, void *vaddr,
2973                                 dma_addr_t dma_handle)
2974 {
2975         int order;
2976
2977         size = PAGE_ALIGN(size);
2978         order = get_order(size);
2979
2980         intel_unmap_page(hwdev, dma_handle, size, DMA_BIDIRECTIONAL, NULL);
2981         free_pages((unsigned long)vaddr, order);
2982 }
2983
2984 static void intel_unmap_sg(struct device *hwdev, struct scatterlist *sglist,
2985                            int nelems, enum dma_data_direction dir,
2986                            struct dma_attrs *attrs)
2987 {
2988         struct pci_dev *pdev = to_pci_dev(hwdev);
2989         struct dmar_domain *domain;
2990         unsigned long start_pfn, last_pfn;
2991         struct iova *iova;
2992         struct intel_iommu *iommu;
2993
2994         if (iommu_no_mapping(hwdev))
2995                 return;
2996
2997         domain = find_domain(pdev);
2998         BUG_ON(!domain);
2999
3000         iommu = domain_get_iommu(domain);
3001
3002         iova = find_iova(&domain->iovad, IOVA_PFN(sglist[0].dma_address));
3003         if (WARN_ONCE(!iova, "Driver unmaps unmatched sglist at PFN %llx\n",
3004                       (unsigned long long)sglist[0].dma_address))
3005                 return;
3006
3007         start_pfn = mm_to_dma_pfn(iova->pfn_lo);
3008         last_pfn = mm_to_dma_pfn(iova->pfn_hi + 1) - 1;
3009
3010         /*  clear the whole page */
3011         dma_pte_clear_range(domain, start_pfn, last_pfn);
3012
3013         /* free page tables */
3014         dma_pte_free_pagetable(domain, start_pfn, last_pfn);
3015
3016         if (intel_iommu_strict) {
3017                 iommu_flush_iotlb_psi(iommu, domain->id, start_pfn,
3018                                       last_pfn - start_pfn + 1, 0);
3019                 /* free iova */
3020                 __free_iova(&domain->iovad, iova);
3021         } else {
3022                 add_unmap(domain, iova);
3023                 /*
3024                  * queue up the release of the unmap to save the 1/6th of the
3025                  * cpu used up by the iotlb flush operation...
3026                  */
3027         }
3028 }
3029
3030 static int intel_nontranslate_map_sg(struct device *hddev,
3031         struct scatterlist *sglist, int nelems, int dir)
3032 {
3033         int i;
3034         struct scatterlist *sg;
3035
3036         for_each_sg(sglist, sg, nelems, i) {
3037                 BUG_ON(!sg_page(sg));
3038                 sg->dma_address = page_to_phys(sg_page(sg)) + sg->offset;
3039                 sg->dma_length = sg->length;
3040         }
3041         return nelems;
3042 }
3043
3044 static int intel_map_sg(struct device *hwdev, struct scatterlist *sglist, int nelems,
3045                         enum dma_data_direction dir, struct dma_attrs *attrs)
3046 {
3047         int i;
3048         struct pci_dev *pdev = to_pci_dev(hwdev);
3049         struct dmar_domain *domain;
3050         size_t size = 0;
3051         int prot = 0;
3052         struct iova *iova = NULL;
3053         int ret;
3054         struct scatterlist *sg;
3055         unsigned long start_vpfn;
3056         struct intel_iommu *iommu;
3057
3058         BUG_ON(dir == DMA_NONE);
3059         if (iommu_no_mapping(hwdev))
3060                 return intel_nontranslate_map_sg(hwdev, sglist, nelems, dir);
3061
3062         domain = get_valid_domain_for_dev(pdev);
3063         if (!domain)
3064                 return 0;
3065
3066         iommu = domain_get_iommu(domain);
3067
3068         for_each_sg(sglist, sg, nelems, i)
3069                 size += aligned_nrpages(sg->offset, sg->length);
3070
3071         iova = intel_alloc_iova(hwdev, domain, dma_to_mm_pfn(size),
3072                                 pdev->dma_mask);
3073         if (!iova) {
3074                 sglist->dma_length = 0;
3075                 return 0;
3076         }
3077
3078         /*
3079          * Check if DMAR supports zero-length reads on write only
3080          * mappings..
3081          */
3082         if (dir == DMA_TO_DEVICE || dir == DMA_BIDIRECTIONAL || \
3083                         !cap_zlr(iommu->cap))
3084                 prot |= DMA_PTE_READ;
3085         if (dir == DMA_FROM_DEVICE || dir == DMA_BIDIRECTIONAL)
3086                 prot |= DMA_PTE_WRITE;
3087
3088         start_vpfn = mm_to_dma_pfn(iova->pfn_lo);
3089
3090         ret = domain_sg_mapping(domain, start_vpfn, sglist, size, prot);
3091         if (unlikely(ret)) {
3092                 /*  clear the page */
3093                 dma_pte_clear_range(domain, start_vpfn,
3094                                     start_vpfn + size - 1);
3095                 /* free page tables */
3096                 dma_pte_free_pagetable(domain, start_vpfn,
3097                                        start_vpfn + size - 1);
3098                 /* free iova */
3099                 __free_iova(&domain->iovad, iova);
3100                 return 0;
3101         }
3102
3103         /* it's a non-present to present mapping. Only flush if caching mode */
3104         if (cap_caching_mode(iommu->cap))
3105                 iommu_flush_iotlb_psi(iommu, domain->id, start_vpfn, size, 1);
3106         else
3107                 iommu_flush_write_buffer(iommu);
3108
3109         return nelems;
3110 }
3111
3112 static int intel_mapping_error(struct device *dev, dma_addr_t dma_addr)
3113 {
3114         return !dma_addr;
3115 }
3116
3117 struct dma_map_ops intel_dma_ops = {
3118         .alloc_coherent = intel_alloc_coherent,
3119         .free_coherent = intel_free_coherent,
3120         .map_sg = intel_map_sg,
3121         .unmap_sg = intel_unmap_sg,
3122         .map_page = intel_map_page,
3123         .unmap_page = intel_unmap_page,
3124         .mapping_error = intel_mapping_error,
3125 };
3126
3127 static inline int iommu_domain_cache_init(void)
3128 {
3129         int ret = 0;
3130
3131         iommu_domain_cache = kmem_cache_create("iommu_domain",
3132                                          sizeof(struct dmar_domain),
3133                                          0,
3134                                          SLAB_HWCACHE_ALIGN,
3135
3136                                          NULL);
3137         if (!iommu_domain_cache) {
3138                 printk(KERN_ERR "Couldn't create iommu_domain cache\n");
3139                 ret = -ENOMEM;
3140         }
3141
3142         return ret;
3143 }
3144
3145 static inline int iommu_devinfo_cache_init(void)
3146 {
3147         int ret = 0;
3148
3149         iommu_devinfo_cache = kmem_cache_create("iommu_devinfo",
3150                                          sizeof(struct device_domain_info),
3151                                          0,
3152                                          SLAB_HWCACHE_ALIGN,
3153                                          NULL);
3154         if (!iommu_devinfo_cache) {
3155                 printk(KERN_ERR "Couldn't create devinfo cache\n");
3156                 ret = -ENOMEM;
3157         }
3158
3159         return ret;
3160 }
3161
3162 static inline int iommu_iova_cache_init(void)
3163 {
3164         int ret = 0;
3165
3166         iommu_iova_cache = kmem_cache_create("iommu_iova",
3167                                          sizeof(struct iova),
3168                                          0,
3169                                          SLAB_HWCACHE_ALIGN,
3170                                          NULL);
3171         if (!iommu_iova_cache) {
3172                 printk(KERN_ERR "Couldn't create iova cache\n");
3173                 ret = -ENOMEM;
3174         }
3175
3176         return ret;
3177 }
3178
3179 static int __init iommu_init_mempool(void)
3180 {
3181         int ret;
3182         ret = iommu_iova_cache_init();
3183         if (ret)
3184                 return ret;
3185
3186         ret = iommu_domain_cache_init();
3187         if (ret)
3188                 goto domain_error;
3189
3190         ret = iommu_devinfo_cache_init();
3191         if (!ret)
3192                 return ret;
3193
3194         kmem_cache_destroy(iommu_domain_cache);
3195 domain_error:
3196         kmem_cache_destroy(iommu_iova_cache);
3197
3198         return -ENOMEM;
3199 }
3200
3201 static void __init iommu_exit_mempool(void)
3202 {
3203         kmem_cache_destroy(iommu_devinfo_cache);
3204         kmem_cache_destroy(iommu_domain_cache);
3205         kmem_cache_destroy(iommu_iova_cache);
3206
3207 }
3208
3209 static void quirk_ioat_snb_local_iommu(struct pci_dev *pdev)
3210 {
3211         struct dmar_drhd_unit *drhd;
3212         u32 vtbar;
3213         int rc;
3214
3215         /* We know that this device on this chipset has its own IOMMU.
3216          * If we find it under a different IOMMU, then the BIOS is lying
3217          * to us. Hope that the IOMMU for this device is actually
3218          * disabled, and it needs no translation...
3219          */
3220         rc = pci_bus_read_config_dword(pdev->bus, PCI_DEVFN(0, 0), 0xb0, &vtbar);
3221         if (rc) {
3222                 /* "can't" happen */
3223                 dev_info(&pdev->dev, "failed to run vt-d quirk\n");
3224                 return;
3225         }
3226         vtbar &= 0xffff0000;
3227
3228         /* we know that the this iommu should be at offset 0xa000 from vtbar */
3229         drhd = dmar_find_matched_drhd_unit(pdev);
3230         if (WARN_TAINT_ONCE(!drhd || drhd->reg_base_addr - vtbar != 0xa000,
3231                             TAINT_FIRMWARE_WORKAROUND,
3232                             "BIOS assigned incorrect VT-d unit for Intel(R) QuickData Technology device\n"))
3233                 pdev->dev.archdata.iommu = DUMMY_DEVICE_DOMAIN_INFO;
3234 }
3235 DECLARE_PCI_FIXUP_ENABLE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_IOAT_SNB, quirk_ioat_snb_local_iommu);
3236
3237 static void __init init_no_remapping_devices(void)
3238 {
3239         struct dmar_drhd_unit *drhd;
3240
3241         for_each_drhd_unit(drhd) {
3242                 if (!drhd->include_all) {
3243                         int i;
3244                         for (i = 0; i < drhd->devices_cnt; i++)
3245                                 if (drhd->devices[i] != NULL)
3246                                         break;
3247                         /* ignore DMAR unit if no pci devices exist */
3248                         if (i == drhd->devices_cnt)
3249                                 drhd->ignored = 1;
3250                 }
3251         }
3252
3253         for_each_drhd_unit(drhd) {
3254                 int i;
3255                 if (drhd->ignored || drhd->include_all)
3256                         continue;
3257
3258                 for (i = 0; i < drhd->devices_cnt; i++)
3259                         if (drhd->devices[i] &&
3260                             !IS_GFX_DEVICE(drhd->devices[i]))
3261                                 break;
3262
3263                 if (i < drhd->devices_cnt)
3264                         continue;
3265
3266                 /* This IOMMU has *only* gfx devices. Either bypass it or
3267                    set the gfx_mapped flag, as appropriate */
3268                 if (dmar_map_gfx) {
3269                         intel_iommu_gfx_mapped = 1;
3270                 } else {
3271                         drhd->ignored = 1;
3272                         for (i = 0; i < drhd->devices_cnt; i++) {
3273                                 if (!drhd->devices[i])
3274                                         continue;
3275                                 drhd->devices[i]->dev.archdata.iommu = DUMMY_DEVICE_DOMAIN_INFO;
3276                         }
3277                 }
3278         }
3279 }
3280
3281 #ifdef CONFIG_SUSPEND
3282 static int init_iommu_hw(void)
3283 {
3284         struct dmar_drhd_unit *drhd;
3285         struct intel_iommu *iommu = NULL;
3286
3287         for_each_active_iommu(iommu, drhd)
3288                 if (iommu->qi)
3289                         dmar_reenable_qi(iommu);
3290
3291         for_each_iommu(iommu, drhd) {
3292                 if (drhd->ignored) {
3293                         /*
3294                          * we always have to disable PMRs or DMA may fail on
3295                          * this device
3296                          */
3297                         if (force_on)
3298                                 iommu_disable_protect_mem_regions(iommu);
3299                         continue;
3300                 }
3301         
3302