1// SPDX-License-Identifier: GPL-2.0-only
2/*
3 * Copyright © 2006-2014 Intel Corporation.
4 *
5 * Authors: David Woodhouse <dwmw2@infradead.org>,
6 * Ashok Raj <ashok.raj@intel.com>,
7 * Shaohua Li <shaohua.li@intel.com>,
8 * Anil S Keshavamurthy <anil.s.keshavamurthy@intel.com>,
9 * Fenghua Yu <fenghua.yu@intel.com>
10 * Joerg Roedel <jroedel@suse.de>
11 */
12
13#define pr_fmt(fmt) "DMAR: " fmt
14#define dev_fmt(fmt) pr_fmt(fmt)
15
16#include <linux/crash_dump.h>
17#include <linux/dma-direct.h>
18#include <linux/dmi.h>
19#include <linux/memory.h>
20#include <linux/pci.h>
21#include <linux/pci-ats.h>
22#include <linux/spinlock.h>
23#include <linux/syscore_ops.h>
24#include <linux/tboot.h>
25#include <uapi/linux/iommufd.h>
26
27#include "iommu.h"
28#include "../dma-iommu.h"
29#include "../irq_remapping.h"
30#include "../iommu-pages.h"
31#include "pasid.h"
32#include "perfmon.h"
33
34#define ROOT_SIZE VTD_PAGE_SIZE
35#define CONTEXT_SIZE VTD_PAGE_SIZE
36
37#define IS_GFX_DEVICE(pdev) pci_is_display(pdev)
38#define IS_USB_DEVICE(pdev) ((pdev->class >> 8) == PCI_CLASS_SERIAL_USB)
39#define IS_ISA_DEVICE(pdev) ((pdev->class >> 8) == PCI_CLASS_BRIDGE_ISA)
40#define IS_AZALIA(pdev) ((pdev)->vendor == 0x8086 && (pdev)->device == 0x3a3e)
41
42#define IOAPIC_RANGE_START (0xfee00000)
43#define IOAPIC_RANGE_END (0xfeefffff)
44#define IOVA_START_ADDR (0x1000)
45
46#define DEFAULT_DOMAIN_ADDRESS_WIDTH 57
47
48#define __DOMAIN_MAX_PFN(gaw) ((((uint64_t)1) << ((gaw) - VTD_PAGE_SHIFT)) - 1)
49#define __DOMAIN_MAX_ADDR(gaw) ((((uint64_t)1) << (gaw)) - 1)
50
51/* We limit DOMAIN_MAX_PFN to fit in an unsigned long, and DOMAIN_MAX_ADDR
52 to match. That way, we can use 'unsigned long' for PFNs with impunity. */
53#define DOMAIN_MAX_PFN(gaw) ((unsigned long) min_t(uint64_t, \
54 __DOMAIN_MAX_PFN(gaw), (unsigned long)-1))
55#define DOMAIN_MAX_ADDR(gaw) (((uint64_t)__DOMAIN_MAX_PFN(gaw)) << VTD_PAGE_SHIFT)
56
57static void __init check_tylersburg_isoch(void);
58static int rwbf_quirk;
59
60#define rwbf_required(iommu) (rwbf_quirk || cap_rwbf((iommu)->cap))
61
62/*
63 * set to 1 to panic kernel if can't successfully enable VT-d
64 * (used when kernel is launched w/ TXT)
65 */
66static int force_on = 0;
67static int intel_iommu_tboot_noforce;
68static int no_platform_optin;
69
70#define ROOT_ENTRY_NR (VTD_PAGE_SIZE/sizeof(struct root_entry))
71
72/*
73 * Take a root_entry and return the Lower Context Table Pointer (LCTP)
74 * if marked present.
75 */
76static phys_addr_t root_entry_lctp(struct root_entry *re)
77{
78 if (!(re->lo & 1))
79 return 0;
80
81 return re->lo & VTD_PAGE_MASK;
82}
83
84/*
85 * Take a root_entry and return the Upper Context Table Pointer (UCTP)
86 * if marked present.
87 */
88static phys_addr_t root_entry_uctp(struct root_entry *re)
89{
90 if (!(re->hi & 1))
91 return 0;
92
93 return re->hi & VTD_PAGE_MASK;
94}
95
96static int device_rid_cmp_key(const void *key, const struct rb_node *node)
97{
98 struct device_domain_info *info =
99 rb_entry(node, struct device_domain_info, node);
100 const u16 *rid_lhs = key;
101
102 if (*rid_lhs < PCI_DEVID(info->bus, info->devfn))
103 return -1;
104
105 if (*rid_lhs > PCI_DEVID(info->bus, info->devfn))
106 return 1;
107
108 return 0;
109}
110
111static int device_rid_cmp(struct rb_node *lhs, const struct rb_node *rhs)
112{
113 struct device_domain_info *info =
114 rb_entry(lhs, struct device_domain_info, node);
115 u16 key = PCI_DEVID(info->bus, info->devfn);
116
117 return device_rid_cmp_key(key: &key, node: rhs);
118}
119
120/*
121 * Looks up an IOMMU-probed device using its source ID.
122 *
123 * Returns the pointer to the device if there is a match. Otherwise,
124 * returns NULL.
125 *
126 * Note that this helper doesn't guarantee that the device won't be
127 * released by the iommu subsystem after being returned. The caller
128 * should use its own synchronization mechanism to avoid the device
129 * being released during its use if its possibly the case.
130 */
131struct device *device_rbtree_find(struct intel_iommu *iommu, u16 rid)
132{
133 struct device_domain_info *info = NULL;
134 struct rb_node *node;
135 unsigned long flags;
136
137 spin_lock_irqsave(&iommu->device_rbtree_lock, flags);
138 node = rb_find(key: &rid, tree: &iommu->device_rbtree, cmp: device_rid_cmp_key);
139 if (node)
140 info = rb_entry(node, struct device_domain_info, node);
141 spin_unlock_irqrestore(lock: &iommu->device_rbtree_lock, flags);
142
143 return info ? info->dev : NULL;
144}
145
146static int device_rbtree_insert(struct intel_iommu *iommu,
147 struct device_domain_info *info)
148{
149 struct rb_node *curr;
150 unsigned long flags;
151
152 spin_lock_irqsave(&iommu->device_rbtree_lock, flags);
153 curr = rb_find_add(node: &info->node, tree: &iommu->device_rbtree, cmp: device_rid_cmp);
154 spin_unlock_irqrestore(lock: &iommu->device_rbtree_lock, flags);
155 if (WARN_ON(curr))
156 return -EEXIST;
157
158 return 0;
159}
160
161static void device_rbtree_remove(struct device_domain_info *info)
162{
163 struct intel_iommu *iommu = info->iommu;
164 unsigned long flags;
165
166 spin_lock_irqsave(&iommu->device_rbtree_lock, flags);
167 rb_erase(&info->node, &iommu->device_rbtree);
168 spin_unlock_irqrestore(lock: &iommu->device_rbtree_lock, flags);
169}
170
171struct dmar_rmrr_unit {
172 struct list_head list; /* list of rmrr units */
173 struct acpi_dmar_header *hdr; /* ACPI header */
174 u64 base_address; /* reserved base address*/
175 u64 end_address; /* reserved end address */
176 struct dmar_dev_scope *devices; /* target devices */
177 int devices_cnt; /* target device count */
178};
179
180struct dmar_atsr_unit {
181 struct list_head list; /* list of ATSR units */
182 struct acpi_dmar_header *hdr; /* ACPI header */
183 struct dmar_dev_scope *devices; /* target devices */
184 int devices_cnt; /* target device count */
185 u8 include_all:1; /* include all ports */
186};
187
188struct dmar_satc_unit {
189 struct list_head list; /* list of SATC units */
190 struct acpi_dmar_header *hdr; /* ACPI header */
191 struct dmar_dev_scope *devices; /* target devices */
192 struct intel_iommu *iommu; /* the corresponding iommu */
193 int devices_cnt; /* target device count */
194 u8 atc_required:1; /* ATS is required */
195};
196
197static LIST_HEAD(dmar_atsr_units);
198static LIST_HEAD(dmar_rmrr_units);
199static LIST_HEAD(dmar_satc_units);
200
201#define for_each_rmrr_units(rmrr) \
202 list_for_each_entry(rmrr, &dmar_rmrr_units, list)
203
204static void intel_iommu_domain_free(struct iommu_domain *domain);
205
206int dmar_disabled = !IS_ENABLED(CONFIG_INTEL_IOMMU_DEFAULT_ON);
207int intel_iommu_sm = IS_ENABLED(CONFIG_INTEL_IOMMU_SCALABLE_MODE_DEFAULT_ON);
208
209int intel_iommu_enabled = 0;
210EXPORT_SYMBOL_GPL(intel_iommu_enabled);
211
212static int intel_iommu_superpage = 1;
213static int iommu_identity_mapping;
214static int iommu_skip_te_disable;
215static int disable_igfx_iommu;
216
217#define IDENTMAP_AZALIA 4
218
219const struct iommu_ops intel_iommu_ops;
220static const struct iommu_dirty_ops intel_dirty_ops;
221
222static bool translation_pre_enabled(struct intel_iommu *iommu)
223{
224 return (iommu->flags & VTD_FLAG_TRANS_PRE_ENABLED);
225}
226
227static void clear_translation_pre_enabled(struct intel_iommu *iommu)
228{
229 iommu->flags &= ~VTD_FLAG_TRANS_PRE_ENABLED;
230}
231
232static void init_translation_status(struct intel_iommu *iommu)
233{
234 u32 gsts;
235
236 gsts = readl(addr: iommu->reg + DMAR_GSTS_REG);
237 if (gsts & DMA_GSTS_TES)
238 iommu->flags |= VTD_FLAG_TRANS_PRE_ENABLED;
239}
240
241static int __init intel_iommu_setup(char *str)
242{
243 if (!str)
244 return -EINVAL;
245
246 while (*str) {
247 if (!strncmp(str, "on", 2)) {
248 dmar_disabled = 0;
249 pr_info("IOMMU enabled\n");
250 } else if (!strncmp(str, "off", 3)) {
251 dmar_disabled = 1;
252 no_platform_optin = 1;
253 pr_info("IOMMU disabled\n");
254 } else if (!strncmp(str, "igfx_off", 8)) {
255 disable_igfx_iommu = 1;
256 pr_info("Disable GFX device mapping\n");
257 } else if (!strncmp(str, "forcedac", 8)) {
258 pr_warn("intel_iommu=forcedac deprecated; use iommu.forcedac instead\n");
259 iommu_dma_forcedac = true;
260 } else if (!strncmp(str, "strict", 6)) {
261 pr_warn("intel_iommu=strict deprecated; use iommu.strict=1 instead\n");
262 iommu_set_dma_strict();
263 } else if (!strncmp(str, "sp_off", 6)) {
264 pr_info("Disable supported super page\n");
265 intel_iommu_superpage = 0;
266 } else if (!strncmp(str, "sm_on", 5)) {
267 pr_info("Enable scalable mode if hardware supports\n");
268 intel_iommu_sm = 1;
269 } else if (!strncmp(str, "sm_off", 6)) {
270 pr_info("Scalable mode is disallowed\n");
271 intel_iommu_sm = 0;
272 } else if (!strncmp(str, "tboot_noforce", 13)) {
273 pr_info("Intel-IOMMU: not forcing on after tboot. This could expose security risk for tboot\n");
274 intel_iommu_tboot_noforce = 1;
275 } else {
276 pr_notice("Unknown option - '%s'\n", str);
277 }
278
279 str += strcspn(str, ",");
280 while (*str == ',')
281 str++;
282 }
283
284 return 1;
285}
286__setup("intel_iommu=", intel_iommu_setup);
287
288static int domain_pfn_supported(struct dmar_domain *domain, unsigned long pfn)
289{
290 int addr_width = agaw_to_width(agaw: domain->agaw) - VTD_PAGE_SHIFT;
291
292 return !(addr_width < BITS_PER_LONG && pfn >> addr_width);
293}
294
295/*
296 * Calculate the Supported Adjusted Guest Address Widths of an IOMMU.
297 * Refer to 11.4.2 of the VT-d spec for the encoding of each bit of
298 * the returned SAGAW.
299 */
300static unsigned long __iommu_calculate_sagaw(struct intel_iommu *iommu)
301{
302 unsigned long fl_sagaw, sl_sagaw;
303
304 fl_sagaw = BIT(2) | (cap_fl5lp_support(iommu->cap) ? BIT(3) : 0);
305 sl_sagaw = cap_sagaw(iommu->cap);
306
307 /* Second level only. */
308 if (!sm_supported(iommu) || !ecap_flts(iommu->ecap))
309 return sl_sagaw;
310
311 /* First level only. */
312 if (!ecap_slts(iommu->ecap))
313 return fl_sagaw;
314
315 return fl_sagaw & sl_sagaw;
316}
317
318static int __iommu_calculate_agaw(struct intel_iommu *iommu, int max_gaw)
319{
320 unsigned long sagaw;
321 int agaw;
322
323 sagaw = __iommu_calculate_sagaw(iommu);
324 for (agaw = width_to_agaw(width: max_gaw); agaw >= 0; agaw--) {
325 if (test_bit(agaw, &sagaw))
326 break;
327 }
328
329 return agaw;
330}
331
332/*
333 * Calculate max SAGAW for each iommu.
334 */
335int iommu_calculate_max_sagaw(struct intel_iommu *iommu)
336{
337 return __iommu_calculate_agaw(iommu, MAX_AGAW_WIDTH);
338}
339
340/*
341 * calculate agaw for each iommu.
342 * "SAGAW" may be different across iommus, use a default agaw, and
343 * get a supported less agaw for iommus that don't support the default agaw.
344 */
345int iommu_calculate_agaw(struct intel_iommu *iommu)
346{
347 return __iommu_calculate_agaw(iommu, DEFAULT_DOMAIN_ADDRESS_WIDTH);
348}
349
350static bool iommu_paging_structure_coherency(struct intel_iommu *iommu)
351{
352 return sm_supported(iommu) ?
353 ecap_smpwc(iommu->ecap) : ecap_coherent(iommu->ecap);
354}
355
356/* Return the super pagesize bitmap if supported. */
357static unsigned long domain_super_pgsize_bitmap(struct dmar_domain *domain)
358{
359 unsigned long bitmap = 0;
360
361 /*
362 * 1-level super page supports page size of 2MiB, 2-level super page
363 * supports page size of both 2MiB and 1GiB.
364 */
365 if (domain->iommu_superpage == 1)
366 bitmap |= SZ_2M;
367 else if (domain->iommu_superpage == 2)
368 bitmap |= SZ_2M | SZ_1G;
369
370 return bitmap;
371}
372
373struct context_entry *iommu_context_addr(struct intel_iommu *iommu, u8 bus,
374 u8 devfn, int alloc)
375{
376 struct root_entry *root = &iommu->root_entry[bus];
377 struct context_entry *context;
378 u64 *entry;
379
380 /*
381 * Except that the caller requested to allocate a new entry,
382 * returning a copied context entry makes no sense.
383 */
384 if (!alloc && context_copied(iommu, bus, devfn))
385 return NULL;
386
387 entry = &root->lo;
388 if (sm_supported(iommu)) {
389 if (devfn >= 0x80) {
390 devfn -= 0x80;
391 entry = &root->hi;
392 }
393 devfn *= 2;
394 }
395 if (*entry & 1)
396 context = phys_to_virt(address: *entry & VTD_PAGE_MASK);
397 else {
398 unsigned long phy_addr;
399 if (!alloc)
400 return NULL;
401
402 context = iommu_alloc_pages_node_sz(nid: iommu->node, GFP_ATOMIC,
403 SZ_4K);
404 if (!context)
405 return NULL;
406
407 __iommu_flush_cache(iommu, addr: (void *)context, CONTEXT_SIZE);
408 phy_addr = virt_to_phys(address: (void *)context);
409 *entry = phy_addr | 1;
410 __iommu_flush_cache(iommu, addr: entry, size: sizeof(*entry));
411 }
412 return &context[devfn];
413}
414
415/**
416 * is_downstream_to_pci_bridge - test if a device belongs to the PCI
417 * sub-hierarchy of a candidate PCI-PCI bridge
418 * @dev: candidate PCI device belonging to @bridge PCI sub-hierarchy
419 * @bridge: the candidate PCI-PCI bridge
420 *
421 * Return: true if @dev belongs to @bridge PCI sub-hierarchy, else false.
422 */
423static bool
424is_downstream_to_pci_bridge(struct device *dev, struct device *bridge)
425{
426 struct pci_dev *pdev, *pbridge;
427
428 if (!dev_is_pci(dev) || !dev_is_pci(bridge))
429 return false;
430
431 pdev = to_pci_dev(dev);
432 pbridge = to_pci_dev(bridge);
433
434 if (pbridge->subordinate &&
435 pbridge->subordinate->number <= pdev->bus->number &&
436 pbridge->subordinate->busn_res.end >= pdev->bus->number)
437 return true;
438
439 return false;
440}
441
442static bool quirk_ioat_snb_local_iommu(struct pci_dev *pdev)
443{
444 struct dmar_drhd_unit *drhd;
445 u32 vtbar;
446 int rc;
447
448 /* We know that this device on this chipset has its own IOMMU.
449 * If we find it under a different IOMMU, then the BIOS is lying
450 * to us. Hope that the IOMMU for this device is actually
451 * disabled, and it needs no translation...
452 */
453 rc = pci_bus_read_config_dword(bus: pdev->bus, PCI_DEVFN(0, 0), where: 0xb0, val: &vtbar);
454 if (rc) {
455 /* "can't" happen */
456 dev_info(&pdev->dev, "failed to run vt-d quirk\n");
457 return false;
458 }
459 vtbar &= 0xffff0000;
460
461 /* we know that the this iommu should be at offset 0xa000 from vtbar */
462 drhd = dmar_find_matched_drhd_unit(dev: pdev);
463 if (!drhd || drhd->reg_base_addr - vtbar != 0xa000) {
464 pr_warn_once(FW_BUG "BIOS assigned incorrect VT-d unit for Intel(R) QuickData Technology device\n");
465 add_taint(TAINT_FIRMWARE_WORKAROUND, LOCKDEP_STILL_OK);
466 return true;
467 }
468
469 return false;
470}
471
472static bool iommu_is_dummy(struct intel_iommu *iommu, struct device *dev)
473{
474 if (!iommu || iommu->drhd->ignored)
475 return true;
476
477 if (dev_is_pci(dev)) {
478 struct pci_dev *pdev = to_pci_dev(dev);
479
480 if (pdev->vendor == PCI_VENDOR_ID_INTEL &&
481 pdev->device == PCI_DEVICE_ID_INTEL_IOAT_SNB &&
482 quirk_ioat_snb_local_iommu(pdev))
483 return true;
484 }
485
486 return false;
487}
488
489static struct intel_iommu *device_lookup_iommu(struct device *dev, u8 *bus, u8 *devfn)
490{
491 struct dmar_drhd_unit *drhd = NULL;
492 struct pci_dev *pdev = NULL;
493 struct intel_iommu *iommu;
494 struct device *tmp;
495 u16 segment = 0;
496 int i;
497
498 if (!dev)
499 return NULL;
500
501 if (dev_is_pci(dev)) {
502 struct pci_dev *pf_pdev;
503
504 pdev = pci_real_dma_dev(to_pci_dev(dev));
505
506 /* VFs aren't listed in scope tables; we need to look up
507 * the PF instead to find the IOMMU. */
508 pf_pdev = pci_physfn(dev: pdev);
509 dev = &pf_pdev->dev;
510 segment = pci_domain_nr(bus: pdev->bus);
511 } else if (has_acpi_companion(dev))
512 dev = &ACPI_COMPANION(dev)->dev;
513
514 rcu_read_lock();
515 for_each_iommu(iommu, drhd) {
516 if (pdev && segment != drhd->segment)
517 continue;
518
519 for_each_active_dev_scope(drhd->devices,
520 drhd->devices_cnt, i, tmp) {
521 if (tmp == dev) {
522 /* For a VF use its original BDF# not that of the PF
523 * which we used for the IOMMU lookup. Strictly speaking
524 * we could do this for all PCI devices; we only need to
525 * get the BDF# from the scope table for ACPI matches. */
526 if (pdev && pdev->is_virtfn)
527 goto got_pdev;
528
529 if (bus && devfn) {
530 *bus = drhd->devices[i].bus;
531 *devfn = drhd->devices[i].devfn;
532 }
533 goto out;
534 }
535
536 if (is_downstream_to_pci_bridge(dev, bridge: tmp))
537 goto got_pdev;
538 }
539
540 if (pdev && drhd->include_all) {
541got_pdev:
542 if (bus && devfn) {
543 *bus = pdev->bus->number;
544 *devfn = pdev->devfn;
545 }
546 goto out;
547 }
548 }
549 iommu = NULL;
550out:
551 if (iommu_is_dummy(iommu, dev))
552 iommu = NULL;
553
554 rcu_read_unlock();
555
556 return iommu;
557}
558
559static void domain_flush_cache(struct dmar_domain *domain,
560 void *addr, int size)
561{
562 if (!domain->iommu_coherency)
563 clflush_cache_range(addr, size);
564}
565
566static void free_context_table(struct intel_iommu *iommu)
567{
568 struct context_entry *context;
569 int i;
570
571 if (!iommu->root_entry)
572 return;
573
574 for (i = 0; i < ROOT_ENTRY_NR; i++) {
575 context = iommu_context_addr(iommu, bus: i, devfn: 0, alloc: 0);
576 if (context)
577 iommu_free_pages(virt: context);
578
579 if (!sm_supported(iommu))
580 continue;
581
582 context = iommu_context_addr(iommu, bus: i, devfn: 0x80, alloc: 0);
583 if (context)
584 iommu_free_pages(virt: context);
585 }
586
587 iommu_free_pages(virt: iommu->root_entry);
588 iommu->root_entry = NULL;
589}
590
591#ifdef CONFIG_DMAR_DEBUG
592static void pgtable_walk(struct intel_iommu *iommu, unsigned long pfn,
593 u8 bus, u8 devfn, struct dma_pte *parent, int level)
594{
595 struct dma_pte *pte;
596 int offset;
597
598 while (1) {
599 offset = pfn_level_offset(pfn, level);
600 pte = &parent[offset];
601
602 pr_info("pte level: %d, pte value: 0x%016llx\n", level, pte->val);
603
604 if (!dma_pte_present(pte)) {
605 pr_info("page table not present at level %d\n", level - 1);
606 break;
607 }
608
609 if (level == 1 || dma_pte_superpage(pte))
610 break;
611
612 parent = phys_to_virt(dma_pte_addr(pte));
613 level--;
614 }
615}
616
617void dmar_fault_dump_ptes(struct intel_iommu *iommu, u16 source_id,
618 unsigned long long addr, u32 pasid)
619{
620 struct pasid_dir_entry *dir, *pde;
621 struct pasid_entry *entries, *pte;
622 struct context_entry *ctx_entry;
623 struct root_entry *rt_entry;
624 int i, dir_index, index, level;
625 u8 devfn = source_id & 0xff;
626 u8 bus = source_id >> 8;
627 struct dma_pte *pgtable;
628
629 pr_info("Dump %s table entries for IOVA 0x%llx\n", iommu->name, addr);
630
631 /* root entry dump */
632 if (!iommu->root_entry) {
633 pr_info("root table is not present\n");
634 return;
635 }
636 rt_entry = &iommu->root_entry[bus];
637
638 if (sm_supported(iommu))
639 pr_info("scalable mode root entry: hi 0x%016llx, low 0x%016llx\n",
640 rt_entry->hi, rt_entry->lo);
641 else
642 pr_info("root entry: 0x%016llx", rt_entry->lo);
643
644 /* context entry dump */
645 ctx_entry = iommu_context_addr(iommu, bus, devfn, 0);
646 if (!ctx_entry) {
647 pr_info("context table is not present\n");
648 return;
649 }
650
651 pr_info("context entry: hi 0x%016llx, low 0x%016llx\n",
652 ctx_entry->hi, ctx_entry->lo);
653
654 /* legacy mode does not require PASID entries */
655 if (!sm_supported(iommu)) {
656 if (!context_present(ctx_entry)) {
657 pr_info("legacy mode page table is not present\n");
658 return;
659 }
660 level = agaw_to_level(ctx_entry->hi & 7);
661 pgtable = phys_to_virt(ctx_entry->lo & VTD_PAGE_MASK);
662 goto pgtable_walk;
663 }
664
665 if (!context_present(ctx_entry)) {
666 pr_info("pasid directory table is not present\n");
667 return;
668 }
669
670 /* get the pointer to pasid directory entry */
671 dir = phys_to_virt(ctx_entry->lo & VTD_PAGE_MASK);
672
673 /* For request-without-pasid, get the pasid from context entry */
674 if (intel_iommu_sm && pasid == IOMMU_PASID_INVALID)
675 pasid = IOMMU_NO_PASID;
676
677 dir_index = pasid >> PASID_PDE_SHIFT;
678 pde = &dir[dir_index];
679 pr_info("pasid dir entry: 0x%016llx\n", pde->val);
680
681 /* get the pointer to the pasid table entry */
682 entries = get_pasid_table_from_pde(pde);
683 if (!entries) {
684 pr_info("pasid table is not present\n");
685 return;
686 }
687 index = pasid & PASID_PTE_MASK;
688 pte = &entries[index];
689 for (i = 0; i < ARRAY_SIZE(pte->val); i++)
690 pr_info("pasid table entry[%d]: 0x%016llx\n", i, pte->val[i]);
691
692 if (!pasid_pte_is_present(pte)) {
693 pr_info("scalable mode page table is not present\n");
694 return;
695 }
696
697 if (pasid_pte_get_pgtt(pte) == PASID_ENTRY_PGTT_FL_ONLY) {
698 level = pte->val[2] & BIT_ULL(2) ? 5 : 4;
699 pgtable = phys_to_virt(pte->val[2] & VTD_PAGE_MASK);
700 } else {
701 level = agaw_to_level((pte->val[0] >> 2) & 0x7);
702 pgtable = phys_to_virt(pte->val[0] & VTD_PAGE_MASK);
703 }
704
705pgtable_walk:
706 pgtable_walk(iommu, addr >> VTD_PAGE_SHIFT, bus, devfn, pgtable, level);
707}
708#endif
709
710static struct dma_pte *pfn_to_dma_pte(struct dmar_domain *domain,
711 unsigned long pfn, int *target_level,
712 gfp_t gfp)
713{
714 struct dma_pte *parent, *pte;
715 int level = agaw_to_level(agaw: domain->agaw);
716 int offset;
717
718 if (!domain_pfn_supported(domain, pfn))
719 /* Address beyond IOMMU's addressing capabilities. */
720 return NULL;
721
722 parent = domain->pgd;
723
724 while (1) {
725 void *tmp_page;
726
727 offset = pfn_level_offset(pfn, level);
728 pte = &parent[offset];
729 if (!*target_level && (dma_pte_superpage(pte) || !dma_pte_present(pte)))
730 break;
731 if (level == *target_level)
732 break;
733
734 if (!dma_pte_present(pte)) {
735 uint64_t pteval, tmp;
736
737 tmp_page = iommu_alloc_pages_node_sz(nid: domain->nid, gfp,
738 SZ_4K);
739
740 if (!tmp_page)
741 return NULL;
742
743 domain_flush_cache(domain, addr: tmp_page, VTD_PAGE_SIZE);
744 pteval = virt_to_phys(address: tmp_page) | DMA_PTE_READ |
745 DMA_PTE_WRITE;
746 if (domain->use_first_level)
747 pteval |= DMA_FL_PTE_US | DMA_FL_PTE_ACCESS;
748
749 tmp = 0ULL;
750 if (!try_cmpxchg64(&pte->val, &tmp, pteval))
751 /* Someone else set it while we were thinking; use theirs. */
752 iommu_free_pages(virt: tmp_page);
753 else
754 domain_flush_cache(domain, addr: pte, size: sizeof(*pte));
755 }
756 if (level == 1)
757 break;
758
759 parent = phys_to_virt(address: dma_pte_addr(pte));
760 level--;
761 }
762
763 if (!*target_level)
764 *target_level = level;
765
766 return pte;
767}
768
769/* return address's pte at specific level */
770static struct dma_pte *dma_pfn_level_pte(struct dmar_domain *domain,
771 unsigned long pfn,
772 int level, int *large_page)
773{
774 struct dma_pte *parent, *pte;
775 int total = agaw_to_level(agaw: domain->agaw);
776 int offset;
777
778 parent = domain->pgd;
779 while (level <= total) {
780 offset = pfn_level_offset(pfn, level: total);
781 pte = &parent[offset];
782 if (level == total)
783 return pte;
784
785 if (!dma_pte_present(pte)) {
786 *large_page = total;
787 break;
788 }
789
790 if (dma_pte_superpage(pte)) {
791 *large_page = total;
792 return pte;
793 }
794
795 parent = phys_to_virt(address: dma_pte_addr(pte));
796 total--;
797 }
798 return NULL;
799}
800
801/* clear last level pte, a tlb flush should be followed */
802static void dma_pte_clear_range(struct dmar_domain *domain,
803 unsigned long start_pfn,
804 unsigned long last_pfn)
805{
806 unsigned int large_page;
807 struct dma_pte *first_pte, *pte;
808
809 if (WARN_ON(!domain_pfn_supported(domain, last_pfn)) ||
810 WARN_ON(start_pfn > last_pfn))
811 return;
812
813 /* we don't need lock here; nobody else touches the iova range */
814 do {
815 large_page = 1;
816 first_pte = pte = dma_pfn_level_pte(domain, pfn: start_pfn, level: 1, large_page: &large_page);
817 if (!pte) {
818 start_pfn = align_to_level(pfn: start_pfn + 1, level: large_page + 1);
819 continue;
820 }
821 do {
822 dma_clear_pte(pte);
823 start_pfn += lvl_to_nr_pages(lvl: large_page);
824 pte++;
825 } while (start_pfn <= last_pfn && !first_pte_in_page(pte));
826
827 domain_flush_cache(domain, addr: first_pte,
828 size: (void *)pte - (void *)first_pte);
829
830 } while (start_pfn && start_pfn <= last_pfn);
831}
832
833static void dma_pte_free_level(struct dmar_domain *domain, int level,
834 int retain_level, struct dma_pte *pte,
835 unsigned long pfn, unsigned long start_pfn,
836 unsigned long last_pfn)
837{
838 pfn = max(start_pfn, pfn);
839 pte = &pte[pfn_level_offset(pfn, level)];
840
841 do {
842 unsigned long level_pfn;
843 struct dma_pte *level_pte;
844
845 if (!dma_pte_present(pte) || dma_pte_superpage(pte))
846 goto next;
847
848 level_pfn = pfn & level_mask(level);
849 level_pte = phys_to_virt(address: dma_pte_addr(pte));
850
851 if (level > 2) {
852 dma_pte_free_level(domain, level: level - 1, retain_level,
853 pte: level_pte, pfn: level_pfn, start_pfn,
854 last_pfn);
855 }
856
857 /*
858 * Free the page table if we're below the level we want to
859 * retain and the range covers the entire table.
860 */
861 if (level < retain_level && !(start_pfn > level_pfn ||
862 last_pfn < level_pfn + level_size(level) - 1)) {
863 dma_clear_pte(pte);
864 domain_flush_cache(domain, addr: pte, size: sizeof(*pte));
865 iommu_free_pages(virt: level_pte);
866 }
867next:
868 pfn += level_size(level);
869 } while (!first_pte_in_page(pte: ++pte) && pfn <= last_pfn);
870}
871
872/*
873 * clear last level (leaf) ptes and free page table pages below the
874 * level we wish to keep intact.
875 */
876static void dma_pte_free_pagetable(struct dmar_domain *domain,
877 unsigned long start_pfn,
878 unsigned long last_pfn,
879 int retain_level)
880{
881 dma_pte_clear_range(domain, start_pfn, last_pfn);
882
883 /* We don't need lock here; nobody else touches the iova range */
884 dma_pte_free_level(domain, level: agaw_to_level(agaw: domain->agaw), retain_level,
885 pte: domain->pgd, pfn: 0, start_pfn, last_pfn);
886
887 /* free pgd */
888 if (start_pfn == 0 && last_pfn == DOMAIN_MAX_PFN(domain->gaw)) {
889 iommu_free_pages(virt: domain->pgd);
890 domain->pgd = NULL;
891 }
892}
893
894/* When a page at a given level is being unlinked from its parent, we don't
895 need to *modify* it at all. All we need to do is make a list of all the
896 pages which can be freed just as soon as we've flushed the IOTLB and we
897 know the hardware page-walk will no longer touch them.
898 The 'pte' argument is the *parent* PTE, pointing to the page that is to
899 be freed. */
900static void dma_pte_list_pagetables(struct dmar_domain *domain,
901 int level, struct dma_pte *parent_pte,
902 struct iommu_pages_list *freelist)
903{
904 struct dma_pte *pte = phys_to_virt(address: dma_pte_addr(pte: parent_pte));
905
906 iommu_pages_list_add(list: freelist, virt: pte);
907
908 if (level == 1)
909 return;
910
911 do {
912 if (dma_pte_present(pte) && !dma_pte_superpage(pte))
913 dma_pte_list_pagetables(domain, level: level - 1, parent_pte: pte, freelist);
914 pte++;
915 } while (!first_pte_in_page(pte));
916}
917
918static void dma_pte_clear_level(struct dmar_domain *domain, int level,
919 struct dma_pte *pte, unsigned long pfn,
920 unsigned long start_pfn, unsigned long last_pfn,
921 struct iommu_pages_list *freelist)
922{
923 struct dma_pte *first_pte = NULL, *last_pte = NULL;
924
925 pfn = max(start_pfn, pfn);
926 pte = &pte[pfn_level_offset(pfn, level)];
927
928 do {
929 unsigned long level_pfn = pfn & level_mask(level);
930
931 if (!dma_pte_present(pte))
932 goto next;
933
934 /* If range covers entire pagetable, free it */
935 if (start_pfn <= level_pfn &&
936 last_pfn >= level_pfn + level_size(level) - 1) {
937 /* These suborbinate page tables are going away entirely. Don't
938 bother to clear them; we're just going to *free* them. */
939 if (level > 1 && !dma_pte_superpage(pte))
940 dma_pte_list_pagetables(domain, level: level - 1, parent_pte: pte, freelist);
941
942 dma_clear_pte(pte);
943 if (!first_pte)
944 first_pte = pte;
945 last_pte = pte;
946 } else if (level > 1) {
947 /* Recurse down into a level that isn't *entirely* obsolete */
948 dma_pte_clear_level(domain, level: level - 1,
949 phys_to_virt(address: dma_pte_addr(pte)),
950 pfn: level_pfn, start_pfn, last_pfn,
951 freelist);
952 }
953next:
954 pfn = level_pfn + level_size(level);
955 } while (!first_pte_in_page(pte: ++pte) && pfn <= last_pfn);
956
957 if (first_pte)
958 domain_flush_cache(domain, addr: first_pte,
959 size: (void *)++last_pte - (void *)first_pte);
960}
961
962/* We can't just free the pages because the IOMMU may still be walking
963 the page tables, and may have cached the intermediate levels. The
964 pages can only be freed after the IOTLB flush has been done. */
965static void domain_unmap(struct dmar_domain *domain, unsigned long start_pfn,
966 unsigned long last_pfn,
967 struct iommu_pages_list *freelist)
968{
969 if (WARN_ON(!domain_pfn_supported(domain, last_pfn)) ||
970 WARN_ON(start_pfn > last_pfn))
971 return;
972
973 /* we don't need lock here; nobody else touches the iova range */
974 dma_pte_clear_level(domain, level: agaw_to_level(agaw: domain->agaw),
975 pte: domain->pgd, pfn: 0, start_pfn, last_pfn, freelist);
976
977 /* free pgd */
978 if (start_pfn == 0 && last_pfn == DOMAIN_MAX_PFN(domain->gaw)) {
979 iommu_pages_list_add(list: freelist, virt: domain->pgd);
980 domain->pgd = NULL;
981 }
982}
983
984/* iommu handling */
985static int iommu_alloc_root_entry(struct intel_iommu *iommu)
986{
987 struct root_entry *root;
988
989 root = iommu_alloc_pages_node_sz(nid: iommu->node, GFP_ATOMIC, SZ_4K);
990 if (!root) {
991 pr_err("Allocating root entry for %s failed\n",
992 iommu->name);
993 return -ENOMEM;
994 }
995
996 __iommu_flush_cache(iommu, addr: root, ROOT_SIZE);
997 iommu->root_entry = root;
998
999 return 0;
1000}
1001
1002static void iommu_set_root_entry(struct intel_iommu *iommu)
1003{
1004 u64 addr;
1005 u32 sts;
1006 unsigned long flag;
1007
1008 addr = virt_to_phys(address: iommu->root_entry);
1009 if (sm_supported(iommu))
1010 addr |= DMA_RTADDR_SMT;
1011
1012 raw_spin_lock_irqsave(&iommu->register_lock, flag);
1013 dmar_writeq(iommu->reg + DMAR_RTADDR_REG, addr);
1014
1015 writel(val: iommu->gcmd | DMA_GCMD_SRTP, addr: iommu->reg + DMAR_GCMD_REG);
1016
1017 /* Make sure hardware complete it */
1018 IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1019 readl, (sts & DMA_GSTS_RTPS), sts);
1020
1021 raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1022
1023 /*
1024 * Hardware invalidates all DMA remapping hardware translation
1025 * caches as part of SRTP flow.
1026 */
1027 if (cap_esrtps(iommu->cap))
1028 return;
1029
1030 iommu->flush.flush_context(iommu, 0, 0, 0, DMA_CCMD_GLOBAL_INVL);
1031 if (sm_supported(iommu))
1032 qi_flush_pasid_cache(iommu, did: 0, QI_PC_GLOBAL, pasid: 0);
1033 iommu->flush.flush_iotlb(iommu, 0, 0, 0, DMA_TLB_GLOBAL_FLUSH);
1034}
1035
1036void iommu_flush_write_buffer(struct intel_iommu *iommu)
1037{
1038 u32 val;
1039 unsigned long flag;
1040
1041 if (!rwbf_quirk && !cap_rwbf(iommu->cap))
1042 return;
1043
1044 raw_spin_lock_irqsave(&iommu->register_lock, flag);
1045 writel(val: iommu->gcmd | DMA_GCMD_WBF, addr: iommu->reg + DMAR_GCMD_REG);
1046
1047 /* Make sure hardware complete it */
1048 IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1049 readl, (!(val & DMA_GSTS_WBFS)), val);
1050
1051 raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1052}
1053
1054/* return value determine if we need a write buffer flush */
1055static void __iommu_flush_context(struct intel_iommu *iommu,
1056 u16 did, u16 source_id, u8 function_mask,
1057 u64 type)
1058{
1059 u64 val = 0;
1060 unsigned long flag;
1061
1062 switch (type) {
1063 case DMA_CCMD_GLOBAL_INVL:
1064 val = DMA_CCMD_GLOBAL_INVL;
1065 break;
1066 case DMA_CCMD_DOMAIN_INVL:
1067 val = DMA_CCMD_DOMAIN_INVL|DMA_CCMD_DID(did);
1068 break;
1069 case DMA_CCMD_DEVICE_INVL:
1070 val = DMA_CCMD_DEVICE_INVL|DMA_CCMD_DID(did)
1071 | DMA_CCMD_SID(source_id) | DMA_CCMD_FM(function_mask);
1072 break;
1073 default:
1074 pr_warn("%s: Unexpected context-cache invalidation type 0x%llx\n",
1075 iommu->name, type);
1076 return;
1077 }
1078 val |= DMA_CCMD_ICC;
1079
1080 raw_spin_lock_irqsave(&iommu->register_lock, flag);
1081 dmar_writeq(iommu->reg + DMAR_CCMD_REG, val);
1082
1083 /* Make sure hardware complete it */
1084 IOMMU_WAIT_OP(iommu, DMAR_CCMD_REG,
1085 dmar_readq, (!(val & DMA_CCMD_ICC)), val);
1086
1087 raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1088}
1089
1090void __iommu_flush_iotlb(struct intel_iommu *iommu, u16 did, u64 addr,
1091 unsigned int size_order, u64 type)
1092{
1093 int tlb_offset = ecap_iotlb_offset(iommu->ecap);
1094 u64 val = 0, val_iva = 0;
1095 unsigned long flag;
1096
1097 switch (type) {
1098 case DMA_TLB_GLOBAL_FLUSH:
1099 /* global flush doesn't need set IVA_REG */
1100 val = DMA_TLB_GLOBAL_FLUSH|DMA_TLB_IVT;
1101 break;
1102 case DMA_TLB_DSI_FLUSH:
1103 val = DMA_TLB_DSI_FLUSH|DMA_TLB_IVT|DMA_TLB_DID(did);
1104 break;
1105 case DMA_TLB_PSI_FLUSH:
1106 val = DMA_TLB_PSI_FLUSH|DMA_TLB_IVT|DMA_TLB_DID(did);
1107 /* IH bit is passed in as part of address */
1108 val_iva = size_order | addr;
1109 break;
1110 default:
1111 pr_warn("%s: Unexpected iotlb invalidation type 0x%llx\n",
1112 iommu->name, type);
1113 return;
1114 }
1115
1116 if (cap_write_drain(iommu->cap))
1117 val |= DMA_TLB_WRITE_DRAIN;
1118
1119 raw_spin_lock_irqsave(&iommu->register_lock, flag);
1120 /* Note: Only uses first TLB reg currently */
1121 if (val_iva)
1122 dmar_writeq(iommu->reg + tlb_offset, val_iva);
1123 dmar_writeq(iommu->reg + tlb_offset + 8, val);
1124
1125 /* Make sure hardware complete it */
1126 IOMMU_WAIT_OP(iommu, tlb_offset + 8,
1127 dmar_readq, (!(val & DMA_TLB_IVT)), val);
1128
1129 raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1130
1131 /* check IOTLB invalidation granularity */
1132 if (DMA_TLB_IAIG(val) == 0)
1133 pr_err("Flush IOTLB failed\n");
1134 if (DMA_TLB_IAIG(val) != DMA_TLB_IIRG(type))
1135 pr_debug("TLB flush request %Lx, actual %Lx\n",
1136 (unsigned long long)DMA_TLB_IIRG(type),
1137 (unsigned long long)DMA_TLB_IAIG(val));
1138}
1139
1140static struct device_domain_info *
1141domain_lookup_dev_info(struct dmar_domain *domain,
1142 struct intel_iommu *iommu, u8 bus, u8 devfn)
1143{
1144 struct device_domain_info *info;
1145 unsigned long flags;
1146
1147 spin_lock_irqsave(&domain->lock, flags);
1148 list_for_each_entry(info, &domain->devices, link) {
1149 if (info->iommu == iommu && info->bus == bus &&
1150 info->devfn == devfn) {
1151 spin_unlock_irqrestore(lock: &domain->lock, flags);
1152 return info;
1153 }
1154 }
1155 spin_unlock_irqrestore(lock: &domain->lock, flags);
1156
1157 return NULL;
1158}
1159
1160/*
1161 * The extra devTLB flush quirk impacts those QAT devices with PCI device
1162 * IDs ranging from 0x4940 to 0x4943. It is exempted from risky_device()
1163 * check because it applies only to the built-in QAT devices and it doesn't
1164 * grant additional privileges.
1165 */
1166#define BUGGY_QAT_DEVID_MASK 0x4940
1167static bool dev_needs_extra_dtlb_flush(struct pci_dev *pdev)
1168{
1169 if (pdev->vendor != PCI_VENDOR_ID_INTEL)
1170 return false;
1171
1172 if ((pdev->device & 0xfffc) != BUGGY_QAT_DEVID_MASK)
1173 return false;
1174
1175 return true;
1176}
1177
1178static void iommu_enable_pci_ats(struct device_domain_info *info)
1179{
1180 struct pci_dev *pdev;
1181
1182 if (!info->ats_supported)
1183 return;
1184
1185 pdev = to_pci_dev(info->dev);
1186 if (!pci_ats_page_aligned(dev: pdev))
1187 return;
1188
1189 if (!pci_enable_ats(dev: pdev, VTD_PAGE_SHIFT))
1190 info->ats_enabled = 1;
1191}
1192
1193static void iommu_disable_pci_ats(struct device_domain_info *info)
1194{
1195 if (!info->ats_enabled)
1196 return;
1197
1198 pci_disable_ats(to_pci_dev(info->dev));
1199 info->ats_enabled = 0;
1200}
1201
1202static void iommu_enable_pci_pri(struct device_domain_info *info)
1203{
1204 struct pci_dev *pdev;
1205
1206 if (!info->ats_enabled || !info->pri_supported)
1207 return;
1208
1209 pdev = to_pci_dev(info->dev);
1210 /* PASID is required in PRG Response Message. */
1211 if (info->pasid_enabled && !pci_prg_resp_pasid_required(pdev))
1212 return;
1213
1214 if (pci_reset_pri(pdev))
1215 return;
1216
1217 if (!pci_enable_pri(pdev, PRQ_DEPTH))
1218 info->pri_enabled = 1;
1219}
1220
1221static void iommu_disable_pci_pri(struct device_domain_info *info)
1222{
1223 if (!info->pri_enabled)
1224 return;
1225
1226 if (WARN_ON(info->iopf_refcount))
1227 iopf_queue_remove_device(queue: info->iommu->iopf_queue, dev: info->dev);
1228
1229 pci_disable_pri(to_pci_dev(info->dev));
1230 info->pri_enabled = 0;
1231}
1232
1233static void intel_flush_iotlb_all(struct iommu_domain *domain)
1234{
1235 cache_tag_flush_all(domain: to_dmar_domain(dom: domain));
1236}
1237
1238static void iommu_disable_protect_mem_regions(struct intel_iommu *iommu)
1239{
1240 u32 pmen;
1241 unsigned long flags;
1242
1243 if (!cap_plmr(iommu->cap) && !cap_phmr(iommu->cap))
1244 return;
1245
1246 raw_spin_lock_irqsave(&iommu->register_lock, flags);
1247 pmen = readl(addr: iommu->reg + DMAR_PMEN_REG);
1248 pmen &= ~DMA_PMEN_EPM;
1249 writel(val: pmen, addr: iommu->reg + DMAR_PMEN_REG);
1250
1251 /* wait for the protected region status bit to clear */
1252 IOMMU_WAIT_OP(iommu, DMAR_PMEN_REG,
1253 readl, !(pmen & DMA_PMEN_PRS), pmen);
1254
1255 raw_spin_unlock_irqrestore(&iommu->register_lock, flags);
1256}
1257
1258static void iommu_enable_translation(struct intel_iommu *iommu)
1259{
1260 u32 sts;
1261 unsigned long flags;
1262
1263 raw_spin_lock_irqsave(&iommu->register_lock, flags);
1264 iommu->gcmd |= DMA_GCMD_TE;
1265 writel(val: iommu->gcmd, addr: iommu->reg + DMAR_GCMD_REG);
1266
1267 /* Make sure hardware complete it */
1268 IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1269 readl, (sts & DMA_GSTS_TES), sts);
1270
1271 raw_spin_unlock_irqrestore(&iommu->register_lock, flags);
1272}
1273
1274static void iommu_disable_translation(struct intel_iommu *iommu)
1275{
1276 u32 sts;
1277 unsigned long flag;
1278
1279 if (iommu_skip_te_disable && iommu->drhd->gfx_dedicated &&
1280 (cap_read_drain(iommu->cap) || cap_write_drain(iommu->cap)))
1281 return;
1282
1283 raw_spin_lock_irqsave(&iommu->register_lock, flag);
1284 iommu->gcmd &= ~DMA_GCMD_TE;
1285 writel(val: iommu->gcmd, addr: iommu->reg + DMAR_GCMD_REG);
1286
1287 /* Make sure hardware complete it */
1288 IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1289 readl, (!(sts & DMA_GSTS_TES)), sts);
1290
1291 raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1292}
1293
1294static void disable_dmar_iommu(struct intel_iommu *iommu)
1295{
1296 /*
1297 * All iommu domains must have been detached from the devices,
1298 * hence there should be no domain IDs in use.
1299 */
1300 if (WARN_ON(!ida_is_empty(&iommu->domain_ida)))
1301 return;
1302
1303 if (iommu->gcmd & DMA_GCMD_TE)
1304 iommu_disable_translation(iommu);
1305}
1306
1307static void free_dmar_iommu(struct intel_iommu *iommu)
1308{
1309 if (iommu->copied_tables) {
1310 bitmap_free(bitmap: iommu->copied_tables);
1311 iommu->copied_tables = NULL;
1312 }
1313
1314 /* free context mapping */
1315 free_context_table(iommu);
1316
1317 if (ecap_prs(iommu->ecap))
1318 intel_iommu_finish_prq(iommu);
1319}
1320
1321/*
1322 * Check and return whether first level is used by default for
1323 * DMA translation.
1324 */
1325static bool first_level_by_default(struct intel_iommu *iommu)
1326{
1327 /* Only SL is available in legacy mode */
1328 if (!sm_supported(iommu))
1329 return false;
1330
1331 /* Only level (either FL or SL) is available, just use it */
1332 if (ecap_flts(iommu->ecap) ^ ecap_slts(iommu->ecap))
1333 return ecap_flts(iommu->ecap);
1334
1335 return true;
1336}
1337
1338int domain_attach_iommu(struct dmar_domain *domain, struct intel_iommu *iommu)
1339{
1340 struct iommu_domain_info *info, *curr;
1341 int num, ret = -ENOSPC;
1342
1343 if (domain->domain.type == IOMMU_DOMAIN_SVA)
1344 return 0;
1345
1346 info = kzalloc(sizeof(*info), GFP_KERNEL);
1347 if (!info)
1348 return -ENOMEM;
1349
1350 guard(mutex)(T: &iommu->did_lock);
1351 curr = xa_load(&domain->iommu_array, index: iommu->seq_id);
1352 if (curr) {
1353 curr->refcnt++;
1354 kfree(objp: info);
1355 return 0;
1356 }
1357
1358 num = ida_alloc_range(&iommu->domain_ida, IDA_START_DID,
1359 cap_ndoms(iommu->cap) - 1, GFP_KERNEL);
1360 if (num < 0) {
1361 pr_err("%s: No free domain ids\n", iommu->name);
1362 goto err_unlock;
1363 }
1364
1365 info->refcnt = 1;
1366 info->did = num;
1367 info->iommu = iommu;
1368 curr = xa_cmpxchg(xa: &domain->iommu_array, index: iommu->seq_id,
1369 NULL, entry: info, GFP_KERNEL);
1370 if (curr) {
1371 ret = xa_err(entry: curr) ? : -EBUSY;
1372 goto err_clear;
1373 }
1374
1375 return 0;
1376
1377err_clear:
1378 ida_free(&iommu->domain_ida, id: info->did);
1379err_unlock:
1380 kfree(objp: info);
1381 return ret;
1382}
1383
1384void domain_detach_iommu(struct dmar_domain *domain, struct intel_iommu *iommu)
1385{
1386 struct iommu_domain_info *info;
1387
1388 if (domain->domain.type == IOMMU_DOMAIN_SVA)
1389 return;
1390
1391 guard(mutex)(T: &iommu->did_lock);
1392 info = xa_load(&domain->iommu_array, index: iommu->seq_id);
1393 if (--info->refcnt == 0) {
1394 ida_free(&iommu->domain_ida, id: info->did);
1395 xa_erase(&domain->iommu_array, index: iommu->seq_id);
1396 kfree(objp: info);
1397 }
1398}
1399
1400/*
1401 * For kdump cases, old valid entries may be cached due to the
1402 * in-flight DMA and copied pgtable, but there is no unmapping
1403 * behaviour for them, thus we need an explicit cache flush for
1404 * the newly-mapped device. For kdump, at this point, the device
1405 * is supposed to finish reset at its driver probe stage, so no
1406 * in-flight DMA will exist, and we don't need to worry anymore
1407 * hereafter.
1408 */
1409static void copied_context_tear_down(struct intel_iommu *iommu,
1410 struct context_entry *context,
1411 u8 bus, u8 devfn)
1412{
1413 u16 did_old;
1414
1415 if (!context_copied(iommu, bus, devfn))
1416 return;
1417
1418 assert_spin_locked(&iommu->lock);
1419
1420 did_old = context_domain_id(c: context);
1421 context_clear_entry(context);
1422
1423 if (did_old < cap_ndoms(iommu->cap)) {
1424 iommu->flush.flush_context(iommu, did_old,
1425 PCI_DEVID(bus, devfn),
1426 DMA_CCMD_MASK_NOBIT,
1427 DMA_CCMD_DEVICE_INVL);
1428 iommu->flush.flush_iotlb(iommu, did_old, 0, 0,
1429 DMA_TLB_DSI_FLUSH);
1430 }
1431
1432 clear_context_copied(iommu, bus, devfn);
1433}
1434
1435/*
1436 * It's a non-present to present mapping. If hardware doesn't cache
1437 * non-present entry we only need to flush the write-buffer. If the
1438 * _does_ cache non-present entries, then it does so in the special
1439 * domain #0, which we have to flush:
1440 */
1441static void context_present_cache_flush(struct intel_iommu *iommu, u16 did,
1442 u8 bus, u8 devfn)
1443{
1444 if (cap_caching_mode(iommu->cap)) {
1445 iommu->flush.flush_context(iommu, 0,
1446 PCI_DEVID(bus, devfn),
1447 DMA_CCMD_MASK_NOBIT,
1448 DMA_CCMD_DEVICE_INVL);
1449 iommu->flush.flush_iotlb(iommu, did, 0, 0, DMA_TLB_DSI_FLUSH);
1450 } else {
1451 iommu_flush_write_buffer(iommu);
1452 }
1453}
1454
1455static int domain_context_mapping_one(struct dmar_domain *domain,
1456 struct intel_iommu *iommu,
1457 u8 bus, u8 devfn)
1458{
1459 struct device_domain_info *info =
1460 domain_lookup_dev_info(domain, iommu, bus, devfn);
1461 u16 did = domain_id_iommu(domain, iommu);
1462 int translation = CONTEXT_TT_MULTI_LEVEL;
1463 struct dma_pte *pgd = domain->pgd;
1464 struct context_entry *context;
1465 int ret;
1466
1467 if (WARN_ON(!intel_domain_is_ss_paging(domain)))
1468 return -EINVAL;
1469
1470 pr_debug("Set context mapping for %02x:%02x.%d\n",
1471 bus, PCI_SLOT(devfn), PCI_FUNC(devfn));
1472
1473 spin_lock(lock: &iommu->lock);
1474 ret = -ENOMEM;
1475 context = iommu_context_addr(iommu, bus, devfn, alloc: 1);
1476 if (!context)
1477 goto out_unlock;
1478
1479 ret = 0;
1480 if (context_present(context) && !context_copied(iommu, bus, devfn))
1481 goto out_unlock;
1482
1483 copied_context_tear_down(iommu, context, bus, devfn);
1484 context_clear_entry(context);
1485 context_set_domain_id(context, value: did);
1486
1487 if (info && info->ats_supported)
1488 translation = CONTEXT_TT_DEV_IOTLB;
1489 else
1490 translation = CONTEXT_TT_MULTI_LEVEL;
1491
1492 context_set_address_root(context, virt_to_phys(address: pgd));
1493 context_set_address_width(context, value: domain->agaw);
1494 context_set_translation_type(context, value: translation);
1495 context_set_fault_enable(context);
1496 context_set_present(context);
1497 if (!ecap_coherent(iommu->ecap))
1498 clflush_cache_range(addr: context, size: sizeof(*context));
1499 context_present_cache_flush(iommu, did, bus, devfn);
1500 ret = 0;
1501
1502out_unlock:
1503 spin_unlock(lock: &iommu->lock);
1504
1505 return ret;
1506}
1507
1508static int domain_context_mapping_cb(struct pci_dev *pdev,
1509 u16 alias, void *opaque)
1510{
1511 struct device_domain_info *info = dev_iommu_priv_get(dev: &pdev->dev);
1512 struct intel_iommu *iommu = info->iommu;
1513 struct dmar_domain *domain = opaque;
1514
1515 return domain_context_mapping_one(domain, iommu,
1516 PCI_BUS_NUM(alias), devfn: alias & 0xff);
1517}
1518
1519static int
1520domain_context_mapping(struct dmar_domain *domain, struct device *dev)
1521{
1522 struct device_domain_info *info = dev_iommu_priv_get(dev);
1523 struct intel_iommu *iommu = info->iommu;
1524 u8 bus = info->bus, devfn = info->devfn;
1525 int ret;
1526
1527 if (!dev_is_pci(dev))
1528 return domain_context_mapping_one(domain, iommu, bus, devfn);
1529
1530 ret = pci_for_each_dma_alias(to_pci_dev(dev),
1531 fn: domain_context_mapping_cb, data: domain);
1532 if (ret)
1533 return ret;
1534
1535 iommu_enable_pci_ats(info);
1536
1537 return 0;
1538}
1539
1540/* Return largest possible superpage level for a given mapping */
1541static int hardware_largepage_caps(struct dmar_domain *domain, unsigned long iov_pfn,
1542 unsigned long phy_pfn, unsigned long pages)
1543{
1544 int support, level = 1;
1545 unsigned long pfnmerge;
1546
1547 support = domain->iommu_superpage;
1548
1549 /* To use a large page, the virtual *and* physical addresses
1550 must be aligned to 2MiB/1GiB/etc. Lower bits set in either
1551 of them will mean we have to use smaller pages. So just
1552 merge them and check both at once. */
1553 pfnmerge = iov_pfn | phy_pfn;
1554
1555 while (support && !(pfnmerge & ~VTD_STRIDE_MASK)) {
1556 pages >>= VTD_STRIDE_SHIFT;
1557 if (!pages)
1558 break;
1559 pfnmerge >>= VTD_STRIDE_SHIFT;
1560 level++;
1561 support--;
1562 }
1563 return level;
1564}
1565
1566/*
1567 * Ensure that old small page tables are removed to make room for superpage(s).
1568 * We're going to add new large pages, so make sure we don't remove their parent
1569 * tables. The IOTLB/devTLBs should be flushed if any PDE/PTEs are cleared.
1570 */
1571static void switch_to_super_page(struct dmar_domain *domain,
1572 unsigned long start_pfn,
1573 unsigned long end_pfn, int level)
1574{
1575 unsigned long lvl_pages = lvl_to_nr_pages(lvl: level);
1576 struct dma_pte *pte = NULL;
1577
1578 if (WARN_ON(!IS_ALIGNED(start_pfn, lvl_pages) ||
1579 !IS_ALIGNED(end_pfn + 1, lvl_pages)))
1580 return;
1581
1582 while (start_pfn <= end_pfn) {
1583 if (!pte)
1584 pte = pfn_to_dma_pte(domain, pfn: start_pfn, target_level: &level,
1585 GFP_ATOMIC);
1586
1587 if (dma_pte_present(pte)) {
1588 dma_pte_free_pagetable(domain, start_pfn,
1589 last_pfn: start_pfn + lvl_pages - 1,
1590 retain_level: level + 1);
1591
1592 cache_tag_flush_range(domain, start: start_pfn << VTD_PAGE_SHIFT,
1593 end: end_pfn << VTD_PAGE_SHIFT, ih: 0);
1594 }
1595
1596 pte++;
1597 start_pfn += lvl_pages;
1598 if (first_pte_in_page(pte))
1599 pte = NULL;
1600 }
1601}
1602
1603static int
1604__domain_mapping(struct dmar_domain *domain, unsigned long iov_pfn,
1605 unsigned long phys_pfn, unsigned long nr_pages, int prot,
1606 gfp_t gfp)
1607{
1608 struct dma_pte *first_pte = NULL, *pte = NULL;
1609 unsigned int largepage_lvl = 0;
1610 unsigned long lvl_pages = 0;
1611 phys_addr_t pteval;
1612 u64 attr;
1613
1614 if (unlikely(!domain_pfn_supported(domain, iov_pfn + nr_pages - 1)))
1615 return -EINVAL;
1616
1617 if ((prot & (DMA_PTE_READ|DMA_PTE_WRITE)) == 0)
1618 return -EINVAL;
1619
1620 if (!(prot & DMA_PTE_WRITE) && domain->nested_parent) {
1621 pr_err_ratelimited("Read-only mapping is disallowed on the domain which serves as the parent in a nested configuration, due to HW errata (ERRATA_772415_SPR17)\n");
1622 return -EINVAL;
1623 }
1624
1625 attr = prot & (DMA_PTE_READ | DMA_PTE_WRITE | DMA_PTE_SNP);
1626 if (domain->use_first_level) {
1627 attr |= DMA_FL_PTE_PRESENT | DMA_FL_PTE_US | DMA_FL_PTE_ACCESS;
1628 if (prot & DMA_PTE_WRITE)
1629 attr |= DMA_FL_PTE_DIRTY;
1630 }
1631
1632 domain->has_mappings = true;
1633
1634 pteval = ((phys_addr_t)phys_pfn << VTD_PAGE_SHIFT) | attr;
1635
1636 while (nr_pages > 0) {
1637 uint64_t tmp;
1638
1639 if (!pte) {
1640 largepage_lvl = hardware_largepage_caps(domain, iov_pfn,
1641 phy_pfn: phys_pfn, pages: nr_pages);
1642
1643 pte = pfn_to_dma_pte(domain, pfn: iov_pfn, target_level: &largepage_lvl,
1644 gfp);
1645 if (!pte)
1646 return -ENOMEM;
1647 first_pte = pte;
1648
1649 lvl_pages = lvl_to_nr_pages(lvl: largepage_lvl);
1650
1651 /* It is large page*/
1652 if (largepage_lvl > 1) {
1653 unsigned long end_pfn;
1654 unsigned long pages_to_remove;
1655
1656 pteval |= DMA_PTE_LARGE_PAGE;
1657 pages_to_remove = min_t(unsigned long,
1658 round_down(nr_pages, lvl_pages),
1659 nr_pte_to_next_page(pte) * lvl_pages);
1660 end_pfn = iov_pfn + pages_to_remove - 1;
1661 switch_to_super_page(domain, start_pfn: iov_pfn, end_pfn, level: largepage_lvl);
1662 } else {
1663 pteval &= ~(uint64_t)DMA_PTE_LARGE_PAGE;
1664 }
1665
1666 }
1667 /* We don't need lock here, nobody else
1668 * touches the iova range
1669 */
1670 tmp = 0ULL;
1671 if (!try_cmpxchg64_local(&pte->val, &tmp, pteval)) {
1672 static int dumps = 5;
1673 pr_crit("ERROR: DMA PTE for vPFN 0x%lx already set (to %llx not %llx)\n",
1674 iov_pfn, tmp, (unsigned long long)pteval);
1675 if (dumps) {
1676 dumps--;
1677 debug_dma_dump_mappings(NULL);
1678 }
1679 WARN_ON(1);
1680 }
1681
1682 nr_pages -= lvl_pages;
1683 iov_pfn += lvl_pages;
1684 phys_pfn += lvl_pages;
1685 pteval += lvl_pages * VTD_PAGE_SIZE;
1686
1687 /* If the next PTE would be the first in a new page, then we
1688 * need to flush the cache on the entries we've just written.
1689 * And then we'll need to recalculate 'pte', so clear it and
1690 * let it get set again in the if (!pte) block above.
1691 *
1692 * If we're done (!nr_pages) we need to flush the cache too.
1693 *
1694 * Also if we've been setting superpages, we may need to
1695 * recalculate 'pte' and switch back to smaller pages for the
1696 * end of the mapping, if the trailing size is not enough to
1697 * use another superpage (i.e. nr_pages < lvl_pages).
1698 */
1699 pte++;
1700 if (!nr_pages || first_pte_in_page(pte) ||
1701 (largepage_lvl > 1 && nr_pages < lvl_pages)) {
1702 domain_flush_cache(domain, addr: first_pte,
1703 size: (void *)pte - (void *)first_pte);
1704 pte = NULL;
1705 }
1706 }
1707
1708 return 0;
1709}
1710
1711static void domain_context_clear_one(struct device_domain_info *info, u8 bus, u8 devfn)
1712{
1713 struct intel_iommu *iommu = info->iommu;
1714 struct context_entry *context;
1715 u16 did;
1716
1717 spin_lock(lock: &iommu->lock);
1718 context = iommu_context_addr(iommu, bus, devfn, alloc: 0);
1719 if (!context) {
1720 spin_unlock(lock: &iommu->lock);
1721 return;
1722 }
1723
1724 did = context_domain_id(c: context);
1725 context_clear_entry(context);
1726 __iommu_flush_cache(iommu, addr: context, size: sizeof(*context));
1727 spin_unlock(lock: &iommu->lock);
1728 intel_context_flush_no_pasid(info, context, did);
1729}
1730
1731int __domain_setup_first_level(struct intel_iommu *iommu, struct device *dev,
1732 ioasid_t pasid, u16 did, phys_addr_t fsptptr,
1733 int flags, struct iommu_domain *old)
1734{
1735 if (!old)
1736 return intel_pasid_setup_first_level(iommu, dev, fsptptr, pasid,
1737 did, flags);
1738 return intel_pasid_replace_first_level(iommu, dev, fsptptr, pasid, did,
1739 old_did: iommu_domain_did(domain: old, iommu),
1740 flags);
1741}
1742
1743static int domain_setup_second_level(struct intel_iommu *iommu,
1744 struct dmar_domain *domain,
1745 struct device *dev, ioasid_t pasid,
1746 struct iommu_domain *old)
1747{
1748 if (!old)
1749 return intel_pasid_setup_second_level(iommu, domain,
1750 dev, pasid);
1751 return intel_pasid_replace_second_level(iommu, domain, dev,
1752 old_did: iommu_domain_did(domain: old, iommu),
1753 pasid);
1754}
1755
1756static int domain_setup_passthrough(struct intel_iommu *iommu,
1757 struct device *dev, ioasid_t pasid,
1758 struct iommu_domain *old)
1759{
1760 if (!old)
1761 return intel_pasid_setup_pass_through(iommu, dev, pasid);
1762 return intel_pasid_replace_pass_through(iommu, dev,
1763 old_did: iommu_domain_did(domain: old, iommu),
1764 pasid);
1765}
1766
1767static int domain_setup_first_level(struct intel_iommu *iommu,
1768 struct dmar_domain *domain,
1769 struct device *dev,
1770 u32 pasid, struct iommu_domain *old)
1771{
1772 struct dma_pte *pgd = domain->pgd;
1773 int level, flags = 0;
1774
1775 level = agaw_to_level(agaw: domain->agaw);
1776 if (level != 4 && level != 5)
1777 return -EINVAL;
1778
1779 if (level == 5)
1780 flags |= PASID_FLAG_FL5LP;
1781
1782 if (domain->force_snooping)
1783 flags |= PASID_FLAG_PAGE_SNOOP;
1784
1785 return __domain_setup_first_level(iommu, dev, pasid,
1786 did: domain_id_iommu(domain, iommu),
1787 __pa(pgd), flags, old);
1788}
1789
1790static int dmar_domain_attach_device(struct dmar_domain *domain,
1791 struct device *dev)
1792{
1793 struct device_domain_info *info = dev_iommu_priv_get(dev);
1794 struct intel_iommu *iommu = info->iommu;
1795 unsigned long flags;
1796 int ret;
1797
1798 ret = domain_attach_iommu(domain, iommu);
1799 if (ret)
1800 return ret;
1801
1802 info->domain = domain;
1803 info->domain_attached = true;
1804 spin_lock_irqsave(&domain->lock, flags);
1805 list_add(new: &info->link, head: &domain->devices);
1806 spin_unlock_irqrestore(lock: &domain->lock, flags);
1807
1808 if (dev_is_real_dma_subdevice(dev))
1809 return 0;
1810
1811 if (!sm_supported(iommu))
1812 ret = domain_context_mapping(domain, dev);
1813 else if (intel_domain_is_fs_paging(domain))
1814 ret = domain_setup_first_level(iommu, domain, dev,
1815 IOMMU_NO_PASID, NULL);
1816 else if (intel_domain_is_ss_paging(domain))
1817 ret = domain_setup_second_level(iommu, domain, dev,
1818 IOMMU_NO_PASID, NULL);
1819 else if (WARN_ON(true))
1820 ret = -EINVAL;
1821
1822 if (ret)
1823 goto out_block_translation;
1824
1825 ret = cache_tag_assign_domain(domain, dev, IOMMU_NO_PASID);
1826 if (ret)
1827 goto out_block_translation;
1828
1829 return 0;
1830
1831out_block_translation:
1832 device_block_translation(dev);
1833 return ret;
1834}
1835
1836/**
1837 * device_rmrr_is_relaxable - Test whether the RMRR of this device
1838 * is relaxable (ie. is allowed to be not enforced under some conditions)
1839 * @dev: device handle
1840 *
1841 * We assume that PCI USB devices with RMRRs have them largely
1842 * for historical reasons and that the RMRR space is not actively used post
1843 * boot. This exclusion may change if vendors begin to abuse it.
1844 *
1845 * The same exception is made for graphics devices, with the requirement that
1846 * any use of the RMRR regions will be torn down before assigning the device
1847 * to a guest.
1848 *
1849 * Return: true if the RMRR is relaxable, false otherwise
1850 */
1851static bool device_rmrr_is_relaxable(struct device *dev)
1852{
1853 struct pci_dev *pdev;
1854
1855 if (!dev_is_pci(dev))
1856 return false;
1857
1858 pdev = to_pci_dev(dev);
1859 if (IS_USB_DEVICE(pdev) || IS_GFX_DEVICE(pdev))
1860 return true;
1861 else
1862 return false;
1863}
1864
1865static int device_def_domain_type(struct device *dev)
1866{
1867 struct device_domain_info *info = dev_iommu_priv_get(dev);
1868 struct intel_iommu *iommu = info->iommu;
1869
1870 /*
1871 * Hardware does not support the passthrough translation mode.
1872 * Always use a dynamaic mapping domain.
1873 */
1874 if (!ecap_pass_through(iommu->ecap))
1875 return IOMMU_DOMAIN_DMA;
1876
1877 if (dev_is_pci(dev)) {
1878 struct pci_dev *pdev = to_pci_dev(dev);
1879
1880 if ((iommu_identity_mapping & IDENTMAP_AZALIA) && IS_AZALIA(pdev))
1881 return IOMMU_DOMAIN_IDENTITY;
1882 }
1883
1884 return 0;
1885}
1886
1887static void intel_iommu_init_qi(struct intel_iommu *iommu)
1888{
1889 /*
1890 * Start from the sane iommu hardware state.
1891 * If the queued invalidation is already initialized by us
1892 * (for example, while enabling interrupt-remapping) then
1893 * we got the things already rolling from a sane state.
1894 */
1895 if (!iommu->qi) {
1896 /*
1897 * Clear any previous faults.
1898 */
1899 dmar_fault(irq: -1, dev_id: iommu);
1900 /*
1901 * Disable queued invalidation if supported and already enabled
1902 * before OS handover.
1903 */
1904 dmar_disable_qi(iommu);
1905 }
1906
1907 if (dmar_enable_qi(iommu)) {
1908 /*
1909 * Queued Invalidate not enabled, use Register Based Invalidate
1910 */
1911 iommu->flush.flush_context = __iommu_flush_context;
1912 iommu->flush.flush_iotlb = __iommu_flush_iotlb;
1913 pr_info("%s: Using Register based invalidation\n",
1914 iommu->name);
1915 } else {
1916 iommu->flush.flush_context = qi_flush_context;
1917 iommu->flush.flush_iotlb = qi_flush_iotlb;
1918 pr_info("%s: Using Queued invalidation\n", iommu->name);
1919 }
1920}
1921
1922static int copy_context_table(struct intel_iommu *iommu,
1923 struct root_entry *old_re,
1924 struct context_entry **tbl,
1925 int bus, bool ext)
1926{
1927 int tbl_idx, pos = 0, idx, devfn, ret = 0, did;
1928 struct context_entry *new_ce = NULL, ce;
1929 struct context_entry *old_ce = NULL;
1930 struct root_entry re;
1931 phys_addr_t old_ce_phys;
1932
1933 tbl_idx = ext ? bus * 2 : bus;
1934 memcpy(to: &re, from: old_re, len: sizeof(re));
1935
1936 for (devfn = 0; devfn < 256; devfn++) {
1937 /* First calculate the correct index */
1938 idx = (ext ? devfn * 2 : devfn) % 256;
1939
1940 if (idx == 0) {
1941 /* First save what we may have and clean up */
1942 if (new_ce) {
1943 tbl[tbl_idx] = new_ce;
1944 __iommu_flush_cache(iommu, addr: new_ce,
1945 VTD_PAGE_SIZE);
1946 pos = 1;
1947 }
1948
1949 if (old_ce)
1950 memunmap(addr: old_ce);
1951
1952 ret = 0;
1953 if (devfn < 0x80)
1954 old_ce_phys = root_entry_lctp(re: &re);
1955 else
1956 old_ce_phys = root_entry_uctp(re: &re);
1957
1958 if (!old_ce_phys) {
1959 if (ext && devfn == 0) {
1960 /* No LCTP, try UCTP */
1961 devfn = 0x7f;
1962 continue;
1963 } else {
1964 goto out;
1965 }
1966 }
1967
1968 ret = -ENOMEM;
1969 old_ce = memremap(offset: old_ce_phys, PAGE_SIZE,
1970 flags: MEMREMAP_WB);
1971 if (!old_ce)
1972 goto out;
1973
1974 new_ce = iommu_alloc_pages_node_sz(nid: iommu->node,
1975 GFP_KERNEL, SZ_4K);
1976 if (!new_ce)
1977 goto out_unmap;
1978
1979 ret = 0;
1980 }
1981
1982 /* Now copy the context entry */
1983 memcpy(to: &ce, from: old_ce + idx, len: sizeof(ce));
1984
1985 if (!context_present(context: &ce))
1986 continue;
1987
1988 did = context_domain_id(c: &ce);
1989 if (did >= 0 && did < cap_ndoms(iommu->cap))
1990 ida_alloc_range(&iommu->domain_ida, min: did, max: did, GFP_KERNEL);
1991
1992 set_context_copied(iommu, bus, devfn);
1993 new_ce[idx] = ce;
1994 }
1995
1996 tbl[tbl_idx + pos] = new_ce;
1997
1998 __iommu_flush_cache(iommu, addr: new_ce, VTD_PAGE_SIZE);
1999
2000out_unmap:
2001 memunmap(addr: old_ce);
2002
2003out:
2004 return ret;
2005}
2006
2007static int copy_translation_tables(struct intel_iommu *iommu)
2008{
2009 struct context_entry **ctxt_tbls;
2010 struct root_entry *old_rt;
2011 phys_addr_t old_rt_phys;
2012 int ctxt_table_entries;
2013 u64 rtaddr_reg;
2014 int bus, ret;
2015 bool new_ext, ext;
2016
2017 rtaddr_reg = dmar_readq(iommu->reg + DMAR_RTADDR_REG);
2018 ext = !!(rtaddr_reg & DMA_RTADDR_SMT);
2019 new_ext = !!sm_supported(iommu);
2020
2021 /*
2022 * The RTT bit can only be changed when translation is disabled,
2023 * but disabling translation means to open a window for data
2024 * corruption. So bail out and don't copy anything if we would
2025 * have to change the bit.
2026 */
2027 if (new_ext != ext)
2028 return -EINVAL;
2029
2030 iommu->copied_tables = bitmap_zalloc(BIT_ULL(16), GFP_KERNEL);
2031 if (!iommu->copied_tables)
2032 return -ENOMEM;
2033
2034 old_rt_phys = rtaddr_reg & VTD_PAGE_MASK;
2035 if (!old_rt_phys)
2036 return -EINVAL;
2037
2038 old_rt = memremap(offset: old_rt_phys, PAGE_SIZE, flags: MEMREMAP_WB);
2039 if (!old_rt)
2040 return -ENOMEM;
2041
2042 /* This is too big for the stack - allocate it from slab */
2043 ctxt_table_entries = ext ? 512 : 256;
2044 ret = -ENOMEM;
2045 ctxt_tbls = kcalloc(ctxt_table_entries, sizeof(void *), GFP_KERNEL);
2046 if (!ctxt_tbls)
2047 goto out_unmap;
2048
2049 for (bus = 0; bus < 256; bus++) {
2050 ret = copy_context_table(iommu, old_re: &old_rt[bus],
2051 tbl: ctxt_tbls, bus, ext);
2052 if (ret) {
2053 pr_err("%s: Failed to copy context table for bus %d\n",
2054 iommu->name, bus);
2055 continue;
2056 }
2057 }
2058
2059 spin_lock(lock: &iommu->lock);
2060
2061 /* Context tables are copied, now write them to the root_entry table */
2062 for (bus = 0; bus < 256; bus++) {
2063 int idx = ext ? bus * 2 : bus;
2064 u64 val;
2065
2066 if (ctxt_tbls[idx]) {
2067 val = virt_to_phys(address: ctxt_tbls[idx]) | 1;
2068 iommu->root_entry[bus].lo = val;
2069 }
2070
2071 if (!ext || !ctxt_tbls[idx + 1])
2072 continue;
2073
2074 val = virt_to_phys(address: ctxt_tbls[idx + 1]) | 1;
2075 iommu->root_entry[bus].hi = val;
2076 }
2077
2078 spin_unlock(lock: &iommu->lock);
2079
2080 kfree(objp: ctxt_tbls);
2081
2082 __iommu_flush_cache(iommu, addr: iommu->root_entry, PAGE_SIZE);
2083
2084 ret = 0;
2085
2086out_unmap:
2087 memunmap(addr: old_rt);
2088
2089 return ret;
2090}
2091
2092static int __init init_dmars(void)
2093{
2094 struct dmar_drhd_unit *drhd;
2095 struct intel_iommu *iommu;
2096 int ret;
2097
2098 for_each_iommu(iommu, drhd) {
2099 if (drhd->ignored) {
2100 iommu_disable_translation(iommu);
2101 continue;
2102 }
2103
2104 /*
2105 * Find the max pasid size of all IOMMU's in the system.
2106 * We need to ensure the system pasid table is no bigger
2107 * than the smallest supported.
2108 */
2109 if (pasid_supported(iommu)) {
2110 u32 temp = 2 << ecap_pss(iommu->ecap);
2111
2112 intel_pasid_max_id = min_t(u32, temp,
2113 intel_pasid_max_id);
2114 }
2115
2116 intel_iommu_init_qi(iommu);
2117 init_translation_status(iommu);
2118
2119 if (translation_pre_enabled(iommu) && !is_kdump_kernel()) {
2120 iommu_disable_translation(iommu);
2121 clear_translation_pre_enabled(iommu);
2122 pr_warn("Translation was enabled for %s but we are not in kdump mode\n",
2123 iommu->name);
2124 }
2125
2126 /*
2127 * TBD:
2128 * we could share the same root & context tables
2129 * among all IOMMU's. Need to Split it later.
2130 */
2131 ret = iommu_alloc_root_entry(iommu);
2132 if (ret)
2133 goto free_iommu;
2134
2135 if (translation_pre_enabled(iommu)) {
2136 pr_info("Translation already enabled - trying to copy translation structures\n");
2137
2138 ret = copy_translation_tables(iommu);
2139 if (ret) {
2140 /*
2141 * We found the IOMMU with translation
2142 * enabled - but failed to copy over the
2143 * old root-entry table. Try to proceed
2144 * by disabling translation now and
2145 * allocating a clean root-entry table.
2146 * This might cause DMAR faults, but
2147 * probably the dump will still succeed.
2148 */
2149 pr_err("Failed to copy translation tables from previous kernel for %s\n",
2150 iommu->name);
2151 iommu_disable_translation(iommu);
2152 clear_translation_pre_enabled(iommu);
2153 } else {
2154 pr_info("Copied translation tables from previous kernel for %s\n",
2155 iommu->name);
2156 }
2157 }
2158
2159 intel_svm_check(iommu);
2160 }
2161
2162 /*
2163 * Now that qi is enabled on all iommus, set the root entry and flush
2164 * caches. This is required on some Intel X58 chipsets, otherwise the
2165 * flush_context function will loop forever and the boot hangs.
2166 */
2167 for_each_active_iommu(iommu, drhd) {
2168 iommu_flush_write_buffer(iommu);
2169 iommu_set_root_entry(iommu);
2170 }
2171
2172 check_tylersburg_isoch();
2173
2174 /*
2175 * for each drhd
2176 * enable fault log
2177 * global invalidate context cache
2178 * global invalidate iotlb
2179 * enable translation
2180 */
2181 for_each_iommu(iommu, drhd) {
2182 if (drhd->ignored) {
2183 /*
2184 * we always have to disable PMRs or DMA may fail on
2185 * this device
2186 */
2187 if (force_on)
2188 iommu_disable_protect_mem_regions(iommu);
2189 continue;
2190 }
2191
2192 iommu_flush_write_buffer(iommu);
2193
2194 if (ecap_prs(iommu->ecap)) {
2195 /*
2196 * Call dmar_alloc_hwirq() with dmar_global_lock held,
2197 * could cause possible lock race condition.
2198 */
2199 up_write(sem: &dmar_global_lock);
2200 ret = intel_iommu_enable_prq(iommu);
2201 down_write(sem: &dmar_global_lock);
2202 if (ret)
2203 goto free_iommu;
2204 }
2205
2206 ret = dmar_set_interrupt(iommu);
2207 if (ret)
2208 goto free_iommu;
2209 }
2210
2211 return 0;
2212
2213free_iommu:
2214 for_each_active_iommu(iommu, drhd) {
2215 disable_dmar_iommu(iommu);
2216 free_dmar_iommu(iommu);
2217 }
2218
2219 return ret;
2220}
2221
2222static void __init init_no_remapping_devices(void)
2223{
2224 struct dmar_drhd_unit *drhd;
2225 struct device *dev;
2226 int i;
2227
2228 for_each_drhd_unit(drhd) {
2229 if (!drhd->include_all) {
2230 for_each_active_dev_scope(drhd->devices,
2231 drhd->devices_cnt, i, dev)
2232 break;
2233 /* ignore DMAR unit if no devices exist */
2234 if (i == drhd->devices_cnt)
2235 drhd->ignored = 1;
2236 }
2237 }
2238
2239 for_each_active_drhd_unit(drhd) {
2240 if (drhd->include_all)
2241 continue;
2242
2243 for_each_active_dev_scope(drhd->devices,
2244 drhd->devices_cnt, i, dev)
2245 if (!dev_is_pci(dev) || !IS_GFX_DEVICE(to_pci_dev(dev)))
2246 break;
2247 if (i < drhd->devices_cnt)
2248 continue;
2249
2250 /* This IOMMU has *only* gfx devices. Either bypass it or
2251 set the gfx_mapped flag, as appropriate */
2252 drhd->gfx_dedicated = 1;
2253 if (disable_igfx_iommu)
2254 drhd->ignored = 1;
2255 }
2256}
2257
2258#ifdef CONFIG_SUSPEND
2259static int init_iommu_hw(void)
2260{
2261 struct dmar_drhd_unit *drhd;
2262 struct intel_iommu *iommu = NULL;
2263 int ret;
2264
2265 for_each_active_iommu(iommu, drhd) {
2266 if (iommu->qi) {
2267 ret = dmar_reenable_qi(iommu);
2268 if (ret)
2269 return ret;
2270 }
2271 }
2272
2273 for_each_iommu(iommu, drhd) {
2274 if (drhd->ignored) {
2275 /*
2276 * we always have to disable PMRs or DMA may fail on
2277 * this device
2278 */
2279 if (force_on)
2280 iommu_disable_protect_mem_regions(iommu);
2281 continue;
2282 }
2283
2284 iommu_flush_write_buffer(iommu);
2285 iommu_set_root_entry(iommu);
2286 iommu_enable_translation(iommu);
2287 iommu_disable_protect_mem_regions(iommu);
2288 }
2289
2290 return 0;
2291}
2292
2293static void iommu_flush_all(void)
2294{
2295 struct dmar_drhd_unit *drhd;
2296 struct intel_iommu *iommu;
2297
2298 for_each_active_iommu(iommu, drhd) {
2299 iommu->flush.flush_context(iommu, 0, 0, 0,
2300 DMA_CCMD_GLOBAL_INVL);
2301 iommu->flush.flush_iotlb(iommu, 0, 0, 0,
2302 DMA_TLB_GLOBAL_FLUSH);
2303 }
2304}
2305
2306static int iommu_suspend(void)
2307{
2308 struct dmar_drhd_unit *drhd;
2309 struct intel_iommu *iommu = NULL;
2310 unsigned long flag;
2311
2312 iommu_flush_all();
2313
2314 for_each_active_iommu(iommu, drhd) {
2315 iommu_disable_translation(iommu);
2316
2317 raw_spin_lock_irqsave(&iommu->register_lock, flag);
2318
2319 iommu->iommu_state[SR_DMAR_FECTL_REG] =
2320 readl(addr: iommu->reg + DMAR_FECTL_REG);
2321 iommu->iommu_state[SR_DMAR_FEDATA_REG] =
2322 readl(addr: iommu->reg + DMAR_FEDATA_REG);
2323 iommu->iommu_state[SR_DMAR_FEADDR_REG] =
2324 readl(addr: iommu->reg + DMAR_FEADDR_REG);
2325 iommu->iommu_state[SR_DMAR_FEUADDR_REG] =
2326 readl(addr: iommu->reg + DMAR_FEUADDR_REG);
2327
2328 raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
2329 }
2330 return 0;
2331}
2332
2333static void iommu_resume(void)
2334{
2335 struct dmar_drhd_unit *drhd;
2336 struct intel_iommu *iommu = NULL;
2337 unsigned long flag;
2338
2339 if (init_iommu_hw()) {
2340 if (force_on)
2341 panic(fmt: "tboot: IOMMU setup failed, DMAR can not resume!\n");
2342 else
2343 WARN(1, "IOMMU setup failed, DMAR can not resume!\n");
2344 return;
2345 }
2346
2347 for_each_active_iommu(iommu, drhd) {
2348
2349 raw_spin_lock_irqsave(&iommu->register_lock, flag);
2350
2351 writel(val: iommu->iommu_state[SR_DMAR_FECTL_REG],
2352 addr: iommu->reg + DMAR_FECTL_REG);
2353 writel(val: iommu->iommu_state[SR_DMAR_FEDATA_REG],
2354 addr: iommu->reg + DMAR_FEDATA_REG);
2355 writel(val: iommu->iommu_state[SR_DMAR_FEADDR_REG],
2356 addr: iommu->reg + DMAR_FEADDR_REG);
2357 writel(val: iommu->iommu_state[SR_DMAR_FEUADDR_REG],
2358 addr: iommu->reg + DMAR_FEUADDR_REG);
2359
2360 raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
2361 }
2362}
2363
2364static struct syscore_ops iommu_syscore_ops = {
2365 .resume = iommu_resume,
2366 .suspend = iommu_suspend,
2367};
2368
2369static void __init init_iommu_pm_ops(void)
2370{
2371 register_syscore_ops(ops: &iommu_syscore_ops);
2372}
2373
2374#else
2375static inline void init_iommu_pm_ops(void) {}
2376#endif /* CONFIG_PM */
2377
2378static int __init rmrr_sanity_check(struct acpi_dmar_reserved_memory *rmrr)
2379{
2380 if (!IS_ALIGNED(rmrr->base_address, PAGE_SIZE) ||
2381 !IS_ALIGNED(rmrr->end_address + 1, PAGE_SIZE) ||
2382 rmrr->end_address <= rmrr->base_address ||
2383 arch_rmrr_sanity_check(rmrr))
2384 return -EINVAL;
2385
2386 return 0;
2387}
2388
2389int __init dmar_parse_one_rmrr(struct acpi_dmar_header *header, void *arg)
2390{
2391 struct acpi_dmar_reserved_memory *rmrr;
2392 struct dmar_rmrr_unit *rmrru;
2393
2394 rmrr = (struct acpi_dmar_reserved_memory *)header;
2395 if (rmrr_sanity_check(rmrr)) {
2396 pr_warn(FW_BUG
2397 "Your BIOS is broken; bad RMRR [%#018Lx-%#018Lx]\n"
2398 "BIOS vendor: %s; Ver: %s; Product Version: %s\n",
2399 rmrr->base_address, rmrr->end_address,
2400 dmi_get_system_info(DMI_BIOS_VENDOR),
2401 dmi_get_system_info(DMI_BIOS_VERSION),
2402 dmi_get_system_info(DMI_PRODUCT_VERSION));
2403 add_taint(TAINT_FIRMWARE_WORKAROUND, LOCKDEP_STILL_OK);
2404 }
2405
2406 rmrru = kzalloc(sizeof(*rmrru), GFP_KERNEL);
2407 if (!rmrru)
2408 goto out;
2409
2410 rmrru->hdr = header;
2411
2412 rmrru->base_address = rmrr->base_address;
2413 rmrru->end_address = rmrr->end_address;
2414
2415 rmrru->devices = dmar_alloc_dev_scope(start: (void *)(rmrr + 1),
2416 end: ((void *)rmrr) + rmrr->header.length,
2417 cnt: &rmrru->devices_cnt);
2418 if (rmrru->devices_cnt && rmrru->devices == NULL)
2419 goto free_rmrru;
2420
2421 list_add(new: &rmrru->list, head: &dmar_rmrr_units);
2422
2423 return 0;
2424free_rmrru:
2425 kfree(objp: rmrru);
2426out:
2427 return -ENOMEM;
2428}
2429
2430static struct dmar_atsr_unit *dmar_find_atsr(struct acpi_dmar_atsr *atsr)
2431{
2432 struct dmar_atsr_unit *atsru;
2433 struct acpi_dmar_atsr *tmp;
2434
2435 list_for_each_entry_rcu(atsru, &dmar_atsr_units, list,
2436 dmar_rcu_check()) {
2437 tmp = (struct acpi_dmar_atsr *)atsru->hdr;
2438 if (atsr->segment != tmp->segment)
2439 continue;
2440 if (atsr->header.length != tmp->header.length)
2441 continue;
2442 if (memcmp(atsr, tmp, atsr->header.length) == 0)
2443 return atsru;
2444 }
2445
2446 return NULL;
2447}
2448
2449int dmar_parse_one_atsr(struct acpi_dmar_header *hdr, void *arg)
2450{
2451 struct acpi_dmar_atsr *atsr;
2452 struct dmar_atsr_unit *atsru;
2453
2454 if (system_state >= SYSTEM_RUNNING && !intel_iommu_enabled)
2455 return 0;
2456
2457 atsr = container_of(hdr, struct acpi_dmar_atsr, header);
2458 atsru = dmar_find_atsr(atsr);
2459 if (atsru)
2460 return 0;
2461
2462 atsru = kzalloc(sizeof(*atsru) + hdr->length, GFP_KERNEL);
2463 if (!atsru)
2464 return -ENOMEM;
2465
2466 /*
2467 * If memory is allocated from slab by ACPI _DSM method, we need to
2468 * copy the memory content because the memory buffer will be freed
2469 * on return.
2470 */
2471 atsru->hdr = (void *)(atsru + 1);
2472 memcpy(to: atsru->hdr, from: hdr, len: hdr->length);
2473 atsru->include_all = atsr->flags & 0x1;
2474 if (!atsru->include_all) {
2475 atsru->devices = dmar_alloc_dev_scope(start: (void *)(atsr + 1),
2476 end: (void *)atsr + atsr->header.length,
2477 cnt: &atsru->devices_cnt);
2478 if (atsru->devices_cnt && atsru->devices == NULL) {
2479 kfree(objp: atsru);
2480 return -ENOMEM;
2481 }
2482 }
2483
2484 list_add_rcu(new: &atsru->list, head: &dmar_atsr_units);
2485
2486 return 0;
2487}
2488
2489static void intel_iommu_free_atsr(struct dmar_atsr_unit *atsru)
2490{
2491 dmar_free_dev_scope(devices: &atsru->devices, cnt: &atsru->devices_cnt);
2492 kfree(objp: atsru);
2493}
2494
2495int dmar_release_one_atsr(struct acpi_dmar_header *hdr, void *arg)
2496{
2497 struct acpi_dmar_atsr *atsr;
2498 struct dmar_atsr_unit *atsru;
2499
2500 atsr = container_of(hdr, struct acpi_dmar_atsr, header);
2501 atsru = dmar_find_atsr(atsr);
2502 if (atsru) {
2503 list_del_rcu(entry: &atsru->list);
2504 synchronize_rcu();
2505 intel_iommu_free_atsr(atsru);
2506 }
2507
2508 return 0;
2509}
2510
2511int dmar_check_one_atsr(struct acpi_dmar_header *hdr, void *arg)
2512{
2513 int i;
2514 struct device *dev;
2515 struct acpi_dmar_atsr *atsr;
2516 struct dmar_atsr_unit *atsru;
2517
2518 atsr = container_of(hdr, struct acpi_dmar_atsr, header);
2519 atsru = dmar_find_atsr(atsr);
2520 if (!atsru)
2521 return 0;
2522
2523 if (!atsru->include_all && atsru->devices && atsru->devices_cnt) {
2524 for_each_active_dev_scope(atsru->devices, atsru->devices_cnt,
2525 i, dev)
2526 return -EBUSY;
2527 }
2528
2529 return 0;
2530}
2531
2532static struct dmar_satc_unit *dmar_find_satc(struct acpi_dmar_satc *satc)
2533{
2534 struct dmar_satc_unit *satcu;
2535 struct acpi_dmar_satc *tmp;
2536
2537 list_for_each_entry_rcu(satcu, &dmar_satc_units, list,
2538 dmar_rcu_check()) {
2539 tmp = (struct acpi_dmar_satc *)satcu->hdr;
2540 if (satc->segment != tmp->segment)
2541 continue;
2542 if (satc->header.length != tmp->header.length)
2543 continue;
2544 if (memcmp(satc, tmp, satc->header.length) == 0)
2545 return satcu;
2546 }
2547
2548 return NULL;
2549}
2550
2551int dmar_parse_one_satc(struct acpi_dmar_header *hdr, void *arg)
2552{
2553 struct acpi_dmar_satc *satc;
2554 struct dmar_satc_unit *satcu;
2555
2556 if (system_state >= SYSTEM_RUNNING && !intel_iommu_enabled)
2557 return 0;
2558
2559 satc = container_of(hdr, struct acpi_dmar_satc, header);
2560 satcu = dmar_find_satc(satc);
2561 if (satcu)
2562 return 0;
2563
2564 satcu = kzalloc(sizeof(*satcu) + hdr->length, GFP_KERNEL);
2565 if (!satcu)
2566 return -ENOMEM;
2567
2568 satcu->hdr = (void *)(satcu + 1);
2569 memcpy(to: satcu->hdr, from: hdr, len: hdr->length);
2570 satcu->atc_required = satc->flags & 0x1;
2571 satcu->devices = dmar_alloc_dev_scope(start: (void *)(satc + 1),
2572 end: (void *)satc + satc->header.length,
2573 cnt: &satcu->devices_cnt);
2574 if (satcu->devices_cnt && !satcu->devices) {
2575 kfree(objp: satcu);
2576 return -ENOMEM;
2577 }
2578 list_add_rcu(new: &satcu->list, head: &dmar_satc_units);
2579
2580 return 0;
2581}
2582
2583static int intel_iommu_add(struct dmar_drhd_unit *dmaru)
2584{
2585 struct intel_iommu *iommu = dmaru->iommu;
2586 int ret;
2587
2588 /*
2589 * Disable translation if already enabled prior to OS handover.
2590 */
2591 if (iommu->gcmd & DMA_GCMD_TE)
2592 iommu_disable_translation(iommu);
2593
2594 ret = iommu_alloc_root_entry(iommu);
2595 if (ret)
2596 goto out;
2597
2598 intel_svm_check(iommu);
2599
2600 if (dmaru->ignored) {
2601 /*
2602 * we always have to disable PMRs or DMA may fail on this device
2603 */
2604 if (force_on)
2605 iommu_disable_protect_mem_regions(iommu);
2606 return 0;
2607 }
2608
2609 intel_iommu_init_qi(iommu);
2610 iommu_flush_write_buffer(iommu);
2611
2612 if (ecap_prs(iommu->ecap)) {
2613 ret = intel_iommu_enable_prq(iommu);
2614 if (ret)
2615 goto disable_iommu;
2616 }
2617
2618 ret = dmar_set_interrupt(iommu);
2619 if (ret)
2620 goto disable_iommu;
2621
2622 iommu_set_root_entry(iommu);
2623 iommu_enable_translation(iommu);
2624
2625 iommu_disable_protect_mem_regions(iommu);
2626 return 0;
2627
2628disable_iommu:
2629 disable_dmar_iommu(iommu);
2630out:
2631 free_dmar_iommu(iommu);
2632 return ret;
2633}
2634
2635int dmar_iommu_hotplug(struct dmar_drhd_unit *dmaru, bool insert)
2636{
2637 int ret = 0;
2638 struct intel_iommu *iommu = dmaru->iommu;
2639
2640 if (!intel_iommu_enabled)
2641 return 0;
2642 if (iommu == NULL)
2643 return -EINVAL;
2644
2645 if (insert) {
2646 ret = intel_iommu_add(dmaru);
2647 } else {
2648 disable_dmar_iommu(iommu);
2649 free_dmar_iommu(iommu);
2650 }
2651
2652 return ret;
2653}
2654
2655static void intel_iommu_free_dmars(void)
2656{
2657 struct dmar_rmrr_unit *rmrru, *rmrr_n;
2658 struct dmar_atsr_unit *atsru, *atsr_n;
2659 struct dmar_satc_unit *satcu, *satc_n;
2660
2661 list_for_each_entry_safe(rmrru, rmrr_n, &dmar_rmrr_units, list) {
2662 list_del(entry: &rmrru->list);
2663 dmar_free_dev_scope(devices: &rmrru->devices, cnt: &rmrru->devices_cnt);
2664 kfree(objp: rmrru);
2665 }
2666
2667 list_for_each_entry_safe(atsru, atsr_n, &dmar_atsr_units, list) {
2668 list_del(entry: &atsru->list);
2669 intel_iommu_free_atsr(atsru);
2670 }
2671 list_for_each_entry_safe(satcu, satc_n, &dmar_satc_units, list) {
2672 list_del(entry: &satcu->list);
2673 dmar_free_dev_scope(devices: &satcu->devices, cnt: &satcu->devices_cnt);
2674 kfree(objp: satcu);
2675 }
2676}
2677
2678static struct dmar_satc_unit *dmar_find_matched_satc_unit(struct pci_dev *dev)
2679{
2680 struct dmar_satc_unit *satcu;
2681 struct acpi_dmar_satc *satc;
2682 struct device *tmp;
2683 int i;
2684
2685 rcu_read_lock();
2686
2687 list_for_each_entry_rcu(satcu, &dmar_satc_units, list) {
2688 satc = container_of(satcu->hdr, struct acpi_dmar_satc, header);
2689 if (satc->segment != pci_domain_nr(bus: dev->bus))
2690 continue;
2691 for_each_dev_scope(satcu->devices, satcu->devices_cnt, i, tmp)
2692 if (to_pci_dev(tmp) == dev)
2693 goto out;
2694 }
2695 satcu = NULL;
2696out:
2697 rcu_read_unlock();
2698 return satcu;
2699}
2700
2701static bool dmar_ats_supported(struct pci_dev *dev, struct intel_iommu *iommu)
2702{
2703 struct pci_dev *bridge = NULL;
2704 struct dmar_atsr_unit *atsru;
2705 struct dmar_satc_unit *satcu;
2706 struct acpi_dmar_atsr *atsr;
2707 bool supported = true;
2708 struct pci_bus *bus;
2709 struct device *tmp;
2710 int i;
2711
2712 dev = pci_physfn(dev);
2713 satcu = dmar_find_matched_satc_unit(dev);
2714 if (satcu)
2715 /*
2716 * This device supports ATS as it is in SATC table.
2717 * When IOMMU is in legacy mode, enabling ATS is done
2718 * automatically by HW for the device that requires
2719 * ATS, hence OS should not enable this device ATS
2720 * to avoid duplicated TLB invalidation.
2721 */
2722 return !(satcu->atc_required && !sm_supported(iommu));
2723
2724 for (bus = dev->bus; bus; bus = bus->parent) {
2725 bridge = bus->self;
2726 /* If it's an integrated device, allow ATS */
2727 if (!bridge)
2728 return true;
2729 /* Connected via non-PCIe: no ATS */
2730 if (!pci_is_pcie(dev: bridge) ||
2731 pci_pcie_type(dev: bridge) == PCI_EXP_TYPE_PCI_BRIDGE)
2732 return false;
2733 /* If we found the root port, look it up in the ATSR */
2734 if (pci_pcie_type(dev: bridge) == PCI_EXP_TYPE_ROOT_PORT)
2735 break;
2736 }
2737
2738 rcu_read_lock();
2739 list_for_each_entry_rcu(atsru, &dmar_atsr_units, list) {
2740 atsr = container_of(atsru->hdr, struct acpi_dmar_atsr, header);
2741 if (atsr->segment != pci_domain_nr(bus: dev->bus))
2742 continue;
2743
2744 for_each_dev_scope(atsru->devices, atsru->devices_cnt, i, tmp)
2745 if (tmp == &bridge->dev)
2746 goto out;
2747
2748 if (atsru->include_all)
2749 goto out;
2750 }
2751 supported = false;
2752out:
2753 rcu_read_unlock();
2754
2755 return supported;
2756}
2757
2758int dmar_iommu_notify_scope_dev(struct dmar_pci_notify_info *info)
2759{
2760 int ret;
2761 struct dmar_rmrr_unit *rmrru;
2762 struct dmar_atsr_unit *atsru;
2763 struct dmar_satc_unit *satcu;
2764 struct acpi_dmar_atsr *atsr;
2765 struct acpi_dmar_reserved_memory *rmrr;
2766 struct acpi_dmar_satc *satc;
2767
2768 if (!intel_iommu_enabled && system_state >= SYSTEM_RUNNING)
2769 return 0;
2770
2771 list_for_each_entry(rmrru, &dmar_rmrr_units, list) {
2772 rmrr = container_of(rmrru->hdr,
2773 struct acpi_dmar_reserved_memory, header);
2774 if (info->event == BUS_NOTIFY_ADD_DEVICE) {
2775 ret = dmar_insert_dev_scope(info, start: (void *)(rmrr + 1),
2776 end: ((void *)rmrr) + rmrr->header.length,
2777 segment: rmrr->segment, devices: rmrru->devices,
2778 devices_cnt: rmrru->devices_cnt);
2779 if (ret < 0)
2780 return ret;
2781 } else if (info->event == BUS_NOTIFY_REMOVED_DEVICE) {
2782 dmar_remove_dev_scope(info, segment: rmrr->segment,
2783 devices: rmrru->devices, count: rmrru->devices_cnt);
2784 }
2785 }
2786
2787 list_for_each_entry(atsru, &dmar_atsr_units, list) {
2788 if (atsru->include_all)
2789 continue;
2790
2791 atsr = container_of(atsru->hdr, struct acpi_dmar_atsr, header);
2792 if (info->event == BUS_NOTIFY_ADD_DEVICE) {
2793 ret = dmar_insert_dev_scope(info, start: (void *)(atsr + 1),
2794 end: (void *)atsr + atsr->header.length,
2795 segment: atsr->segment, devices: atsru->devices,
2796 devices_cnt: atsru->devices_cnt);
2797 if (ret > 0)
2798 break;
2799 else if (ret < 0)
2800 return ret;
2801 } else if (info->event == BUS_NOTIFY_REMOVED_DEVICE) {
2802 if (dmar_remove_dev_scope(info, segment: atsr->segment,
2803 devices: atsru->devices, count: atsru->devices_cnt))
2804 break;
2805 }
2806 }
2807 list_for_each_entry(satcu, &dmar_satc_units, list) {
2808 satc = container_of(satcu->hdr, struct acpi_dmar_satc, header);
2809 if (info->event == BUS_NOTIFY_ADD_DEVICE) {
2810 ret = dmar_insert_dev_scope(info, start: (void *)(satc + 1),
2811 end: (void *)satc + satc->header.length,
2812 segment: satc->segment, devices: satcu->devices,
2813 devices_cnt: satcu->devices_cnt);
2814 if (ret > 0)
2815 break;
2816 else if (ret < 0)
2817 return ret;
2818 } else if (info->event == BUS_NOTIFY_REMOVED_DEVICE) {
2819 if (dmar_remove_dev_scope(info, segment: satc->segment,
2820 devices: satcu->devices, count: satcu->devices_cnt))
2821 break;
2822 }
2823 }
2824
2825 return 0;
2826}
2827
2828static void intel_disable_iommus(void)
2829{
2830 struct intel_iommu *iommu = NULL;
2831 struct dmar_drhd_unit *drhd;
2832
2833 for_each_iommu(iommu, drhd)
2834 iommu_disable_translation(iommu);
2835}
2836
2837void intel_iommu_shutdown(void)
2838{
2839 struct dmar_drhd_unit *drhd;
2840 struct intel_iommu *iommu = NULL;
2841
2842 if (no_iommu || dmar_disabled)
2843 return;
2844
2845 /*
2846 * All other CPUs were brought down, hotplug interrupts were disabled,
2847 * no lock and RCU checking needed anymore
2848 */
2849 list_for_each_entry(drhd, &dmar_drhd_units, list) {
2850 iommu = drhd->iommu;
2851
2852 /* Disable PMRs explicitly here. */
2853 iommu_disable_protect_mem_regions(iommu);
2854
2855 /* Make sure the IOMMUs are switched off */
2856 iommu_disable_translation(iommu);
2857 }
2858}
2859
2860static struct intel_iommu *dev_to_intel_iommu(struct device *dev)
2861{
2862 struct iommu_device *iommu_dev = dev_to_iommu_device(dev);
2863
2864 return container_of(iommu_dev, struct intel_iommu, iommu);
2865}
2866
2867static ssize_t version_show(struct device *dev,
2868 struct device_attribute *attr, char *buf)
2869{
2870 struct intel_iommu *iommu = dev_to_intel_iommu(dev);
2871 u32 ver = readl(addr: iommu->reg + DMAR_VER_REG);
2872 return sysfs_emit(buf, fmt: "%d:%d\n",
2873 DMAR_VER_MAJOR(ver), DMAR_VER_MINOR(ver));
2874}
2875static DEVICE_ATTR_RO(version);
2876
2877static ssize_t address_show(struct device *dev,
2878 struct device_attribute *attr, char *buf)
2879{
2880 struct intel_iommu *iommu = dev_to_intel_iommu(dev);
2881 return sysfs_emit(buf, fmt: "%llx\n", iommu->reg_phys);
2882}
2883static DEVICE_ATTR_RO(address);
2884
2885static ssize_t cap_show(struct device *dev,
2886 struct device_attribute *attr, char *buf)
2887{
2888 struct intel_iommu *iommu = dev_to_intel_iommu(dev);
2889 return sysfs_emit(buf, fmt: "%llx\n", iommu->cap);
2890}
2891static DEVICE_ATTR_RO(cap);
2892
2893static ssize_t ecap_show(struct device *dev,
2894 struct device_attribute *attr, char *buf)
2895{
2896 struct intel_iommu *iommu = dev_to_intel_iommu(dev);
2897 return sysfs_emit(buf, fmt: "%llx\n", iommu->ecap);
2898}
2899static DEVICE_ATTR_RO(ecap);
2900
2901static ssize_t domains_supported_show(struct device *dev,
2902 struct device_attribute *attr, char *buf)
2903{
2904 struct intel_iommu *iommu = dev_to_intel_iommu(dev);
2905 return sysfs_emit(buf, fmt: "%ld\n", cap_ndoms(iommu->cap));
2906}
2907static DEVICE_ATTR_RO(domains_supported);
2908
2909static ssize_t domains_used_show(struct device *dev,
2910 struct device_attribute *attr, char *buf)
2911{
2912 struct intel_iommu *iommu = dev_to_intel_iommu(dev);
2913 unsigned int count = 0;
2914 int id;
2915
2916 for (id = 0; id < cap_ndoms(iommu->cap); id++)
2917 if (ida_exists(ida: &iommu->domain_ida, id))
2918 count++;
2919
2920 return sysfs_emit(buf, fmt: "%d\n", count);
2921}
2922static DEVICE_ATTR_RO(domains_used);
2923
2924static struct attribute *intel_iommu_attrs[] = {
2925 &dev_attr_version.attr,
2926 &dev_attr_address.attr,
2927 &dev_attr_cap.attr,
2928 &dev_attr_ecap.attr,
2929 &dev_attr_domains_supported.attr,
2930 &dev_attr_domains_used.attr,
2931 NULL,
2932};
2933
2934static struct attribute_group intel_iommu_group = {
2935 .name = "intel-iommu",
2936 .attrs = intel_iommu_attrs,
2937};
2938
2939const struct attribute_group *intel_iommu_groups[] = {
2940 &intel_iommu_group,
2941 NULL,
2942};
2943
2944static bool has_external_pci(void)
2945{
2946 struct pci_dev *pdev = NULL;
2947
2948 for_each_pci_dev(pdev)
2949 if (pdev->external_facing) {
2950 pci_dev_put(dev: pdev);
2951 return true;
2952 }
2953
2954 return false;
2955}
2956
2957static int __init platform_optin_force_iommu(void)
2958{
2959 if (!dmar_platform_optin() || no_platform_optin || !has_external_pci())
2960 return 0;
2961
2962 if (no_iommu || dmar_disabled)
2963 pr_info("Intel-IOMMU force enabled due to platform opt in\n");
2964
2965 /*
2966 * If Intel-IOMMU is disabled by default, we will apply identity
2967 * map for all devices except those marked as being untrusted.
2968 */
2969 if (dmar_disabled)
2970 iommu_set_default_passthrough(cmd_line: false);
2971
2972 dmar_disabled = 0;
2973 no_iommu = 0;
2974
2975 return 1;
2976}
2977
2978static int __init probe_acpi_namespace_devices(void)
2979{
2980 struct dmar_drhd_unit *drhd;
2981 /* To avoid a -Wunused-but-set-variable warning. */
2982 struct intel_iommu *iommu __maybe_unused;
2983 struct device *dev;
2984 int i, ret = 0;
2985
2986 for_each_active_iommu(iommu, drhd) {
2987 for_each_active_dev_scope(drhd->devices,
2988 drhd->devices_cnt, i, dev) {
2989 struct acpi_device_physical_node *pn;
2990 struct acpi_device *adev;
2991
2992 if (dev->bus != &acpi_bus_type)
2993 continue;
2994
2995 up_read(sem: &dmar_global_lock);
2996 adev = to_acpi_device(dev);
2997 mutex_lock(lock: &adev->physical_node_lock);
2998 list_for_each_entry(pn,
2999 &adev->physical_node_list, node) {
3000 ret = iommu_probe_device(dev: pn->dev);
3001 if (ret)
3002 break;
3003 }
3004 mutex_unlock(lock: &adev->physical_node_lock);
3005 down_read(sem: &dmar_global_lock);
3006
3007 if (ret)
3008 return ret;
3009 }
3010 }
3011
3012 return 0;
3013}
3014
3015static __init int tboot_force_iommu(void)
3016{
3017 if (!tboot_enabled())
3018 return 0;
3019
3020 if (no_iommu || dmar_disabled)
3021 pr_warn("Forcing Intel-IOMMU to enabled\n");
3022
3023 dmar_disabled = 0;
3024 no_iommu = 0;
3025
3026 return 1;
3027}
3028
3029int __init intel_iommu_init(void)
3030{
3031 int ret = -ENODEV;
3032 struct dmar_drhd_unit *drhd;
3033 struct intel_iommu *iommu;
3034
3035 /*
3036 * Intel IOMMU is required for a TXT/tboot launch or platform
3037 * opt in, so enforce that.
3038 */
3039 force_on = (!intel_iommu_tboot_noforce && tboot_force_iommu()) ||
3040 platform_optin_force_iommu();
3041
3042 down_write(sem: &dmar_global_lock);
3043 if (dmar_table_init()) {
3044 if (force_on)
3045 panic(fmt: "tboot: Failed to initialize DMAR table\n");
3046 goto out_free_dmar;
3047 }
3048
3049 if (dmar_dev_scope_init() < 0) {
3050 if (force_on)
3051 panic(fmt: "tboot: Failed to initialize DMAR device scope\n");
3052 goto out_free_dmar;
3053 }
3054
3055 up_write(sem: &dmar_global_lock);
3056
3057 /*
3058 * The bus notifier takes the dmar_global_lock, so lockdep will
3059 * complain later when we register it under the lock.
3060 */
3061 dmar_register_bus_notifier();
3062
3063 down_write(sem: &dmar_global_lock);
3064
3065 if (!no_iommu)
3066 intel_iommu_debugfs_init();
3067
3068 if (no_iommu || dmar_disabled) {
3069 /*
3070 * We exit the function here to ensure IOMMU's remapping and
3071 * mempool aren't setup, which means that the IOMMU's PMRs
3072 * won't be disabled via the call to init_dmars(). So disable
3073 * it explicitly here. The PMRs were setup by tboot prior to
3074 * calling SENTER, but the kernel is expected to reset/tear
3075 * down the PMRs.
3076 */
3077 if (intel_iommu_tboot_noforce) {
3078 for_each_iommu(iommu, drhd)
3079 iommu_disable_protect_mem_regions(iommu);
3080 }
3081
3082 /*
3083 * Make sure the IOMMUs are switched off, even when we
3084 * boot into a kexec kernel and the previous kernel left
3085 * them enabled
3086 */
3087 intel_disable_iommus();
3088 goto out_free_dmar;
3089 }
3090
3091 if (list_empty(head: &dmar_rmrr_units))
3092 pr_info("No RMRR found\n");
3093
3094 if (list_empty(head: &dmar_atsr_units))
3095 pr_info("No ATSR found\n");
3096
3097 if (list_empty(head: &dmar_satc_units))
3098 pr_info("No SATC found\n");
3099
3100 init_no_remapping_devices();
3101
3102 ret = init_dmars();
3103 if (ret) {
3104 if (force_on)
3105 panic(fmt: "tboot: Failed to initialize DMARs\n");
3106 pr_err("Initialization failed\n");
3107 goto out_free_dmar;
3108 }
3109 up_write(sem: &dmar_global_lock);
3110
3111 init_iommu_pm_ops();
3112
3113 down_read(sem: &dmar_global_lock);
3114 for_each_active_iommu(iommu, drhd) {
3115 /*
3116 * The flush queue implementation does not perform
3117 * page-selective invalidations that are required for efficient
3118 * TLB flushes in virtual environments. The benefit of batching
3119 * is likely to be much lower than the overhead of synchronizing
3120 * the virtual and physical IOMMU page-tables.
3121 */
3122 if (cap_caching_mode(iommu->cap) &&
3123 !first_level_by_default(iommu)) {
3124 pr_info_once("IOMMU batching disallowed due to virtualization\n");
3125 iommu_set_dma_strict();
3126 }
3127 iommu_device_sysfs_add(iommu: &iommu->iommu, NULL,
3128 groups: intel_iommu_groups,
3129 fmt: "%s", iommu->name);
3130 /*
3131 * The iommu device probe is protected by the iommu_probe_device_lock.
3132 * Release the dmar_global_lock before entering the device probe path
3133 * to avoid unnecessary lock order splat.
3134 */
3135 up_read(sem: &dmar_global_lock);
3136 iommu_device_register(iommu: &iommu->iommu, ops: &intel_iommu_ops, NULL);
3137 down_read(sem: &dmar_global_lock);
3138
3139 iommu_pmu_register(iommu);
3140 }
3141
3142 if (probe_acpi_namespace_devices())
3143 pr_warn("ACPI name space devices didn't probe correctly\n");
3144
3145 /* Finally, we enable the DMA remapping hardware. */
3146 for_each_iommu(iommu, drhd) {
3147 if (!drhd->ignored && !translation_pre_enabled(iommu))
3148 iommu_enable_translation(iommu);
3149
3150 iommu_disable_protect_mem_regions(iommu);
3151 }
3152 up_read(sem: &dmar_global_lock);
3153
3154 pr_info("Intel(R) Virtualization Technology for Directed I/O\n");
3155
3156 intel_iommu_enabled = 1;
3157
3158 return 0;
3159
3160out_free_dmar:
3161 intel_iommu_free_dmars();
3162 up_write(sem: &dmar_global_lock);
3163 return ret;
3164}
3165
3166static int domain_context_clear_one_cb(struct pci_dev *pdev, u16 alias, void *opaque)
3167{
3168 struct device_domain_info *info = opaque;
3169
3170 domain_context_clear_one(info, PCI_BUS_NUM(alias), devfn: alias & 0xff);
3171 return 0;
3172}
3173
3174/*
3175 * NB - intel-iommu lacks any sort of reference counting for the users of
3176 * dependent devices. If multiple endpoints have intersecting dependent
3177 * devices, unbinding the driver from any one of them will possibly leave
3178 * the others unable to operate.
3179 */
3180static void domain_context_clear(struct device_domain_info *info)
3181{
3182 if (!dev_is_pci(info->dev)) {
3183 domain_context_clear_one(info, bus: info->bus, devfn: info->devfn);
3184 return;
3185 }
3186
3187 pci_for_each_dma_alias(to_pci_dev(info->dev),
3188 fn: &domain_context_clear_one_cb, data: info);
3189 iommu_disable_pci_ats(info);
3190}
3191
3192/*
3193 * Clear the page table pointer in context or pasid table entries so that
3194 * all DMA requests without PASID from the device are blocked. If the page
3195 * table has been set, clean up the data structures.
3196 */
3197void device_block_translation(struct device *dev)
3198{
3199 struct device_domain_info *info = dev_iommu_priv_get(dev);
3200 struct intel_iommu *iommu = info->iommu;
3201 unsigned long flags;
3202
3203 /* Device in DMA blocking state. Noting to do. */
3204 if (!info->domain_attached)
3205 return;
3206
3207 if (info->domain)
3208 cache_tag_unassign_domain(domain: info->domain, dev, IOMMU_NO_PASID);
3209
3210 if (!dev_is_real_dma_subdevice(dev)) {
3211 if (sm_supported(iommu))
3212 intel_pasid_tear_down_entry(iommu, dev,
3213 IOMMU_NO_PASID, fault_ignore: false);
3214 else
3215 domain_context_clear(info);
3216 }
3217
3218 /* Device now in DMA blocking state. */
3219 info->domain_attached = false;
3220
3221 if (!info->domain)
3222 return;
3223
3224 spin_lock_irqsave(&info->domain->lock, flags);
3225 list_del(entry: &info->link);
3226 spin_unlock_irqrestore(lock: &info->domain->lock, flags);
3227
3228 domain_detach_iommu(domain: info->domain, iommu);
3229 info->domain = NULL;
3230}
3231
3232static int blocking_domain_attach_dev(struct iommu_domain *domain,
3233 struct device *dev)
3234{
3235 struct device_domain_info *info = dev_iommu_priv_get(dev);
3236
3237 iopf_for_domain_remove(domain: info->domain ? &info->domain->domain : NULL, dev);
3238 device_block_translation(dev);
3239 return 0;
3240}
3241
3242static int blocking_domain_set_dev_pasid(struct iommu_domain *domain,
3243 struct device *dev, ioasid_t pasid,
3244 struct iommu_domain *old);
3245
3246static struct iommu_domain blocking_domain = {
3247 .type = IOMMU_DOMAIN_BLOCKED,
3248 .ops = &(const struct iommu_domain_ops) {
3249 .attach_dev = blocking_domain_attach_dev,
3250 .set_dev_pasid = blocking_domain_set_dev_pasid,
3251 }
3252};
3253
3254static int iommu_superpage_capability(struct intel_iommu *iommu, bool first_stage)
3255{
3256 if (!intel_iommu_superpage)
3257 return 0;
3258
3259 if (first_stage)
3260 return cap_fl1gp_support(iommu->cap) ? 2 : 1;
3261
3262 return fls(cap_super_page_val(iommu->cap));
3263}
3264
3265static struct dmar_domain *paging_domain_alloc(struct device *dev, bool first_stage)
3266{
3267 struct device_domain_info *info = dev_iommu_priv_get(dev);
3268 struct intel_iommu *iommu = info->iommu;
3269 struct dmar_domain *domain;
3270 int addr_width;
3271
3272 domain = kzalloc(sizeof(*domain), GFP_KERNEL);
3273 if (!domain)
3274 return ERR_PTR(error: -ENOMEM);
3275
3276 INIT_LIST_HEAD(list: &domain->devices);
3277 INIT_LIST_HEAD(list: &domain->dev_pasids);
3278 INIT_LIST_HEAD(list: &domain->cache_tags);
3279 spin_lock_init(&domain->lock);
3280 spin_lock_init(&domain->cache_lock);
3281 xa_init(xa: &domain->iommu_array);
3282 INIT_LIST_HEAD(list: &domain->s1_domains);
3283 spin_lock_init(&domain->s1_lock);
3284
3285 domain->nid = dev_to_node(dev);
3286 domain->use_first_level = first_stage;
3287
3288 domain->domain.type = IOMMU_DOMAIN_UNMANAGED;
3289
3290 /* calculate the address width */
3291 addr_width = agaw_to_width(agaw: iommu->agaw);
3292 if (addr_width > cap_mgaw(iommu->cap))
3293 addr_width = cap_mgaw(iommu->cap);
3294 domain->gaw = addr_width;
3295 domain->agaw = iommu->agaw;
3296 domain->max_addr = __DOMAIN_MAX_ADDR(addr_width);
3297
3298 /* iommu memory access coherency */
3299 domain->iommu_coherency = iommu_paging_structure_coherency(iommu);
3300
3301 /* pagesize bitmap */
3302 domain->domain.pgsize_bitmap = SZ_4K;
3303 domain->iommu_superpage = iommu_superpage_capability(iommu, first_stage);
3304 domain->domain.pgsize_bitmap |= domain_super_pgsize_bitmap(domain);
3305
3306 /*
3307 * IOVA aperture: First-level translation restricts the input-address
3308 * to a canonical address (i.e., address bits 63:N have the same value
3309 * as address bit [N-1], where N is 48-bits with 4-level paging and
3310 * 57-bits with 5-level paging). Hence, skip bit [N-1].
3311 */
3312 domain->domain.geometry.force_aperture = true;
3313 domain->domain.geometry.aperture_start = 0;
3314 if (first_stage)
3315 domain->domain.geometry.aperture_end = __DOMAIN_MAX_ADDR(domain->gaw - 1);
3316 else
3317 domain->domain.geometry.aperture_end = __DOMAIN_MAX_ADDR(domain->gaw);
3318
3319 /* always allocate the top pgd */
3320 domain->pgd = iommu_alloc_pages_node_sz(nid: domain->nid, GFP_KERNEL, SZ_4K);
3321 if (!domain->pgd) {
3322 kfree(objp: domain);
3323 return ERR_PTR(error: -ENOMEM);
3324 }
3325 domain_flush_cache(domain, addr: domain->pgd, PAGE_SIZE);
3326
3327 return domain;
3328}
3329
3330static struct iommu_domain *
3331intel_iommu_domain_alloc_first_stage(struct device *dev,
3332 struct intel_iommu *iommu, u32 flags)
3333{
3334 struct dmar_domain *dmar_domain;
3335
3336 if (flags & ~IOMMU_HWPT_ALLOC_PASID)
3337 return ERR_PTR(error: -EOPNOTSUPP);
3338
3339 /* Only SL is available in legacy mode */
3340 if (!sm_supported(iommu) || !ecap_flts(iommu->ecap))
3341 return ERR_PTR(error: -EOPNOTSUPP);
3342
3343 dmar_domain = paging_domain_alloc(dev, first_stage: true);
3344 if (IS_ERR(ptr: dmar_domain))
3345 return ERR_CAST(ptr: dmar_domain);
3346
3347 dmar_domain->domain.ops = &intel_fs_paging_domain_ops;
3348 /*
3349 * iotlb sync for map is only needed for legacy implementations that
3350 * explicitly require flushing internal write buffers to ensure memory
3351 * coherence.
3352 */
3353 if (rwbf_required(iommu))
3354 dmar_domain->iotlb_sync_map = true;
3355
3356 return &dmar_domain->domain;
3357}
3358
3359static struct iommu_domain *
3360intel_iommu_domain_alloc_second_stage(struct device *dev,
3361 struct intel_iommu *iommu, u32 flags)
3362{
3363 struct dmar_domain *dmar_domain;
3364
3365 if (flags &
3366 (~(IOMMU_HWPT_ALLOC_NEST_PARENT | IOMMU_HWPT_ALLOC_DIRTY_TRACKING |
3367 IOMMU_HWPT_ALLOC_PASID)))
3368 return ERR_PTR(error: -EOPNOTSUPP);
3369
3370 if (((flags & IOMMU_HWPT_ALLOC_NEST_PARENT) &&
3371 !nested_supported(iommu)) ||
3372 ((flags & IOMMU_HWPT_ALLOC_DIRTY_TRACKING) &&
3373 !ssads_supported(iommu)))
3374 return ERR_PTR(error: -EOPNOTSUPP);
3375
3376 /* Legacy mode always supports second stage */
3377 if (sm_supported(iommu) && !ecap_slts(iommu->ecap))
3378 return ERR_PTR(error: -EOPNOTSUPP);
3379
3380 dmar_domain = paging_domain_alloc(dev, first_stage: false);
3381 if (IS_ERR(ptr: dmar_domain))
3382 return ERR_CAST(ptr: dmar_domain);
3383
3384 dmar_domain->domain.ops = &intel_ss_paging_domain_ops;
3385 dmar_domain->nested_parent = flags & IOMMU_HWPT_ALLOC_NEST_PARENT;
3386
3387 if (flags & IOMMU_HWPT_ALLOC_DIRTY_TRACKING)
3388 dmar_domain->domain.dirty_ops = &intel_dirty_ops;
3389
3390 /*
3391 * Besides the internal write buffer flush, the caching mode used for
3392 * legacy nested translation (which utilizes shadowing page tables)
3393 * also requires iotlb sync on map.
3394 */
3395 if (rwbf_required(iommu) || cap_caching_mode(iommu->cap))
3396 dmar_domain->iotlb_sync_map = true;
3397
3398 return &dmar_domain->domain;
3399}
3400
3401static struct iommu_domain *
3402intel_iommu_domain_alloc_paging_flags(struct device *dev, u32 flags,
3403 const struct iommu_user_data *user_data)
3404{
3405 struct device_domain_info *info = dev_iommu_priv_get(dev);
3406 struct intel_iommu *iommu = info->iommu;
3407 struct iommu_domain *domain;
3408
3409 if (user_data)
3410 return ERR_PTR(error: -EOPNOTSUPP);
3411
3412 /* Prefer first stage if possible by default. */
3413 domain = intel_iommu_domain_alloc_first_stage(dev, iommu, flags);
3414 if (domain != ERR_PTR(error: -EOPNOTSUPP))
3415 return domain;
3416 return intel_iommu_domain_alloc_second_stage(dev, iommu, flags);
3417}
3418
3419static void intel_iommu_domain_free(struct iommu_domain *domain)
3420{
3421 struct dmar_domain *dmar_domain = to_dmar_domain(dom: domain);
3422
3423 if (WARN_ON(dmar_domain->nested_parent &&
3424 !list_empty(&dmar_domain->s1_domains)))
3425 return;
3426
3427 if (WARN_ON(!list_empty(&dmar_domain->devices)))
3428 return;
3429
3430 if (dmar_domain->pgd) {
3431 struct iommu_pages_list freelist =
3432 IOMMU_PAGES_LIST_INIT(freelist);
3433
3434 domain_unmap(domain: dmar_domain, start_pfn: 0, DOMAIN_MAX_PFN(dmar_domain->gaw),
3435 freelist: &freelist);
3436 iommu_put_pages_list(list: &freelist);
3437 }
3438
3439 kfree(objp: dmar_domain->qi_batch);
3440 kfree(objp: dmar_domain);
3441}
3442
3443static int paging_domain_compatible_first_stage(struct dmar_domain *dmar_domain,
3444 struct intel_iommu *iommu)
3445{
3446 if (WARN_ON(dmar_domain->domain.dirty_ops ||
3447 dmar_domain->nested_parent))
3448 return -EINVAL;
3449
3450 /* Only SL is available in legacy mode */
3451 if (!sm_supported(iommu) || !ecap_flts(iommu->ecap))
3452 return -EINVAL;
3453
3454 /* Same page size support */
3455 if (!cap_fl1gp_support(iommu->cap) &&
3456 (dmar_domain->domain.pgsize_bitmap & SZ_1G))
3457 return -EINVAL;
3458
3459 /* iotlb sync on map requirement */
3460 if ((rwbf_required(iommu)) && !dmar_domain->iotlb_sync_map)
3461 return -EINVAL;
3462
3463 return 0;
3464}
3465
3466static int
3467paging_domain_compatible_second_stage(struct dmar_domain *dmar_domain,
3468 struct intel_iommu *iommu)
3469{
3470 unsigned int sslps = cap_super_page_val(iommu->cap);
3471
3472 if (dmar_domain->domain.dirty_ops && !ssads_supported(iommu))
3473 return -EINVAL;
3474 if (dmar_domain->nested_parent && !nested_supported(iommu))
3475 return -EINVAL;
3476
3477 /* Legacy mode always supports second stage */
3478 if (sm_supported(iommu) && !ecap_slts(iommu->ecap))
3479 return -EINVAL;
3480
3481 /* Same page size support */
3482 if (!(sslps & BIT(0)) && (dmar_domain->domain.pgsize_bitmap & SZ_2M))
3483 return -EINVAL;
3484 if (!(sslps & BIT(1)) && (dmar_domain->domain.pgsize_bitmap & SZ_1G))
3485 return -EINVAL;
3486
3487 /* iotlb sync on map requirement */
3488 if ((rwbf_required(iommu) || cap_caching_mode(iommu->cap)) &&
3489 !dmar_domain->iotlb_sync_map)
3490 return -EINVAL;
3491
3492 return 0;
3493}
3494
3495int paging_domain_compatible(struct iommu_domain *domain, struct device *dev)
3496{
3497 struct device_domain_info *info = dev_iommu_priv_get(dev);
3498 struct dmar_domain *dmar_domain = to_dmar_domain(dom: domain);
3499 struct intel_iommu *iommu = info->iommu;
3500 int ret = -EINVAL;
3501 int addr_width;
3502
3503 if (intel_domain_is_fs_paging(domain: dmar_domain))
3504 ret = paging_domain_compatible_first_stage(dmar_domain, iommu);
3505 else if (intel_domain_is_ss_paging(domain: dmar_domain))
3506 ret = paging_domain_compatible_second_stage(dmar_domain, iommu);
3507 else if (WARN_ON(true))
3508 ret = -EINVAL;
3509 if (ret)
3510 return ret;
3511
3512 /*
3513 * FIXME this is locked wrong, it needs to be under the
3514 * dmar_domain->lock
3515 */
3516 if (dmar_domain->force_snooping && !ecap_sc_support(iommu->ecap))
3517 return -EINVAL;
3518
3519 if (dmar_domain->iommu_coherency !=
3520 iommu_paging_structure_coherency(iommu))
3521 return -EINVAL;
3522
3523
3524 /* check if this iommu agaw is sufficient for max mapped address */
3525 addr_width = agaw_to_width(agaw: iommu->agaw);
3526 if (addr_width > cap_mgaw(iommu->cap))
3527 addr_width = cap_mgaw(iommu->cap);
3528
3529 if (dmar_domain->gaw > addr_width || dmar_domain->agaw > iommu->agaw)
3530 return -EINVAL;
3531
3532 if (sm_supported(iommu) && !dev_is_real_dma_subdevice(dev) &&
3533 context_copied(iommu, bus: info->bus, devfn: info->devfn))
3534 return intel_pasid_setup_sm_context(dev);
3535
3536 return 0;
3537}
3538
3539static int intel_iommu_attach_device(struct iommu_domain *domain,
3540 struct device *dev)
3541{
3542 int ret;
3543
3544 device_block_translation(dev);
3545
3546 ret = paging_domain_compatible(domain, dev);
3547 if (ret)
3548 return ret;
3549
3550 ret = iopf_for_domain_set(domain, dev);
3551 if (ret)
3552 return ret;
3553
3554 ret = dmar_domain_attach_device(domain: to_dmar_domain(dom: domain), dev);
3555 if (ret)
3556 iopf_for_domain_remove(domain, dev);
3557
3558 return ret;
3559}
3560
3561static int intel_iommu_map(struct iommu_domain *domain,
3562 unsigned long iova, phys_addr_t hpa,
3563 size_t size, int iommu_prot, gfp_t gfp)
3564{
3565 struct dmar_domain *dmar_domain = to_dmar_domain(dom: domain);
3566 u64 max_addr;
3567 int prot = 0;
3568
3569 if (iommu_prot & IOMMU_READ)
3570 prot |= DMA_PTE_READ;
3571 if (iommu_prot & IOMMU_WRITE)
3572 prot |= DMA_PTE_WRITE;
3573 if (dmar_domain->set_pte_snp)
3574 prot |= DMA_PTE_SNP;
3575
3576 max_addr = iova + size;
3577 if (dmar_domain->max_addr < max_addr) {
3578 u64 end;
3579
3580 /* check if minimum agaw is sufficient for mapped address */
3581 end = __DOMAIN_MAX_ADDR(dmar_domain->gaw) + 1;
3582 if (end < max_addr) {
3583 pr_err("%s: iommu width (%d) is not "
3584 "sufficient for the mapped address (%llx)\n",
3585 __func__, dmar_domain->gaw, max_addr);
3586 return -EFAULT;
3587 }
3588 dmar_domain->max_addr = max_addr;
3589 }
3590 /* Round up size to next multiple of PAGE_SIZE, if it and
3591 the low bits of hpa would take us onto the next page */
3592 size = aligned_nrpages(host_addr: hpa, size);
3593 return __domain_mapping(domain: dmar_domain, iov_pfn: iova >> VTD_PAGE_SHIFT,
3594 phys_pfn: hpa >> VTD_PAGE_SHIFT, nr_pages: size, prot, gfp);
3595}
3596
3597static int intel_iommu_map_pages(struct iommu_domain *domain,
3598 unsigned long iova, phys_addr_t paddr,
3599 size_t pgsize, size_t pgcount,
3600 int prot, gfp_t gfp, size_t *mapped)
3601{
3602 unsigned long pgshift = __ffs(pgsize);
3603 size_t size = pgcount << pgshift;
3604 int ret;
3605
3606 if (pgsize != SZ_4K && pgsize != SZ_2M && pgsize != SZ_1G)
3607 return -EINVAL;
3608
3609 if (!IS_ALIGNED(iova | paddr, pgsize))
3610 return -EINVAL;
3611
3612 ret = intel_iommu_map(domain, iova, hpa: paddr, size, iommu_prot: prot, gfp);
3613 if (!ret && mapped)
3614 *mapped = size;
3615
3616 return ret;
3617}
3618
3619static size_t intel_iommu_unmap(struct iommu_domain *domain,
3620 unsigned long iova, size_t size,
3621 struct iommu_iotlb_gather *gather)
3622{
3623 struct dmar_domain *dmar_domain = to_dmar_domain(dom: domain);
3624 unsigned long start_pfn, last_pfn;
3625 int level = 0;
3626
3627 /* Cope with horrid API which requires us to unmap more than the
3628 size argument if it happens to be a large-page mapping. */
3629 if (unlikely(!pfn_to_dma_pte(dmar_domain, iova >> VTD_PAGE_SHIFT,
3630 &level, GFP_ATOMIC)))
3631 return 0;
3632
3633 if (size < VTD_PAGE_SIZE << level_to_offset_bits(level))
3634 size = VTD_PAGE_SIZE << level_to_offset_bits(level);
3635
3636 start_pfn = iova >> VTD_PAGE_SHIFT;
3637 last_pfn = (iova + size - 1) >> VTD_PAGE_SHIFT;
3638
3639 domain_unmap(domain: dmar_domain, start_pfn, last_pfn, freelist: &gather->freelist);
3640
3641 if (dmar_domain->max_addr == iova + size)
3642 dmar_domain->max_addr = iova;
3643
3644 /*
3645 * We do not use page-selective IOTLB invalidation in flush queue,
3646 * so there is no need to track page and sync iotlb.
3647 */
3648 if (!iommu_iotlb_gather_queued(gather))
3649 iommu_iotlb_gather_add_page(domain, gather, iova, size);
3650
3651 return size;
3652}
3653
3654static size_t intel_iommu_unmap_pages(struct iommu_domain *domain,
3655 unsigned long iova,
3656 size_t pgsize, size_t pgcount,
3657 struct iommu_iotlb_gather *gather)
3658{
3659 unsigned long pgshift = __ffs(pgsize);
3660 size_t size = pgcount << pgshift;
3661
3662 return intel_iommu_unmap(domain, iova, size, gather);
3663}
3664
3665static void intel_iommu_tlb_sync(struct iommu_domain *domain,
3666 struct iommu_iotlb_gather *gather)
3667{
3668 cache_tag_flush_range(domain: to_dmar_domain(dom: domain), start: gather->start,
3669 end: gather->end,
3670 ih: iommu_pages_list_empty(list: &gather->freelist));
3671 iommu_put_pages_list(list: &gather->freelist);
3672}
3673
3674static phys_addr_t intel_iommu_iova_to_phys(struct iommu_domain *domain,
3675 dma_addr_t iova)
3676{
3677 struct dmar_domain *dmar_domain = to_dmar_domain(dom: domain);
3678 struct dma_pte *pte;
3679 int level = 0;
3680 u64 phys = 0;
3681
3682 pte = pfn_to_dma_pte(domain: dmar_domain, pfn: iova >> VTD_PAGE_SHIFT, target_level: &level,
3683 GFP_ATOMIC);
3684 if (pte && dma_pte_present(pte))
3685 phys = dma_pte_addr(pte) +
3686 (iova & (BIT_MASK(level_to_offset_bits(level) +
3687 VTD_PAGE_SHIFT) - 1));
3688
3689 return phys;
3690}
3691
3692static bool domain_support_force_snooping(struct dmar_domain *domain)
3693{
3694 struct device_domain_info *info;
3695 bool support = true;
3696
3697 assert_spin_locked(&domain->lock);
3698 list_for_each_entry(info, &domain->devices, link) {
3699 if (!ecap_sc_support(info->iommu->ecap)) {
3700 support = false;
3701 break;
3702 }
3703 }
3704
3705 return support;
3706}
3707
3708static bool intel_iommu_enforce_cache_coherency_fs(struct iommu_domain *domain)
3709{
3710 struct dmar_domain *dmar_domain = to_dmar_domain(dom: domain);
3711 struct device_domain_info *info;
3712
3713 guard(spinlock_irqsave)(l: &dmar_domain->lock);
3714
3715 if (dmar_domain->force_snooping)
3716 return true;
3717
3718 if (!domain_support_force_snooping(domain: dmar_domain))
3719 return false;
3720
3721 dmar_domain->force_snooping = true;
3722 list_for_each_entry(info, &dmar_domain->devices, link)
3723 intel_pasid_setup_page_snoop_control(iommu: info->iommu, dev: info->dev,
3724 IOMMU_NO_PASID);
3725 return true;
3726}
3727
3728static bool intel_iommu_enforce_cache_coherency_ss(struct iommu_domain *domain)
3729{
3730 struct dmar_domain *dmar_domain = to_dmar_domain(dom: domain);
3731
3732 guard(spinlock_irqsave)(l: &dmar_domain->lock);
3733 if (!domain_support_force_snooping(domain: dmar_domain) ||
3734 dmar_domain->has_mappings)
3735 return false;
3736
3737 /*
3738 * Second level page table supports per-PTE snoop control. The
3739 * iommu_map() interface will handle this by setting SNP bit.
3740 */
3741 dmar_domain->set_pte_snp = true;
3742 dmar_domain->force_snooping = true;
3743 return true;
3744}
3745
3746static bool intel_iommu_capable(struct device *dev, enum iommu_cap cap)
3747{
3748 struct device_domain_info *info = dev_iommu_priv_get(dev);
3749
3750 switch (cap) {
3751 case IOMMU_CAP_CACHE_COHERENCY:
3752 case IOMMU_CAP_DEFERRED_FLUSH:
3753 return true;
3754 case IOMMU_CAP_PRE_BOOT_PROTECTION:
3755 return dmar_platform_optin();
3756 case IOMMU_CAP_ENFORCE_CACHE_COHERENCY:
3757 return ecap_sc_support(info->iommu->ecap);
3758 case IOMMU_CAP_DIRTY_TRACKING:
3759 return ssads_supported(info->iommu);
3760 default:
3761 return false;
3762 }
3763}
3764
3765static struct iommu_device *intel_iommu_probe_device(struct device *dev)
3766{
3767 struct pci_dev *pdev = dev_is_pci(dev) ? to_pci_dev(dev) : NULL;
3768 struct device_domain_info *info;
3769 struct intel_iommu *iommu;
3770 u8 bus, devfn;
3771 int ret;
3772
3773 iommu = device_lookup_iommu(dev, bus: &bus, devfn: &devfn);
3774 if (!iommu || !iommu->iommu.ops)
3775 return ERR_PTR(error: -ENODEV);
3776
3777 info = kzalloc(sizeof(*info), GFP_KERNEL);
3778 if (!info)
3779 return ERR_PTR(error: -ENOMEM);
3780
3781 if (dev_is_real_dma_subdevice(dev)) {
3782 info->bus = pdev->bus->number;
3783 info->devfn = pdev->devfn;
3784 info->segment = pci_domain_nr(bus: pdev->bus);
3785 } else {
3786 info->bus = bus;
3787 info->devfn = devfn;
3788 info->segment = iommu->segment;
3789 }
3790
3791 info->dev = dev;
3792 info->iommu = iommu;
3793 if (dev_is_pci(dev)) {
3794 if (ecap_dev_iotlb_support(iommu->ecap) &&
3795 pci_ats_supported(dev: pdev) &&
3796 dmar_ats_supported(dev: pdev, iommu)) {
3797 info->ats_supported = 1;
3798 info->dtlb_extra_inval = dev_needs_extra_dtlb_flush(pdev);
3799
3800 /*
3801 * For IOMMU that supports device IOTLB throttling
3802 * (DIT), we assign PFSID to the invalidation desc
3803 * of a VF such that IOMMU HW can gauge queue depth
3804 * at PF level. If DIT is not set, PFSID will be
3805 * treated as reserved, which should be set to 0.
3806 */
3807 if (ecap_dit(iommu->ecap))
3808 info->pfsid = pci_dev_id(dev: pci_physfn(dev: pdev));
3809 info->ats_qdep = pci_ats_queue_depth(dev: pdev);
3810 }
3811 if (sm_supported(iommu)) {
3812 if (pasid_supported(iommu)) {
3813 int features = pci_pasid_features(pdev);
3814
3815 if (features >= 0)
3816 info->pasid_supported = features | 1;
3817 }
3818
3819 if (info->ats_supported && ecap_prs(iommu->ecap) &&
3820 ecap_pds(iommu->ecap) && pci_pri_supported(pdev))
3821 info->pri_supported = 1;
3822 }
3823 }
3824
3825 dev_iommu_priv_set(dev, priv: info);
3826 if (pdev && pci_ats_supported(dev: pdev)) {
3827 pci_prepare_ats(dev: pdev, VTD_PAGE_SHIFT);
3828 ret = device_rbtree_insert(iommu, info);
3829 if (ret)
3830 goto free;
3831 }
3832
3833 if (sm_supported(iommu) && !dev_is_real_dma_subdevice(dev)) {
3834 ret = intel_pasid_alloc_table(dev);
3835 if (ret) {
3836 dev_err(dev, "PASID table allocation failed\n");
3837 goto clear_rbtree;
3838 }
3839
3840 if (!context_copied(iommu, bus: info->bus, devfn: info->devfn)) {
3841 ret = intel_pasid_setup_sm_context(dev);
3842 if (ret)
3843 goto free_table;
3844 }
3845 }
3846
3847 intel_iommu_debugfs_create_dev(info);
3848
3849 return &iommu->iommu;
3850free_table:
3851 intel_pasid_free_table(dev);
3852clear_rbtree:
3853 device_rbtree_remove(info);
3854free:
3855 kfree(objp: info);
3856
3857 return ERR_PTR(error: ret);
3858}
3859
3860static void intel_iommu_probe_finalize(struct device *dev)
3861{
3862 struct device_domain_info *info = dev_iommu_priv_get(dev);
3863 struct intel_iommu *iommu = info->iommu;
3864
3865 /*
3866 * The PCIe spec, in its wisdom, declares that the behaviour of the
3867 * device is undefined if you enable PASID support after ATS support.
3868 * So always enable PASID support on devices which have it, even if
3869 * we can't yet know if we're ever going to use it.
3870 */
3871 if (info->pasid_supported &&
3872 !pci_enable_pasid(to_pci_dev(dev), features: info->pasid_supported & ~1))
3873 info->pasid_enabled = 1;
3874
3875 if (sm_supported(iommu) && !dev_is_real_dma_subdevice(dev)) {
3876 iommu_enable_pci_ats(info);
3877 /* Assign a DEVTLB cache tag to the default domain. */
3878 if (info->ats_enabled && info->domain) {
3879 u16 did = domain_id_iommu(domain: info->domain, iommu);
3880
3881 if (cache_tag_assign(domain: info->domain, did, dev,
3882 IOMMU_NO_PASID, type: CACHE_TAG_DEVTLB))
3883 iommu_disable_pci_ats(info);
3884 }
3885 }
3886 iommu_enable_pci_pri(info);
3887}
3888
3889static void intel_iommu_release_device(struct device *dev)
3890{
3891 struct device_domain_info *info = dev_iommu_priv_get(dev);
3892 struct intel_iommu *iommu = info->iommu;
3893
3894 iommu_disable_pci_pri(info);
3895 iommu_disable_pci_ats(info);
3896
3897 if (info->pasid_enabled) {
3898 pci_disable_pasid(to_pci_dev(dev));
3899 info->pasid_enabled = 0;
3900 }
3901
3902 mutex_lock(lock: &iommu->iopf_lock);
3903 if (dev_is_pci(dev) && pci_ats_supported(to_pci_dev(dev)))
3904 device_rbtree_remove(info);
3905 mutex_unlock(lock: &iommu->iopf_lock);
3906
3907 if (sm_supported(iommu) && !dev_is_real_dma_subdevice(dev) &&
3908 !context_copied(iommu, bus: info->bus, devfn: info->devfn))
3909 intel_pasid_teardown_sm_context(dev);
3910
3911 intel_pasid_free_table(dev);
3912 intel_iommu_debugfs_remove_dev(info);
3913 kfree(objp: info);
3914}
3915
3916static void intel_iommu_get_resv_regions(struct device *device,
3917 struct list_head *head)
3918{
3919 int prot = DMA_PTE_READ | DMA_PTE_WRITE;
3920 struct iommu_resv_region *reg;
3921 struct dmar_rmrr_unit *rmrr;
3922 struct device *i_dev;
3923 int i;
3924
3925 rcu_read_lock();
3926 for_each_rmrr_units(rmrr) {
3927 for_each_active_dev_scope(rmrr->devices, rmrr->devices_cnt,
3928 i, i_dev) {
3929 struct iommu_resv_region *resv;
3930 enum iommu_resv_type type;
3931 size_t length;
3932
3933 if (i_dev != device &&
3934 !is_downstream_to_pci_bridge(dev: device, bridge: i_dev))
3935 continue;
3936
3937 length = rmrr->end_address - rmrr->base_address + 1;
3938
3939 type = device_rmrr_is_relaxable(dev: device) ?
3940 IOMMU_RESV_DIRECT_RELAXABLE : IOMMU_RESV_DIRECT;
3941
3942 resv = iommu_alloc_resv_region(start: rmrr->base_address,
3943 length, prot, type,
3944 GFP_ATOMIC);
3945 if (!resv)
3946 break;
3947
3948 list_add_tail(new: &resv->list, head);
3949 }
3950 }
3951 rcu_read_unlock();
3952
3953#ifdef CONFIG_INTEL_IOMMU_FLOPPY_WA
3954 if (dev_is_pci(device)) {
3955 struct pci_dev *pdev = to_pci_dev(device);
3956
3957 if ((pdev->class >> 8) == PCI_CLASS_BRIDGE_ISA) {
3958 reg = iommu_alloc_resv_region(start: 0, length: 1UL << 24, prot,
3959 type: IOMMU_RESV_DIRECT_RELAXABLE,
3960 GFP_KERNEL);
3961 if (reg)
3962 list_add_tail(new: &reg->list, head);
3963 }
3964 }
3965#endif /* CONFIG_INTEL_IOMMU_FLOPPY_WA */
3966
3967 reg = iommu_alloc_resv_region(IOAPIC_RANGE_START,
3968 IOAPIC_RANGE_END - IOAPIC_RANGE_START + 1,
3969 prot: 0, type: IOMMU_RESV_MSI, GFP_KERNEL);
3970 if (!reg)
3971 return;
3972 list_add_tail(new: &reg->list, head);
3973}
3974
3975static struct iommu_group *intel_iommu_device_group(struct device *dev)
3976{
3977 if (dev_is_pci(dev))
3978 return pci_device_group(dev);
3979 return generic_device_group(dev);
3980}
3981
3982int intel_iommu_enable_iopf(struct device *dev)
3983{
3984 struct device_domain_info *info = dev_iommu_priv_get(dev);
3985 struct intel_iommu *iommu = info->iommu;
3986 int ret;
3987
3988 if (!info->pri_enabled)
3989 return -ENODEV;
3990
3991 /* pri_enabled is protected by the group mutex. */
3992 iommu_group_mutex_assert(dev);
3993 if (info->iopf_refcount) {
3994 info->iopf_refcount++;
3995 return 0;
3996 }
3997
3998 ret = iopf_queue_add_device(queue: iommu->iopf_queue, dev);
3999 if (ret)
4000 return ret;
4001
4002 info->iopf_refcount = 1;
4003
4004 return 0;
4005}
4006
4007void intel_iommu_disable_iopf(struct device *dev)
4008{
4009 struct device_domain_info *info = dev_iommu_priv_get(dev);
4010 struct intel_iommu *iommu = info->iommu;
4011
4012 if (WARN_ON(!info->pri_enabled || !info->iopf_refcount))
4013 return;
4014
4015 iommu_group_mutex_assert(dev);
4016 if (--info->iopf_refcount)
4017 return;
4018
4019 iopf_queue_remove_device(queue: iommu->iopf_queue, dev);
4020}
4021
4022static bool intel_iommu_is_attach_deferred(struct device *dev)
4023{
4024 struct device_domain_info *info = dev_iommu_priv_get(dev);
4025
4026 return translation_pre_enabled(iommu: info->iommu) && !info->domain;
4027}
4028
4029/*
4030 * Check that the device does not live on an external facing PCI port that is
4031 * marked as untrusted. Such devices should not be able to apply quirks and
4032 * thus not be able to bypass the IOMMU restrictions.
4033 */
4034static bool risky_device(struct pci_dev *pdev)
4035{
4036 if (pdev->untrusted) {
4037 pci_info(pdev,
4038 "Skipping IOMMU quirk for dev [%04X:%04X] on untrusted PCI link\n",
4039 pdev->vendor, pdev->device);
4040 pci_info(pdev, "Please check with your BIOS/Platform vendor about this\n");
4041 return true;
4042 }
4043 return false;
4044}
4045
4046static int intel_iommu_iotlb_sync_map(struct iommu_domain *domain,
4047 unsigned long iova, size_t size)
4048{
4049 struct dmar_domain *dmar_domain = to_dmar_domain(dom: domain);
4050
4051 if (dmar_domain->iotlb_sync_map)
4052 cache_tag_flush_range_np(domain: dmar_domain, start: iova, end: iova + size - 1);
4053
4054 return 0;
4055}
4056
4057void domain_remove_dev_pasid(struct iommu_domain *domain,
4058 struct device *dev, ioasid_t pasid)
4059{
4060 struct device_domain_info *info = dev_iommu_priv_get(dev);
4061 struct dev_pasid_info *curr, *dev_pasid = NULL;
4062 struct intel_iommu *iommu = info->iommu;
4063 struct dmar_domain *dmar_domain;
4064 unsigned long flags;
4065
4066 if (!domain)
4067 return;
4068
4069 /* Identity domain has no meta data for pasid. */
4070 if (domain->type == IOMMU_DOMAIN_IDENTITY)
4071 return;
4072
4073 dmar_domain = to_dmar_domain(dom: domain);
4074 spin_lock_irqsave(&dmar_domain->lock, flags);
4075 list_for_each_entry(curr, &dmar_domain->dev_pasids, link_domain) {
4076 if (curr->dev == dev && curr->pasid == pasid) {
4077 list_del(entry: &curr->link_domain);
4078 dev_pasid = curr;
4079 break;
4080 }
4081 }
4082 spin_unlock_irqrestore(lock: &dmar_domain->lock, flags);
4083
4084 cache_tag_unassign_domain(domain: dmar_domain, dev, pasid);
4085 domain_detach_iommu(domain: dmar_domain, iommu);
4086 if (!WARN_ON_ONCE(!dev_pasid)) {
4087 intel_iommu_debugfs_remove_dev_pasid(dev_pasid);
4088 kfree(objp: dev_pasid);
4089 }
4090}
4091
4092static int blocking_domain_set_dev_pasid(struct iommu_domain *domain,
4093 struct device *dev, ioasid_t pasid,
4094 struct iommu_domain *old)
4095{
4096 struct device_domain_info *info = dev_iommu_priv_get(dev);
4097
4098 intel_pasid_tear_down_entry(iommu: info->iommu, dev, pasid, fault_ignore: false);
4099 iopf_for_domain_remove(domain: old, dev);
4100 domain_remove_dev_pasid(domain: old, dev, pasid);
4101
4102 return 0;
4103}
4104
4105struct dev_pasid_info *
4106domain_add_dev_pasid(struct iommu_domain *domain,
4107 struct device *dev, ioasid_t pasid)
4108{
4109 struct device_domain_info *info = dev_iommu_priv_get(dev);
4110 struct dmar_domain *dmar_domain = to_dmar_domain(dom: domain);
4111 struct intel_iommu *iommu = info->iommu;
4112 struct dev_pasid_info *dev_pasid;
4113 unsigned long flags;
4114 int ret;
4115
4116 dev_pasid = kzalloc(sizeof(*dev_pasid), GFP_KERNEL);
4117 if (!dev_pasid)
4118 return ERR_PTR(error: -ENOMEM);
4119
4120 ret = domain_attach_iommu(domain: dmar_domain, iommu);
4121 if (ret)
4122 goto out_free;
4123
4124 ret = cache_tag_assign_domain(domain: dmar_domain, dev, pasid);
4125 if (ret)
4126 goto out_detach_iommu;
4127
4128 dev_pasid->dev = dev;
4129 dev_pasid->pasid = pasid;
4130 spin_lock_irqsave(&dmar_domain->lock, flags);
4131 list_add(new: &dev_pasid->link_domain, head: &dmar_domain->dev_pasids);
4132 spin_unlock_irqrestore(lock: &dmar_domain->lock, flags);
4133
4134 return dev_pasid;
4135out_detach_iommu:
4136 domain_detach_iommu(domain: dmar_domain, iommu);
4137out_free:
4138 kfree(objp: dev_pasid);
4139 return ERR_PTR(error: ret);
4140}
4141
4142static int intel_iommu_set_dev_pasid(struct iommu_domain *domain,
4143 struct device *dev, ioasid_t pasid,
4144 struct iommu_domain *old)
4145{
4146 struct device_domain_info *info = dev_iommu_priv_get(dev);
4147 struct dmar_domain *dmar_domain = to_dmar_domain(dom: domain);
4148 struct intel_iommu *iommu = info->iommu;
4149 struct dev_pasid_info *dev_pasid;
4150 int ret;
4151
4152 if (WARN_ON_ONCE(!(domain->type & __IOMMU_DOMAIN_PAGING)))
4153 return -EINVAL;
4154
4155 if (!pasid_supported(iommu) || dev_is_real_dma_subdevice(dev))
4156 return -EOPNOTSUPP;
4157
4158 if (domain->dirty_ops)
4159 return -EINVAL;
4160
4161 if (context_copied(iommu, bus: info->bus, devfn: info->devfn))
4162 return -EBUSY;
4163
4164 ret = paging_domain_compatible(domain, dev);
4165 if (ret)
4166 return ret;
4167
4168 dev_pasid = domain_add_dev_pasid(domain, dev, pasid);
4169 if (IS_ERR(ptr: dev_pasid))
4170 return PTR_ERR(ptr: dev_pasid);
4171
4172 ret = iopf_for_domain_replace(new: domain, old, dev);
4173 if (ret)
4174 goto out_remove_dev_pasid;
4175
4176 if (intel_domain_is_fs_paging(domain: dmar_domain))
4177 ret = domain_setup_first_level(iommu, domain: dmar_domain,
4178 dev, pasid, old);
4179 else if (intel_domain_is_ss_paging(domain: dmar_domain))
4180 ret = domain_setup_second_level(iommu, domain: dmar_domain,
4181 dev, pasid, old);
4182 else if (WARN_ON(true))
4183 ret = -EINVAL;
4184
4185 if (ret)
4186 goto out_unwind_iopf;
4187
4188 domain_remove_dev_pasid(domain: old, dev, pasid);
4189
4190 intel_iommu_debugfs_create_dev_pasid(dev_pasid);
4191
4192 return 0;
4193
4194out_unwind_iopf:
4195 iopf_for_domain_replace(new: old, old: domain, dev);
4196out_remove_dev_pasid:
4197 domain_remove_dev_pasid(domain, dev, pasid);
4198 return ret;
4199}
4200
4201static void *intel_iommu_hw_info(struct device *dev, u32 *length,
4202 enum iommu_hw_info_type *type)
4203{
4204 struct device_domain_info *info = dev_iommu_priv_get(dev);
4205 struct intel_iommu *iommu = info->iommu;
4206 struct iommu_hw_info_vtd *vtd;
4207
4208 if (*type != IOMMU_HW_INFO_TYPE_DEFAULT &&
4209 *type != IOMMU_HW_INFO_TYPE_INTEL_VTD)
4210 return ERR_PTR(error: -EOPNOTSUPP);
4211
4212 vtd = kzalloc(sizeof(*vtd), GFP_KERNEL);
4213 if (!vtd)
4214 return ERR_PTR(error: -ENOMEM);
4215
4216 vtd->flags = IOMMU_HW_INFO_VTD_ERRATA_772415_SPR17;
4217 vtd->cap_reg = iommu->cap;
4218 vtd->ecap_reg = iommu->ecap;
4219 *length = sizeof(*vtd);
4220 *type = IOMMU_HW_INFO_TYPE_INTEL_VTD;
4221 return vtd;
4222}
4223
4224/*
4225 * Set dirty tracking for the device list of a domain. The caller must
4226 * hold the domain->lock when calling it.
4227 */
4228static int device_set_dirty_tracking(struct list_head *devices, bool enable)
4229{
4230 struct device_domain_info *info;
4231 int ret = 0;
4232
4233 list_for_each_entry(info, devices, link) {
4234 ret = intel_pasid_setup_dirty_tracking(iommu: info->iommu, dev: info->dev,
4235 IOMMU_NO_PASID, enabled: enable);
4236 if (ret)
4237 break;
4238 }
4239
4240 return ret;
4241}
4242
4243static int parent_domain_set_dirty_tracking(struct dmar_domain *domain,
4244 bool enable)
4245{
4246 struct dmar_domain *s1_domain;
4247 unsigned long flags;
4248 int ret;
4249
4250 spin_lock(lock: &domain->s1_lock);
4251 list_for_each_entry(s1_domain, &domain->s1_domains, s2_link) {
4252 spin_lock_irqsave(&s1_domain->lock, flags);
4253 ret = device_set_dirty_tracking(devices: &s1_domain->devices, enable);
4254 spin_unlock_irqrestore(lock: &s1_domain->lock, flags);
4255 if (ret)
4256 goto err_unwind;
4257 }
4258 spin_unlock(lock: &domain->s1_lock);
4259 return 0;
4260
4261err_unwind:
4262 list_for_each_entry(s1_domain, &domain->s1_domains, s2_link) {
4263 spin_lock_irqsave(&s1_domain->lock, flags);
4264 device_set_dirty_tracking(devices: &s1_domain->devices,
4265 enable: domain->dirty_tracking);
4266 spin_unlock_irqrestore(lock: &s1_domain->lock, flags);
4267 }
4268 spin_unlock(lock: &domain->s1_lock);
4269 return ret;
4270}
4271
4272static int intel_iommu_set_dirty_tracking(struct iommu_domain *domain,
4273 bool enable)
4274{
4275 struct dmar_domain *dmar_domain = to_dmar_domain(dom: domain);
4276 int ret;
4277
4278 spin_lock(lock: &dmar_domain->lock);
4279 if (dmar_domain->dirty_tracking == enable)
4280 goto out_unlock;
4281
4282 ret = device_set_dirty_tracking(devices: &dmar_domain->devices, enable);
4283 if (ret)
4284 goto err_unwind;
4285
4286 if (dmar_domain->nested_parent) {
4287 ret = parent_domain_set_dirty_tracking(domain: dmar_domain, enable);
4288 if (ret)
4289 goto err_unwind;
4290 }
4291
4292 dmar_domain->dirty_tracking = enable;
4293out_unlock:
4294 spin_unlock(lock: &dmar_domain->lock);
4295
4296 return 0;
4297
4298err_unwind:
4299 device_set_dirty_tracking(devices: &dmar_domain->devices,
4300 enable: dmar_domain->dirty_tracking);
4301 spin_unlock(lock: &dmar_domain->lock);
4302 return ret;
4303}
4304
4305static int intel_iommu_read_and_clear_dirty(struct iommu_domain *domain,
4306 unsigned long iova, size_t size,
4307 unsigned long flags,
4308 struct iommu_dirty_bitmap *dirty)
4309{
4310 struct dmar_domain *dmar_domain = to_dmar_domain(dom: domain);
4311 unsigned long end = iova + size - 1;
4312 unsigned long pgsize;
4313
4314 /*
4315 * IOMMUFD core calls into a dirty tracking disabled domain without an
4316 * IOVA bitmap set in order to clean dirty bits in all PTEs that might
4317 * have occurred when we stopped dirty tracking. This ensures that we
4318 * never inherit dirtied bits from a previous cycle.
4319 */
4320 if (!dmar_domain->dirty_tracking && dirty->bitmap)
4321 return -EINVAL;
4322
4323 do {
4324 struct dma_pte *pte;
4325 int lvl = 0;
4326
4327 pte = pfn_to_dma_pte(domain: dmar_domain, pfn: iova >> VTD_PAGE_SHIFT, target_level: &lvl,
4328 GFP_ATOMIC);
4329 pgsize = level_size(level: lvl) << VTD_PAGE_SHIFT;
4330 if (!pte || !dma_pte_present(pte)) {
4331 iova += pgsize;
4332 continue;
4333 }
4334
4335 if (dma_sl_pte_test_and_clear_dirty(pte, flags))
4336 iommu_dirty_bitmap_record(dirty, iova, length: pgsize);
4337 iova += pgsize;
4338 } while (iova < end);
4339
4340 return 0;
4341}
4342
4343static const struct iommu_dirty_ops intel_dirty_ops = {
4344 .set_dirty_tracking = intel_iommu_set_dirty_tracking,
4345 .read_and_clear_dirty = intel_iommu_read_and_clear_dirty,
4346};
4347
4348static int context_setup_pass_through(struct device *dev, u8 bus, u8 devfn)
4349{
4350 struct device_domain_info *info = dev_iommu_priv_get(dev);
4351 struct intel_iommu *iommu = info->iommu;
4352 struct context_entry *context;
4353
4354 spin_lock(lock: &iommu->lock);
4355 context = iommu_context_addr(iommu, bus, devfn, alloc: 1);
4356 if (!context) {
4357 spin_unlock(lock: &iommu->lock);
4358 return -ENOMEM;
4359 }
4360
4361 if (context_present(context) && !context_copied(iommu, bus, devfn)) {
4362 spin_unlock(lock: &iommu->lock);
4363 return 0;
4364 }
4365
4366 copied_context_tear_down(iommu, context, bus, devfn);
4367 context_clear_entry(context);
4368 context_set_domain_id(context, FLPT_DEFAULT_DID);
4369
4370 /*
4371 * In pass through mode, AW must be programmed to indicate the largest
4372 * AGAW value supported by hardware. And ASR is ignored by hardware.
4373 */
4374 context_set_address_width(context, value: iommu->msagaw);
4375 context_set_translation_type(context, CONTEXT_TT_PASS_THROUGH);
4376 context_set_fault_enable(context);
4377 context_set_present(context);
4378 if (!ecap_coherent(iommu->ecap))
4379 clflush_cache_range(addr: context, size: sizeof(*context));
4380 context_present_cache_flush(iommu, FLPT_DEFAULT_DID, bus, devfn);
4381 spin_unlock(lock: &iommu->lock);
4382
4383 return 0;
4384}
4385
4386static int context_setup_pass_through_cb(struct pci_dev *pdev, u16 alias, void *data)
4387{
4388 struct device *dev = data;
4389
4390 return context_setup_pass_through(dev, PCI_BUS_NUM(alias), devfn: alias & 0xff);
4391}
4392
4393static int device_setup_pass_through(struct device *dev)
4394{
4395 struct device_domain_info *info = dev_iommu_priv_get(dev);
4396
4397 if (!dev_is_pci(dev))
4398 return context_setup_pass_through(dev, bus: info->bus, devfn: info->devfn);
4399
4400 return pci_for_each_dma_alias(to_pci_dev(dev),
4401 fn: context_setup_pass_through_cb, data: dev);
4402}
4403
4404static int identity_domain_attach_dev(struct iommu_domain *domain, struct device *dev)
4405{
4406 struct device_domain_info *info = dev_iommu_priv_get(dev);
4407 struct intel_iommu *iommu = info->iommu;
4408 int ret;
4409
4410 device_block_translation(dev);
4411
4412 if (dev_is_real_dma_subdevice(dev))
4413 return 0;
4414
4415 /*
4416 * No PRI support with the global identity domain. No need to enable or
4417 * disable PRI in this path as the iommu has been put in the blocking
4418 * state.
4419 */
4420 if (sm_supported(iommu))
4421 ret = intel_pasid_setup_pass_through(iommu, dev, IOMMU_NO_PASID);
4422 else
4423 ret = device_setup_pass_through(dev);
4424
4425 if (!ret)
4426 info->domain_attached = true;
4427
4428 return ret;
4429}
4430
4431static int identity_domain_set_dev_pasid(struct iommu_domain *domain,
4432 struct device *dev, ioasid_t pasid,
4433 struct iommu_domain *old)
4434{
4435 struct device_domain_info *info = dev_iommu_priv_get(dev);
4436 struct intel_iommu *iommu = info->iommu;
4437 int ret;
4438
4439 if (!pasid_supported(iommu) || dev_is_real_dma_subdevice(dev))
4440 return -EOPNOTSUPP;
4441
4442 ret = iopf_for_domain_replace(new: domain, old, dev);
4443 if (ret)
4444 return ret;
4445
4446 ret = domain_setup_passthrough(iommu, dev, pasid, old);
4447 if (ret) {
4448 iopf_for_domain_replace(new: old, old: domain, dev);
4449 return ret;
4450 }
4451
4452 domain_remove_dev_pasid(domain: old, dev, pasid);
4453 return 0;
4454}
4455
4456static struct iommu_domain identity_domain = {
4457 .type = IOMMU_DOMAIN_IDENTITY,
4458 .ops = &(const struct iommu_domain_ops) {
4459 .attach_dev = identity_domain_attach_dev,
4460 .set_dev_pasid = identity_domain_set_dev_pasid,
4461 },
4462};
4463
4464const struct iommu_domain_ops intel_fs_paging_domain_ops = {
4465 .attach_dev = intel_iommu_attach_device,
4466 .set_dev_pasid = intel_iommu_set_dev_pasid,
4467 .map_pages = intel_iommu_map_pages,
4468 .unmap_pages = intel_iommu_unmap_pages,
4469 .iotlb_sync_map = intel_iommu_iotlb_sync_map,
4470 .flush_iotlb_all = intel_flush_iotlb_all,
4471 .iotlb_sync = intel_iommu_tlb_sync,
4472 .iova_to_phys = intel_iommu_iova_to_phys,
4473 .free = intel_iommu_domain_free,
4474 .enforce_cache_coherency = intel_iommu_enforce_cache_coherency_fs,
4475};
4476
4477const struct iommu_domain_ops intel_ss_paging_domain_ops = {
4478 .attach_dev = intel_iommu_attach_device,
4479 .set_dev_pasid = intel_iommu_set_dev_pasid,
4480 .map_pages = intel_iommu_map_pages,
4481 .unmap_pages = intel_iommu_unmap_pages,
4482 .iotlb_sync_map = intel_iommu_iotlb_sync_map,
4483 .flush_iotlb_all = intel_flush_iotlb_all,
4484 .iotlb_sync = intel_iommu_tlb_sync,
4485 .iova_to_phys = intel_iommu_iova_to_phys,
4486 .free = intel_iommu_domain_free,
4487 .enforce_cache_coherency = intel_iommu_enforce_cache_coherency_ss,
4488};
4489
4490const struct iommu_ops intel_iommu_ops = {
4491 .blocked_domain = &blocking_domain,
4492 .release_domain = &blocking_domain,
4493 .identity_domain = &identity_domain,
4494 .capable = intel_iommu_capable,
4495 .hw_info = intel_iommu_hw_info,
4496 .domain_alloc_paging_flags = intel_iommu_domain_alloc_paging_flags,
4497 .domain_alloc_sva = intel_svm_domain_alloc,
4498 .domain_alloc_nested = intel_iommu_domain_alloc_nested,
4499 .probe_device = intel_iommu_probe_device,
4500 .probe_finalize = intel_iommu_probe_finalize,
4501 .release_device = intel_iommu_release_device,
4502 .get_resv_regions = intel_iommu_get_resv_regions,
4503 .device_group = intel_iommu_device_group,
4504 .is_attach_deferred = intel_iommu_is_attach_deferred,
4505 .def_domain_type = device_def_domain_type,
4506 .page_response = intel_iommu_page_response,
4507};
4508
4509static void quirk_iommu_igfx(struct pci_dev *dev)
4510{
4511 if (risky_device(pdev: dev))
4512 return;
4513
4514 pci_info(dev, "Disabling IOMMU for graphics on this chipset\n");
4515 disable_igfx_iommu = 1;
4516}
4517
4518/* G4x/GM45 integrated gfx dmar support is totally busted. */
4519DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2a40, quirk_iommu_igfx);
4520DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e00, quirk_iommu_igfx);
4521DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e10, quirk_iommu_igfx);
4522DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e20, quirk_iommu_igfx);
4523DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e30, quirk_iommu_igfx);
4524DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e40, quirk_iommu_igfx);
4525DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e90, quirk_iommu_igfx);
4526
4527/* QM57/QS57 integrated gfx malfunctions with dmar */
4528DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x0044, quirk_iommu_igfx);
4529
4530/* Broadwell igfx malfunctions with dmar */
4531DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1606, quirk_iommu_igfx);
4532DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x160B, quirk_iommu_igfx);
4533DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x160E, quirk_iommu_igfx);
4534DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1602, quirk_iommu_igfx);
4535DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x160A, quirk_iommu_igfx);
4536DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x160D, quirk_iommu_igfx);
4537DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1616, quirk_iommu_igfx);
4538DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x161B, quirk_iommu_igfx);
4539DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x161E, quirk_iommu_igfx);
4540DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1612, quirk_iommu_igfx);
4541DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x161A, quirk_iommu_igfx);
4542DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x161D, quirk_iommu_igfx);
4543DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1626, quirk_iommu_igfx);
4544DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x162B, quirk_iommu_igfx);
4545DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x162E, quirk_iommu_igfx);
4546DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1622, quirk_iommu_igfx);
4547DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x162A, quirk_iommu_igfx);
4548DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x162D, quirk_iommu_igfx);
4549DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1636, quirk_iommu_igfx);
4550DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x163B, quirk_iommu_igfx);
4551DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x163E, quirk_iommu_igfx);
4552DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1632, quirk_iommu_igfx);
4553DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x163A, quirk_iommu_igfx);
4554DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x163D, quirk_iommu_igfx);
4555
4556static void quirk_iommu_rwbf(struct pci_dev *dev)
4557{
4558 if (risky_device(pdev: dev))
4559 return;
4560
4561 /*
4562 * Mobile 4 Series Chipset neglects to set RWBF capability,
4563 * but needs it. Same seems to hold for the desktop versions.
4564 */
4565 pci_info(dev, "Forcing write-buffer flush capability\n");
4566 rwbf_quirk = 1;
4567}
4568
4569DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2a40, quirk_iommu_rwbf);
4570DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e00, quirk_iommu_rwbf);
4571DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e10, quirk_iommu_rwbf);
4572DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e20, quirk_iommu_rwbf);
4573DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e30, quirk_iommu_rwbf);
4574DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e40, quirk_iommu_rwbf);
4575DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e90, quirk_iommu_rwbf);
4576
4577#define GGC 0x52
4578#define GGC_MEMORY_SIZE_MASK (0xf << 8)
4579#define GGC_MEMORY_SIZE_NONE (0x0 << 8)
4580#define GGC_MEMORY_SIZE_1M (0x1 << 8)
4581#define GGC_MEMORY_SIZE_2M (0x3 << 8)
4582#define GGC_MEMORY_VT_ENABLED (0x8 << 8)
4583#define GGC_MEMORY_SIZE_2M_VT (0x9 << 8)
4584#define GGC_MEMORY_SIZE_3M_VT (0xa << 8)
4585#define GGC_MEMORY_SIZE_4M_VT (0xb << 8)
4586
4587static void quirk_calpella_no_shadow_gtt(struct pci_dev *dev)
4588{
4589 unsigned short ggc;
4590
4591 if (risky_device(pdev: dev))
4592 return;
4593
4594 if (pci_read_config_word(dev, GGC, val: &ggc))
4595 return;
4596
4597 if (!(ggc & GGC_MEMORY_VT_ENABLED)) {
4598 pci_info(dev, "BIOS has allocated no shadow GTT; disabling IOMMU for graphics\n");
4599 disable_igfx_iommu = 1;
4600 } else if (!disable_igfx_iommu) {
4601 /* we have to ensure the gfx device is idle before we flush */
4602 pci_info(dev, "Disabling batched IOTLB flush on Ironlake\n");
4603 iommu_set_dma_strict();
4604 }
4605}
4606DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x0040, quirk_calpella_no_shadow_gtt);
4607DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x0062, quirk_calpella_no_shadow_gtt);
4608DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x006a, quirk_calpella_no_shadow_gtt);
4609
4610static void quirk_igfx_skip_te_disable(struct pci_dev *dev)
4611{
4612 unsigned short ver;
4613
4614 if (!IS_GFX_DEVICE(dev))
4615 return;
4616
4617 ver = (dev->device >> 8) & 0xff;
4618 if (ver != 0x45 && ver != 0x46 && ver != 0x4c &&
4619 ver != 0x4e && ver != 0x8a && ver != 0x98 &&
4620 ver != 0x9a && ver != 0xa7 && ver != 0x7d)
4621 return;
4622
4623 if (risky_device(pdev: dev))
4624 return;
4625
4626 pci_info(dev, "Skip IOMMU disabling for graphics\n");
4627 iommu_skip_te_disable = 1;
4628}
4629DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, PCI_ANY_ID, quirk_igfx_skip_te_disable);
4630
4631/* On Tylersburg chipsets, some BIOSes have been known to enable the
4632 ISOCH DMAR unit for the Azalia sound device, but not give it any
4633 TLB entries, which causes it to deadlock. Check for that. We do
4634 this in a function called from init_dmars(), instead of in a PCI
4635 quirk, because we don't want to print the obnoxious "BIOS broken"
4636 message if VT-d is actually disabled.
4637*/
4638static void __init check_tylersburg_isoch(void)
4639{
4640 struct pci_dev *pdev;
4641 uint32_t vtisochctrl;
4642
4643 /* If there's no Azalia in the system anyway, forget it. */
4644 pdev = pci_get_device(PCI_VENDOR_ID_INTEL, device: 0x3a3e, NULL);
4645 if (!pdev)
4646 return;
4647
4648 if (risky_device(pdev)) {
4649 pci_dev_put(dev: pdev);
4650 return;
4651 }
4652
4653 pci_dev_put(dev: pdev);
4654
4655 /* System Management Registers. Might be hidden, in which case
4656 we can't do the sanity check. But that's OK, because the
4657 known-broken BIOSes _don't_ actually hide it, so far. */
4658 pdev = pci_get_device(PCI_VENDOR_ID_INTEL, device: 0x342e, NULL);
4659 if (!pdev)
4660 return;
4661
4662 if (risky_device(pdev)) {
4663 pci_dev_put(dev: pdev);
4664 return;
4665 }
4666
4667 if (pci_read_config_dword(dev: pdev, where: 0x188, val: &vtisochctrl)) {
4668 pci_dev_put(dev: pdev);
4669 return;
4670 }
4671
4672 pci_dev_put(dev: pdev);
4673
4674 /* If Azalia DMA is routed to the non-isoch DMAR unit, fine. */
4675 if (vtisochctrl & 1)
4676 return;
4677
4678 /* Drop all bits other than the number of TLB entries */
4679 vtisochctrl &= 0x1c;
4680
4681 /* If we have the recommended number of TLB entries (16), fine. */
4682 if (vtisochctrl == 0x10)
4683 return;
4684
4685 /* Zero TLB entries? You get to ride the short bus to school. */
4686 if (!vtisochctrl) {
4687 WARN(1, "Your BIOS is broken; DMA routed to ISOCH DMAR unit but no TLB space.\n"
4688 "BIOS vendor: %s; Ver: %s; Product Version: %s\n",
4689 dmi_get_system_info(DMI_BIOS_VENDOR),
4690 dmi_get_system_info(DMI_BIOS_VERSION),
4691 dmi_get_system_info(DMI_PRODUCT_VERSION));
4692 iommu_identity_mapping |= IDENTMAP_AZALIA;
4693 return;
4694 }
4695
4696 pr_warn("Recommended TLB entries for ISOCH unit is 16; your BIOS set %d\n",
4697 vtisochctrl);
4698}
4699
4700/*
4701 * Here we deal with a device TLB defect where device may inadvertently issue ATS
4702 * invalidation completion before posted writes initiated with translated address
4703 * that utilized translations matching the invalidation address range, violating
4704 * the invalidation completion ordering.
4705 * Therefore, any use cases that cannot guarantee DMA is stopped before unmap is
4706 * vulnerable to this defect. In other words, any dTLB invalidation initiated not
4707 * under the control of the trusted/privileged host device driver must use this
4708 * quirk.
4709 * Device TLBs are invalidated under the following six conditions:
4710 * 1. Device driver does DMA API unmap IOVA
4711 * 2. Device driver unbind a PASID from a process, sva_unbind_device()
4712 * 3. PASID is torn down, after PASID cache is flushed. e.g. process
4713 * exit_mmap() due to crash
4714 * 4. Under SVA usage, called by mmu_notifier.invalidate_range() where
4715 * VM has to free pages that were unmapped
4716 * 5. Userspace driver unmaps a DMA buffer
4717 * 6. Cache invalidation in vSVA usage (upcoming)
4718 *
4719 * For #1 and #2, device drivers are responsible for stopping DMA traffic
4720 * before unmap/unbind. For #3, iommu driver gets mmu_notifier to
4721 * invalidate TLB the same way as normal user unmap which will use this quirk.
4722 * The dTLB invalidation after PASID cache flush does not need this quirk.
4723 *
4724 * As a reminder, #6 will *NEED* this quirk as we enable nested translation.
4725 */
4726void quirk_extra_dev_tlb_flush(struct device_domain_info *info,
4727 unsigned long address, unsigned long mask,
4728 u32 pasid, u16 qdep)
4729{
4730 u16 sid;
4731
4732 if (likely(!info->dtlb_extra_inval))
4733 return;
4734
4735 sid = PCI_DEVID(info->bus, info->devfn);
4736 if (pasid == IOMMU_NO_PASID) {
4737 qi_flush_dev_iotlb(iommu: info->iommu, sid, pfsid: info->pfsid,
4738 qdep, addr: address, mask);
4739 } else {
4740 qi_flush_dev_iotlb_pasid(iommu: info->iommu, sid, pfsid: info->pfsid,
4741 pasid, qdep, addr: address, size_order: mask);
4742 }
4743}
4744
4745#define ecmd_get_status_code(res) (((res) & 0xff) >> 1)
4746
4747/*
4748 * Function to submit a command to the enhanced command interface. The
4749 * valid enhanced command descriptions are defined in Table 47 of the
4750 * VT-d spec. The VT-d hardware implementation may support some but not
4751 * all commands, which can be determined by checking the Enhanced
4752 * Command Capability Register.
4753 *
4754 * Return values:
4755 * - 0: Command successful without any error;
4756 * - Negative: software error value;
4757 * - Nonzero positive: failure status code defined in Table 48.
4758 */
4759int ecmd_submit_sync(struct intel_iommu *iommu, u8 ecmd, u64 oa, u64 ob)
4760{
4761 unsigned long flags;
4762 u64 res;
4763 int ret;
4764
4765 if (!cap_ecmds(iommu->cap))
4766 return -ENODEV;
4767
4768 raw_spin_lock_irqsave(&iommu->register_lock, flags);
4769
4770 res = dmar_readq(iommu->reg + DMAR_ECRSP_REG);
4771 if (res & DMA_ECMD_ECRSP_IP) {
4772 ret = -EBUSY;
4773 goto err;
4774 }
4775
4776 /*
4777 * Unconditionally write the operand B, because
4778 * - There is no side effect if an ecmd doesn't require an
4779 * operand B, but we set the register to some value.
4780 * - It's not invoked in any critical path. The extra MMIO
4781 * write doesn't bring any performance concerns.
4782 */
4783 dmar_writeq(iommu->reg + DMAR_ECEO_REG, ob);
4784 dmar_writeq(iommu->reg + DMAR_ECMD_REG, ecmd | (oa << DMA_ECMD_OA_SHIFT));
4785
4786 IOMMU_WAIT_OP(iommu, DMAR_ECRSP_REG, dmar_readq,
4787 !(res & DMA_ECMD_ECRSP_IP), res);
4788
4789 if (res & DMA_ECMD_ECRSP_IP) {
4790 ret = -ETIMEDOUT;
4791 goto err;
4792 }
4793
4794 ret = ecmd_get_status_code(res);
4795err:
4796 raw_spin_unlock_irqrestore(&iommu->register_lock, flags);
4797
4798 return ret;
4799}
4800