📄 intel-iommu.c
字号:
/* * Copyright (c) 2006, Intel Corporation. * * This program is free software; you can redistribute it and/or modify it * under the terms and conditions of the GNU General Public License, * version 2, as published by the Free Software Foundation. * * This program is distributed in the hope it will be useful, but WITHOUT * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for * more details. * * You should have received a copy of the GNU General Public License along with * this program; if not, write to the Free Software Foundation, Inc., 59 Temple * Place - Suite 330, Boston, MA 02111-1307 USA. * * Copyright (C) Ashok Raj <ashok.raj@intel.com> * Copyright (C) Shaohua Li <shaohua.li@intel.com> * Copyright (C) Anil S Keshavamurthy <anil.s.keshavamurthy@intel.com> */#include <linux/init.h>#include <linux/bitmap.h>#include <linux/slab.h>#include <linux/irq.h>#include <linux/interrupt.h>#include <linux/sysdev.h>#include <linux/spinlock.h>#include <linux/pci.h>#include <linux/dmar.h>#include <linux/dma-mapping.h>#include <linux/mempool.h>#include "iova.h"#include "intel-iommu.h"#include <asm/proto.h> /* force_iommu in this header in x86-64*/#include <asm/cacheflush.h>#include <asm/gart.h>#include "pci.h"#define IS_GFX_DEVICE(pdev) ((pdev->class >> 16) == PCI_BASE_CLASS_DISPLAY)#define IS_ISA_DEVICE(pdev) ((pdev->class >> 8) == PCI_CLASS_BRIDGE_ISA)#define IOAPIC_RANGE_START (0xfee00000)#define IOAPIC_RANGE_END (0xfeefffff)#define IOVA_START_ADDR (0x1000)#define DEFAULT_DOMAIN_ADDRESS_WIDTH 48#define DMAR_OPERATION_TIMEOUT (HZ*60) /* 1m */#define DOMAIN_MAX_ADDR(gaw) ((((u64)1) << gaw) - 1)static void domain_remove_dev_info(struct dmar_domain *domain);static int dmar_disabled;static int __initdata dmar_map_gfx = 1;static int dmar_forcedac;#define DUMMY_DEVICE_DOMAIN_INFO ((struct device_domain_info *)(-1))static DEFINE_SPINLOCK(device_domain_lock);static LIST_HEAD(device_domain_list);static int __init intel_iommu_setup(char *str){ if (!str) return -EINVAL; while (*str) { if (!strncmp(str, "off", 3)) { dmar_disabled = 1; printk(KERN_INFO"Intel-IOMMU: disabled\n"); } else if (!strncmp(str, "igfx_off", 8)) { dmar_map_gfx = 0; printk(KERN_INFO "Intel-IOMMU: disable GFX device mapping\n"); } else if (!strncmp(str, "forcedac", 8)) { printk (KERN_INFO "Intel-IOMMU: Forcing DAC for PCI devices\n"); dmar_forcedac = 1; } str += strcspn(str, ","); while (*str == ',') str++; } return 0;}__setup("intel_iommu=", intel_iommu_setup);static struct kmem_cache *iommu_domain_cache;static struct kmem_cache *iommu_devinfo_cache;static struct kmem_cache *iommu_iova_cache;static inline void *iommu_kmem_cache_alloc(struct kmem_cache *cachep){ unsigned int flags; void *vaddr; /* trying to avoid low memory issues */ flags = current->flags & PF_MEMALLOC; current->flags |= PF_MEMALLOC; vaddr = kmem_cache_alloc(cachep, GFP_ATOMIC); current->flags &= (~PF_MEMALLOC | flags); return vaddr;}static inline void *alloc_pgtable_page(void){ unsigned int flags; void *vaddr; /* trying to avoid low memory issues */ flags = current->flags & PF_MEMALLOC; current->flags |= PF_MEMALLOC; vaddr = (void *)get_zeroed_page(GFP_ATOMIC); current->flags &= (~PF_MEMALLOC | flags); return vaddr;}static inline void free_pgtable_page(void *vaddr){ free_page((unsigned long)vaddr);}static inline void *alloc_domain_mem(void){ return iommu_kmem_cache_alloc(iommu_domain_cache);}static inline void free_domain_mem(void *vaddr){ kmem_cache_free(iommu_domain_cache, vaddr);}static inline void * alloc_devinfo_mem(void){ return iommu_kmem_cache_alloc(iommu_devinfo_cache);}static inline void free_devinfo_mem(void *vaddr){ kmem_cache_free(iommu_devinfo_cache, vaddr);}struct iova *alloc_iova_mem(void){ return iommu_kmem_cache_alloc(iommu_iova_cache);}void free_iova_mem(struct iova *iova){ kmem_cache_free(iommu_iova_cache, iova);}static inline void __iommu_flush_cache( struct intel_iommu *iommu, void *addr, int size){ if (!ecap_coherent(iommu->ecap)) clflush_cache_range(addr, size);}/* Gets context entry for a given bus and devfn */static struct context_entry * device_to_context_entry(struct intel_iommu *iommu, u8 bus, u8 devfn){ struct root_entry *root; struct context_entry *context; unsigned long phy_addr; unsigned long flags; spin_lock_irqsave(&iommu->lock, flags); root = &iommu->root_entry[bus]; context = get_context_addr_from_root(root); if (!context) { context = (struct context_entry *)alloc_pgtable_page(); if (!context) { spin_unlock_irqrestore(&iommu->lock, flags); return NULL; } __iommu_flush_cache(iommu, (void *)context, PAGE_SIZE_4K); phy_addr = virt_to_phys((void *)context); set_root_value(root, phy_addr); set_root_present(root); __iommu_flush_cache(iommu, root, sizeof(*root)); } spin_unlock_irqrestore(&iommu->lock, flags); return &context[devfn];}static int device_context_mapped(struct intel_iommu *iommu, u8 bus, u8 devfn){ struct root_entry *root; struct context_entry *context; int ret; unsigned long flags; spin_lock_irqsave(&iommu->lock, flags); root = &iommu->root_entry[bus]; context = get_context_addr_from_root(root); if (!context) { ret = 0; goto out; } ret = context_present(context[devfn]);out: spin_unlock_irqrestore(&iommu->lock, flags); return ret;}static void clear_context_table(struct intel_iommu *iommu, u8 bus, u8 devfn){ struct root_entry *root; struct context_entry *context; unsigned long flags; spin_lock_irqsave(&iommu->lock, flags); root = &iommu->root_entry[bus]; context = get_context_addr_from_root(root); if (context) { context_clear_entry(context[devfn]); __iommu_flush_cache(iommu, &context[devfn], \ sizeof(*context)); } spin_unlock_irqrestore(&iommu->lock, flags);}static void free_context_table(struct intel_iommu *iommu){ struct root_entry *root; int i; unsigned long flags; struct context_entry *context; spin_lock_irqsave(&iommu->lock, flags); if (!iommu->root_entry) { goto out; } for (i = 0; i < ROOT_ENTRY_NR; i++) { root = &iommu->root_entry[i]; context = get_context_addr_from_root(root); if (context) free_pgtable_page(context); } free_pgtable_page(iommu->root_entry); iommu->root_entry = NULL;out: spin_unlock_irqrestore(&iommu->lock, flags);}/* page table handling */#define LEVEL_STRIDE (9)#define LEVEL_MASK (((u64)1 << LEVEL_STRIDE) - 1)static inline int agaw_to_level(int agaw){ return agaw + 2;}static inline int agaw_to_width(int agaw){ return 30 + agaw * LEVEL_STRIDE;}static inline int width_to_agaw(int width){ return (width - 30) / LEVEL_STRIDE;}static inline unsigned int level_to_offset_bits(int level){ return (12 + (level - 1) * LEVEL_STRIDE);}static inline int address_level_offset(u64 addr, int level){ return ((addr >> level_to_offset_bits(level)) & LEVEL_MASK);}static inline u64 level_mask(int level){ return ((u64)-1 << level_to_offset_bits(level));}static inline u64 level_size(int level){ return ((u64)1 << level_to_offset_bits(level));}static inline u64 align_to_level(u64 addr, int level){ return ((addr + level_size(level) - 1) & level_mask(level));}static struct dma_pte * addr_to_dma_pte(struct dmar_domain *domain, u64 addr){ int addr_width = agaw_to_width(domain->agaw); struct dma_pte *parent, *pte = NULL; int level = agaw_to_level(domain->agaw); int offset; unsigned long flags; BUG_ON(!domain->pgd); addr &= (((u64)1) << addr_width) - 1; parent = domain->pgd; spin_lock_irqsave(&domain->mapping_lock, flags); while (level > 0) { void *tmp_page; offset = address_level_offset(addr, level); pte = &parent[offset]; if (level == 1) break; if (!dma_pte_present(*pte)) { tmp_page = alloc_pgtable_page(); if (!tmp_page) { spin_unlock_irqrestore(&domain->mapping_lock, flags); return NULL; } __iommu_flush_cache(domain->iommu, tmp_page, PAGE_SIZE_4K); dma_set_pte_addr(*pte, virt_to_phys(tmp_page)); /* * high level table always sets r/w, last level page * table control read/write */ dma_set_pte_readable(*pte); dma_set_pte_writable(*pte); __iommu_flush_cache(domain->iommu, pte, sizeof(*pte)); } parent = phys_to_virt(dma_pte_addr(*pte)); level--; } spin_unlock_irqrestore(&domain->mapping_lock, flags); return pte;}/* return address's pte at specific level */static struct dma_pte *dma_addr_level_pte(struct dmar_domain *domain, u64 addr, int level){ struct dma_pte *parent, *pte = NULL; int total = agaw_to_level(domain->agaw); int offset; parent = domain->pgd; while (level <= total) { offset = address_level_offset(addr, total); pte = &parent[offset]; if (level == total) return pte; if (!dma_pte_present(*pte)) break; parent = phys_to_virt(dma_pte_addr(*pte)); total--; } return NULL;}/* clear one page's page table */static void dma_pte_clear_one(struct dmar_domain *domain, u64 addr){ struct dma_pte *pte = NULL; /* get last level pte */ pte = dma_addr_level_pte(domain, addr, 1); if (pte) { dma_clear_pte(*pte); __iommu_flush_cache(domain->iommu, pte, sizeof(*pte)); }}/* clear last level pte, a tlb flush should be followed */static void dma_pte_clear_range(struct dmar_domain *domain, u64 start, u64 end){ int addr_width = agaw_to_width(domain->agaw); start &= (((u64)1) << addr_width) - 1; end &= (((u64)1) << addr_width) - 1; /* in case it's partial page */ start = PAGE_ALIGN_4K(start); end &= PAGE_MASK_4K; /* we don't need lock here, nobody else touches the iova range */ while (start < end) { dma_pte_clear_one(domain, start); start += PAGE_SIZE_4K; }}/* free page table pages. last level pte should already be cleared */static void dma_pte_free_pagetable(struct dmar_domain *domain, u64 start, u64 end){ int addr_width = agaw_to_width(domain->agaw); struct dma_pte *pte; int total = agaw_to_level(domain->agaw); int level; u64 tmp; start &= (((u64)1) << addr_width) - 1; end &= (((u64)1) << addr_width) - 1; /* we don't need lock here, nobody else touches the iova range */ level = 2; while (level <= total) { tmp = align_to_level(start, level); if (tmp >= end || (tmp + level_size(level) > end)) return; while (tmp < end) { pte = dma_addr_level_pte(domain, tmp, level); if (pte) { free_pgtable_page( phys_to_virt(dma_pte_addr(*pte))); dma_clear_pte(*pte); __iommu_flush_cache(domain->iommu, pte, sizeof(*pte)); } tmp += level_size(level); } level++; } /* free pgd */ if (start == 0 && end >= ((((u64)1) << addr_width) - 1)) { free_pgtable_page(domain->pgd); domain->pgd = NULL; }}/* iommu handling */static int iommu_alloc_root_entry(struct intel_iommu *iommu){ struct root_entry *root; unsigned long flags; root = (struct root_entry *)alloc_pgtable_page(); if (!root) return -ENOMEM; __iommu_flush_cache(iommu, root, PAGE_SIZE_4K); spin_lock_irqsave(&iommu->lock, flags); iommu->root_entry = root; spin_unlock_irqrestore(&iommu->lock, flags); return 0;}#define IOMMU_WAIT_OP(iommu, offset, op, cond, sts) \{\ unsigned long start_time = jiffies;\ while (1) {\ sts = op (iommu->reg + offset);\ if (cond)\ break;\ if (time_after(jiffies, start_time + DMAR_OPERATION_TIMEOUT))\ panic("DMAR hardware is malfunctioning\n");\ cpu_relax();\ }\}static void iommu_set_root_entry(struct intel_iommu *iommu){ void *addr; u32 cmd, sts; unsigned long flag; addr = iommu->root_entry; spin_lock_irqsave(&iommu->register_lock, flag); dmar_writeq(iommu->reg + DMAR_RTADDR_REG, virt_to_phys(addr)); cmd = iommu->gcmd | DMA_GCMD_SRTP; writel(cmd, iommu->reg + DMAR_GCMD_REG); /* Make sure hardware complete it */ IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG, readl, (sts & DMA_GSTS_RTPS), sts); spin_unlock_irqrestore(&iommu->register_lock, flag);}static void iommu_flush_write_buffer(struct intel_iommu *iommu){ u32 val; unsigned long flag; if (!cap_rwbf(iommu->cap)) return; val = iommu->gcmd | DMA_GCMD_WBF; spin_lock_irqsave(&iommu->register_lock, flag); writel(val, iommu->reg + DMAR_GCMD_REG); /* Make sure hardware complete it */ IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG, readl, (!(val & DMA_GSTS_WBFS)), val); spin_unlock_irqrestore(&iommu->register_lock, flag);}/* return value determine if we need a write buffer flush */static int __iommu_flush_context(struct intel_iommu *iommu, u16 did, u16 source_id, u8 function_mask, u64 type, int non_present_entry_flush){ u64 val = 0; unsigned long flag; /* * In the non-present entry flush case, if hardware doesn't cache * non-present entry we do nothing and if hardware cache non-present * entry, we flush entries of domain 0 (the domain id is used to cache * any non-present entries) */ if (non_present_entry_flush) { if (!cap_caching_mode(iommu->cap)) return 1; else did = 0; } switch (type) { case DMA_CCMD_GLOBAL_INVL: val = DMA_CCMD_GLOBAL_INVL; break; case DMA_CCMD_DOMAIN_INVL: val = DMA_CCMD_DOMAIN_INVL|DMA_CCMD_DID(did); break; case DMA_CCMD_DEVICE_INVL: val = DMA_CCMD_DEVICE_INVL|DMA_CCMD_DID(did) | DMA_CCMD_SID(source_id) | DMA_CCMD_FM(function_mask); break; default: BUG(); } val |= DMA_CCMD_ICC; spin_lock_irqsave(&iommu->register_lock, flag); dmar_writeq(iommu->reg + DMAR_CCMD_REG, val); /* Make sure hardware complete it */ IOMMU_WAIT_OP(iommu, DMAR_CCMD_REG, dmar_readq, (!(val & DMA_CCMD_ICC)), val); spin_unlock_irqrestore(&iommu->register_lock, flag); /* flush context entry will implictly flush write buffer */ return 0;}static int inline iommu_flush_context_global(struct intel_iommu *iommu, int non_present_entry_flush){ return __iommu_flush_context(iommu, 0, 0, 0, DMA_CCMD_GLOBAL_INVL, non_present_entry_flush);}static int inline iommu_flush_context_domain(struct intel_iommu *iommu, u16 did,
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -