📄 sba_iommu.c
字号:
/*** IA64 System Bus Adapter (SBA) I/O MMU manager**** (c) Copyright 2002-2005 Alex Williamson** (c) Copyright 2002-2003 Grant Grundler** (c) Copyright 2002-2005 Hewlett-Packard Company**** Portions (c) 2000 Grant Grundler (from parisc I/O MMU code)** Portions (c) 1999 Dave S. Miller (from sparc64 I/O MMU code)**** This program is free software; you can redistribute it and/or modify** it under the terms of the GNU General Public License as published by** the Free Software Foundation; either version 2 of the License, or** (at your option) any later version.****** This module initializes the IOC (I/O Controller) found on HP** McKinley machines and their successors.***/#include <linux/config.h>#include <linux/types.h>#include <linux/kernel.h>#include <linux/module.h>#include <linux/spinlock.h>#include <linux/slab.h>#include <linux/init.h>#include <linux/mm.h>#include <linux/string.h>#include <linux/pci.h>#include <linux/proc_fs.h>#include <linux/seq_file.h>#include <linux/acpi.h>#include <linux/efi.h>#include <linux/nodemask.h>#include <linux/bitops.h> /* hweight64() */#include <asm/delay.h> /* ia64_get_itc() */#include <asm/io.h>#include <asm/page.h> /* PAGE_OFFSET */#include <asm/dma.h>#include <asm/system.h> /* wmb() */#include <asm/acpi-ext.h>#define PFX "IOC: "/*** Enabling timing search of the pdir resource map. Output in /proc.** Disabled by default to optimize performance.*/#undef PDIR_SEARCH_TIMING/*** This option allows cards capable of 64bit DMA to bypass the IOMMU. If** not defined, all DMA will be 32bit and go through the TLB.** There's potentially a conflict in the bio merge code with us** advertising an iommu, but then bypassing it. Since I/O MMU bypassing** appears to give more performance than bio-level virtual merging, we'll** do the former for now. NOTE: BYPASS_SG also needs to be undef'd to** completely restrict DMA to the IOMMU.*/#define ALLOW_IOV_BYPASS/*** This option specifically allows/disallows bypassing scatterlists with** multiple entries. Coalescing these entries can allow better DMA streaming** and in some cases shows better performance than entirely bypassing the** IOMMU. Performance increase on the order of 1-2% sequential output/input** using bonnie++ on a RAID0 MD device (sym2 & mpt).*/#undef ALLOW_IOV_BYPASS_SG/*** If a device prefetches beyond the end of a valid pdir entry, it will cause** a hard failure, ie. MCA. Version 3.0 and later of the zx1 LBA should** disconnect on 4k boundaries and prevent such issues. If the device is** particularly agressive, this option will keep the entire pdir valid such** that prefetching will hit a valid address. This could severely impact** error containment, and is therefore off by default. The page that is** used for spill-over is poisoned, so that should help debugging somewhat.*/#undef FULL_VALID_PDIR#define ENABLE_MARK_CLEAN/*** The number of debug flags is a clue - this code is fragile. NOTE: since** tightening the use of res_lock the resource bitmap and actual pdir are no** longer guaranteed to stay in sync. The sanity checking code isn't going to** like that.*/#undef DEBUG_SBA_INIT#undef DEBUG_SBA_RUN#undef DEBUG_SBA_RUN_SG#undef DEBUG_SBA_RESOURCE#undef ASSERT_PDIR_SANITY#undef DEBUG_LARGE_SG_ENTRIES#undef DEBUG_BYPASS#if defined(FULL_VALID_PDIR) && defined(ASSERT_PDIR_SANITY)#error FULL_VALID_PDIR and ASSERT_PDIR_SANITY are mutually exclusive#endif#define SBA_INLINE __inline__/* #define SBA_INLINE */#ifdef DEBUG_SBA_INIT#define DBG_INIT(x...) printk(x)#else#define DBG_INIT(x...)#endif#ifdef DEBUG_SBA_RUN#define DBG_RUN(x...) printk(x)#else#define DBG_RUN(x...)#endif#ifdef DEBUG_SBA_RUN_SG#define DBG_RUN_SG(x...) printk(x)#else#define DBG_RUN_SG(x...)#endif#ifdef DEBUG_SBA_RESOURCE#define DBG_RES(x...) printk(x)#else#define DBG_RES(x...)#endif#ifdef DEBUG_BYPASS#define DBG_BYPASS(x...) printk(x)#else#define DBG_BYPASS(x...)#endif#ifdef ASSERT_PDIR_SANITY#define ASSERT(expr) \ if(!(expr)) { \ printk( "\n" __FILE__ ":%d: Assertion " #expr " failed!\n",__LINE__); \ panic(#expr); \ }#else#define ASSERT(expr)#endif/*** The number of pdir entries to "free" before issuing** a read to PCOM register to flush out PCOM writes.** Interacts with allocation granularity (ie 4 or 8 entries** allocated and free'd/purged at a time might make this** less interesting).*/#define DELAYED_RESOURCE_CNT 64#define PCI_DEVICE_ID_HP_SX2000_IOC 0x12ec#define ZX1_IOC_ID ((PCI_DEVICE_ID_HP_ZX1_IOC << 16) | PCI_VENDOR_ID_HP)#define ZX2_IOC_ID ((PCI_DEVICE_ID_HP_ZX2_IOC << 16) | PCI_VENDOR_ID_HP)#define REO_IOC_ID ((PCI_DEVICE_ID_HP_REO_IOC << 16) | PCI_VENDOR_ID_HP)#define SX1000_IOC_ID ((PCI_DEVICE_ID_HP_SX1000_IOC << 16) | PCI_VENDOR_ID_HP)#define SX2000_IOC_ID ((PCI_DEVICE_ID_HP_SX2000_IOC << 16) | PCI_VENDOR_ID_HP)#define ZX1_IOC_OFFSET 0x1000 /* ACPI reports SBA, we want IOC */#define IOC_FUNC_ID 0x000#define IOC_FCLASS 0x008 /* function class, bist, header, rev... */#define IOC_IBASE 0x300 /* IO TLB */#define IOC_IMASK 0x308#define IOC_PCOM 0x310#define IOC_TCNFG 0x318#define IOC_PDIR_BASE 0x320#define IOC_ROPE0_CFG 0x500#define IOC_ROPE_AO 0x10 /* Allow "Relaxed Ordering" *//* AGP GART driver looks for this */#define ZX1_SBA_IOMMU_COOKIE 0x0000badbadc0ffeeUL/*** The zx1 IOC supports 4/8/16/64KB page sizes (see TCNFG register)**** Some IOCs (sx1000) can run at the above pages sizes, but are** really only supported using the IOC at a 4k page size.**** iovp_size could only be greater than PAGE_SIZE if we are** confident the drivers really only touch the next physical** page iff that driver instance owns it.*/static unsigned long iovp_size;static unsigned long iovp_shift;static unsigned long iovp_mask;struct ioc { void __iomem *ioc_hpa; /* I/O MMU base address */ char *res_map; /* resource map, bit == pdir entry */ u64 *pdir_base; /* physical base address */ unsigned long ibase; /* pdir IOV Space base */ unsigned long imask; /* pdir IOV Space mask */ unsigned long *res_hint; /* next avail IOVP - circular search */ unsigned long dma_mask; spinlock_t res_lock; /* protects the resource bitmap, but must be held when */ /* clearing pdir to prevent races with allocations. */ unsigned int res_bitshift; /* from the RIGHT! */ unsigned int res_size; /* size of resource map in bytes */#ifdef CONFIG_NUMA unsigned int node; /* node where this IOC lives */#endif#if DELAYED_RESOURCE_CNT > 0 spinlock_t saved_lock; /* may want to try to get this on a separate cacheline */ /* than res_lock for bigger systems. */ int saved_cnt; struct sba_dma_pair { dma_addr_t iova; size_t size; } saved[DELAYED_RESOURCE_CNT];#endif#ifdef PDIR_SEARCH_TIMING#define SBA_SEARCH_SAMPLE 0x100 unsigned long avg_search[SBA_SEARCH_SAMPLE]; unsigned long avg_idx; /* current index into avg_search */#endif /* Stuff we don't need in performance path */ struct ioc *next; /* list of IOC's in system */ acpi_handle handle; /* for multiple IOC's */ const char *name; unsigned int func_id; unsigned int rev; /* HW revision of chip */ u32 iov_size; unsigned int pdir_size; /* in bytes, determined by IOV Space size */ struct pci_dev *sac_only_dev;};static struct ioc *ioc_list;static int reserve_sba_gart = 1;static SBA_INLINE void sba_mark_invalid(struct ioc *, dma_addr_t, size_t);static SBA_INLINE void sba_free_range(struct ioc *, dma_addr_t, size_t);#define sba_sg_address(sg) (page_address((sg)->page) + (sg)->offset)#ifdef FULL_VALID_PDIRstatic u64 prefetch_spill_page;#endif#ifdef CONFIG_PCI# define GET_IOC(dev) (((dev)->bus == &pci_bus_type) \ ? ((struct ioc *) PCI_CONTROLLER(to_pci_dev(dev))->iommu) : NULL)#else# define GET_IOC(dev) NULL#endif/*** DMA_CHUNK_SIZE is used by the SCSI mid-layer to break up** (or rather not merge) DMA's into managable chunks.** On parisc, this is more of the software/tuning constraint** rather than the HW. I/O MMU allocation alogorithms can be** faster with smaller size is (to some degree).*/#define DMA_CHUNK_SIZE (BITS_PER_LONG*iovp_size)#define ROUNDUP(x,y) ((x + ((y)-1)) & ~((y)-1))/************************************** SBA register read and write support**** BE WARNED: register writes are posted.** (ie follow writes which must reach HW with a read)***/#define READ_REG(addr) __raw_readq(addr)#define WRITE_REG(val, addr) __raw_writeq(val, addr)#ifdef DEBUG_SBA_INIT/** * sba_dump_tlb - debugging only - print IOMMU operating parameters * @hpa: base address of the IOMMU * * Print the size/location of the IO MMU PDIR. */static voidsba_dump_tlb(char *hpa){ DBG_INIT("IO TLB at 0x%p\n", (void *)hpa); DBG_INIT("IOC_IBASE : %016lx\n", READ_REG(hpa+IOC_IBASE)); DBG_INIT("IOC_IMASK : %016lx\n", READ_REG(hpa+IOC_IMASK)); DBG_INIT("IOC_TCNFG : %016lx\n", READ_REG(hpa+IOC_TCNFG)); DBG_INIT("IOC_PDIR_BASE: %016lx\n", READ_REG(hpa+IOC_PDIR_BASE)); DBG_INIT("\n");}#endif#ifdef ASSERT_PDIR_SANITY/** * sba_dump_pdir_entry - debugging only - print one IOMMU PDIR entry * @ioc: IO MMU structure which owns the pdir we are interested in. * @msg: text to print ont the output line. * @pide: pdir index. * * Print one entry of the IO MMU PDIR in human readable form. */static voidsba_dump_pdir_entry(struct ioc *ioc, char *msg, uint pide){ /* start printing from lowest pde in rval */ u64 *ptr = &ioc->pdir_base[pide & ~(BITS_PER_LONG - 1)]; unsigned long *rptr = (unsigned long *) &ioc->res_map[(pide >>3) & -sizeof(unsigned long)]; uint rcnt; printk(KERN_DEBUG "SBA: %s rp %p bit %d rval 0x%lx\n", msg, rptr, pide & (BITS_PER_LONG - 1), *rptr); rcnt = 0; while (rcnt < BITS_PER_LONG) { printk(KERN_DEBUG "%s %2d %p %016Lx\n", (rcnt == (pide & (BITS_PER_LONG - 1))) ? " -->" : " ", rcnt, ptr, (unsigned long long) *ptr ); rcnt++; ptr++; } printk(KERN_DEBUG "%s", msg);}/** * sba_check_pdir - debugging only - consistency checker * @ioc: IO MMU structure which owns the pdir we are interested in. * @msg: text to print ont the output line. * * Verify the resource map and pdir state is consistent */static intsba_check_pdir(struct ioc *ioc, char *msg){ u64 *rptr_end = (u64 *) &(ioc->res_map[ioc->res_size]); u64 *rptr = (u64 *) ioc->res_map; /* resource map ptr */ u64 *pptr = ioc->pdir_base; /* pdir ptr */ uint pide = 0; while (rptr < rptr_end) { u64 rval; int rcnt; /* number of bits we might check */ rval = *rptr; rcnt = 64; while (rcnt) { /* Get last byte and highest bit from that */ u32 pde = ((u32)((*pptr >> (63)) & 0x1)); if ((rval & 0x1) ^ pde) { /* ** BUMMER! -- res_map != pdir -- ** Dump rval and matching pdir entries */ sba_dump_pdir_entry(ioc, msg, pide); return(1); } rcnt--; rval >>= 1; /* try the next bit */ pptr++; pide++; } rptr++; /* look at next word of res_map */ } /* It'd be nice if we always got here :^) */ return 0;}/** * sba_dump_sg - debugging only - print Scatter-Gather list * @ioc: IO MMU structure which owns the pdir we are interested in. * @startsg: head of the SG list * @nents: number of entries in SG list * * print the SG list so we can verify it's correct by hand. */static voidsba_dump_sg( struct ioc *ioc, struct scatterlist *startsg, int nents){ while (nents-- > 0) { printk(KERN_DEBUG " %d : DMA %08lx/%05x CPU %p\n", nents, startsg->dma_address, startsg->dma_length, sba_sg_address(startsg)); startsg++; }}static voidsba_check_sg( struct ioc *ioc, struct scatterlist *startsg, int nents){ struct scatterlist *the_sg = startsg; int the_nents = nents; while (the_nents-- > 0) { if (sba_sg_address(the_sg) == 0x0UL) sba_dump_sg(NULL, startsg, nents); the_sg++; }}#endif /* ASSERT_PDIR_SANITY *//**************************************************************** I/O Pdir Resource Management** Bits set in the resource map are in use.* Each bit can represent a number of pages.* LSbs represent lower addresses (IOVA's).****************************************************************/#define PAGES_PER_RANGE 1 /* could increase this to 4 or 8 if needed *//* Convert from IOVP to IOVA and vice versa. */#define SBA_IOVA(ioc,iovp,offset) ((ioc->ibase) | (iovp) | (offset))#define SBA_IOVP(ioc,iova) ((iova) & ~(ioc->ibase))#define PDIR_ENTRY_SIZE sizeof(u64)#define PDIR_INDEX(iovp) ((iovp)>>iovp_shift)#define RESMAP_MASK(n) ~(~0UL << (n))#define RESMAP_IDX_MASK (sizeof(unsigned long) - 1)/** * For most cases the normal get_order is sufficient, however it limits us * to PAGE_SIZE being the minimum mapping alignment and TC flush granularity. * It only incurs about 1 clock cycle to use this one with the static variable * and makes the code more intuitive. */static SBA_INLINE intget_iovp_order (unsigned long size){ long double d = size - 1; long order; order = ia64_getf_exp(d); order = order - iovp_shift - 0xffff + 1; if (order < 0) order = 0; return order;}/** * sba_search_bitmap - find free space in IO PDIR resource bitmap * @ioc: IO MMU structure which owns the pdir we are interested in. * @bits_wanted: number of entries we need. * @use_hint: use res_hint to indicate where to start looking * * Find consecutive free bits in resource bitmap. * Each bit represents one entry in the IO Pdir. * Cool perf optimization: search for log2(size) bits at a time. */static SBA_INLINE unsigned longsba_search_bitmap(struct ioc *ioc, unsigned long bits_wanted, int use_hint){ unsigned long *res_ptr; unsigned long *res_end = (unsigned long *) &(ioc->res_map[ioc->res_size]); unsigned long flags, pide = ~0UL; ASSERT(((unsigned long) ioc->res_hint & (sizeof(unsigned long) - 1UL)) == 0); ASSERT(res_ptr < res_end); spin_lock_irqsave(&ioc->res_lock, flags); /* Allow caller to force a search through the entire resource space */ if (likely(use_hint)) { res_ptr = ioc->res_hint; } else { res_ptr = (ulong *)ioc->res_map; ioc->res_bitshift = 0; } /* * N.B. REO/Grande defect AR2305 can cause TLB fetch timeouts * if a TLB entry is purged while in use. sba_mark_invalid() * purges IOTLB entries in power-of-two sizes, so we also * allocate IOVA space in power-of-two sizes. */ bits_wanted = 1UL << get_iovp_order(bits_wanted << iovp_shift); if (likely(bits_wanted == 1)) { unsigned int bitshiftcnt; for(; res_ptr < res_end ; res_ptr++) { if (likely(*res_ptr != ~0UL)) { bitshiftcnt = ffz(*res_ptr); *res_ptr |= (1UL << bitshiftcnt); pide = ((unsigned long)res_ptr - (unsigned long)ioc->res_map); pide <<= 3; /* convert to bit address */ pide += bitshiftcnt; ioc->res_bitshift = bitshiftcnt + bits_wanted; goto found_it; } } goto not_found; } if (likely(bits_wanted <= BITS_PER_LONG/2)) { /* ** Search the resource bit map on well-aligned values. ** "o" is the alignment. ** We need the alignment to invalidate I/O TLB using ** SBA HW features in the unmap path. */ unsigned long o = 1 << get_iovp_order(bits_wanted << iovp_shift); uint bitshiftcnt = ROUNDUP(ioc->res_bitshift, o); unsigned long mask, base_mask; base_mask = RESMAP_MASK(bits_wanted); mask = base_mask << bitshiftcnt; DBG_RES("%s() o %ld %p", __FUNCTION__, o, res_ptr); for(; res_ptr < res_end ; res_ptr++) { DBG_RES(" %p %lx %lx\n", res_ptr, mask, *res_ptr); ASSERT(0 != mask);
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -