📄 vmalloc.c
字号:
/* * linux/mm/vmalloc.c * * Copyright (C) 1993 Linus Torvalds * Support of BIGMEM added by Gerhard Wichert, Siemens AG, July 1999 * SMP-safe vmalloc/vfree/ioremap, Tigran Aivazian <tigran@veritas.com>, May 2000 * Major rework to support vmap/vunmap, Christoph Hellwig, SGI, August 2002 * Numa awareness, Christoph Lameter, SGI, June 2005 */#include <linux/vmalloc.h>#include <linux/mm.h>#include <linux/module.h>#include <linux/highmem.h>#include <linux/slab.h>#include <linux/spinlock.h>#include <linux/interrupt.h>#include <linux/proc_fs.h>#include <linux/seq_file.h>#include <linux/debugobjects.h>#include <linux/kallsyms.h>#include <linux/list.h>#include <linux/rbtree.h>#include <linux/radix-tree.h>#include <linux/rcupdate.h>#include <linux/bootmem.h>#include <asm/atomic.h>#include <asm/uaccess.h>#include <asm/tlbflush.h>/*** Page table manipulation functions ***/static void vunmap_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end){ pte_t *pte; pte = pte_offset_kernel(pmd, addr); do { pte_t ptent = ptep_get_and_clear(&init_mm, addr, pte); WARN_ON(!pte_none(ptent) && !pte_present(ptent)); } while (pte++, addr += PAGE_SIZE, addr != end);}static void vunmap_pmd_range(pud_t *pud, unsigned long addr, unsigned long end){ pmd_t *pmd; unsigned long next; pmd = pmd_offset(pud, addr); do { next = pmd_addr_end(addr, end); if (pmd_none_or_clear_bad(pmd)) continue; vunmap_pte_range(pmd, addr, next); } while (pmd++, addr = next, addr != end);}static void vunmap_pud_range(pgd_t *pgd, unsigned long addr, unsigned long end){ pud_t *pud; unsigned long next; pud = pud_offset(pgd, addr); do { next = pud_addr_end(addr, end); if (pud_none_or_clear_bad(pud)) continue; vunmap_pmd_range(pud, addr, next); } while (pud++, addr = next, addr != end);}static void vunmap_page_range(unsigned long addr, unsigned long end){ pgd_t *pgd; unsigned long next; BUG_ON(addr >= end); pgd = pgd_offset_k(addr); do { next = pgd_addr_end(addr, end); if (pgd_none_or_clear_bad(pgd)) continue; vunmap_pud_range(pgd, addr, next); } while (pgd++, addr = next, addr != end);}static int vmap_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end, pgprot_t prot, struct page **pages, int *nr){ pte_t *pte; /* * nr is a running index into the array which helps higher level * callers keep track of where we're up to. */ pte = pte_alloc_kernel(pmd, addr); if (!pte) return -ENOMEM; do { struct page *page = pages[*nr]; if (WARN_ON(!pte_none(*pte))) return -EBUSY; if (WARN_ON(!page)) return -ENOMEM; set_pte_at(&init_mm, addr, pte, mk_pte(page, prot)); (*nr)++; } while (pte++, addr += PAGE_SIZE, addr != end); return 0;}static int vmap_pmd_range(pud_t *pud, unsigned long addr, unsigned long end, pgprot_t prot, struct page **pages, int *nr){ pmd_t *pmd; unsigned long next; pmd = pmd_alloc(&init_mm, pud, addr); if (!pmd) return -ENOMEM; do { next = pmd_addr_end(addr, end); if (vmap_pte_range(pmd, addr, next, prot, pages, nr)) return -ENOMEM; } while (pmd++, addr = next, addr != end); return 0;}static int vmap_pud_range(pgd_t *pgd, unsigned long addr, unsigned long end, pgprot_t prot, struct page **pages, int *nr){ pud_t *pud; unsigned long next; pud = pud_alloc(&init_mm, pgd, addr); if (!pud) return -ENOMEM; do { next = pud_addr_end(addr, end); if (vmap_pmd_range(pud, addr, next, prot, pages, nr)) return -ENOMEM; } while (pud++, addr = next, addr != end); return 0;}/* * Set up page tables in kva (addr, end). The ptes shall have prot "prot", and * will have pfns corresponding to the "pages" array. * * Ie. pte at addr+N*PAGE_SIZE shall point to pfn corresponding to pages[N] */static int vmap_page_range(unsigned long start, unsigned long end, pgprot_t prot, struct page **pages){ pgd_t *pgd; unsigned long next; unsigned long addr = start; int err = 0; int nr = 0; BUG_ON(addr >= end); pgd = pgd_offset_k(addr); do { next = pgd_addr_end(addr, end); err = vmap_pud_range(pgd, addr, next, prot, pages, &nr); if (err) break; } while (pgd++, addr = next, addr != end); flush_cache_vmap(start, end); if (unlikely(err)) return err; return nr;}static inline int is_vmalloc_or_module_addr(const void *x){ /* * ARM, x86-64 and sparc64 put modules in a special place, * and fall back on vmalloc() if that fails. Others * just put it in the vmalloc space. */#if defined(CONFIG_MODULES) && defined(MODULES_VADDR) unsigned long addr = (unsigned long)x; if (addr >= MODULES_VADDR && addr < MODULES_END) return 1;#endif return is_vmalloc_addr(x);}/* * Walk a vmap address to the struct page it maps. */struct page *vmalloc_to_page(const void *vmalloc_addr){ unsigned long addr = (unsigned long) vmalloc_addr; struct page *page = NULL; pgd_t *pgd = pgd_offset_k(addr); /* * XXX we might need to change this if we add VIRTUAL_BUG_ON for * architectures that do not vmalloc module space */ VIRTUAL_BUG_ON(!is_vmalloc_or_module_addr(vmalloc_addr)); if (!pgd_none(*pgd)) { pud_t *pud = pud_offset(pgd, addr); if (!pud_none(*pud)) { pmd_t *pmd = pmd_offset(pud, addr); if (!pmd_none(*pmd)) { pte_t *ptep, pte; ptep = pte_offset_map(pmd, addr); pte = *ptep; if (pte_present(pte)) page = pte_page(pte); pte_unmap(ptep); } } } return page;}EXPORT_SYMBOL(vmalloc_to_page);/* * Map a vmalloc()-space virtual address to the physical page frame number. */unsigned long vmalloc_to_pfn(const void *vmalloc_addr){ return page_to_pfn(vmalloc_to_page(vmalloc_addr));}EXPORT_SYMBOL(vmalloc_to_pfn);/*** Global kva allocator ***/#define VM_LAZY_FREE 0x01#define VM_LAZY_FREEING 0x02#define VM_VM_AREA 0x04struct vmap_area { unsigned long va_start; unsigned long va_end; unsigned long flags; struct rb_node rb_node; /* address sorted rbtree */ struct list_head list; /* address sorted list */ struct list_head purge_list; /* "lazy purge" list */ void *private; struct rcu_head rcu_head;};static DEFINE_SPINLOCK(vmap_area_lock);static struct rb_root vmap_area_root = RB_ROOT;static LIST_HEAD(vmap_area_list);static struct vmap_area *__find_vmap_area(unsigned long addr){ struct rb_node *n = vmap_area_root.rb_node; while (n) { struct vmap_area *va; va = rb_entry(n, struct vmap_area, rb_node); if (addr < va->va_start) n = n->rb_left; else if (addr > va->va_start) n = n->rb_right; else return va; } return NULL;}static void __insert_vmap_area(struct vmap_area *va){ struct rb_node **p = &vmap_area_root.rb_node; struct rb_node *parent = NULL; struct rb_node *tmp; while (*p) { struct vmap_area *tmp; parent = *p; tmp = rb_entry(parent, struct vmap_area, rb_node); if (va->va_start < tmp->va_end) p = &(*p)->rb_left; else if (va->va_end > tmp->va_start) p = &(*p)->rb_right; else BUG(); } rb_link_node(&va->rb_node, parent, p); rb_insert_color(&va->rb_node, &vmap_area_root); /* address-sort this list so it is usable like the vmlist */ tmp = rb_prev(&va->rb_node); if (tmp) { struct vmap_area *prev; prev = rb_entry(tmp, struct vmap_area, rb_node); list_add_rcu(&va->list, &prev->list); } else list_add_rcu(&va->list, &vmap_area_list);}static void purge_vmap_area_lazy(void);/* * Allocate a region of KVA of the specified size and alignment, within the * vstart and vend. */static struct vmap_area *alloc_vmap_area(unsigned long size, unsigned long align, unsigned long vstart, unsigned long vend, int node, gfp_t gfp_mask){ struct vmap_area *va; struct rb_node *n; unsigned long addr; int purged = 0; BUG_ON(!size); BUG_ON(size & ~PAGE_MASK); va = kmalloc_node(sizeof(struct vmap_area), gfp_mask & GFP_RECLAIM_MASK, node); if (unlikely(!va)) return ERR_PTR(-ENOMEM);retry: addr = ALIGN(vstart, align); spin_lock(&vmap_area_lock); if (addr + size - 1 < addr) goto overflow; /* XXX: could have a last_hole cache */ n = vmap_area_root.rb_node; if (n) { struct vmap_area *first = NULL; do { struct vmap_area *tmp; tmp = rb_entry(n, struct vmap_area, rb_node); if (tmp->va_end >= addr) { if (!first && tmp->va_start < addr + size) first = tmp; n = n->rb_left; } else { first = tmp; n = n->rb_right; } } while (n); if (!first) goto found; if (first->va_end < addr) { n = rb_next(&first->rb_node); if (n) first = rb_entry(n, struct vmap_area, rb_node); else goto found; } while (addr + size > first->va_start && addr + size <= vend) { addr = ALIGN(first->va_end + PAGE_SIZE, align); if (addr + size - 1 < addr) goto overflow; n = rb_next(&first->rb_node); if (n) first = rb_entry(n, struct vmap_area, rb_node); else goto found; } }found: if (addr + size > vend) {overflow: spin_unlock(&vmap_area_lock); if (!purged) { purge_vmap_area_lazy(); purged = 1; goto retry; } if (printk_ratelimit()) printk(KERN_WARNING "vmap allocation for size %lu failed: " "use vmalloc=<size> to increase size.\n", size); return ERR_PTR(-EBUSY); } BUG_ON(addr & (align-1)); va->va_start = addr; va->va_end = addr + size; va->flags = 0; __insert_vmap_area(va); spin_unlock(&vmap_area_lock); return va;}static void rcu_free_va(struct rcu_head *head){ struct vmap_area *va = container_of(head, struct vmap_area, rcu_head); kfree(va);}static void __free_vmap_area(struct vmap_area *va){ BUG_ON(RB_EMPTY_NODE(&va->rb_node)); rb_erase(&va->rb_node, &vmap_area_root); RB_CLEAR_NODE(&va->rb_node); list_del_rcu(&va->list); call_rcu(&va->rcu_head, rcu_free_va);}/* * Free a region of KVA allocated by alloc_vmap_area */static void free_vmap_area(struct vmap_area *va){ spin_lock(&vmap_area_lock); __free_vmap_area(va); spin_unlock(&vmap_area_lock);}/* * Clear the pagetable entries of a given vmap_area */static void unmap_vmap_area(struct vmap_area *va){ vunmap_page_range(va->va_start, va->va_end);}static void vmap_debug_free_range(unsigned long start, unsigned long end){ /* * Unmap page tables and force a TLB flush immediately if * CONFIG_DEBUG_PAGEALLOC is set. This catches use after free * bugs similarly to those in linear kernel virtual address * space after a page has been freed. * * All the lazy freeing logic is still retained, in order to * minimise intrusiveness of this debugging feature. * * This is going to be *slow* (linear kernel virtual address * debugging doesn't do a broadcast TLB flush so it is a lot * faster). */#ifdef CONFIG_DEBUG_PAGEALLOC vunmap_page_range(start, end); flush_tlb_kernel_range(start, end);#endif}/* * lazy_max_pages is the maximum amount of virtual address space we gather up * before attempting to purge with a TLB flush. * * There is a tradeoff here: a larger number will cover more kernel page tables * and take slightly longer to purge, but it will linearly reduce the number of * global TLB flushes that must be performed. It would seem natural to scale * this number up linearly with the number of CPUs (because vmapping activity * could also scale linearly with the number of CPUs), however it is likely * that in practice, workloads might be constrained in other ways that mean * vmap activity will not scale linearly with CPUs. Also, I want to be * conservative and not introduce a big latency on huge systems, so go with * a less aggressive log scale. It will still be an improvement over the old * code, and it will be simple to change the scale factor if we find that it * becomes a problem on bigger systems. */static unsigned long lazy_max_pages(void){ unsigned int log; log = fls(num_online_cpus()); return log * (32UL * 1024 * 1024 / PAGE_SIZE);}static atomic_t vmap_lazy_nr = ATOMIC_INIT(0);/* * Purges all lazily-freed vmap areas. * * If sync is 0 then don't purge if there is already a purge in progress. * If force_flush is 1, then flush kernel TLBs between *start and *end even * if we found no lazy vmap areas to unmap (callers can use this to optimise * their own TLB flushing). * Returns with *start = min(*start, lowest purged address) * *end = max(*end, highest purged address) */static void __purge_vmap_area_lazy(unsigned long *start, unsigned long *end, int sync, int force_flush){ static DEFINE_SPINLOCK(purge_lock); LIST_HEAD(valist); struct vmap_area *va; struct vmap_area *n_va; int nr = 0; /* * If sync is 0 but force_flush is 1, we'll go sync anyway but callers * should not expect such behaviour. This just simplifies locking for * the case that isn't actually used at the moment anyway. */ if (!sync && !force_flush) { if (!spin_trylock(&purge_lock)) return; } else spin_lock(&purge_lock); rcu_read_lock(); list_for_each_entry_rcu(va, &vmap_area_list, list) { if (va->flags & VM_LAZY_FREE) { if (va->va_start < *start) *start = va->va_start; if (va->va_end > *end) *end = va->va_end; nr += (va->va_end - va->va_start) >> PAGE_SHIFT; unmap_vmap_area(va); list_add_tail(&va->purge_list, &valist); va->flags |= VM_LAZY_FREEING; va->flags &= ~VM_LAZY_FREE; } } rcu_read_unlock(); if (nr) { BUG_ON(nr > atomic_read(&vmap_lazy_nr)); atomic_sub(nr, &vmap_lazy_nr); } if (nr || force_flush) flush_tlb_kernel_range(*start, *end); if (nr) { spin_lock(&vmap_area_lock); list_for_each_entry_safe(va, n_va, &valist, purge_list) __free_vmap_area(va); spin_unlock(&vmap_area_lock); } spin_unlock(&purge_lock);}/* * Kick off a purge of the outstanding lazy areas. Don't bother if somebody * is already purging. */static void try_purge_vmap_area_lazy(void){ unsigned long start = ULONG_MAX, end = 0; __purge_vmap_area_lazy(&start, &end, 0, 0);}/* * Kick off a purge of the outstanding lazy areas. */static void purge_vmap_area_lazy(void){ unsigned long start = ULONG_MAX, end = 0; __purge_vmap_area_lazy(&start, &end, 1, 0);}/* * Free and unmap a vmap area, caller ensuring flush_cache_vunmap had been * called for the correct range previously. */static void free_unmap_vmap_area_noflush(struct vmap_area *va){ va->flags |= VM_LAZY_FREE; atomic_add((va->va_end - va->va_start) >> PAGE_SHIFT, &vmap_lazy_nr); if (unlikely(atomic_read(&vmap_lazy_nr) > lazy_max_pages())) try_purge_vmap_area_lazy();}/* * Free and unmap a vmap area */static void free_unmap_vmap_area(struct vmap_area *va){ flush_cache_vunmap(va->va_start, va->va_end); free_unmap_vmap_area_noflush(va);}static struct vmap_area *find_vmap_area(unsigned long addr){ struct vmap_area *va; spin_lock(&vmap_area_lock); va = __find_vmap_area(addr); spin_unlock(&vmap_area_lock); return va;}static void free_unmap_vmap_area_addr(unsigned long addr){ struct vmap_area *va; va = find_vmap_area(addr); BUG_ON(!va); free_unmap_vmap_area(va);}
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -