📄 vmalloc.c
字号:
/*** Per cpu kva allocator ***//* * vmap space is limited especially on 32 bit architectures. Ensure there is * room for at least 16 percpu vmap blocks per CPU. *//* * If we had a constant VMALLOC_START and VMALLOC_END, we'd like to be able * to #define VMALLOC_SPACE (VMALLOC_END-VMALLOC_START). Guess * instead (we just need a rough idea) */#if BITS_PER_LONG == 32#define VMALLOC_SPACE (128UL*1024*1024)#else#define VMALLOC_SPACE (128UL*1024*1024*1024)#endif#define VMALLOC_PAGES (VMALLOC_SPACE / PAGE_SIZE)#define VMAP_MAX_ALLOC BITS_PER_LONG /* 256K with 4K pages */#define VMAP_BBMAP_BITS_MAX 1024 /* 4MB with 4K pages */#define VMAP_BBMAP_BITS_MIN (VMAP_MAX_ALLOC*2)#define VMAP_MIN(x, y) ((x) < (y) ? (x) : (y)) /* can't use min() */#define VMAP_MAX(x, y) ((x) > (y) ? (x) : (y)) /* can't use max() */#define VMAP_BBMAP_BITS VMAP_MIN(VMAP_BBMAP_BITS_MAX, \ VMAP_MAX(VMAP_BBMAP_BITS_MIN, \ VMALLOC_PAGES / NR_CPUS / 16))#define VMAP_BLOCK_SIZE (VMAP_BBMAP_BITS * PAGE_SIZE)static bool vmap_initialized __read_mostly = false;struct vmap_block_queue { spinlock_t lock; struct list_head free; struct list_head dirty; unsigned int nr_dirty;};struct vmap_block { spinlock_t lock; struct vmap_area *va; struct vmap_block_queue *vbq; unsigned long free, dirty; DECLARE_BITMAP(alloc_map, VMAP_BBMAP_BITS); DECLARE_BITMAP(dirty_map, VMAP_BBMAP_BITS); union { struct { struct list_head free_list; struct list_head dirty_list; }; struct rcu_head rcu_head; };};/* Queue of free and dirty vmap blocks, for allocation and flushing purposes */static DEFINE_PER_CPU(struct vmap_block_queue, vmap_block_queue);/* * Radix tree of vmap blocks, indexed by address, to quickly find a vmap block * in the free path. Could get rid of this if we change the API to return a * "cookie" from alloc, to be passed to free. But no big deal yet. */static DEFINE_SPINLOCK(vmap_block_tree_lock);static RADIX_TREE(vmap_block_tree, GFP_ATOMIC);/* * We should probably have a fallback mechanism to allocate virtual memory * out of partially filled vmap blocks. However vmap block sizing should be * fairly reasonable according to the vmalloc size, so it shouldn't be a * big problem. */static unsigned long addr_to_vb_idx(unsigned long addr){ addr -= VMALLOC_START & ~(VMAP_BLOCK_SIZE-1); addr /= VMAP_BLOCK_SIZE; return addr;}static struct vmap_block *new_vmap_block(gfp_t gfp_mask){ struct vmap_block_queue *vbq; struct vmap_block *vb; struct vmap_area *va; unsigned long vb_idx; int node, err; node = numa_node_id(); vb = kmalloc_node(sizeof(struct vmap_block), gfp_mask & GFP_RECLAIM_MASK, node); if (unlikely(!vb)) return ERR_PTR(-ENOMEM); va = alloc_vmap_area(VMAP_BLOCK_SIZE, VMAP_BLOCK_SIZE, VMALLOC_START, VMALLOC_END, node, gfp_mask); if (unlikely(IS_ERR(va))) { kfree(vb); return ERR_PTR(PTR_ERR(va)); } err = radix_tree_preload(gfp_mask); if (unlikely(err)) { kfree(vb); free_vmap_area(va); return ERR_PTR(err); } spin_lock_init(&vb->lock); vb->va = va; vb->free = VMAP_BBMAP_BITS; vb->dirty = 0; bitmap_zero(vb->alloc_map, VMAP_BBMAP_BITS); bitmap_zero(vb->dirty_map, VMAP_BBMAP_BITS); INIT_LIST_HEAD(&vb->free_list); INIT_LIST_HEAD(&vb->dirty_list); vb_idx = addr_to_vb_idx(va->va_start); spin_lock(&vmap_block_tree_lock); err = radix_tree_insert(&vmap_block_tree, vb_idx, vb); spin_unlock(&vmap_block_tree_lock); BUG_ON(err); radix_tree_preload_end(); vbq = &get_cpu_var(vmap_block_queue); vb->vbq = vbq; spin_lock(&vbq->lock); list_add(&vb->free_list, &vbq->free); spin_unlock(&vbq->lock); put_cpu_var(vmap_cpu_blocks); return vb;}static void rcu_free_vb(struct rcu_head *head){ struct vmap_block *vb = container_of(head, struct vmap_block, rcu_head); kfree(vb);}static void free_vmap_block(struct vmap_block *vb){ struct vmap_block *tmp; unsigned long vb_idx; spin_lock(&vb->vbq->lock); if (!list_empty(&vb->free_list)) list_del(&vb->free_list); if (!list_empty(&vb->dirty_list)) list_del(&vb->dirty_list); spin_unlock(&vb->vbq->lock); vb_idx = addr_to_vb_idx(vb->va->va_start); spin_lock(&vmap_block_tree_lock); tmp = radix_tree_delete(&vmap_block_tree, vb_idx); spin_unlock(&vmap_block_tree_lock); BUG_ON(tmp != vb); free_unmap_vmap_area_noflush(vb->va); call_rcu(&vb->rcu_head, rcu_free_vb);}static void *vb_alloc(unsigned long size, gfp_t gfp_mask){ struct vmap_block_queue *vbq; struct vmap_block *vb; unsigned long addr = 0; unsigned int order; BUG_ON(size & ~PAGE_MASK); BUG_ON(size > PAGE_SIZE*VMAP_MAX_ALLOC); order = get_order(size);again: rcu_read_lock(); vbq = &get_cpu_var(vmap_block_queue); list_for_each_entry_rcu(vb, &vbq->free, free_list) { int i; spin_lock(&vb->lock); i = bitmap_find_free_region(vb->alloc_map, VMAP_BBMAP_BITS, order); if (i >= 0) { addr = vb->va->va_start + (i << PAGE_SHIFT); BUG_ON(addr_to_vb_idx(addr) != addr_to_vb_idx(vb->va->va_start)); vb->free -= 1UL << order; if (vb->free == 0) { spin_lock(&vbq->lock); list_del_init(&vb->free_list); spin_unlock(&vbq->lock); } spin_unlock(&vb->lock); break; } spin_unlock(&vb->lock); } put_cpu_var(vmap_cpu_blocks); rcu_read_unlock(); if (!addr) { vb = new_vmap_block(gfp_mask); if (IS_ERR(vb)) return vb; goto again; } return (void *)addr;}static void vb_free(const void *addr, unsigned long size){ unsigned long offset; unsigned long vb_idx; unsigned int order; struct vmap_block *vb; BUG_ON(size & ~PAGE_MASK); BUG_ON(size > PAGE_SIZE*VMAP_MAX_ALLOC); flush_cache_vunmap((unsigned long)addr, (unsigned long)addr + size); order = get_order(size); offset = (unsigned long)addr & (VMAP_BLOCK_SIZE - 1); vb_idx = addr_to_vb_idx((unsigned long)addr); rcu_read_lock(); vb = radix_tree_lookup(&vmap_block_tree, vb_idx); rcu_read_unlock(); BUG_ON(!vb); spin_lock(&vb->lock); bitmap_allocate_region(vb->dirty_map, offset >> PAGE_SHIFT, order); if (!vb->dirty) { spin_lock(&vb->vbq->lock); list_add(&vb->dirty_list, &vb->vbq->dirty); spin_unlock(&vb->vbq->lock); } vb->dirty += 1UL << order; if (vb->dirty == VMAP_BBMAP_BITS) { BUG_ON(vb->free || !list_empty(&vb->free_list)); spin_unlock(&vb->lock); free_vmap_block(vb); } else spin_unlock(&vb->lock);}/** * vm_unmap_aliases - unmap outstanding lazy aliases in the vmap layer * * The vmap/vmalloc layer lazily flushes kernel virtual mappings primarily * to amortize TLB flushing overheads. What this means is that any page you * have now, may, in a former life, have been mapped into kernel virtual * address by the vmap layer and so there might be some CPUs with TLB entries * still referencing that page (additional to the regular 1:1 kernel mapping). * * vm_unmap_aliases flushes all such lazy mappings. After it returns, we can * be sure that none of the pages we have control over will have any aliases * from the vmap layer. */void vm_unmap_aliases(void){ unsigned long start = ULONG_MAX, end = 0; int cpu; int flush = 0; if (unlikely(!vmap_initialized)) return; for_each_possible_cpu(cpu) { struct vmap_block_queue *vbq = &per_cpu(vmap_block_queue, cpu); struct vmap_block *vb; rcu_read_lock(); list_for_each_entry_rcu(vb, &vbq->free, free_list) { int i; spin_lock(&vb->lock); i = find_first_bit(vb->dirty_map, VMAP_BBMAP_BITS); while (i < VMAP_BBMAP_BITS) { unsigned long s, e; int j; j = find_next_zero_bit(vb->dirty_map, VMAP_BBMAP_BITS, i); s = vb->va->va_start + (i << PAGE_SHIFT); e = vb->va->va_start + (j << PAGE_SHIFT); vunmap_page_range(s, e); flush = 1; if (s < start) start = s; if (e > end) end = e; i = j; i = find_next_bit(vb->dirty_map, VMAP_BBMAP_BITS, i); } spin_unlock(&vb->lock); } rcu_read_unlock(); } __purge_vmap_area_lazy(&start, &end, 1, flush);}EXPORT_SYMBOL_GPL(vm_unmap_aliases);/** * vm_unmap_ram - unmap linear kernel address space set up by vm_map_ram * @mem: the pointer returned by vm_map_ram * @count: the count passed to that vm_map_ram call (cannot unmap partial) */void vm_unmap_ram(const void *mem, unsigned int count){ unsigned long size = count << PAGE_SHIFT; unsigned long addr = (unsigned long)mem; BUG_ON(!addr); BUG_ON(addr < VMALLOC_START); BUG_ON(addr > VMALLOC_END); BUG_ON(addr & (PAGE_SIZE-1)); debug_check_no_locks_freed(mem, size); vmap_debug_free_range(addr, addr+size); if (likely(count <= VMAP_MAX_ALLOC)) vb_free(mem, size); else free_unmap_vmap_area_addr(addr);}EXPORT_SYMBOL(vm_unmap_ram);/** * vm_map_ram - map pages linearly into kernel virtual address (vmalloc space) * @pages: an array of pointers to the pages to be mapped * @count: number of pages * @node: prefer to allocate data structures on this node * @prot: memory protection to use. PAGE_KERNEL for regular RAM * * Returns: a pointer to the address that has been mapped, or %NULL on failure */void *vm_map_ram(struct page **pages, unsigned int count, int node, pgprot_t prot){ unsigned long size = count << PAGE_SHIFT; unsigned long addr; void *mem; if (likely(count <= VMAP_MAX_ALLOC)) { mem = vb_alloc(size, GFP_KERNEL); if (IS_ERR(mem)) return NULL; addr = (unsigned long)mem; } else { struct vmap_area *va; va = alloc_vmap_area(size, PAGE_SIZE, VMALLOC_START, VMALLOC_END, node, GFP_KERNEL); if (IS_ERR(va)) return NULL; addr = va->va_start; mem = (void *)addr; } if (vmap_page_range(addr, addr + size, prot, pages) < 0) { vm_unmap_ram(mem, count); return NULL; } return mem;}EXPORT_SYMBOL(vm_map_ram);void __init vmalloc_init(void){ struct vmap_area *va; struct vm_struct *tmp; int i; for_each_possible_cpu(i) { struct vmap_block_queue *vbq; vbq = &per_cpu(vmap_block_queue, i); spin_lock_init(&vbq->lock); INIT_LIST_HEAD(&vbq->free); INIT_LIST_HEAD(&vbq->dirty); vbq->nr_dirty = 0; } /* Import existing vmlist entries. */ for (tmp = vmlist; tmp; tmp = tmp->next) { va = alloc_bootmem(sizeof(struct vmap_area)); va->flags = tmp->flags | VM_VM_AREA; va->va_start = (unsigned long)tmp->addr; va->va_end = va->va_start + tmp->size; __insert_vmap_area(va); } vmap_initialized = true;}void unmap_kernel_range(unsigned long addr, unsigned long size){ unsigned long end = addr + size; flush_cache_vunmap(addr, end); vunmap_page_range(addr, end); flush_tlb_kernel_range(addr, end);}int map_vm_area(struct vm_struct *area, pgprot_t prot, struct page ***pages){ unsigned long addr = (unsigned long)area->addr; unsigned long end = addr + area->size - PAGE_SIZE; int err; err = vmap_page_range(addr, end, prot, *pages); if (err > 0) { *pages += err; err = 0; } return err;}EXPORT_SYMBOL_GPL(map_vm_area);/*** Old vmalloc interfaces ***/DEFINE_RWLOCK(vmlist_lock);struct vm_struct *vmlist;static struct vm_struct *__get_vm_area_node(unsigned long size, unsigned long flags, unsigned long start, unsigned long end, int node, gfp_t gfp_mask, void *caller){ static struct vmap_area *va; struct vm_struct *area; struct vm_struct *tmp, **p; unsigned long align = 1; BUG_ON(in_interrupt()); if (flags & VM_IOREMAP) { int bit = fls(size); if (bit > IOREMAP_MAX_ORDER) bit = IOREMAP_MAX_ORDER; else if (bit < PAGE_SHIFT) bit = PAGE_SHIFT; align = 1ul << bit; } size = PAGE_ALIGN(size); if (unlikely(!size)) return NULL; area = kmalloc_node(sizeof(*area), gfp_mask & GFP_RECLAIM_MASK, node); if (unlikely(!area)) return NULL; /* * We always allocate a guard page. */ size += PAGE_SIZE; va = alloc_vmap_area(size, align, start, end, node, gfp_mask); if (IS_ERR(va)) { kfree(area); return NULL; } area->flags = flags; area->addr = (void *)va->va_start; area->size = size; area->pages = NULL; area->nr_pages = 0; area->phys_addr = 0; area->caller = caller; va->private = area; va->flags |= VM_VM_AREA; write_lock(&vmlist_lock); for (p = &vmlist; (tmp = *p) != NULL; p = &tmp->next) { if (tmp->addr >= area->addr) break; } area->next = *p; *p = area; write_unlock(&vmlist_lock); return area;}struct vm_struct *__get_vm_area(unsigned long size, unsigned long flags, unsigned long start, unsigned long end){ return __get_vm_area_node(size, flags, start, end, -1, GFP_KERNEL, __builtin_return_address(0));}EXPORT_SYMBOL_GPL(__get_vm_area);struct vm_struct *__get_vm_area_caller(unsigned long size, unsigned long flags, unsigned long start, unsigned long end, void *caller){ return __get_vm_area_node(size, flags, start, end, -1, GFP_KERNEL, caller);}/** * get_vm_area - reserve a contiguous kernel virtual area * @size: size of the area * @flags: %VM_IOREMAP for I/O mappings or VM_ALLOC * * Search an area of @size in the kernel virtual mapping area, * and reserved it for out purposes. Returns the area descriptor * on success or %NULL on failure. */struct vm_struct *get_vm_area(unsigned long size, unsigned long flags){ return __get_vm_area_node(size, flags, VMALLOC_START, VMALLOC_END, -1, GFP_KERNEL, __builtin_return_address(0));}struct vm_struct *get_vm_area_caller(unsigned long size, unsigned long flags, void *caller){ return __get_vm_area_node(size, flags, VMALLOC_START, VMALLOC_END, -1, GFP_KERNEL, caller);}struct vm_struct *get_vm_area_node(unsigned long size, unsigned long flags, int node, gfp_t gfp_mask){ return __get_vm_area_node(size, flags, VMALLOC_START, VMALLOC_END, node, gfp_mask, __builtin_return_address(0));}static struct vm_struct *find_vm_area(const void *addr){ struct vmap_area *va; va = find_vmap_area((unsigned long)addr); if (va && va->flags & VM_VM_AREA) return va->private; return NULL;}/** * remove_vm_area - find and remove a continuous kernel virtual area * @addr: base address * * Search for the kernel VM area starting at @addr, and remove it. * This function returns the found VM area, but using it is NOT safe * on SMP machines, except for its size or flags. */struct vm_struct *remove_vm_area(const void *addr){ struct vmap_area *va; va = find_vmap_area((unsigned long)addr); if (va && va->flags & VM_VM_AREA) { struct vm_struct *vm = va->private; struct vm_struct *tmp, **p; vmap_debug_free_range(va->va_start, va->va_end); free_unmap_vmap_area(va); vm->size -= PAGE_SIZE; write_lock(&vmlist_lock); for (p = &vmlist; (tmp = *p) != vm; p = &tmp->next) ; *p = tmp->next; write_unlock(&vmlist_lock); return vm; } return NULL;}static void __vunmap(const void *addr, int deallocate_pages){ struct vm_struct *area; if (!addr) return; if ((PAGE_SIZE-1) & (unsigned long)addr) { WARN(1, KERN_ERR "Trying to vfree() bad address (%p)\n", addr); return; } area = remove_vm_area(addr); if (unlikely(!area)) { WARN(1, KERN_ERR "Trying to vfree() nonexistent vm area (%p)\n", addr); return; } debug_check_no_locks_freed(addr, area->size); debug_check_no_obj_freed(addr, area->size); if (deallocate_pages) { int i; for (i = 0; i < area->nr_pages; i++) { struct page *page = area->pages[i]; BUG_ON(!page); __free_page(page); } if (area->flags & VM_VPAGES) vfree(area->pages); else kfree(area->pages);
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -