📄 slab.c
字号:
/* * linux/mm/slab.c * Written by Mark Hemment, 1996/97. * (markhe@nextd.demon.co.uk) * * kmem_cache_destroy() + some cleanup - 1999 Andrea Arcangeli * * Major cleanup, different bufctl logic, per-cpu arrays * (c) 2000 Manfred Spraul * * Cleanup, make the head arrays unconditional, preparation for NUMA * (c) 2002 Manfred Spraul * * An implementation of the Slab Allocator as described in outline in; * UNIX Internals: The New Frontiers by Uresh Vahalia * Pub: Prentice Hall ISBN 0-13-101908-2 * or with a little more detail in; * The Slab Allocator: An Object-Caching Kernel Memory Allocator * Jeff Bonwick (Sun Microsystems). * Presented at: USENIX Summer 1994 Technical Conference * * The memory is organized in caches, one cache for each object type. * (e.g. inode_cache, dentry_cache, buffer_head, vm_area_struct) * Each cache consists out of many slabs (they are small (usually one * page long) and always contiguous), and each slab contains multiple * initialized objects. * * This means, that your constructor is used only for newly allocated * slabs and you must pass objects with the same initializations to * kmem_cache_free. * * Each cache can only support one memory type (GFP_DMA, GFP_HIGHMEM, * normal). If you need a special memory type, then must create a new * cache for that memory type. * * In order to reduce fragmentation, the slabs are sorted in 3 groups: * full slabs with 0 free objects * partial slabs * empty slabs with no allocated objects * * If partial slabs exist, then new allocations come from these slabs, * otherwise from empty slabs or new slabs are allocated. * * kmem_cache_destroy() CAN CRASH if you try to allocate from the cache * during kmem_cache_destroy(). The caller must prevent concurrent allocs. * * Each cache has a short per-cpu head array, most allocs * and frees go into that array, and if that array overflows, then 1/2 * of the entries in the array are given back into the global cache. * The head array is strictly LIFO and should improve the cache hit rates. * On SMP, it additionally reduces the spinlock operations. * * The c_cpuarray may not be read with enabled local interrupts - * it's changed with a smp_call_function(). * * SMP synchronization: * constructors and destructors are called without any locking. * Several members in struct kmem_cache and struct slab never change, they * are accessed without any locking. * The per-cpu arrays are never accessed from the wrong cpu, no locking, * and local interrupts are disabled so slab code is preempt-safe. * The non-constant members are protected with a per-cache irq spinlock. * * Many thanks to Mark Hemment, who wrote another per-cpu slab patch * in 2000 - many ideas in the current implementation are derived from * his patch. * * Further notes from the original documentation: * * 11 April '97. Started multi-threading - markhe * The global cache-chain is protected by the mutex 'cache_chain_mutex'. * The sem is only needed when accessing/extending the cache-chain, which * can never happen inside an interrupt (kmem_cache_create(), * kmem_cache_shrink() and kmem_cache_reap()). * * At present, each engine can be growing a cache. This should be blocked. * * 15 March 2005. NUMA slab allocator. * Shai Fultheim <shai@scalex86.org>. * Shobhit Dayal <shobhit@calsoftinc.com> * Alok N Kataria <alokk@calsoftinc.com> * Christoph Lameter <christoph@lameter.com> * * Modified the slab allocator to be node aware on NUMA systems. * Each node has its own list of partial, free and full slabs. * All object allocations for a node occur from node specific slab lists. */#include <linux/slab.h>#include <linux/mm.h>#include <linux/poison.h>#include <linux/swap.h>#include <linux/cache.h>#include <linux/interrupt.h>#include <linux/init.h>#include <linux/compiler.h>#include <linux/cpuset.h>#include <linux/proc_fs.h>#include <linux/seq_file.h>#include <linux/notifier.h>#include <linux/kallsyms.h>#include <linux/cpu.h>#include <linux/sysctl.h>#include <linux/module.h>#include <linux/rcupdate.h>#include <linux/string.h>#include <linux/uaccess.h>#include <linux/nodemask.h>#include <linux/mempolicy.h>#include <linux/mutex.h>#include <linux/fault-inject.h>#include <linux/rtmutex.h>#include <linux/reciprocal_div.h>#include <linux/debugobjects.h>#include <asm/cacheflush.h>#include <asm/tlbflush.h>#include <asm/page.h>/* * DEBUG - 1 for kmem_cache_create() to honour; SLAB_RED_ZONE & SLAB_POISON. * 0 for faster, smaller code (especially in the critical paths). * * STATS - 1 to collect stats for /proc/slabinfo. * 0 for faster, smaller code (especially in the critical paths). * * FORCED_DEBUG - 1 enables SLAB_RED_ZONE and SLAB_POISON (if possible) */#ifdef CONFIG_DEBUG_SLAB#define DEBUG 1#define STATS 1#define FORCED_DEBUG 1#else#define DEBUG 0#define STATS 0#define FORCED_DEBUG 0#endif/* Shouldn't this be in a header file somewhere? */#define BYTES_PER_WORD sizeof(void *)#define REDZONE_ALIGN max(BYTES_PER_WORD, __alignof__(unsigned long long))#ifndef ARCH_KMALLOC_MINALIGN/* * Enforce a minimum alignment for the kmalloc caches. * Usually, the kmalloc caches are cache_line_size() aligned, except when * DEBUG and FORCED_DEBUG are enabled, then they are BYTES_PER_WORD aligned. * Some archs want to perform DMA into kmalloc caches and need a guaranteed * alignment larger than the alignment of a 64-bit integer. * ARCH_KMALLOC_MINALIGN allows that. * Note that increasing this value may disable some debug features. */#define ARCH_KMALLOC_MINALIGN __alignof__(unsigned long long)#endif#ifndef ARCH_SLAB_MINALIGN/* * Enforce a minimum alignment for all caches. * Intended for archs that get misalignment faults even for BYTES_PER_WORD * aligned buffers. Includes ARCH_KMALLOC_MINALIGN. * If possible: Do not enable this flag for CONFIG_DEBUG_SLAB, it disables * some debug features. */#define ARCH_SLAB_MINALIGN 0#endif#ifndef ARCH_KMALLOC_FLAGS#define ARCH_KMALLOC_FLAGS SLAB_HWCACHE_ALIGN#endif/* Legal flag mask for kmem_cache_create(). */#if DEBUG# define CREATE_MASK (SLAB_RED_ZONE | \ SLAB_POISON | SLAB_HWCACHE_ALIGN | \ SLAB_CACHE_DMA | \ SLAB_STORE_USER | \ SLAB_RECLAIM_ACCOUNT | SLAB_PANIC | \ SLAB_DESTROY_BY_RCU | SLAB_MEM_SPREAD | \ SLAB_DEBUG_OBJECTS)#else# define CREATE_MASK (SLAB_HWCACHE_ALIGN | \ SLAB_CACHE_DMA | \ SLAB_RECLAIM_ACCOUNT | SLAB_PANIC | \ SLAB_DESTROY_BY_RCU | SLAB_MEM_SPREAD | \ SLAB_DEBUG_OBJECTS)#endif/* * kmem_bufctl_t: * * Bufctl's are used for linking objs within a slab * linked offsets. * * This implementation relies on "struct page" for locating the cache & * slab an object belongs to. * This allows the bufctl structure to be small (one int), but limits * the number of objects a slab (not a cache) can contain when off-slab * bufctls are used. The limit is the size of the largest general cache * that does not use off-slab slabs. * For 32bit archs with 4 kB pages, is this 56. * This is not serious, as it is only for large objects, when it is unwise * to have too many per slab. * Note: This limit can be raised by introducing a general cache whose size * is less than 512 (PAGE_SIZE<<3), but greater than 256. */typedef unsigned int kmem_bufctl_t;#define BUFCTL_END (((kmem_bufctl_t)(~0U))-0)#define BUFCTL_FREE (((kmem_bufctl_t)(~0U))-1)#define BUFCTL_ACTIVE (((kmem_bufctl_t)(~0U))-2)#define SLAB_LIMIT (((kmem_bufctl_t)(~0U))-3)/* * struct slab * * Manages the objs in a slab. Placed either at the beginning of mem allocated * for a slab, or allocated from an general cache. * Slabs are chained into three list: fully used, partial, fully free slabs. */struct slab { struct list_head list; unsigned long colouroff; void *s_mem; /* including colour offset */ unsigned int inuse; /* num of objs active in slab */ kmem_bufctl_t free; unsigned short nodeid;};/* * struct slab_rcu * * slab_destroy on a SLAB_DESTROY_BY_RCU cache uses this structure to * arrange for kmem_freepages to be called via RCU. This is useful if * we need to approach a kernel structure obliquely, from its address * obtained without the usual locking. We can lock the structure to * stabilize it and check it's still at the given address, only if we * can be sure that the memory has not been meanwhile reused for some * other kind of object (which our subsystem's lock might corrupt). * * rcu_read_lock before reading the address, then rcu_read_unlock after * taking the spinlock within the structure expected at that address. * * We assume struct slab_rcu can overlay struct slab when destroying. */struct slab_rcu { struct rcu_head head; struct kmem_cache *cachep; void *addr;};/* * struct array_cache * * Purpose: * - LIFO ordering, to hand out cache-warm objects from _alloc * - reduce the number of linked list operations * - reduce spinlock operations * * The limit is stored in the per-cpu structure to reduce the data cache * footprint. * */struct array_cache { unsigned int avail; unsigned int limit; unsigned int batchcount; unsigned int touched; spinlock_t lock; void *entry[]; /* * Must have this definition in here for the proper * alignment of array_cache. Also simplifies accessing * the entries. */};/* * bootstrap: The caches do not work without cpuarrays anymore, but the * cpuarrays are allocated from the generic caches... */#define BOOT_CPUCACHE_ENTRIES 1struct arraycache_init { struct array_cache cache; void *entries[BOOT_CPUCACHE_ENTRIES];};/* * The slab lists for all objects. */struct kmem_list3 { struct list_head slabs_partial; /* partial list first, better asm code */ struct list_head slabs_full; struct list_head slabs_free; unsigned long free_objects; unsigned int free_limit; unsigned int colour_next; /* Per-node cache coloring */ spinlock_t list_lock; struct array_cache *shared; /* shared per node */ struct array_cache **alien; /* on other nodes */ unsigned long next_reap; /* updated without locking */ int free_touched; /* updated without locking */};/* * Need this for bootstrapping a per node allocator. */#define NUM_INIT_LISTS (3 * MAX_NUMNODES)struct kmem_list3 __initdata initkmem_list3[NUM_INIT_LISTS];#define CACHE_CACHE 0#define SIZE_AC MAX_NUMNODES#define SIZE_L3 (2 * MAX_NUMNODES)static int drain_freelist(struct kmem_cache *cache, struct kmem_list3 *l3, int tofree);static void free_block(struct kmem_cache *cachep, void **objpp, int len, int node);static int enable_cpucache(struct kmem_cache *cachep);static void cache_reap(struct work_struct *unused);/* * This function must be completely optimized away if a constant is passed to * it. Mostly the same as what is in linux/slab.h except it returns an index. */static __always_inline int index_of(const size_t size){ extern void __bad_size(void); if (__builtin_constant_p(size)) { int i = 0;#define CACHE(x) \ if (size <=x) \ return i; \ else \ i++;#include <linux/kmalloc_sizes.h>#undef CACHE __bad_size(); } else __bad_size(); return 0;}static int slab_early_init = 1;#define INDEX_AC index_of(sizeof(struct arraycache_init))#define INDEX_L3 index_of(sizeof(struct kmem_list3))static void kmem_list3_init(struct kmem_list3 *parent){ INIT_LIST_HEAD(&parent->slabs_full); INIT_LIST_HEAD(&parent->slabs_partial); INIT_LIST_HEAD(&parent->slabs_free); parent->shared = NULL; parent->alien = NULL; parent->colour_next = 0; spin_lock_init(&parent->list_lock); parent->free_objects = 0; parent->free_touched = 0;}#define MAKE_LIST(cachep, listp, slab, nodeid) \ do { \ INIT_LIST_HEAD(listp); \ list_splice(&(cachep->nodelists[nodeid]->slab), listp); \ } while (0)#define MAKE_ALL_LISTS(cachep, ptr, nodeid) \ do { \ MAKE_LIST((cachep), (&(ptr)->slabs_full), slabs_full, nodeid); \ MAKE_LIST((cachep), (&(ptr)->slabs_partial), slabs_partial, nodeid); \ MAKE_LIST((cachep), (&(ptr)->slabs_free), slabs_free, nodeid); \ } while (0)/* * struct kmem_cache * * manages a cache. */struct kmem_cache {/* 1) per-cpu data, touched during every alloc/free */ struct array_cache *array[NR_CPUS];/* 2) Cache tunables. Protected by cache_chain_mutex */ unsigned int batchcount; unsigned int limit; unsigned int shared; unsigned int buffer_size; u32 reciprocal_buffer_size;/* 3) touched by every alloc & free from the backend */ unsigned int flags; /* constant flags */ unsigned int num; /* # of objs per slab *//* 4) cache_grow/shrink */ /* order of pgs per slab (2^n) */ unsigned int gfporder; /* force GFP flags, e.g. GFP_DMA */ gfp_t gfpflags; size_t colour; /* cache colouring range */ unsigned int colour_off; /* colour offset */ struct kmem_cache *slabp_cache; unsigned int slab_size; unsigned int dflags; /* dynamic flags */ /* constructor func */ void (*ctor)(void *obj);/* 5) cache creation/removal */ const char *name; struct list_head next;/* 6) statistics */#if STATS unsigned long num_active; unsigned long num_allocations; unsigned long high_mark; unsigned long grown; unsigned long reaped; unsigned long errors; unsigned long max_freeable; unsigned long node_allocs; unsigned long node_frees; unsigned long node_overflow; atomic_t allochit; atomic_t allocmiss; atomic_t freehit; atomic_t freemiss;#endif#if DEBUG /* * If debugging is enabled, then the allocator can add additional * fields and/or padding to every object. buffer_size contains the total * object size including these internal fields, the following two * variables contain the offset to the user object and its size. */ int obj_offset; int obj_size;#endif /* * We put nodelists[] at the end of kmem_cache, because we want to size * this array to nr_node_ids slots instead of MAX_NUMNODES * (see kmem_cache_init()) * We still use [MAX_NUMNODES] and not [1] or [0] because cache_cache * is statically defined, so we reserve the max number of nodes. */ struct kmem_list3 *nodelists[MAX_NUMNODES]; /* * Do not add fields after nodelists[] */};#define CFLGS_OFF_SLAB (0x80000000UL)#define OFF_SLAB(x) ((x)->flags & CFLGS_OFF_SLAB)#define BATCHREFILL_LIMIT 16/* * Optimization question: fewer reaps means less probability for unnessary * cpucache drain/refill cycles. * * OTOH the cpuarrays can contain lots of objects, * which could lock up otherwise freeable slabs. */#define REAPTIMEOUT_CPUC (2*HZ)#define REAPTIMEOUT_LIST3 (4*HZ)#if STATS#define STATS_INC_ACTIVE(x) ((x)->num_active++)#define STATS_DEC_ACTIVE(x) ((x)->num_active--)#define STATS_INC_ALLOCED(x) ((x)->num_allocations++)#define STATS_INC_GROWN(x) ((x)->grown++)#define STATS_ADD_REAPED(x,y) ((x)->reaped += (y))#define STATS_SET_HIGH(x) \ do { \ if ((x)->num_active > (x)->high_mark) \
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -