📄 cpuset.c
字号:
/* * kernel/cpuset.c * * Processor and Memory placement constraints for sets of tasks. * * Copyright (C) 2003 BULL SA. * Copyright (C) 2004-2006 Silicon Graphics, Inc. * * Portions derived from Patrick Mochel's sysfs code. * sysfs is Copyright (c) 2001-3 Patrick Mochel * * 2003-10-10 Written by Simon Derr. * 2003-10-22 Updates by Stephen Hemminger. * 2004 May-July Rework by Paul Jackson. * * This file is subject to the terms and conditions of the GNU General Public * License. See the file COPYING in the main directory of the Linux * distribution for more details. */#include <linux/cpu.h>#include <linux/cpumask.h>#include <linux/cpuset.h>#include <linux/err.h>#include <linux/errno.h>#include <linux/file.h>#include <linux/fs.h>#include <linux/init.h>#include <linux/interrupt.h>#include <linux/kernel.h>#include <linux/kmod.h>#include <linux/list.h>#include <linux/mempolicy.h>#include <linux/mm.h>#include <linux/module.h>#include <linux/mount.h>#include <linux/namei.h>#include <linux/pagemap.h>#include <linux/proc_fs.h>#include <linux/rcupdate.h>#include <linux/sched.h>#include <linux/seq_file.h>#include <linux/security.h>#include <linux/slab.h>#include <linux/spinlock.h>#include <linux/stat.h>#include <linux/string.h>#include <linux/time.h>#include <linux/backing-dev.h>#include <linux/sort.h>#include <asm/uaccess.h>#include <asm/atomic.h>#include <linux/mutex.h>#define CPUSET_SUPER_MAGIC 0x27e0eb/* * Tracks how many cpusets are currently defined in system. * When there is only one cpuset (the root cpuset) we can * short circuit some hooks. */int number_of_cpusets __read_mostly;/* See "Frequency meter" comments, below. */struct fmeter { int cnt; /* unprocessed events count */ int val; /* most recent output value */ time_t time; /* clock (secs) when val computed */ spinlock_t lock; /* guards read or write of above */};struct cpuset { unsigned long flags; /* "unsigned long" so bitops work */ cpumask_t cpus_allowed; /* CPUs allowed to tasks in cpuset */ nodemask_t mems_allowed; /* Memory Nodes allowed to tasks */ /* * Count is atomic so can incr (fork) or decr (exit) without a lock. */ atomic_t count; /* count tasks using this cpuset */ /* * We link our 'sibling' struct into our parents 'children'. * Our children link their 'sibling' into our 'children'. */ struct list_head sibling; /* my parents children */ struct list_head children; /* my children */ struct cpuset *parent; /* my parent */ struct dentry *dentry; /* cpuset fs entry */ /* * Copy of global cpuset_mems_generation as of the most * recent time this cpuset changed its mems_allowed. */ int mems_generation; struct fmeter fmeter; /* memory_pressure filter */};/* bits in struct cpuset flags field */typedef enum { CS_CPU_EXCLUSIVE, CS_MEM_EXCLUSIVE, CS_MEMORY_MIGRATE, CS_REMOVED, CS_NOTIFY_ON_RELEASE, CS_SPREAD_PAGE, CS_SPREAD_SLAB,} cpuset_flagbits_t;/* convenient tests for these bits */static inline int is_cpu_exclusive(const struct cpuset *cs){ return test_bit(CS_CPU_EXCLUSIVE, &cs->flags);}static inline int is_mem_exclusive(const struct cpuset *cs){ return test_bit(CS_MEM_EXCLUSIVE, &cs->flags);}static inline int is_removed(const struct cpuset *cs){ return test_bit(CS_REMOVED, &cs->flags);}static inline int notify_on_release(const struct cpuset *cs){ return test_bit(CS_NOTIFY_ON_RELEASE, &cs->flags);}static inline int is_memory_migrate(const struct cpuset *cs){ return test_bit(CS_MEMORY_MIGRATE, &cs->flags);}static inline int is_spread_page(const struct cpuset *cs){ return test_bit(CS_SPREAD_PAGE, &cs->flags);}static inline int is_spread_slab(const struct cpuset *cs){ return test_bit(CS_SPREAD_SLAB, &cs->flags);}/* * Increment this integer everytime any cpuset changes its * mems_allowed value. Users of cpusets can track this generation * number, and avoid having to lock and reload mems_allowed unless * the cpuset they're using changes generation. * * A single, global generation is needed because attach_task() could * reattach a task to a different cpuset, which must not have its * generation numbers aliased with those of that tasks previous cpuset. * * Generations are needed for mems_allowed because one task cannot * modify anothers memory placement. So we must enable every task, * on every visit to __alloc_pages(), to efficiently check whether * its current->cpuset->mems_allowed has changed, requiring an update * of its current->mems_allowed. * * Since cpuset_mems_generation is guarded by manage_mutex, * there is no need to mark it atomic. */static int cpuset_mems_generation;static struct cpuset top_cpuset = { .flags = ((1 << CS_CPU_EXCLUSIVE) | (1 << CS_MEM_EXCLUSIVE)), .cpus_allowed = CPU_MASK_ALL, .mems_allowed = NODE_MASK_ALL, .count = ATOMIC_INIT(0), .sibling = LIST_HEAD_INIT(top_cpuset.sibling), .children = LIST_HEAD_INIT(top_cpuset.children),};static struct vfsmount *cpuset_mount;static struct super_block *cpuset_sb;/* * We have two global cpuset mutexes below. They can nest. * It is ok to first take manage_mutex, then nest callback_mutex. We also * require taking task_lock() when dereferencing a tasks cpuset pointer. * See "The task_lock() exception", at the end of this comment. * * A task must hold both mutexes to modify cpusets. If a task * holds manage_mutex, then it blocks others wanting that mutex, * ensuring that it is the only task able to also acquire callback_mutex * and be able to modify cpusets. It can perform various checks on * the cpuset structure first, knowing nothing will change. It can * also allocate memory while just holding manage_mutex. While it is * performing these checks, various callback routines can briefly * acquire callback_mutex to query cpusets. Once it is ready to make * the changes, it takes callback_mutex, blocking everyone else. * * Calls to the kernel memory allocator can not be made while holding * callback_mutex, as that would risk double tripping on callback_mutex * from one of the callbacks into the cpuset code from within * __alloc_pages(). * * If a task is only holding callback_mutex, then it has read-only * access to cpusets. * * The task_struct fields mems_allowed and mems_generation may only * be accessed in the context of that task, so require no locks. * * Any task can increment and decrement the count field without lock. * So in general, code holding manage_mutex or callback_mutex can't rely * on the count field not changing. However, if the count goes to * zero, then only attach_task(), which holds both mutexes, can * increment it again. Because a count of zero means that no tasks * are currently attached, therefore there is no way a task attached * to that cpuset can fork (the other way to increment the count). * So code holding manage_mutex or callback_mutex can safely assume that * if the count is zero, it will stay zero. Similarly, if a task * holds manage_mutex or callback_mutex on a cpuset with zero count, it * knows that the cpuset won't be removed, as cpuset_rmdir() needs * both of those mutexes. * * The cpuset_common_file_write handler for operations that modify * the cpuset hierarchy holds manage_mutex across the entire operation, * single threading all such cpuset modifications across the system. * * The cpuset_common_file_read() handlers only hold callback_mutex across * small pieces of code, such as when reading out possibly multi-word * cpumasks and nodemasks. * * The fork and exit callbacks cpuset_fork() and cpuset_exit(), don't * (usually) take either mutex. These are the two most performance * critical pieces of code here. The exception occurs on cpuset_exit(), * when a task in a notify_on_release cpuset exits. Then manage_mutex * is taken, and if the cpuset count is zero, a usermode call made * to /sbin/cpuset_release_agent with the name of the cpuset (path * relative to the root of cpuset file system) as the argument. * * A cpuset can only be deleted if both its 'count' of using tasks * is zero, and its list of 'children' cpusets is empty. Since all * tasks in the system use _some_ cpuset, and since there is always at * least one task in the system (init), therefore, top_cpuset * always has either children cpusets and/or using tasks. So we don't * need a special hack to ensure that top_cpuset cannot be deleted. * * The above "Tale of Two Semaphores" would be complete, but for: * * The task_lock() exception * * The need for this exception arises from the action of attach_task(), * which overwrites one tasks cpuset pointer with another. It does * so using both mutexes, however there are several performance * critical places that need to reference task->cpuset without the * expense of grabbing a system global mutex. Therefore except as * noted below, when dereferencing or, as in attach_task(), modifying * a tasks cpuset pointer we use task_lock(), which acts on a spinlock * (task->alloc_lock) already in the task_struct routinely used for * such matters. * * P.S. One more locking exception. RCU is used to guard the * update of a tasks cpuset pointer by attach_task() and the * access of task->cpuset->mems_generation via that pointer in * the routine cpuset_update_task_memory_state(). */static DEFINE_MUTEX(manage_mutex);static DEFINE_MUTEX(callback_mutex);/* * A couple of forward declarations required, due to cyclic reference loop: * cpuset_mkdir -> cpuset_create -> cpuset_populate_dir -> cpuset_add_file * -> cpuset_create_file -> cpuset_dir_inode_operations -> cpuset_mkdir. */static int cpuset_mkdir(struct inode *dir, struct dentry *dentry, int mode);static int cpuset_rmdir(struct inode *unused_dir, struct dentry *dentry);static struct backing_dev_info cpuset_backing_dev_info = { .ra_pages = 0, /* No readahead */ .capabilities = BDI_CAP_NO_ACCT_DIRTY | BDI_CAP_NO_WRITEBACK,};static struct inode *cpuset_new_inode(mode_t mode){ struct inode *inode = new_inode(cpuset_sb); if (inode) { inode->i_mode = mode; inode->i_uid = current->fsuid; inode->i_gid = current->fsgid; inode->i_blocks = 0; inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME; inode->i_mapping->backing_dev_info = &cpuset_backing_dev_info; } return inode;}static void cpuset_diput(struct dentry *dentry, struct inode *inode){ /* is dentry a directory ? if so, kfree() associated cpuset */ if (S_ISDIR(inode->i_mode)) { struct cpuset *cs = dentry->d_fsdata; BUG_ON(!(is_removed(cs))); kfree(cs); } iput(inode);}static struct dentry_operations cpuset_dops = { .d_iput = cpuset_diput,};static struct dentry *cpuset_get_dentry(struct dentry *parent, const char *name){ struct dentry *d = lookup_one_len(name, parent, strlen(name)); if (!IS_ERR(d)) d->d_op = &cpuset_dops; return d;}static void remove_dir(struct dentry *d){ struct dentry *parent = dget(d->d_parent); d_delete(d); simple_rmdir(parent->d_inode, d); dput(parent);}/* * NOTE : the dentry must have been dget()'ed */static void cpuset_d_remove_dir(struct dentry *dentry){ struct list_head *node; spin_lock(&dcache_lock); node = dentry->d_subdirs.next; while (node != &dentry->d_subdirs) { struct dentry *d = list_entry(node, struct dentry, d_u.d_child); list_del_init(node); if (d->d_inode) { d = dget_locked(d); spin_unlock(&dcache_lock); d_delete(d); simple_unlink(dentry->d_inode, d); dput(d); spin_lock(&dcache_lock); } node = dentry->d_subdirs.next; } list_del_init(&dentry->d_u.d_child); spin_unlock(&dcache_lock); remove_dir(dentry);}static struct super_operations cpuset_ops = { .statfs = simple_statfs, .drop_inode = generic_delete_inode,};static int cpuset_fill_super(struct super_block *sb, void *unused_data, int unused_silent){ struct inode *inode; struct dentry *root; sb->s_blocksize = PAGE_CACHE_SIZE; sb->s_blocksize_bits = PAGE_CACHE_SHIFT; sb->s_magic = CPUSET_SUPER_MAGIC; sb->s_op = &cpuset_ops; cpuset_sb = sb; inode = cpuset_new_inode(S_IFDIR | S_IRUGO | S_IXUGO | S_IWUSR); if (inode) { inode->i_op = &simple_dir_inode_operations; inode->i_fop = &simple_dir_operations; /* directories start off with i_nlink == 2 (for "." entry) */ inc_nlink(inode); } else { return -ENOMEM; } root = d_alloc_root(inode); if (!root) { iput(inode); return -ENOMEM; } sb->s_root = root; return 0;}static int cpuset_get_sb(struct file_system_type *fs_type, int flags, const char *unused_dev_name, void *data, struct vfsmount *mnt){ return get_sb_single(fs_type, flags, data, cpuset_fill_super, mnt);}static struct file_system_type cpuset_fs_type = { .name = "cpuset", .get_sb = cpuset_get_sb, .kill_sb = kill_litter_super,};/* struct cftype: * * The files in the cpuset filesystem mostly have a very simple read/write * handling, some common function will take care of it. Nevertheless some cases * (read tasks) are special and therefore I define this structure for every * kind of file. * * * When reading/writing to a file: * - the cpuset to use in file->f_path.dentry->d_parent->d_fsdata * - the 'cftype' of the file is file->f_path.dentry->d_fsdata */struct cftype { char *name; int private; int (*open) (struct inode *inode, struct file *file); ssize_t (*read) (struct file *file, char __user *buf, size_t nbytes, loff_t *ppos); int (*write) (struct file *file, const char __user *buf, size_t nbytes, loff_t *ppos); int (*release) (struct inode *inode, struct file *file);};static inline struct cpuset *__d_cs(struct dentry *dentry){ return dentry->d_fsdata;}static inline struct cftype *__d_cft(struct dentry *dentry){ return dentry->d_fsdata;}/* * Call with manage_mutex held. Writes path of cpuset into buf. * Returns 0 on success, -errno on error. */static int cpuset_path(const struct cpuset *cs, char *buf, int buflen){ char *start;
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -