📄 cpuset.c
字号:
/* * kernel/cpuset.c * * Processor and Memory placement constraints for sets of tasks. * * Copyright (C) 2003 BULL SA. * Copyright (C) 2004-2007 Silicon Graphics, Inc. * Copyright (C) 2006 Google, Inc * * Portions derived from Patrick Mochel's sysfs code. * sysfs is Copyright (c) 2001-3 Patrick Mochel * * 2003-10-10 Written by Simon Derr. * 2003-10-22 Updates by Stephen Hemminger. * 2004 May-July Rework by Paul Jackson. * 2006 Rework by Paul Menage to use generic cgroups * 2008 Rework of the scheduler domains and CPU hotplug handling * by Max Krasnyansky * * This file is subject to the terms and conditions of the GNU General Public * License. See the file COPYING in the main directory of the Linux * distribution for more details. */#include <linux/cpu.h>#include <linux/cpumask.h>#include <linux/cpuset.h>#include <linux/err.h>#include <linux/errno.h>#include <linux/file.h>#include <linux/fs.h>#include <linux/init.h>#include <linux/interrupt.h>#include <linux/kernel.h>#include <linux/kmod.h>#include <linux/list.h>#include <linux/mempolicy.h>#include <linux/mm.h>#include <linux/module.h>#include <linux/mount.h>#include <linux/namei.h>#include <linux/pagemap.h>#include <linux/proc_fs.h>#include <linux/rcupdate.h>#include <linux/sched.h>#include <linux/seq_file.h>#include <linux/security.h>#include <linux/slab.h>#include <linux/spinlock.h>#include <linux/stat.h>#include <linux/string.h>#include <linux/time.h>#include <linux/backing-dev.h>#include <linux/sort.h>#include <asm/uaccess.h>#include <asm/atomic.h>#include <linux/mutex.h>#include <linux/workqueue.h>#include <linux/cgroup.h>/* * Tracks how many cpusets are currently defined in system. * When there is only one cpuset (the root cpuset) we can * short circuit some hooks. */int number_of_cpusets __read_mostly;/* Forward declare cgroup structures */struct cgroup_subsys cpuset_subsys;struct cpuset;/* See "Frequency meter" comments, below. */struct fmeter { int cnt; /* unprocessed events count */ int val; /* most recent output value */ time_t time; /* clock (secs) when val computed */ spinlock_t lock; /* guards read or write of above */};struct cpuset { struct cgroup_subsys_state css; unsigned long flags; /* "unsigned long" so bitops work */ cpumask_t cpus_allowed; /* CPUs allowed to tasks in cpuset */ nodemask_t mems_allowed; /* Memory Nodes allowed to tasks */ struct cpuset *parent; /* my parent */ /* * Copy of global cpuset_mems_generation as of the most * recent time this cpuset changed its mems_allowed. */ int mems_generation; struct fmeter fmeter; /* memory_pressure filter */ /* partition number for rebuild_sched_domains() */ int pn; /* for custom sched domain */ int relax_domain_level; /* used for walking a cpuset heirarchy */ struct list_head stack_list;};/* Retrieve the cpuset for a cgroup */static inline struct cpuset *cgroup_cs(struct cgroup *cont){ return container_of(cgroup_subsys_state(cont, cpuset_subsys_id), struct cpuset, css);}/* Retrieve the cpuset for a task */static inline struct cpuset *task_cs(struct task_struct *task){ return container_of(task_subsys_state(task, cpuset_subsys_id), struct cpuset, css);}struct cpuset_hotplug_scanner { struct cgroup_scanner scan; struct cgroup *to;};/* bits in struct cpuset flags field */typedef enum { CS_CPU_EXCLUSIVE, CS_MEM_EXCLUSIVE, CS_MEM_HARDWALL, CS_MEMORY_MIGRATE, CS_SCHED_LOAD_BALANCE, CS_SPREAD_PAGE, CS_SPREAD_SLAB,} cpuset_flagbits_t;/* convenient tests for these bits */static inline int is_cpu_exclusive(const struct cpuset *cs){ return test_bit(CS_CPU_EXCLUSIVE, &cs->flags);}static inline int is_mem_exclusive(const struct cpuset *cs){ return test_bit(CS_MEM_EXCLUSIVE, &cs->flags);}static inline int is_mem_hardwall(const struct cpuset *cs){ return test_bit(CS_MEM_HARDWALL, &cs->flags);}static inline int is_sched_load_balance(const struct cpuset *cs){ return test_bit(CS_SCHED_LOAD_BALANCE, &cs->flags);}static inline int is_memory_migrate(const struct cpuset *cs){ return test_bit(CS_MEMORY_MIGRATE, &cs->flags);}static inline int is_spread_page(const struct cpuset *cs){ return test_bit(CS_SPREAD_PAGE, &cs->flags);}static inline int is_spread_slab(const struct cpuset *cs){ return test_bit(CS_SPREAD_SLAB, &cs->flags);}/* * Increment this integer everytime any cpuset changes its * mems_allowed value. Users of cpusets can track this generation * number, and avoid having to lock and reload mems_allowed unless * the cpuset they're using changes generation. * * A single, global generation is needed because cpuset_attach_task() could * reattach a task to a different cpuset, which must not have its * generation numbers aliased with those of that tasks previous cpuset. * * Generations are needed for mems_allowed because one task cannot * modify another's memory placement. So we must enable every task, * on every visit to __alloc_pages(), to efficiently check whether * its current->cpuset->mems_allowed has changed, requiring an update * of its current->mems_allowed. * * Since writes to cpuset_mems_generation are guarded by the cgroup lock * there is no need to mark it atomic. */static int cpuset_mems_generation;static struct cpuset top_cpuset = { .flags = ((1 << CS_CPU_EXCLUSIVE) | (1 << CS_MEM_EXCLUSIVE)), .cpus_allowed = CPU_MASK_ALL, .mems_allowed = NODE_MASK_ALL,};/* * There are two global mutexes guarding cpuset structures. The first * is the main control groups cgroup_mutex, accessed via * cgroup_lock()/cgroup_unlock(). The second is the cpuset-specific * callback_mutex, below. They can nest. It is ok to first take * cgroup_mutex, then nest callback_mutex. We also require taking * task_lock() when dereferencing a task's cpuset pointer. See "The * task_lock() exception", at the end of this comment. * * A task must hold both mutexes to modify cpusets. If a task * holds cgroup_mutex, then it blocks others wanting that mutex, * ensuring that it is the only task able to also acquire callback_mutex * and be able to modify cpusets. It can perform various checks on * the cpuset structure first, knowing nothing will change. It can * also allocate memory while just holding cgroup_mutex. While it is * performing these checks, various callback routines can briefly * acquire callback_mutex to query cpusets. Once it is ready to make * the changes, it takes callback_mutex, blocking everyone else. * * Calls to the kernel memory allocator can not be made while holding * callback_mutex, as that would risk double tripping on callback_mutex * from one of the callbacks into the cpuset code from within * __alloc_pages(). * * If a task is only holding callback_mutex, then it has read-only * access to cpusets. * * The task_struct fields mems_allowed and mems_generation may only * be accessed in the context of that task, so require no locks. * * The cpuset_common_file_read() handlers only hold callback_mutex across * small pieces of code, such as when reading out possibly multi-word * cpumasks and nodemasks. * * Accessing a task's cpuset should be done in accordance with the * guidelines for accessing subsystem state in kernel/cgroup.c */static DEFINE_MUTEX(callback_mutex);/* * This is ugly, but preserves the userspace API for existing cpuset * users. If someone tries to mount the "cpuset" filesystem, we * silently switch it to mount "cgroup" instead */static int cpuset_get_sb(struct file_system_type *fs_type, int flags, const char *unused_dev_name, void *data, struct vfsmount *mnt){ struct file_system_type *cgroup_fs = get_fs_type("cgroup"); int ret = -ENODEV; if (cgroup_fs) { char mountopts[] = "cpuset,noprefix," "release_agent=/sbin/cpuset_release_agent"; ret = cgroup_fs->get_sb(cgroup_fs, flags, unused_dev_name, mountopts, mnt); put_filesystem(cgroup_fs); } return ret;}static struct file_system_type cpuset_fs_type = { .name = "cpuset", .get_sb = cpuset_get_sb,};/* * Return in *pmask the portion of a cpusets's cpus_allowed that * are online. If none are online, walk up the cpuset hierarchy * until we find one that does have some online cpus. If we get * all the way to the top and still haven't found any online cpus, * return cpu_online_map. Or if passed a NULL cs from an exit'ing * task, return cpu_online_map. * * One way or another, we guarantee to return some non-empty subset * of cpu_online_map. * * Call with callback_mutex held. */static void guarantee_online_cpus(const struct cpuset *cs, cpumask_t *pmask){ while (cs && !cpus_intersects(cs->cpus_allowed, cpu_online_map)) cs = cs->parent; if (cs) cpus_and(*pmask, cs->cpus_allowed, cpu_online_map); else *pmask = cpu_online_map; BUG_ON(!cpus_intersects(*pmask, cpu_online_map));}/* * Return in *pmask the portion of a cpusets's mems_allowed that * are online, with memory. If none are online with memory, walk * up the cpuset hierarchy until we find one that does have some * online mems. If we get all the way to the top and still haven't * found any online mems, return node_states[N_HIGH_MEMORY]. * * One way or another, we guarantee to return some non-empty subset * of node_states[N_HIGH_MEMORY]. * * Call with callback_mutex held. */static void guarantee_online_mems(const struct cpuset *cs, nodemask_t *pmask){ while (cs && !nodes_intersects(cs->mems_allowed, node_states[N_HIGH_MEMORY])) cs = cs->parent; if (cs) nodes_and(*pmask, cs->mems_allowed, node_states[N_HIGH_MEMORY]); else *pmask = node_states[N_HIGH_MEMORY]; BUG_ON(!nodes_intersects(*pmask, node_states[N_HIGH_MEMORY]));}/** * cpuset_update_task_memory_state - update task memory placement * * If the current tasks cpusets mems_allowed changed behind our * backs, update current->mems_allowed, mems_generation and task NUMA * mempolicy to the new value. * * Task mempolicy is updated by rebinding it relative to the * current->cpuset if a task has its memory placement changed. * Do not call this routine if in_interrupt(). * * Call without callback_mutex or task_lock() held. May be * called with or without cgroup_mutex held. Thanks in part to * 'the_top_cpuset_hack', the task's cpuset pointer will never * be NULL. This routine also might acquire callback_mutex during * call. * * Reading current->cpuset->mems_generation doesn't need task_lock * to guard the current->cpuset derefence, because it is guarded * from concurrent freeing of current->cpuset using RCU. * * The rcu_dereference() is technically probably not needed, * as I don't actually mind if I see a new cpuset pointer but * an old value of mems_generation. However this really only * matters on alpha systems using cpusets heavily. If I dropped * that rcu_dereference(), it would save them a memory barrier. * For all other arch's, rcu_dereference is a no-op anyway, and for * alpha systems not using cpusets, another planned optimization, * avoiding the rcu critical section for tasks in the root cpuset * which is statically allocated, so can't vanish, will make this * irrelevant. Better to use RCU as intended, than to engage in * some cute trick to save a memory barrier that is impossible to * test, for alpha systems using cpusets heavily, which might not * even exist. * * This routine is needed to update the per-task mems_allowed data, * within the tasks context, when it is trying to allocate memory * (in various mm/mempolicy.c routines) and notices that some other * task has been modifying its cpuset. */void cpuset_update_task_memory_state(void){ int my_cpusets_mem_gen; struct task_struct *tsk = current; struct cpuset *cs; if (task_cs(tsk) == &top_cpuset) { /* Don't need rcu for top_cpuset. It's never freed. */ my_cpusets_mem_gen = top_cpuset.mems_generation; } else { rcu_read_lock(); my_cpusets_mem_gen = task_cs(tsk)->mems_generation; rcu_read_unlock(); } if (my_cpusets_mem_gen != tsk->cpuset_mems_generation) { mutex_lock(&callback_mutex); task_lock(tsk); cs = task_cs(tsk); /* Maybe changed when task not locked */ guarantee_online_mems(cs, &tsk->mems_allowed); tsk->cpuset_mems_generation = cs->mems_generation; if (is_spread_page(cs)) tsk->flags |= PF_SPREAD_PAGE; else tsk->flags &= ~PF_SPREAD_PAGE; if (is_spread_slab(cs)) tsk->flags |= PF_SPREAD_SLAB; else tsk->flags &= ~PF_SPREAD_SLAB; task_unlock(tsk); mutex_unlock(&callback_mutex); mpol_rebind_task(tsk, &tsk->mems_allowed); }}/* * is_cpuset_subset(p, q) - Is cpuset p a subset of cpuset q? * * One cpuset is a subset of another if all its allowed CPUs and * Memory Nodes are a subset of the other, and its exclusive flags * are only set if the other's are set. Call holding cgroup_mutex. */static int is_cpuset_subset(const struct cpuset *p, const struct cpuset *q){ return cpus_subset(p->cpus_allowed, q->cpus_allowed) && nodes_subset(p->mems_allowed, q->mems_allowed) && is_cpu_exclusive(p) <= is_cpu_exclusive(q) && is_mem_exclusive(p) <= is_mem_exclusive(q);}/* * validate_change() - Used to validate that any proposed cpuset change * follows the structural rules for cpusets. * * If we replaced the flag and mask values of the current cpuset * (cur) with those values in the trial cpuset (trial), would * our various subset and exclusive rules still be valid? Presumes * cgroup_mutex held. * * 'cur' is the address of an actual, in-use cpuset. Operations * such as list traversal that depend on the actual address of the * cpuset in the list must use cur below, not trial. * * 'trial' is the address of bulk structure copy of cur, with * perhaps one or more of the fields cpus_allowed, mems_allowed, * or flags changed to new, trial values. * * Return 0 if valid, -errno if not. */static int validate_change(const struct cpuset *cur, const struct cpuset *trial){ struct cgroup *cont; struct cpuset *c, *par; /* Each of our child cpusets must be a subset of us */ list_for_each_entry(cont, &cur->css.cgroup->children, sibling) { if (!is_cpuset_subset(cgroup_cs(cont), trial)) return -EBUSY; } /* Remaining checks don't apply to root cpuset */ if (cur == &top_cpuset) return 0; par = cur->parent; /* We must be a subset of our parent cpuset */
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -