📄 cpuset.c
字号:
if (!cpus_subset(trialcs.cpus_allowed, cpu_online_map)) return -EINVAL; } retval = validate_change(cs, &trialcs); if (retval < 0) return retval; /* Nothing to do if the cpus didn't change */ if (cpus_equal(cs->cpus_allowed, trialcs.cpus_allowed)) return 0; retval = heap_init(&heap, PAGE_SIZE, GFP_KERNEL, NULL); if (retval) return retval; is_load_balanced = is_sched_load_balance(&trialcs); mutex_lock(&callback_mutex); cs->cpus_allowed = trialcs.cpus_allowed; mutex_unlock(&callback_mutex); /* * Scan tasks in the cpuset, and update the cpumasks of any * that need an update. */ update_tasks_cpumask(cs, &heap); heap_free(&heap); if (is_load_balanced) async_rebuild_sched_domains(); return 0;}/* * cpuset_migrate_mm * * Migrate memory region from one set of nodes to another. * * Temporarilly set tasks mems_allowed to target nodes of migration, * so that the migration code can allocate pages on these nodes. * * Call holding cgroup_mutex, so current's cpuset won't change * during this call, as manage_mutex holds off any cpuset_attach() * calls. Therefore we don't need to take task_lock around the * call to guarantee_online_mems(), as we know no one is changing * our task's cpuset. * * Hold callback_mutex around the two modifications of our tasks * mems_allowed to synchronize with cpuset_mems_allowed(). * * While the mm_struct we are migrating is typically from some * other task, the task_struct mems_allowed that we are hacking * is for our current task, which must allocate new pages for that * migrating memory region. * * We call cpuset_update_task_memory_state() before hacking * our tasks mems_allowed, so that we are assured of being in * sync with our tasks cpuset, and in particular, callbacks to * cpuset_update_task_memory_state() from nested page allocations * won't see any mismatch of our cpuset and task mems_generation * values, so won't overwrite our hacked tasks mems_allowed * nodemask. */static void cpuset_migrate_mm(struct mm_struct *mm, const nodemask_t *from, const nodemask_t *to){ struct task_struct *tsk = current; cpuset_update_task_memory_state(); mutex_lock(&callback_mutex); tsk->mems_allowed = *to; mutex_unlock(&callback_mutex); do_migrate_pages(mm, from, to, MPOL_MF_MOVE_ALL); mutex_lock(&callback_mutex); guarantee_online_mems(task_cs(tsk),&tsk->mems_allowed); mutex_unlock(&callback_mutex);}static void *cpuset_being_rebound;/** * update_tasks_nodemask - Update the nodemasks of tasks in the cpuset. * @cs: the cpuset in which each task's mems_allowed mask needs to be changed * @oldmem: old mems_allowed of cpuset cs * * Called with cgroup_mutex held * Return 0 if successful, -errno if not. */static int update_tasks_nodemask(struct cpuset *cs, const nodemask_t *oldmem){ struct task_struct *p; struct mm_struct **mmarray; int i, n, ntasks; int migrate; int fudge; struct cgroup_iter it; int retval; cpuset_being_rebound = cs; /* causes mpol_dup() rebind */ fudge = 10; /* spare mmarray[] slots */ fudge += cpus_weight(cs->cpus_allowed); /* imagine one fork-bomb/cpu */ retval = -ENOMEM; /* * Allocate mmarray[] to hold mm reference for each task * in cpuset cs. Can't kmalloc GFP_KERNEL while holding * tasklist_lock. We could use GFP_ATOMIC, but with a * few more lines of code, we can retry until we get a big * enough mmarray[] w/o using GFP_ATOMIC. */ while (1) { ntasks = cgroup_task_count(cs->css.cgroup); /* guess */ ntasks += fudge; mmarray = kmalloc(ntasks * sizeof(*mmarray), GFP_KERNEL); if (!mmarray) goto done; read_lock(&tasklist_lock); /* block fork */ if (cgroup_task_count(cs->css.cgroup) <= ntasks) break; /* got enough */ read_unlock(&tasklist_lock); /* try again */ kfree(mmarray); } n = 0; /* Load up mmarray[] with mm reference for each task in cpuset. */ cgroup_iter_start(cs->css.cgroup, &it); while ((p = cgroup_iter_next(cs->css.cgroup, &it))) { struct mm_struct *mm; if (n >= ntasks) { printk(KERN_WARNING "Cpuset mempolicy rebind incomplete.\n"); break; } mm = get_task_mm(p); if (!mm) continue; mmarray[n++] = mm; } cgroup_iter_end(cs->css.cgroup, &it); read_unlock(&tasklist_lock); /* * Now that we've dropped the tasklist spinlock, we can * rebind the vma mempolicies of each mm in mmarray[] to their * new cpuset, and release that mm. The mpol_rebind_mm() * call takes mmap_sem, which we couldn't take while holding * tasklist_lock. Forks can happen again now - the mpol_dup() * cpuset_being_rebound check will catch such forks, and rebind * their vma mempolicies too. Because we still hold the global * cgroup_mutex, we know that no other rebind effort will * be contending for the global variable cpuset_being_rebound. * It's ok if we rebind the same mm twice; mpol_rebind_mm() * is idempotent. Also migrate pages in each mm to new nodes. */ migrate = is_memory_migrate(cs); for (i = 0; i < n; i++) { struct mm_struct *mm = mmarray[i]; mpol_rebind_mm(mm, &cs->mems_allowed); if (migrate) cpuset_migrate_mm(mm, oldmem, &cs->mems_allowed); mmput(mm); } /* We're done rebinding vmas to this cpuset's new mems_allowed. */ kfree(mmarray); cpuset_being_rebound = NULL; retval = 0;done: return retval;}/* * Handle user request to change the 'mems' memory placement * of a cpuset. Needs to validate the request, update the * cpusets mems_allowed and mems_generation, and for each * task in the cpuset, rebind any vma mempolicies and if * the cpuset is marked 'memory_migrate', migrate the tasks * pages to the new memory. * * Call with cgroup_mutex held. May take callback_mutex during call. * Will take tasklist_lock, scan tasklist for tasks in cpuset cs, * lock each such tasks mm->mmap_sem, scan its vma's and rebind * their mempolicies to the cpusets new mems_allowed. */static int update_nodemask(struct cpuset *cs, const char *buf){ struct cpuset trialcs; nodemask_t oldmem; int retval; /* * top_cpuset.mems_allowed tracks node_stats[N_HIGH_MEMORY]; * it's read-only */ if (cs == &top_cpuset) return -EACCES; trialcs = *cs; /* * An empty mems_allowed is ok iff there are no tasks in the cpuset. * Since nodelist_parse() fails on an empty mask, we special case * that parsing. The validate_change() call ensures that cpusets * with tasks have memory. */ if (!*buf) { nodes_clear(trialcs.mems_allowed); } else { retval = nodelist_parse(buf, trialcs.mems_allowed); if (retval < 0) goto done; if (!nodes_subset(trialcs.mems_allowed, node_states[N_HIGH_MEMORY])) return -EINVAL; } oldmem = cs->mems_allowed; if (nodes_equal(oldmem, trialcs.mems_allowed)) { retval = 0; /* Too easy - nothing to do */ goto done; } retval = validate_change(cs, &trialcs); if (retval < 0) goto done; mutex_lock(&callback_mutex); cs->mems_allowed = trialcs.mems_allowed; cs->mems_generation = cpuset_mems_generation++; mutex_unlock(&callback_mutex); retval = update_tasks_nodemask(cs, &oldmem);done: return retval;}int current_cpuset_is_being_rebound(void){ return task_cs(current) == cpuset_being_rebound;}static int update_relax_domain_level(struct cpuset *cs, s64 val){ if (val < -1 || val >= SD_LV_MAX) return -EINVAL; if (val != cs->relax_domain_level) { cs->relax_domain_level = val; if (!cpus_empty(cs->cpus_allowed) && is_sched_load_balance(cs)) async_rebuild_sched_domains(); } return 0;}/* * update_flag - read a 0 or a 1 in a file and update associated flag * bit: the bit to update (see cpuset_flagbits_t) * cs: the cpuset to update * turning_on: whether the flag is being set or cleared * * Call with cgroup_mutex held. */static int update_flag(cpuset_flagbits_t bit, struct cpuset *cs, int turning_on){ struct cpuset trialcs; int err; int cpus_nonempty, balance_flag_changed; trialcs = *cs; if (turning_on) set_bit(bit, &trialcs.flags); else clear_bit(bit, &trialcs.flags); err = validate_change(cs, &trialcs); if (err < 0) return err; cpus_nonempty = !cpus_empty(trialcs.cpus_allowed); balance_flag_changed = (is_sched_load_balance(cs) != is_sched_load_balance(&trialcs)); mutex_lock(&callback_mutex); cs->flags = trialcs.flags; mutex_unlock(&callback_mutex); if (cpus_nonempty && balance_flag_changed) async_rebuild_sched_domains(); return 0;}/* * Frequency meter - How fast is some event occurring? * * These routines manage a digitally filtered, constant time based, * event frequency meter. There are four routines: * fmeter_init() - initialize a frequency meter. * fmeter_markevent() - called each time the event happens. * fmeter_getrate() - returns the recent rate of such events. * fmeter_update() - internal routine used to update fmeter. * * A common data structure is passed to each of these routines, * which is used to keep track of the state required to manage the * frequency meter and its digital filter. * * The filter works on the number of events marked per unit time. * The filter is single-pole low-pass recursive (IIR). The time unit * is 1 second. Arithmetic is done using 32-bit integers scaled to * simulate 3 decimal digits of precision (multiplied by 1000). * * With an FM_COEF of 933, and a time base of 1 second, the filter * has a half-life of 10 seconds, meaning that if the events quit * happening, then the rate returned from the fmeter_getrate() * will be cut in half each 10 seconds, until it converges to zero. * * It is not worth doing a real infinitely recursive filter. If more * than FM_MAXTICKS ticks have elapsed since the last filter event, * just compute FM_MAXTICKS ticks worth, by which point the level * will be stable. * * Limit the count of unprocessed events to FM_MAXCNT, so as to avoid * arithmetic overflow in the fmeter_update() routine. * * Given the simple 32 bit integer arithmetic used, this meter works * best for reporting rates between one per millisecond (msec) and * one per 32 (approx) seconds. At constant rates faster than one * per msec it maxes out at values just under 1,000,000. At constant * rates between one per msec, and one per second it will stabilize * to a value N*1000, where N is the rate of events per second. * At constant rates between one per second and one per 32 seconds, * it will be choppy, moving up on the seconds that have an event, * and then decaying until the next event. At rates slower than * about one in 32 seconds, it decays all the way back to zero between * each event. */#define FM_COEF 933 /* coefficient for half-life of 10 secs */#define FM_MAXTICKS ((time_t)99) /* useless computing more ticks than this */#define FM_MAXCNT 1000000 /* limit cnt to avoid overflow */#define FM_SCALE 1000 /* faux fixed point scale *//* Initialize a frequency meter */static void fmeter_init(struct fmeter *fmp){ fmp->cnt = 0; fmp->val = 0; fmp->time = 0; spin_lock_init(&fmp->lock);}/* Internal meter update - process cnt events and update value */static void fmeter_update(struct fmeter *fmp){ time_t now = get_seconds(); time_t ticks = now - fmp->time; if (ticks == 0) return; ticks = min(FM_MAXTICKS, ticks); while (ticks-- > 0) fmp->val = (FM_COEF * fmp->val) / FM_SCALE; fmp->time = now; fmp->val += ((FM_SCALE - FM_COEF) * fmp->cnt) / FM_SCALE; fmp->cnt = 0;}/* Process any previous ticks, then bump cnt by one (times scale). */static void fmeter_markevent(struct fmeter *fmp){ spin_lock(&fmp->lock); fmeter_update(fmp); fmp->cnt = min(FM_MAXCNT, fmp->cnt + FM_SCALE); spin_unlock(&fmp->lock);}/* Process any previous ticks, then return current value. */static int fmeter_getrate(struct fmeter *fmp){ int val; spin_lock(&fmp->lock); fmeter_update(fmp); val = fmp->val; spin_unlock(&fmp->lock); return val;}/* Called by cgroups to determine if a cpuset is usable; cgroup_mutex held */static int cpuset_can_attach(struct cgroup_subsys *ss, struct cgroup *cont, struct task_struct *tsk){ struct cpuset *cs = cgroup_cs(cont); if (cpus_empty(cs->cpus_allowed) || nodes_empty(cs->mems_allowed)) return -ENOSPC; if (tsk->flags & PF_THREAD_BOUND) { cpumask_t mask; mutex_lock(&callback_mutex); mask = cs->cpus_allowed; mutex_unlock(&callback_mutex); if (!cpus_equal(tsk->cpus_allowed, mask)) return -EINVAL; } return security_task_setscheduler(tsk, 0, NULL);}static void cpuset_attach(struct cgroup_subsys *ss, struct cgroup *cont, struct cgroup *oldcont, struct task_struct *tsk){ cpumask_t cpus; nodemask_t from, to; struct mm_struct *mm; struct cpuset *cs = cgroup_cs(cont); struct cpuset *oldcs = cgroup_cs(oldcont); int err; mutex_lock(&callback_mutex); guarantee_online_cpus(cs, &cpus); err = set_cpus_allowed_ptr(tsk, &cpus); mutex_unlock(&callback_mutex); if (err) return; from = oldcs->mems_allowed; to = cs->mems_allowed; mm = get_task_mm(tsk); if (mm) { mpol_rebind_mm(mm, &to); if (is_memory_migrate(cs)) cpuset_migrate_mm(mm, &from, &to);
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -