📄 cpuset.c

📁 Kernel code of linux kernel
💻 C
📖 第 1 页 / 共 5 页
字号:
		if (!cpus_subset(trialcs.cpus_allowed, cpu_online_map))			return -EINVAL;	}	retval = validate_change(cs, &trialcs);	if (retval < 0)		return retval;	/* Nothing to do if the cpus didn't change */	if (cpus_equal(cs->cpus_allowed, trialcs.cpus_allowed))		return 0;	retval = heap_init(&heap, PAGE_SIZE, GFP_KERNEL, NULL);	if (retval)		return retval;	is_load_balanced = is_sched_load_balance(&trialcs);	mutex_lock(&callback_mutex);	cs->cpus_allowed = trialcs.cpus_allowed;	mutex_unlock(&callback_mutex);	/*	 * Scan tasks in the cpuset, and update the cpumasks of any	 * that need an update.	 */	update_tasks_cpumask(cs, &heap);	heap_free(&heap);	if (is_load_balanced)		async_rebuild_sched_domains();	return 0;}/* * cpuset_migrate_mm * *    Migrate memory region from one set of nodes to another. * *    Temporarilly set tasks mems_allowed to target nodes of migration, *    so that the migration code can allocate pages on these nodes. * *    Call holding cgroup_mutex, so current's cpuset won't change *    during this call, as manage_mutex holds off any cpuset_attach() *    calls.  Therefore we don't need to take task_lock around the *    call to guarantee_online_mems(), as we know no one is changing *    our task's cpuset. * *    Hold callback_mutex around the two modifications of our tasks *    mems_allowed to synchronize with cpuset_mems_allowed(). * *    While the mm_struct we are migrating is typically from some *    other task, the task_struct mems_allowed that we are hacking *    is for our current task, which must allocate new pages for that *    migrating memory region. * *    We call cpuset_update_task_memory_state() before hacking *    our tasks mems_allowed, so that we are assured of being in *    sync with our tasks cpuset, and in particular, callbacks to *    cpuset_update_task_memory_state() from nested page allocations *    won't see any mismatch of our cpuset and task mems_generation *    values, so won't overwrite our hacked tasks mems_allowed *    nodemask. */static void cpuset_migrate_mm(struct mm_struct *mm, const nodemask_t *from,							const nodemask_t *to){	struct task_struct *tsk = current;	cpuset_update_task_memory_state();	mutex_lock(&callback_mutex);	tsk->mems_allowed = *to;	mutex_unlock(&callback_mutex);	do_migrate_pages(mm, from, to, MPOL_MF_MOVE_ALL);	mutex_lock(&callback_mutex);	guarantee_online_mems(task_cs(tsk),&tsk->mems_allowed);	mutex_unlock(&callback_mutex);}static void *cpuset_being_rebound;/** * update_tasks_nodemask - Update the nodemasks of tasks in the cpuset. * @cs: the cpuset in which each task's mems_allowed mask needs to be changed * @oldmem: old mems_allowed of cpuset cs * * Called with cgroup_mutex held * Return 0 if successful, -errno if not. */static int update_tasks_nodemask(struct cpuset *cs, const nodemask_t *oldmem){	struct task_struct *p;	struct mm_struct **mmarray;	int i, n, ntasks;	int migrate;	int fudge;	struct cgroup_iter it;	int retval;	cpuset_being_rebound = cs;		/* causes mpol_dup() rebind */	fudge = 10;				/* spare mmarray[] slots */	fudge += cpus_weight(cs->cpus_allowed);	/* imagine one fork-bomb/cpu */	retval = -ENOMEM;	/*	 * Allocate mmarray[] to hold mm reference for each task	 * in cpuset cs.  Can't kmalloc GFP_KERNEL while holding	 * tasklist_lock.  We could use GFP_ATOMIC, but with a	 * few more lines of code, we can retry until we get a big	 * enough mmarray[] w/o using GFP_ATOMIC.	 */	while (1) {		ntasks = cgroup_task_count(cs->css.cgroup);  /* guess */		ntasks += fudge;		mmarray = kmalloc(ntasks * sizeof(*mmarray), GFP_KERNEL);		if (!mmarray)			goto done;		read_lock(&tasklist_lock);		/* block fork */		if (cgroup_task_count(cs->css.cgroup) <= ntasks)			break;				/* got enough */		read_unlock(&tasklist_lock);		/* try again */		kfree(mmarray);	}	n = 0;	/* Load up mmarray[] with mm reference for each task in cpuset. */	cgroup_iter_start(cs->css.cgroup, &it);	while ((p = cgroup_iter_next(cs->css.cgroup, &it))) {		struct mm_struct *mm;		if (n >= ntasks) {			printk(KERN_WARNING				"Cpuset mempolicy rebind incomplete.\n");			break;		}		mm = get_task_mm(p);		if (!mm)			continue;		mmarray[n++] = mm;	}	cgroup_iter_end(cs->css.cgroup, &it);	read_unlock(&tasklist_lock);	/*	 * Now that we've dropped the tasklist spinlock, we can	 * rebind the vma mempolicies of each mm in mmarray[] to their	 * new cpuset, and release that mm.  The mpol_rebind_mm()	 * call takes mmap_sem, which we couldn't take while holding	 * tasklist_lock.  Forks can happen again now - the mpol_dup()	 * cpuset_being_rebound check will catch such forks, and rebind	 * their vma mempolicies too.  Because we still hold the global	 * cgroup_mutex, we know that no other rebind effort will	 * be contending for the global variable cpuset_being_rebound.	 * It's ok if we rebind the same mm twice; mpol_rebind_mm()	 * is idempotent.  Also migrate pages in each mm to new nodes.	 */	migrate = is_memory_migrate(cs);	for (i = 0; i < n; i++) {		struct mm_struct *mm = mmarray[i];		mpol_rebind_mm(mm, &cs->mems_allowed);		if (migrate)			cpuset_migrate_mm(mm, oldmem, &cs->mems_allowed);		mmput(mm);	}	/* We're done rebinding vmas to this cpuset's new mems_allowed. */	kfree(mmarray);	cpuset_being_rebound = NULL;	retval = 0;done:	return retval;}/* * Handle user request to change the 'mems' memory placement * of a cpuset.  Needs to validate the request, update the * cpusets mems_allowed and mems_generation, and for each * task in the cpuset, rebind any vma mempolicies and if * the cpuset is marked 'memory_migrate', migrate the tasks * pages to the new memory. * * Call with cgroup_mutex held.  May take callback_mutex during call. * Will take tasklist_lock, scan tasklist for tasks in cpuset cs, * lock each such tasks mm->mmap_sem, scan its vma's and rebind * their mempolicies to the cpusets new mems_allowed. */static int update_nodemask(struct cpuset *cs, const char *buf){	struct cpuset trialcs;	nodemask_t oldmem;	int retval;	/*	 * top_cpuset.mems_allowed tracks node_stats[N_HIGH_MEMORY];	 * it's read-only	 */	if (cs == &top_cpuset)		return -EACCES;	trialcs = *cs;	/*	 * An empty mems_allowed is ok iff there are no tasks in the cpuset.	 * Since nodelist_parse() fails on an empty mask, we special case	 * that parsing.  The validate_change() call ensures that cpusets	 * with tasks have memory.	 */	if (!*buf) {		nodes_clear(trialcs.mems_allowed);	} else {		retval = nodelist_parse(buf, trialcs.mems_allowed);		if (retval < 0)			goto done;		if (!nodes_subset(trialcs.mems_allowed,				node_states[N_HIGH_MEMORY]))			return -EINVAL;	}	oldmem = cs->mems_allowed;	if (nodes_equal(oldmem, trialcs.mems_allowed)) {		retval = 0;		/* Too easy - nothing to do */		goto done;	}	retval = validate_change(cs, &trialcs);	if (retval < 0)		goto done;	mutex_lock(&callback_mutex);	cs->mems_allowed = trialcs.mems_allowed;	cs->mems_generation = cpuset_mems_generation++;	mutex_unlock(&callback_mutex);	retval = update_tasks_nodemask(cs, &oldmem);done:	return retval;}int current_cpuset_is_being_rebound(void){	return task_cs(current) == cpuset_being_rebound;}static int update_relax_domain_level(struct cpuset *cs, s64 val){	if (val < -1 || val >= SD_LV_MAX)		return -EINVAL;	if (val != cs->relax_domain_level) {		cs->relax_domain_level = val;		if (!cpus_empty(cs->cpus_allowed) && is_sched_load_balance(cs))			async_rebuild_sched_domains();	}	return 0;}/* * update_flag - read a 0 or a 1 in a file and update associated flag * bit:		the bit to update (see cpuset_flagbits_t) * cs:		the cpuset to update * turning_on: 	whether the flag is being set or cleared * * Call with cgroup_mutex held. */static int update_flag(cpuset_flagbits_t bit, struct cpuset *cs,		       int turning_on){	struct cpuset trialcs;	int err;	int cpus_nonempty, balance_flag_changed;	trialcs = *cs;	if (turning_on)		set_bit(bit, &trialcs.flags);	else		clear_bit(bit, &trialcs.flags);	err = validate_change(cs, &trialcs);	if (err < 0)		return err;	cpus_nonempty = !cpus_empty(trialcs.cpus_allowed);	balance_flag_changed = (is_sched_load_balance(cs) !=		 			is_sched_load_balance(&trialcs));	mutex_lock(&callback_mutex);	cs->flags = trialcs.flags;	mutex_unlock(&callback_mutex);	if (cpus_nonempty && balance_flag_changed)		async_rebuild_sched_domains();	return 0;}/* * Frequency meter - How fast is some event occurring? * * These routines manage a digitally filtered, constant time based, * event frequency meter.  There are four routines: *   fmeter_init() - initialize a frequency meter. *   fmeter_markevent() - called each time the event happens. *   fmeter_getrate() - returns the recent rate of such events. *   fmeter_update() - internal routine used to update fmeter. * * A common data structure is passed to each of these routines, * which is used to keep track of the state required to manage the * frequency meter and its digital filter. * * The filter works on the number of events marked per unit time. * The filter is single-pole low-pass recursive (IIR).  The time unit * is 1 second.  Arithmetic is done using 32-bit integers scaled to * simulate 3 decimal digits of precision (multiplied by 1000). * * With an FM_COEF of 933, and a time base of 1 second, the filter * has a half-life of 10 seconds, meaning that if the events quit * happening, then the rate returned from the fmeter_getrate() * will be cut in half each 10 seconds, until it converges to zero. * * It is not worth doing a real infinitely recursive filter.  If more * than FM_MAXTICKS ticks have elapsed since the last filter event, * just compute FM_MAXTICKS ticks worth, by which point the level * will be stable. * * Limit the count of unprocessed events to FM_MAXCNT, so as to avoid * arithmetic overflow in the fmeter_update() routine. * * Given the simple 32 bit integer arithmetic used, this meter works * best for reporting rates between one per millisecond (msec) and * one per 32 (approx) seconds.  At constant rates faster than one * per msec it maxes out at values just under 1,000,000.  At constant * rates between one per msec, and one per second it will stabilize * to a value N*1000, where N is the rate of events per second. * At constant rates between one per second and one per 32 seconds, * it will be choppy, moving up on the seconds that have an event, * and then decaying until the next event.  At rates slower than * about one in 32 seconds, it decays all the way back to zero between * each event. */#define FM_COEF 933		/* coefficient for half-life of 10 secs */#define FM_MAXTICKS ((time_t)99) /* useless computing more ticks than this */#define FM_MAXCNT 1000000	/* limit cnt to avoid overflow */#define FM_SCALE 1000		/* faux fixed point scale *//* Initialize a frequency meter */static void fmeter_init(struct fmeter *fmp){	fmp->cnt = 0;	fmp->val = 0;	fmp->time = 0;	spin_lock_init(&fmp->lock);}/* Internal meter update - process cnt events and update value */static void fmeter_update(struct fmeter *fmp){	time_t now = get_seconds();	time_t ticks = now - fmp->time;	if (ticks == 0)		return;	ticks = min(FM_MAXTICKS, ticks);	while (ticks-- > 0)		fmp->val = (FM_COEF * fmp->val) / FM_SCALE;	fmp->time = now;	fmp->val += ((FM_SCALE - FM_COEF) * fmp->cnt) / FM_SCALE;	fmp->cnt = 0;}/* Process any previous ticks, then bump cnt by one (times scale). */static void fmeter_markevent(struct fmeter *fmp){	spin_lock(&fmp->lock);	fmeter_update(fmp);	fmp->cnt = min(FM_MAXCNT, fmp->cnt + FM_SCALE);	spin_unlock(&fmp->lock);}/* Process any previous ticks, then return current value. */static int fmeter_getrate(struct fmeter *fmp){	int val;	spin_lock(&fmp->lock);	fmeter_update(fmp);	val = fmp->val;	spin_unlock(&fmp->lock);	return val;}/* Called by cgroups to determine if a cpuset is usable; cgroup_mutex held */static int cpuset_can_attach(struct cgroup_subsys *ss,			     struct cgroup *cont, struct task_struct *tsk){	struct cpuset *cs = cgroup_cs(cont);	if (cpus_empty(cs->cpus_allowed) || nodes_empty(cs->mems_allowed))		return -ENOSPC;	if (tsk->flags & PF_THREAD_BOUND) {		cpumask_t mask;		mutex_lock(&callback_mutex);		mask = cs->cpus_allowed;		mutex_unlock(&callback_mutex);		if (!cpus_equal(tsk->cpus_allowed, mask))			return -EINVAL;	}	return security_task_setscheduler(tsk, 0, NULL);}static void cpuset_attach(struct cgroup_subsys *ss,			  struct cgroup *cont, struct cgroup *oldcont,			  struct task_struct *tsk){	cpumask_t cpus;	nodemask_t from, to;	struct mm_struct *mm;	struct cpuset *cs = cgroup_cs(cont);	struct cpuset *oldcs = cgroup_cs(oldcont);	int err;	mutex_lock(&callback_mutex);	guarantee_online_cpus(cs, &cpus);	err = set_cpus_allowed_ptr(tsk, &cpus);	mutex_unlock(&callback_mutex);	if (err)		return;	from = oldcs->mems_allowed;	to = cs->mems_allowed;	mm = get_task_mm(tsk);	if (mm) {		mpol_rebind_mm(mm, &to);		if (is_memory_migrate(cs))			cpuset_migrate_mm(mm, &from, &to);
⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -