📄 cpuset.c

📁 linux 2.6.19 kernel source code before patching
💻 C
📖 第 1 页 / 共 5 页
字号:
	do_migrate_pages(mm, from, to, MPOL_MF_MOVE_ALL);	mutex_lock(&callback_mutex);	guarantee_online_mems(tsk->cpuset, &tsk->mems_allowed);	mutex_unlock(&callback_mutex);}/* * Handle user request to change the 'mems' memory placement * of a cpuset.  Needs to validate the request, update the * cpusets mems_allowed and mems_generation, and for each * task in the cpuset, rebind any vma mempolicies and if * the cpuset is marked 'memory_migrate', migrate the tasks * pages to the new memory. * * Call with manage_mutex held.  May take callback_mutex during call. * Will take tasklist_lock, scan tasklist for tasks in cpuset cs, * lock each such tasks mm->mmap_sem, scan its vma's and rebind * their mempolicies to the cpusets new mems_allowed. */static int update_nodemask(struct cpuset *cs, char *buf){	struct cpuset trialcs;	nodemask_t oldmem;	struct task_struct *g, *p;	struct mm_struct **mmarray;	int i, n, ntasks;	int migrate;	int fudge;	int retval;	/* top_cpuset.mems_allowed tracks node_online_map; it's read-only */	if (cs == &top_cpuset)		return -EACCES;	trialcs = *cs;	/*	 * We allow a cpuset's mems_allowed to be empty; if it has attached	 * tasks, we'll catch it later when we validate the change and return	 * -ENOSPC.	 */	if (!buf[0] || (buf[0] == '\n' && !buf[1])) {		nodes_clear(trialcs.mems_allowed);	} else {		retval = nodelist_parse(buf, trialcs.mems_allowed);		if (retval < 0)			goto done;	}	nodes_and(trialcs.mems_allowed, trialcs.mems_allowed, node_online_map);	oldmem = cs->mems_allowed;	if (nodes_equal(oldmem, trialcs.mems_allowed)) {		retval = 0;		/* Too easy - nothing to do */		goto done;	}	/* mems_allowed cannot be empty for a cpuset with attached tasks. */	if (atomic_read(&cs->count) && nodes_empty(trialcs.mems_allowed)) {		retval = -ENOSPC;		goto done;	}	retval = validate_change(cs, &trialcs);	if (retval < 0)		goto done;	mutex_lock(&callback_mutex);	cs->mems_allowed = trialcs.mems_allowed;	cs->mems_generation = cpuset_mems_generation++;	mutex_unlock(&callback_mutex);	set_cpuset_being_rebound(cs);		/* causes mpol_copy() rebind */	fudge = 10;				/* spare mmarray[] slots */	fudge += cpus_weight(cs->cpus_allowed);	/* imagine one fork-bomb/cpu */	retval = -ENOMEM;	/*	 * Allocate mmarray[] to hold mm reference for each task	 * in cpuset cs.  Can't kmalloc GFP_KERNEL while holding	 * tasklist_lock.  We could use GFP_ATOMIC, but with a	 * few more lines of code, we can retry until we get a big	 * enough mmarray[] w/o using GFP_ATOMIC.	 */	while (1) {		ntasks = atomic_read(&cs->count);	/* guess */		ntasks += fudge;		mmarray = kmalloc(ntasks * sizeof(*mmarray), GFP_KERNEL);		if (!mmarray)			goto done;		write_lock_irq(&tasklist_lock);		/* block fork */		if (atomic_read(&cs->count) <= ntasks)			break;				/* got enough */		write_unlock_irq(&tasklist_lock);	/* try again */		kfree(mmarray);	}	n = 0;	/* Load up mmarray[] with mm reference for each task in cpuset. */	do_each_thread(g, p) {		struct mm_struct *mm;		if (n >= ntasks) {			printk(KERN_WARNING				"Cpuset mempolicy rebind incomplete.\n");			continue;		}		if (p->cpuset != cs)			continue;		mm = get_task_mm(p);		if (!mm)			continue;		mmarray[n++] = mm;	} while_each_thread(g, p);	write_unlock_irq(&tasklist_lock);	/*	 * Now that we've dropped the tasklist spinlock, we can	 * rebind the vma mempolicies of each mm in mmarray[] to their	 * new cpuset, and release that mm.  The mpol_rebind_mm()	 * call takes mmap_sem, which we couldn't take while holding	 * tasklist_lock.  Forks can happen again now - the mpol_copy()	 * cpuset_being_rebound check will catch such forks, and rebind	 * their vma mempolicies too.  Because we still hold the global	 * cpuset manage_mutex, we know that no other rebind effort will	 * be contending for the global variable cpuset_being_rebound.	 * It's ok if we rebind the same mm twice; mpol_rebind_mm()	 * is idempotent.  Also migrate pages in each mm to new nodes.	 */	migrate = is_memory_migrate(cs);	for (i = 0; i < n; i++) {		struct mm_struct *mm = mmarray[i];		mpol_rebind_mm(mm, &cs->mems_allowed);		if (migrate)			cpuset_migrate_mm(mm, &oldmem, &cs->mems_allowed);		mmput(mm);	}	/* We're done rebinding vma's to this cpusets new mems_allowed. */	kfree(mmarray);	set_cpuset_being_rebound(NULL);	retval = 0;done:	return retval;}/* * Call with manage_mutex held. */static int update_memory_pressure_enabled(struct cpuset *cs, char *buf){	if (simple_strtoul(buf, NULL, 10) != 0)		cpuset_memory_pressure_enabled = 1;	else		cpuset_memory_pressure_enabled = 0;	return 0;}/* * update_flag - read a 0 or a 1 in a file and update associated flag * bit:	the bit to update (CS_CPU_EXCLUSIVE, CS_MEM_EXCLUSIVE, *				CS_NOTIFY_ON_RELEASE, CS_MEMORY_MIGRATE, *				CS_SPREAD_PAGE, CS_SPREAD_SLAB) * cs:	the cpuset to update * buf:	the buffer where we read the 0 or 1 * * Call with manage_mutex held. */static int update_flag(cpuset_flagbits_t bit, struct cpuset *cs, char *buf){	int turning_on;	struct cpuset trialcs;	int err, cpu_exclusive_changed;	turning_on = (simple_strtoul(buf, NULL, 10) != 0);	trialcs = *cs;	if (turning_on)		set_bit(bit, &trialcs.flags);	else		clear_bit(bit, &trialcs.flags);	err = validate_change(cs, &trialcs);	if (err < 0)		return err;	cpu_exclusive_changed =		(is_cpu_exclusive(cs) != is_cpu_exclusive(&trialcs));	mutex_lock(&callback_mutex);	cs->flags = trialcs.flags;	mutex_unlock(&callback_mutex);	if (cpu_exclusive_changed)                update_cpu_domains(cs);	return 0;}/* * Frequency meter - How fast is some event occurring? * * These routines manage a digitally filtered, constant time based, * event frequency meter.  There are four routines: *   fmeter_init() - initialize a frequency meter. *   fmeter_markevent() - called each time the event happens. *   fmeter_getrate() - returns the recent rate of such events. *   fmeter_update() - internal routine used to update fmeter. * * A common data structure is passed to each of these routines, * which is used to keep track of the state required to manage the * frequency meter and its digital filter. * * The filter works on the number of events marked per unit time. * The filter is single-pole low-pass recursive (IIR).  The time unit * is 1 second.  Arithmetic is done using 32-bit integers scaled to * simulate 3 decimal digits of precision (multiplied by 1000). * * With an FM_COEF of 933, and a time base of 1 second, the filter * has a half-life of 10 seconds, meaning that if the events quit * happening, then the rate returned from the fmeter_getrate() * will be cut in half each 10 seconds, until it converges to zero. * * It is not worth doing a real infinitely recursive filter.  If more * than FM_MAXTICKS ticks have elapsed since the last filter event, * just compute FM_MAXTICKS ticks worth, by which point the level * will be stable. * * Limit the count of unprocessed events to FM_MAXCNT, so as to avoid * arithmetic overflow in the fmeter_update() routine. * * Given the simple 32 bit integer arithmetic used, this meter works * best for reporting rates between one per millisecond (msec) and * one per 32 (approx) seconds.  At constant rates faster than one * per msec it maxes out at values just under 1,000,000.  At constant * rates between one per msec, and one per second it will stabilize * to a value N*1000, where N is the rate of events per second. * At constant rates between one per second and one per 32 seconds, * it will be choppy, moving up on the seconds that have an event, * and then decaying until the next event.  At rates slower than * about one in 32 seconds, it decays all the way back to zero between * each event. */#define FM_COEF 933		/* coefficient for half-life of 10 secs */#define FM_MAXTICKS ((time_t)99) /* useless computing more ticks than this */#define FM_MAXCNT 1000000	/* limit cnt to avoid overflow */#define FM_SCALE 1000		/* faux fixed point scale *//* Initialize a frequency meter */static void fmeter_init(struct fmeter *fmp){	fmp->cnt = 0;	fmp->val = 0;	fmp->time = 0;	spin_lock_init(&fmp->lock);}/* Internal meter update - process cnt events and update value */static void fmeter_update(struct fmeter *fmp){	time_t now = get_seconds();	time_t ticks = now - fmp->time;	if (ticks == 0)		return;	ticks = min(FM_MAXTICKS, ticks);	while (ticks-- > 0)		fmp->val = (FM_COEF * fmp->val) / FM_SCALE;	fmp->time = now;	fmp->val += ((FM_SCALE - FM_COEF) * fmp->cnt) / FM_SCALE;	fmp->cnt = 0;}/* Process any previous ticks, then bump cnt by one (times scale). */static void fmeter_markevent(struct fmeter *fmp){	spin_lock(&fmp->lock);	fmeter_update(fmp);	fmp->cnt = min(FM_MAXCNT, fmp->cnt + FM_SCALE);	spin_unlock(&fmp->lock);}/* Process any previous ticks, then return current value. */static int fmeter_getrate(struct fmeter *fmp){	int val;	spin_lock(&fmp->lock);	fmeter_update(fmp);	val = fmp->val;	spin_unlock(&fmp->lock);	return val;}/* * Attack task specified by pid in 'pidbuf' to cpuset 'cs', possibly * writing the path of the old cpuset in 'ppathbuf' if it needs to be * notified on release. * * Call holding manage_mutex.  May take callback_mutex and task_lock of * the task 'pid' during call. */static int attach_task(struct cpuset *cs, char *pidbuf, char **ppathbuf){	pid_t pid;	struct task_struct *tsk;	struct cpuset *oldcs;	cpumask_t cpus;	nodemask_t from, to;	struct mm_struct *mm;	int retval;	if (sscanf(pidbuf, "%d", &pid) != 1)		return -EIO;	if (cpus_empty(cs->cpus_allowed) || nodes_empty(cs->mems_allowed))		return -ENOSPC;	if (pid) {		read_lock(&tasklist_lock);		tsk = find_task_by_pid(pid);		if (!tsk || tsk->flags & PF_EXITING) {			read_unlock(&tasklist_lock);			return -ESRCH;		}		get_task_struct(tsk);		read_unlock(&tasklist_lock);		if ((current->euid) && (current->euid != tsk->uid)		    && (current->euid != tsk->suid)) {			put_task_struct(tsk);			return -EACCES;		}	} else {		tsk = current;		get_task_struct(tsk);	}	retval = security_task_setscheduler(tsk, 0, NULL);	if (retval) {		put_task_struct(tsk);		return retval;	}	mutex_lock(&callback_mutex);	task_lock(tsk);	oldcs = tsk->cpuset;	/*	 * After getting 'oldcs' cpuset ptr, be sure still not exiting.	 * If 'oldcs' might be the top_cpuset due to the_top_cpuset_hack	 * then fail this attach_task(), to avoid breaking top_cpuset.count.	 */	if (tsk->flags & PF_EXITING) {		task_unlock(tsk);		mutex_unlock(&callback_mutex);		put_task_struct(tsk);		return -ESRCH;	}	atomic_inc(&cs->count);	rcu_assign_pointer(tsk->cpuset, cs);	task_unlock(tsk);	guarantee_online_cpus(cs, &cpus);	set_cpus_allowed(tsk, cpus);	from = oldcs->mems_allowed;	to = cs->mems_allowed;	mutex_unlock(&callback_mutex);	mm = get_task_mm(tsk);	if (mm) {		mpol_rebind_mm(mm, &to);		if (is_memory_migrate(cs))			cpuset_migrate_mm(mm, &from, &to);		mmput(mm);	}	put_task_struct(tsk);	synchronize_rcu();	if (atomic_dec_and_test(&oldcs->count))		check_for_release(oldcs, ppathbuf);	return 0;}/* The various types of files and directories in a cpuset file system */typedef enum {	FILE_ROOT,	FILE_DIR,	FILE_MEMORY_MIGRATE,	FILE_CPULIST,	FILE_MEMLIST,	FILE_CPU_EXCLUSIVE,	FILE_MEM_EXCLUSIVE,	FILE_NOTIFY_ON_RELEASE,	FILE_MEMORY_PRESSURE_ENABLED,	FILE_MEMORY_PRESSURE,	FILE_SPREAD_PAGE,	FILE_SPREAD_SLAB,	FILE_TASKLIST,} cpuset_filetype_t;static ssize_t cpuset_common_file_write(struct file *file,					const char __user *userbuf,					size_t nbytes, loff_t *unused_ppos){	struct cpuset *cs = __d_cs(file->f_path.dentry->d_parent);	struct cftype *cft = __d_cft(file->f_path.dentry);	cpuset_filetype_t type = cft->private;	char *buffer;	char *pathbuf = NULL;	int retval = 0;	/* Crude upper limit on largest legitimate cpulist user might write. */	if (nbytes > 100 + 6 * max(NR_CPUS, MAX_NUMNODES))		return -E2BIG;	/* +1 for nul-terminator */	if ((buffer = kmalloc(nbytes + 1, GFP_KERNEL)) == 0)		return -ENOMEM;	if (copy_from_user(buffer, userbuf, nbytes)) {		retval = -EFAULT;		goto out1;	}	buffer[nbytes] = 0;	/* nul-terminate */	mutex_lock(&manage_mutex);	if (is_removed(cs)) {		retval = -ENODEV;		goto out2;	}	switch (type) {	case FILE_CPULIST:		retval = update_cpumask(cs, buffer);		break;	case FILE_MEMLIST:		retval = update_nodemask(cs, buffer);
⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -