📄 cpuset.c

📁 Kernel code of linux kernel
💻 C
📖 第 1 页 / 共 5 页
字号:
上一页 1 2 3 45
/* * cpuset_init_early - just enough so that the calls to * cpuset_update_task_memory_state() in early init code * are harmless. */int __init cpuset_init_early(void){	top_cpuset.mems_generation = cpuset_mems_generation++;	return 0;}/** * cpuset_init - initialize cpusets at system boot * * Description: Initialize top_cpuset and the cpuset internal file system, **/int __init cpuset_init(void){	int err = 0;	cpus_setall(top_cpuset.cpus_allowed);	nodes_setall(top_cpuset.mems_allowed);	fmeter_init(&top_cpuset.fmeter);	top_cpuset.mems_generation = cpuset_mems_generation++;	set_bit(CS_SCHED_LOAD_BALANCE, &top_cpuset.flags);	top_cpuset.relax_domain_level = -1;	err = register_filesystem(&cpuset_fs_type);	if (err < 0)		return err;	number_of_cpusets = 1;	return 0;}/** * cpuset_do_move_task - move a given task to another cpuset * @tsk: pointer to task_struct the task to move * @scan: struct cgroup_scanner contained in its struct cpuset_hotplug_scanner * * Called by cgroup_scan_tasks() for each task in a cgroup. * Return nonzero to stop the walk through the tasks. */static void cpuset_do_move_task(struct task_struct *tsk,				struct cgroup_scanner *scan){	struct cpuset_hotplug_scanner *chsp;	chsp = container_of(scan, struct cpuset_hotplug_scanner, scan);	cgroup_attach_task(chsp->to, tsk);}/** * move_member_tasks_to_cpuset - move tasks from one cpuset to another * @from: cpuset in which the tasks currently reside * @to: cpuset to which the tasks will be moved * * Called with cgroup_mutex held * callback_mutex must not be held, as cpuset_attach() will take it. * * The cgroup_scan_tasks() function will scan all the tasks in a cgroup, * calling callback functions for each. */static void move_member_tasks_to_cpuset(struct cpuset *from, struct cpuset *to){	struct cpuset_hotplug_scanner scan;	scan.scan.cg = from->css.cgroup;	scan.scan.test_task = NULL; /* select all tasks in cgroup */	scan.scan.process_task = cpuset_do_move_task;	scan.scan.heap = NULL;	scan.to = to->css.cgroup;	if (cgroup_scan_tasks(&scan.scan))		printk(KERN_ERR "move_member_tasks_to_cpuset: "				"cgroup_scan_tasks failed\n");}/* * If CPU and/or memory hotplug handlers, below, unplug any CPUs * or memory nodes, we need to walk over the cpuset hierarchy, * removing that CPU or node from all cpusets.  If this removes the * last CPU or node from a cpuset, then move the tasks in the empty * cpuset to its next-highest non-empty parent. * * Called with cgroup_mutex held * callback_mutex must not be held, as cpuset_attach() will take it. */static void remove_tasks_in_empty_cpuset(struct cpuset *cs){	struct cpuset *parent;	/*	 * The cgroup's css_sets list is in use if there are tasks	 * in the cpuset; the list is empty if there are none;	 * the cs->css.refcnt seems always 0.	 */	if (list_empty(&cs->css.cgroup->css_sets))		return;	/*	 * Find its next-highest non-empty parent, (top cpuset	 * has online cpus, so can't be empty).	 */	parent = cs->parent;	while (cpus_empty(parent->cpus_allowed) ||			nodes_empty(parent->mems_allowed))		parent = parent->parent;	move_member_tasks_to_cpuset(cs, parent);}/* * Walk the specified cpuset subtree and look for empty cpusets. * The tasks of such cpuset must be moved to a parent cpuset. * * Called with cgroup_mutex held.  We take callback_mutex to modify * cpus_allowed and mems_allowed. * * This walk processes the tree from top to bottom, completing one layer * before dropping down to the next.  It always processes a node before * any of its children. * * For now, since we lack memory hot unplug, we'll never see a cpuset * that has tasks along with an empty 'mems'.  But if we did see such * a cpuset, we'd handle it just like we do if its 'cpus' was empty. */static void scan_for_empty_cpusets(const struct cpuset *root){	LIST_HEAD(queue);	struct cpuset *cp;	/* scans cpusets being updated */	struct cpuset *child;	/* scans child cpusets of cp */	struct cgroup *cont;	nodemask_t oldmems;	list_add_tail((struct list_head *)&root->stack_list, &queue);	while (!list_empty(&queue)) {		cp = list_first_entry(&queue, struct cpuset, stack_list);		list_del(queue.next);		list_for_each_entry(cont, &cp->css.cgroup->children, sibling) {			child = cgroup_cs(cont);			list_add_tail(&child->stack_list, &queue);		}		/* Continue past cpusets with all cpus, mems online */		if (cpus_subset(cp->cpus_allowed, cpu_online_map) &&		    nodes_subset(cp->mems_allowed, node_states[N_HIGH_MEMORY]))			continue;		oldmems = cp->mems_allowed;		/* Remove offline cpus and mems from this cpuset. */		mutex_lock(&callback_mutex);		cpus_and(cp->cpus_allowed, cp->cpus_allowed, cpu_online_map);		nodes_and(cp->mems_allowed, cp->mems_allowed,						node_states[N_HIGH_MEMORY]);		mutex_unlock(&callback_mutex);		/* Move tasks from the empty cpuset to a parent */		if (cpus_empty(cp->cpus_allowed) ||		     nodes_empty(cp->mems_allowed))			remove_tasks_in_empty_cpuset(cp);		else {			update_tasks_cpumask(cp, NULL);			update_tasks_nodemask(cp, &oldmems);		}	}}/* * The top_cpuset tracks what CPUs and Memory Nodes are online, * period.  This is necessary in order to make cpusets transparent * (of no affect) on systems that are actively using CPU hotplug * but making no active use of cpusets. * * This routine ensures that top_cpuset.cpus_allowed tracks * cpu_online_map on each CPU hotplug (cpuhp) event. * * Called within get_online_cpus().  Needs to call cgroup_lock() * before calling generate_sched_domains(). */static int cpuset_track_online_cpus(struct notifier_block *unused_nb,				unsigned long phase, void *unused_cpu){	struct sched_domain_attr *attr;	cpumask_t *doms;	int ndoms;	switch (phase) {	case CPU_ONLINE:	case CPU_ONLINE_FROZEN:	case CPU_DEAD:	case CPU_DEAD_FROZEN:		break;	default:		return NOTIFY_DONE;	}	cgroup_lock();	top_cpuset.cpus_allowed = cpu_online_map;	scan_for_empty_cpusets(&top_cpuset);	ndoms = generate_sched_domains(&doms, &attr);	cgroup_unlock();	/* Have scheduler rebuild the domains */	partition_sched_domains(ndoms, doms, attr);	return NOTIFY_OK;}#ifdef CONFIG_MEMORY_HOTPLUG/* * Keep top_cpuset.mems_allowed tracking node_states[N_HIGH_MEMORY]. * Call this routine anytime after node_states[N_HIGH_MEMORY] changes. * See also the previous routine cpuset_track_online_cpus(). */void cpuset_track_online_nodes(void){	cgroup_lock();	top_cpuset.mems_allowed = node_states[N_HIGH_MEMORY];	scan_for_empty_cpusets(&top_cpuset);	cgroup_unlock();}#endif/** * cpuset_init_smp - initialize cpus_allowed * * Description: Finish top cpuset after cpu, node maps are initialized **/void __init cpuset_init_smp(void){	top_cpuset.cpus_allowed = cpu_online_map;	top_cpuset.mems_allowed = node_states[N_HIGH_MEMORY];	hotcpu_notifier(cpuset_track_online_cpus, 0);}/** * cpuset_cpus_allowed - return cpus_allowed mask from a tasks cpuset. * @tsk: pointer to task_struct from which to obtain cpuset->cpus_allowed. * @pmask: pointer to cpumask_t variable to receive cpus_allowed set. * * Description: Returns the cpumask_t cpus_allowed of the cpuset * attached to the specified @tsk.  Guaranteed to return some non-empty * subset of cpu_online_map, even if this means going outside the * tasks cpuset. **/void cpuset_cpus_allowed(struct task_struct *tsk, cpumask_t *pmask){	mutex_lock(&callback_mutex);	cpuset_cpus_allowed_locked(tsk, pmask);	mutex_unlock(&callback_mutex);}/** * cpuset_cpus_allowed_locked - return cpus_allowed mask from a tasks cpuset. * Must be called with callback_mutex held. **/void cpuset_cpus_allowed_locked(struct task_struct *tsk, cpumask_t *pmask){	task_lock(tsk);	guarantee_online_cpus(task_cs(tsk), pmask);	task_unlock(tsk);}void cpuset_init_current_mems_allowed(void){	nodes_setall(current->mems_allowed);}/** * cpuset_mems_allowed - return mems_allowed mask from a tasks cpuset. * @tsk: pointer to task_struct from which to obtain cpuset->mems_allowed. * * Description: Returns the nodemask_t mems_allowed of the cpuset * attached to the specified @tsk.  Guaranteed to return some non-empty * subset of node_states[N_HIGH_MEMORY], even if this means going outside the * tasks cpuset. **/nodemask_t cpuset_mems_allowed(struct task_struct *tsk){	nodemask_t mask;	mutex_lock(&callback_mutex);	task_lock(tsk);	guarantee_online_mems(task_cs(tsk), &mask);	task_unlock(tsk);	mutex_unlock(&callback_mutex);	return mask;}/** * cpuset_nodemask_valid_mems_allowed - check nodemask vs. curremt mems_allowed * @nodemask: the nodemask to be checked * * Are any of the nodes in the nodemask allowed in current->mems_allowed? */int cpuset_nodemask_valid_mems_allowed(nodemask_t *nodemask){	return nodes_intersects(*nodemask, current->mems_allowed);}/* * nearest_hardwall_ancestor() - Returns the nearest mem_exclusive or * mem_hardwall ancestor to the specified cpuset.  Call holding * callback_mutex.  If no ancestor is mem_exclusive or mem_hardwall * (an unusual configuration), then returns the root cpuset. */static const struct cpuset *nearest_hardwall_ancestor(const struct cpuset *cs){	while (!(is_mem_exclusive(cs) || is_mem_hardwall(cs)) && cs->parent)		cs = cs->parent;	return cs;}/** * cpuset_zone_allowed_softwall - Can we allocate on zone z's memory node? * @z: is this zone on an allowed node? * @gfp_mask: memory allocation flags * * If we're in interrupt, yes, we can always allocate.  If * __GFP_THISNODE is set, yes, we can always allocate.  If zone * z's node is in our tasks mems_allowed, yes.  If it's not a * __GFP_HARDWALL request and this zone's nodes is in the nearest * hardwalled cpuset ancestor to this tasks cpuset, yes. * If the task has been OOM killed and has access to memory reserves * as specified by the TIF_MEMDIE flag, yes. * Otherwise, no. * * If __GFP_HARDWALL is set, cpuset_zone_allowed_softwall() * reduces to cpuset_zone_allowed_hardwall().  Otherwise, * cpuset_zone_allowed_softwall() might sleep, and might allow a zone * from an enclosing cpuset. * * cpuset_zone_allowed_hardwall() only handles the simpler case of * hardwall cpusets, and never sleeps. * * The __GFP_THISNODE placement logic is really handled elsewhere, * by forcibly using a zonelist starting at a specified node, and by * (in get_page_from_freelist()) refusing to consider the zones for * any node on the zonelist except the first.  By the time any such * calls get to this routine, we should just shut up and say 'yes'. * * GFP_USER allocations are marked with the __GFP_HARDWALL bit, * and do not allow allocations outside the current tasks cpuset * unless the task has been OOM killed as is marked TIF_MEMDIE. * GFP_KERNEL allocations are not so marked, so can escape to the * nearest enclosing hardwalled ancestor cpuset. * * Scanning up parent cpusets requires callback_mutex.  The * __alloc_pages() routine only calls here with __GFP_HARDWALL bit * _not_ set if it's a GFP_KERNEL allocation, and all nodes in the * current tasks mems_allowed came up empty on the first pass over * the zonelist.  So only GFP_KERNEL allocations, if all nodes in the * cpuset are short of memory, might require taking the callback_mutex * mutex. * * The first call here from mm/page_alloc:get_page_from_freelist() * has __GFP_HARDWALL set in gfp_mask, enforcing hardwall cpusets, * so no allocation on a node outside the cpuset is allowed (unless * in interrupt, of course). * * The second pass through get_page_from_freelist() doesn't even call * here for GFP_ATOMIC calls.  For those calls, the __alloc_pages() * variable 'wait' is not set, and the bit ALLOC_CPUSET is not set * in alloc_flags.  That logic and the checks below have the combined * affect that: *	in_interrupt - any node ok (current task context irrelevant) *	GFP_ATOMIC   - any node ok *	TIF_MEMDIE   - any node ok *	GFP_KERNEL   - any node in enclosing hardwalled cpuset ok *	GFP_USER     - only nodes in current tasks mems allowed ok. * * Rule: *    Don't call cpuset_zone_allowed_softwall if you can't sleep, unless you *    pass in the __GFP_HARDWALL flag set in gfp_flag, which disables *    the code that might scan up ancestor cpusets and sleep. */int __cpuset_zone_allowed_softwall(struct zone *z, gfp_t gfp_mask){	int node;			/* node that zone z is on */	const struct cpuset *cs;	/* current cpuset ancestors */	int allowed;			/* is allocation in zone z allowed? */	if (in_interrupt() || (gfp_mask & __GFP_THISNODE))		return 1;	node = zone_to_nid(z);	might_sleep_if(!(gfp_mask & __GFP_HARDWALL));	if (node_isset(node, current->mems_allowed))		return 1;	/*	 * Allow tasks that have access to memory reserves because they have	 * been OOM killed to get memory anywhere.	 */	if (unlikely(test_thread_flag(TIF_MEMDIE)))		return 1;	if (gfp_mask & __GFP_HARDWALL)	/* If hardwall request, stop here */		return 0;	if (current->flags & PF_EXITING) /* Let dying task have memory */		return 1;	/* Not hardwall and node outside mems_allowed: scan up cpusets */	mutex_lock(&callback_mutex);	task_lock(current);	cs = nearest_hardwall_ancestor(task_cs(current));	task_unlock(current);	allowed = node_isset(node, cs->mems_allowed);	mutex_unlock(&callback_mutex);	return allowed;}/* * cpuset_zone_allowed_hardwall - Can we allocate on zone z's memory node? * @z: is this zone on an allowed node? * @gfp_mask: memory allocation flags * * If we're in interrupt, yes, we can always allocate. * If __GFP_THISNODE is set, yes, we can always allocate.  If zone * z's node is in our tasks mems_allowed, yes.   If the task has been * OOM killed and has access to memory reserves as specified by the * TIF_MEMDIE flag, yes.  Otherwise, no. * * The __GFP_THISNODE placement logic is really handled elsewhere, * by forcibly using a zonelist starting at a specified node, and by * (in get_page_from_freelist()) refusing to consider the zones for * any node on the zonelist except the first.  By the time any such * calls get to this routine, we should just shut up and say 'yes'. * * Unlike the cpuset_zone_allowed_softwall() variant, above, * this variant requires that the zone be
上一页 1 2 3 45
⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -