📄 cpuset.c
字号:
/* * cpuset_init_early - just enough so that the calls to * cpuset_update_task_memory_state() in early init code * are harmless. */int __init cpuset_init_early(void){ top_cpuset.mems_generation = cpuset_mems_generation++; return 0;}/** * cpuset_init - initialize cpusets at system boot * * Description: Initialize top_cpuset and the cpuset internal file system, **/int __init cpuset_init(void){ int err = 0; cpus_setall(top_cpuset.cpus_allowed); nodes_setall(top_cpuset.mems_allowed); fmeter_init(&top_cpuset.fmeter); top_cpuset.mems_generation = cpuset_mems_generation++; set_bit(CS_SCHED_LOAD_BALANCE, &top_cpuset.flags); top_cpuset.relax_domain_level = -1; err = register_filesystem(&cpuset_fs_type); if (err < 0) return err; number_of_cpusets = 1; return 0;}/** * cpuset_do_move_task - move a given task to another cpuset * @tsk: pointer to task_struct the task to move * @scan: struct cgroup_scanner contained in its struct cpuset_hotplug_scanner * * Called by cgroup_scan_tasks() for each task in a cgroup. * Return nonzero to stop the walk through the tasks. */static void cpuset_do_move_task(struct task_struct *tsk, struct cgroup_scanner *scan){ struct cpuset_hotplug_scanner *chsp; chsp = container_of(scan, struct cpuset_hotplug_scanner, scan); cgroup_attach_task(chsp->to, tsk);}/** * move_member_tasks_to_cpuset - move tasks from one cpuset to another * @from: cpuset in which the tasks currently reside * @to: cpuset to which the tasks will be moved * * Called with cgroup_mutex held * callback_mutex must not be held, as cpuset_attach() will take it. * * The cgroup_scan_tasks() function will scan all the tasks in a cgroup, * calling callback functions for each. */static void move_member_tasks_to_cpuset(struct cpuset *from, struct cpuset *to){ struct cpuset_hotplug_scanner scan; scan.scan.cg = from->css.cgroup; scan.scan.test_task = NULL; /* select all tasks in cgroup */ scan.scan.process_task = cpuset_do_move_task; scan.scan.heap = NULL; scan.to = to->css.cgroup; if (cgroup_scan_tasks(&scan.scan)) printk(KERN_ERR "move_member_tasks_to_cpuset: " "cgroup_scan_tasks failed\n");}/* * If CPU and/or memory hotplug handlers, below, unplug any CPUs * or memory nodes, we need to walk over the cpuset hierarchy, * removing that CPU or node from all cpusets. If this removes the * last CPU or node from a cpuset, then move the tasks in the empty * cpuset to its next-highest non-empty parent. * * Called with cgroup_mutex held * callback_mutex must not be held, as cpuset_attach() will take it. */static void remove_tasks_in_empty_cpuset(struct cpuset *cs){ struct cpuset *parent; /* * The cgroup's css_sets list is in use if there are tasks * in the cpuset; the list is empty if there are none; * the cs->css.refcnt seems always 0. */ if (list_empty(&cs->css.cgroup->css_sets)) return; /* * Find its next-highest non-empty parent, (top cpuset * has online cpus, so can't be empty). */ parent = cs->parent; while (cpus_empty(parent->cpus_allowed) || nodes_empty(parent->mems_allowed)) parent = parent->parent; move_member_tasks_to_cpuset(cs, parent);}/* * Walk the specified cpuset subtree and look for empty cpusets. * The tasks of such cpuset must be moved to a parent cpuset. * * Called with cgroup_mutex held. We take callback_mutex to modify * cpus_allowed and mems_allowed. * * This walk processes the tree from top to bottom, completing one layer * before dropping down to the next. It always processes a node before * any of its children. * * For now, since we lack memory hot unplug, we'll never see a cpuset * that has tasks along with an empty 'mems'. But if we did see such * a cpuset, we'd handle it just like we do if its 'cpus' was empty. */static void scan_for_empty_cpusets(const struct cpuset *root){ LIST_HEAD(queue); struct cpuset *cp; /* scans cpusets being updated */ struct cpuset *child; /* scans child cpusets of cp */ struct cgroup *cont; nodemask_t oldmems; list_add_tail((struct list_head *)&root->stack_list, &queue); while (!list_empty(&queue)) { cp = list_first_entry(&queue, struct cpuset, stack_list); list_del(queue.next); list_for_each_entry(cont, &cp->css.cgroup->children, sibling) { child = cgroup_cs(cont); list_add_tail(&child->stack_list, &queue); } /* Continue past cpusets with all cpus, mems online */ if (cpus_subset(cp->cpus_allowed, cpu_online_map) && nodes_subset(cp->mems_allowed, node_states[N_HIGH_MEMORY])) continue; oldmems = cp->mems_allowed; /* Remove offline cpus and mems from this cpuset. */ mutex_lock(&callback_mutex); cpus_and(cp->cpus_allowed, cp->cpus_allowed, cpu_online_map); nodes_and(cp->mems_allowed, cp->mems_allowed, node_states[N_HIGH_MEMORY]); mutex_unlock(&callback_mutex); /* Move tasks from the empty cpuset to a parent */ if (cpus_empty(cp->cpus_allowed) || nodes_empty(cp->mems_allowed)) remove_tasks_in_empty_cpuset(cp); else { update_tasks_cpumask(cp, NULL); update_tasks_nodemask(cp, &oldmems); } }}/* * The top_cpuset tracks what CPUs and Memory Nodes are online, * period. This is necessary in order to make cpusets transparent * (of no affect) on systems that are actively using CPU hotplug * but making no active use of cpusets. * * This routine ensures that top_cpuset.cpus_allowed tracks * cpu_online_map on each CPU hotplug (cpuhp) event. * * Called within get_online_cpus(). Needs to call cgroup_lock() * before calling generate_sched_domains(). */static int cpuset_track_online_cpus(struct notifier_block *unused_nb, unsigned long phase, void *unused_cpu){ struct sched_domain_attr *attr; cpumask_t *doms; int ndoms; switch (phase) { case CPU_ONLINE: case CPU_ONLINE_FROZEN: case CPU_DEAD: case CPU_DEAD_FROZEN: break; default: return NOTIFY_DONE; } cgroup_lock(); top_cpuset.cpus_allowed = cpu_online_map; scan_for_empty_cpusets(&top_cpuset); ndoms = generate_sched_domains(&doms, &attr); cgroup_unlock(); /* Have scheduler rebuild the domains */ partition_sched_domains(ndoms, doms, attr); return NOTIFY_OK;}#ifdef CONFIG_MEMORY_HOTPLUG/* * Keep top_cpuset.mems_allowed tracking node_states[N_HIGH_MEMORY]. * Call this routine anytime after node_states[N_HIGH_MEMORY] changes. * See also the previous routine cpuset_track_online_cpus(). */void cpuset_track_online_nodes(void){ cgroup_lock(); top_cpuset.mems_allowed = node_states[N_HIGH_MEMORY]; scan_for_empty_cpusets(&top_cpuset); cgroup_unlock();}#endif/** * cpuset_init_smp - initialize cpus_allowed * * Description: Finish top cpuset after cpu, node maps are initialized **/void __init cpuset_init_smp(void){ top_cpuset.cpus_allowed = cpu_online_map; top_cpuset.mems_allowed = node_states[N_HIGH_MEMORY]; hotcpu_notifier(cpuset_track_online_cpus, 0);}/** * cpuset_cpus_allowed - return cpus_allowed mask from a tasks cpuset. * @tsk: pointer to task_struct from which to obtain cpuset->cpus_allowed. * @pmask: pointer to cpumask_t variable to receive cpus_allowed set. * * Description: Returns the cpumask_t cpus_allowed of the cpuset * attached to the specified @tsk. Guaranteed to return some non-empty * subset of cpu_online_map, even if this means going outside the * tasks cpuset. **/void cpuset_cpus_allowed(struct task_struct *tsk, cpumask_t *pmask){ mutex_lock(&callback_mutex); cpuset_cpus_allowed_locked(tsk, pmask); mutex_unlock(&callback_mutex);}/** * cpuset_cpus_allowed_locked - return cpus_allowed mask from a tasks cpuset. * Must be called with callback_mutex held. **/void cpuset_cpus_allowed_locked(struct task_struct *tsk, cpumask_t *pmask){ task_lock(tsk); guarantee_online_cpus(task_cs(tsk), pmask); task_unlock(tsk);}void cpuset_init_current_mems_allowed(void){ nodes_setall(current->mems_allowed);}/** * cpuset_mems_allowed - return mems_allowed mask from a tasks cpuset. * @tsk: pointer to task_struct from which to obtain cpuset->mems_allowed. * * Description: Returns the nodemask_t mems_allowed of the cpuset * attached to the specified @tsk. Guaranteed to return some non-empty * subset of node_states[N_HIGH_MEMORY], even if this means going outside the * tasks cpuset. **/nodemask_t cpuset_mems_allowed(struct task_struct *tsk){ nodemask_t mask; mutex_lock(&callback_mutex); task_lock(tsk); guarantee_online_mems(task_cs(tsk), &mask); task_unlock(tsk); mutex_unlock(&callback_mutex); return mask;}/** * cpuset_nodemask_valid_mems_allowed - check nodemask vs. curremt mems_allowed * @nodemask: the nodemask to be checked * * Are any of the nodes in the nodemask allowed in current->mems_allowed? */int cpuset_nodemask_valid_mems_allowed(nodemask_t *nodemask){ return nodes_intersects(*nodemask, current->mems_allowed);}/* * nearest_hardwall_ancestor() - Returns the nearest mem_exclusive or * mem_hardwall ancestor to the specified cpuset. Call holding * callback_mutex. If no ancestor is mem_exclusive or mem_hardwall * (an unusual configuration), then returns the root cpuset. */static const struct cpuset *nearest_hardwall_ancestor(const struct cpuset *cs){ while (!(is_mem_exclusive(cs) || is_mem_hardwall(cs)) && cs->parent) cs = cs->parent; return cs;}/** * cpuset_zone_allowed_softwall - Can we allocate on zone z's memory node? * @z: is this zone on an allowed node? * @gfp_mask: memory allocation flags * * If we're in interrupt, yes, we can always allocate. If * __GFP_THISNODE is set, yes, we can always allocate. If zone * z's node is in our tasks mems_allowed, yes. If it's not a * __GFP_HARDWALL request and this zone's nodes is in the nearest * hardwalled cpuset ancestor to this tasks cpuset, yes. * If the task has been OOM killed and has access to memory reserves * as specified by the TIF_MEMDIE flag, yes. * Otherwise, no. * * If __GFP_HARDWALL is set, cpuset_zone_allowed_softwall() * reduces to cpuset_zone_allowed_hardwall(). Otherwise, * cpuset_zone_allowed_softwall() might sleep, and might allow a zone * from an enclosing cpuset. * * cpuset_zone_allowed_hardwall() only handles the simpler case of * hardwall cpusets, and never sleeps. * * The __GFP_THISNODE placement logic is really handled elsewhere, * by forcibly using a zonelist starting at a specified node, and by * (in get_page_from_freelist()) refusing to consider the zones for * any node on the zonelist except the first. By the time any such * calls get to this routine, we should just shut up and say 'yes'. * * GFP_USER allocations are marked with the __GFP_HARDWALL bit, * and do not allow allocations outside the current tasks cpuset * unless the task has been OOM killed as is marked TIF_MEMDIE. * GFP_KERNEL allocations are not so marked, so can escape to the * nearest enclosing hardwalled ancestor cpuset. * * Scanning up parent cpusets requires callback_mutex. The * __alloc_pages() routine only calls here with __GFP_HARDWALL bit * _not_ set if it's a GFP_KERNEL allocation, and all nodes in the * current tasks mems_allowed came up empty on the first pass over * the zonelist. So only GFP_KERNEL allocations, if all nodes in the * cpuset are short of memory, might require taking the callback_mutex * mutex. * * The first call here from mm/page_alloc:get_page_from_freelist() * has __GFP_HARDWALL set in gfp_mask, enforcing hardwall cpusets, * so no allocation on a node outside the cpuset is allowed (unless * in interrupt, of course). * * The second pass through get_page_from_freelist() doesn't even call * here for GFP_ATOMIC calls. For those calls, the __alloc_pages() * variable 'wait' is not set, and the bit ALLOC_CPUSET is not set * in alloc_flags. That logic and the checks below have the combined * affect that: * in_interrupt - any node ok (current task context irrelevant) * GFP_ATOMIC - any node ok * TIF_MEMDIE - any node ok * GFP_KERNEL - any node in enclosing hardwalled cpuset ok * GFP_USER - only nodes in current tasks mems allowed ok. * * Rule: * Don't call cpuset_zone_allowed_softwall if you can't sleep, unless you * pass in the __GFP_HARDWALL flag set in gfp_flag, which disables * the code that might scan up ancestor cpusets and sleep. */int __cpuset_zone_allowed_softwall(struct zone *z, gfp_t gfp_mask){ int node; /* node that zone z is on */ const struct cpuset *cs; /* current cpuset ancestors */ int allowed; /* is allocation in zone z allowed? */ if (in_interrupt() || (gfp_mask & __GFP_THISNODE)) return 1; node = zone_to_nid(z); might_sleep_if(!(gfp_mask & __GFP_HARDWALL)); if (node_isset(node, current->mems_allowed)) return 1; /* * Allow tasks that have access to memory reserves because they have * been OOM killed to get memory anywhere. */ if (unlikely(test_thread_flag(TIF_MEMDIE))) return 1; if (gfp_mask & __GFP_HARDWALL) /* If hardwall request, stop here */ return 0; if (current->flags & PF_EXITING) /* Let dying task have memory */ return 1; /* Not hardwall and node outside mems_allowed: scan up cpusets */ mutex_lock(&callback_mutex); task_lock(current); cs = nearest_hardwall_ancestor(task_cs(current)); task_unlock(current); allowed = node_isset(node, cs->mems_allowed); mutex_unlock(&callback_mutex); return allowed;}/* * cpuset_zone_allowed_hardwall - Can we allocate on zone z's memory node? * @z: is this zone on an allowed node? * @gfp_mask: memory allocation flags * * If we're in interrupt, yes, we can always allocate. * If __GFP_THISNODE is set, yes, we can always allocate. If zone * z's node is in our tasks mems_allowed, yes. If the task has been * OOM killed and has access to memory reserves as specified by the * TIF_MEMDIE flag, yes. Otherwise, no. * * The __GFP_THISNODE placement logic is really handled elsewhere, * by forcibly using a zonelist starting at a specified node, and by * (in get_page_from_freelist()) refusing to consider the zones for * any node on the zonelist except the first. By the time any such * calls get to this routine, we should just shut up and say 'yes'. * * Unlike the cpuset_zone_allowed_softwall() variant, above, * this variant requires that the zone be
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -