📄 cpuset.c
字号:
if (!is_cpuset_subset(trial, par)) return -EACCES; /* * If either I or some sibling (!= me) is exclusive, we can't * overlap */ list_for_each_entry(cont, &par->css.cgroup->children, sibling) { c = cgroup_cs(cont); if ((is_cpu_exclusive(trial) || is_cpu_exclusive(c)) && c != cur && cpus_intersects(trial->cpus_allowed, c->cpus_allowed)) return -EINVAL; if ((is_mem_exclusive(trial) || is_mem_exclusive(c)) && c != cur && nodes_intersects(trial->mems_allowed, c->mems_allowed)) return -EINVAL; } /* Cpusets with tasks can't have empty cpus_allowed or mems_allowed */ if (cgroup_task_count(cur->css.cgroup)) { if (cpus_empty(trial->cpus_allowed) || nodes_empty(trial->mems_allowed)) { return -ENOSPC; } } return 0;}/* * Helper routine for generate_sched_domains(). * Do cpusets a, b have overlapping cpus_allowed masks? */static int cpusets_overlap(struct cpuset *a, struct cpuset *b){ return cpus_intersects(a->cpus_allowed, b->cpus_allowed);}static voidupdate_domain_attr(struct sched_domain_attr *dattr, struct cpuset *c){ if (dattr->relax_domain_level < c->relax_domain_level) dattr->relax_domain_level = c->relax_domain_level; return;}static voidupdate_domain_attr_tree(struct sched_domain_attr *dattr, struct cpuset *c){ LIST_HEAD(q); list_add(&c->stack_list, &q); while (!list_empty(&q)) { struct cpuset *cp; struct cgroup *cont; struct cpuset *child; cp = list_first_entry(&q, struct cpuset, stack_list); list_del(q.next); if (cpus_empty(cp->cpus_allowed)) continue; if (is_sched_load_balance(cp)) update_domain_attr(dattr, cp); list_for_each_entry(cont, &cp->css.cgroup->children, sibling) { child = cgroup_cs(cont); list_add_tail(&child->stack_list, &q); } }}/* * generate_sched_domains() * * This function builds a partial partition of the systems CPUs * A 'partial partition' is a set of non-overlapping subsets whose * union is a subset of that set. * The output of this function needs to be passed to kernel/sched.c * partition_sched_domains() routine, which will rebuild the scheduler's * load balancing domains (sched domains) as specified by that partial * partition. * * See "What is sched_load_balance" in Documentation/cpusets.txt * for a background explanation of this. * * Does not return errors, on the theory that the callers of this * routine would rather not worry about failures to rebuild sched * domains when operating in the severe memory shortage situations * that could cause allocation failures below. * * Must be called with cgroup_lock held. * * The three key local variables below are: * q - a linked-list queue of cpuset pointers, used to implement a * top-down scan of all cpusets. This scan loads a pointer * to each cpuset marked is_sched_load_balance into the * array 'csa'. For our purposes, rebuilding the schedulers * sched domains, we can ignore !is_sched_load_balance cpusets. * csa - (for CpuSet Array) Array of pointers to all the cpusets * that need to be load balanced, for convenient iterative * access by the subsequent code that finds the best partition, * i.e the set of domains (subsets) of CPUs such that the * cpus_allowed of every cpuset marked is_sched_load_balance * is a subset of one of these domains, while there are as * many such domains as possible, each as small as possible. * doms - Conversion of 'csa' to an array of cpumasks, for passing to * the kernel/sched.c routine partition_sched_domains() in a * convenient format, that can be easily compared to the prior * value to determine what partition elements (sched domains) * were changed (added or removed.) * * Finding the best partition (set of domains): * The triple nested loops below over i, j, k scan over the * load balanced cpusets (using the array of cpuset pointers in * csa[]) looking for pairs of cpusets that have overlapping * cpus_allowed, but which don't have the same 'pn' partition * number and gives them in the same partition number. It keeps * looping on the 'restart' label until it can no longer find * any such pairs. * * The union of the cpus_allowed masks from the set of * all cpusets having the same 'pn' value then form the one * element of the partition (one sched domain) to be passed to * partition_sched_domains(). */static int generate_sched_domains(cpumask_t **domains, struct sched_domain_attr **attributes){ LIST_HEAD(q); /* queue of cpusets to be scanned */ struct cpuset *cp; /* scans q */ struct cpuset **csa; /* array of all cpuset ptrs */ int csn; /* how many cpuset ptrs in csa so far */ int i, j, k; /* indices for partition finding loops */ cpumask_t *doms; /* resulting partition; i.e. sched domains */ struct sched_domain_attr *dattr; /* attributes for custom domains */ int ndoms; /* number of sched domains in result */ int nslot; /* next empty doms[] cpumask_t slot */ ndoms = 0; doms = NULL; dattr = NULL; csa = NULL; /* Special case for the 99% of systems with one, full, sched domain */ if (is_sched_load_balance(&top_cpuset)) { doms = kmalloc(sizeof(cpumask_t), GFP_KERNEL); if (!doms) goto done; dattr = kmalloc(sizeof(struct sched_domain_attr), GFP_KERNEL); if (dattr) { *dattr = SD_ATTR_INIT; update_domain_attr_tree(dattr, &top_cpuset); } *doms = top_cpuset.cpus_allowed; ndoms = 1; goto done; } csa = kmalloc(number_of_cpusets * sizeof(cp), GFP_KERNEL); if (!csa) goto done; csn = 0; list_add(&top_cpuset.stack_list, &q); while (!list_empty(&q)) { struct cgroup *cont; struct cpuset *child; /* scans child cpusets of cp */ cp = list_first_entry(&q, struct cpuset, stack_list); list_del(q.next); if (cpus_empty(cp->cpus_allowed)) continue; /* * All child cpusets contain a subset of the parent's cpus, so * just skip them, and then we call update_domain_attr_tree() * to calc relax_domain_level of the corresponding sched * domain. */ if (is_sched_load_balance(cp)) { csa[csn++] = cp; continue; } list_for_each_entry(cont, &cp->css.cgroup->children, sibling) { child = cgroup_cs(cont); list_add_tail(&child->stack_list, &q); } } for (i = 0; i < csn; i++) csa[i]->pn = i; ndoms = csn;restart: /* Find the best partition (set of sched domains) */ for (i = 0; i < csn; i++) { struct cpuset *a = csa[i]; int apn = a->pn; for (j = 0; j < csn; j++) { struct cpuset *b = csa[j]; int bpn = b->pn; if (apn != bpn && cpusets_overlap(a, b)) { for (k = 0; k < csn; k++) { struct cpuset *c = csa[k]; if (c->pn == bpn) c->pn = apn; } ndoms--; /* one less element */ goto restart; } } } /* * Now we know how many domains to create. * Convert <csn, csa> to <ndoms, doms> and populate cpu masks. */ doms = kmalloc(ndoms * sizeof(cpumask_t), GFP_KERNEL); if (!doms) { ndoms = 0; goto done; } /* * The rest of the code, including the scheduler, can deal with * dattr==NULL case. No need to abort if alloc fails. */ dattr = kmalloc(ndoms * sizeof(struct sched_domain_attr), GFP_KERNEL); for (nslot = 0, i = 0; i < csn; i++) { struct cpuset *a = csa[i]; cpumask_t *dp; int apn = a->pn; if (apn < 0) { /* Skip completed partitions */ continue; } dp = doms + nslot; if (nslot == ndoms) { static int warnings = 10; if (warnings) { printk(KERN_WARNING "rebuild_sched_domains confused:" " nslot %d, ndoms %d, csn %d, i %d," " apn %d\n", nslot, ndoms, csn, i, apn); warnings--; } continue; } cpus_clear(*dp); if (dattr) *(dattr + nslot) = SD_ATTR_INIT; for (j = i; j < csn; j++) { struct cpuset *b = csa[j]; if (apn == b->pn) { cpus_or(*dp, *dp, b->cpus_allowed); if (dattr) update_domain_attr_tree(dattr + nslot, b); /* Done with this partition */ b->pn = -1; } } nslot++; } BUG_ON(nslot != ndoms);done: kfree(csa); *domains = doms; *attributes = dattr; return ndoms;}/* * Rebuild scheduler domains. * * Call with neither cgroup_mutex held nor within get_online_cpus(). * Takes both cgroup_mutex and get_online_cpus(). * * Cannot be directly called from cpuset code handling changes * to the cpuset pseudo-filesystem, because it cannot be called * from code that already holds cgroup_mutex. */static void do_rebuild_sched_domains(struct work_struct *unused){ struct sched_domain_attr *attr; cpumask_t *doms; int ndoms; get_online_cpus(); /* Generate domain masks and attrs */ cgroup_lock(); ndoms = generate_sched_domains(&doms, &attr); cgroup_unlock(); /* Have scheduler rebuild the domains */ partition_sched_domains(ndoms, doms, attr); put_online_cpus();}static DECLARE_WORK(rebuild_sched_domains_work, do_rebuild_sched_domains);/* * Rebuild scheduler domains, asynchronously via workqueue. * * If the flag 'sched_load_balance' of any cpuset with non-empty * 'cpus' changes, or if the 'cpus' allowed changes in any cpuset * which has that flag enabled, or if any cpuset with a non-empty * 'cpus' is removed, then call this routine to rebuild the * scheduler's dynamic sched domains. * * The rebuild_sched_domains() and partition_sched_domains() * routines must nest cgroup_lock() inside get_online_cpus(), * but such cpuset changes as these must nest that locking the * other way, holding cgroup_lock() for much of the code. * * So in order to avoid an ABBA deadlock, the cpuset code handling * these user changes delegates the actual sched domain rebuilding * to a separate workqueue thread, which ends up processing the * above do_rebuild_sched_domains() function. */static void async_rebuild_sched_domains(void){ schedule_work(&rebuild_sched_domains_work);}/* * Accomplishes the same scheduler domain rebuild as the above * async_rebuild_sched_domains(), however it directly calls the * rebuild routine synchronously rather than calling it via an * asynchronous work thread. * * This can only be called from code that is not holding * cgroup_mutex (not nested in a cgroup_lock() call.) */void rebuild_sched_domains(void){ do_rebuild_sched_domains(NULL);}/** * cpuset_test_cpumask - test a task's cpus_allowed versus its cpuset's * @tsk: task to test * @scan: struct cgroup_scanner contained in its struct cpuset_hotplug_scanner * * Call with cgroup_mutex held. May take callback_mutex during call. * Called for each task in a cgroup by cgroup_scan_tasks(). * Return nonzero if this tasks's cpus_allowed mask should be changed (in other * words, if its mask is not equal to its cpuset's mask). */static int cpuset_test_cpumask(struct task_struct *tsk, struct cgroup_scanner *scan){ return !cpus_equal(tsk->cpus_allowed, (cgroup_cs(scan->cg))->cpus_allowed);}/** * cpuset_change_cpumask - make a task's cpus_allowed the same as its cpuset's * @tsk: task to test * @scan: struct cgroup_scanner containing the cgroup of the task * * Called by cgroup_scan_tasks() for each task in a cgroup whose * cpus_allowed mask needs to be changed. * * We don't need to re-check for the cgroup/cpuset membership, since we're * holding cgroup_lock() at this point. */static void cpuset_change_cpumask(struct task_struct *tsk, struct cgroup_scanner *scan){ set_cpus_allowed_ptr(tsk, &((cgroup_cs(scan->cg))->cpus_allowed));}/** * update_tasks_cpumask - Update the cpumasks of tasks in the cpuset. * @cs: the cpuset in which each task's cpus_allowed mask needs to be changed * @heap: if NULL, defer allocating heap memory to cgroup_scan_tasks() * * Called with cgroup_mutex held * * The cgroup_scan_tasks() function will scan all the tasks in a cgroup, * calling callback functions for each. * * No return value. It's guaranteed that cgroup_scan_tasks() always returns 0 * if @heap != NULL. */static void update_tasks_cpumask(struct cpuset *cs, struct ptr_heap *heap){ struct cgroup_scanner scan; scan.cg = cs->css.cgroup; scan.test_task = cpuset_test_cpumask; scan.process_task = cpuset_change_cpumask; scan.heap = heap; cgroup_scan_tasks(&scan);}/** * update_cpumask - update the cpus_allowed mask of a cpuset and all tasks in it * @cs: the cpuset to consider * @buf: buffer of cpu numbers written to this cpuset */static int update_cpumask(struct cpuset *cs, const char *buf){ struct ptr_heap heap; struct cpuset trialcs; int retval; int is_load_balanced; /* top_cpuset.cpus_allowed tracks cpu_online_map; it's read-only */ if (cs == &top_cpuset) return -EACCES; trialcs = *cs; /* * An empty cpus_allowed is ok only if the cpuset has no tasks. * Since cpulist_parse() fails on an empty mask, we special case * that parsing. The validate_change() call ensures that cpusets * with tasks have cpus. */ if (!*buf) { cpus_clear(trialcs.cpus_allowed); } else { retval = cpulist_parse(buf, trialcs.cpus_allowed); if (retval < 0) return retval;
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -