📄 cpuset.c
字号:
start = buf + buflen; *--start = '\0'; for (;;) { int len = cs->dentry->d_name.len; if ((start -= len) < buf) return -ENAMETOOLONG; memcpy(start, cs->dentry->d_name.name, len); cs = cs->parent; if (!cs) break; if (!cs->parent) continue; if (--start < buf) return -ENAMETOOLONG; *start = '/'; } memmove(buf, start, buf + buflen - start); return 0;}/* * Notify userspace when a cpuset is released, by running * /sbin/cpuset_release_agent with the name of the cpuset (path * relative to the root of cpuset file system) as the argument. * * Most likely, this user command will try to rmdir this cpuset. * * This races with the possibility that some other task will be * attached to this cpuset before it is removed, or that some other * user task will 'mkdir' a child cpuset of this cpuset. That's ok. * The presumed 'rmdir' will fail quietly if this cpuset is no longer * unused, and this cpuset will be reprieved from its death sentence, * to continue to serve a useful existence. Next time it's released, * we will get notified again, if it still has 'notify_on_release' set. * * The final arg to call_usermodehelper() is 0, which means don't * wait. The separate /sbin/cpuset_release_agent task is forked by * call_usermodehelper(), then control in this thread returns here, * without waiting for the release agent task. We don't bother to * wait because the caller of this routine has no use for the exit * status of the /sbin/cpuset_release_agent task, so no sense holding * our caller up for that. * * When we had only one cpuset mutex, we had to call this * without holding it, to avoid deadlock when call_usermodehelper() * allocated memory. With two locks, we could now call this while * holding manage_mutex, but we still don't, so as to minimize * the time manage_mutex is held. */static void cpuset_release_agent(const char *pathbuf){ char *argv[3], *envp[3]; int i; if (!pathbuf) return; i = 0; argv[i++] = "/sbin/cpuset_release_agent"; argv[i++] = (char *)pathbuf; argv[i] = NULL; i = 0; /* minimal command environment */ envp[i++] = "HOME=/"; envp[i++] = "PATH=/sbin:/bin:/usr/sbin:/usr/bin"; envp[i] = NULL; call_usermodehelper(argv[0], argv, envp, 0); kfree(pathbuf);}/* * Either cs->count of using tasks transitioned to zero, or the * cs->children list of child cpusets just became empty. If this * cs is notify_on_release() and now both the user count is zero and * the list of children is empty, prepare cpuset path in a kmalloc'd * buffer, to be returned via ppathbuf, so that the caller can invoke * cpuset_release_agent() with it later on, once manage_mutex is dropped. * Call here with manage_mutex held. * * This check_for_release() routine is responsible for kmalloc'ing * pathbuf. The above cpuset_release_agent() is responsible for * kfree'ing pathbuf. The caller of these routines is responsible * for providing a pathbuf pointer, initialized to NULL, then * calling check_for_release() with manage_mutex held and the address * of the pathbuf pointer, then dropping manage_mutex, then calling * cpuset_release_agent() with pathbuf, as set by check_for_release(). */static void check_for_release(struct cpuset *cs, char **ppathbuf){ if (notify_on_release(cs) && atomic_read(&cs->count) == 0 && list_empty(&cs->children)) { char *buf; buf = kmalloc(PAGE_SIZE, GFP_KERNEL); if (!buf) return; if (cpuset_path(cs, buf, PAGE_SIZE) < 0) kfree(buf); else *ppathbuf = buf; }}/* * Return in *pmask the portion of a cpusets's cpus_allowed that * are online. If none are online, walk up the cpuset hierarchy * until we find one that does have some online cpus. If we get * all the way to the top and still haven't found any online cpus, * return cpu_online_map. Or if passed a NULL cs from an exit'ing * task, return cpu_online_map. * * One way or another, we guarantee to return some non-empty subset * of cpu_online_map. * * Call with callback_mutex held. */static void guarantee_online_cpus(const struct cpuset *cs, cpumask_t *pmask){ while (cs && !cpus_intersects(cs->cpus_allowed, cpu_online_map)) cs = cs->parent; if (cs) cpus_and(*pmask, cs->cpus_allowed, cpu_online_map); else *pmask = cpu_online_map; BUG_ON(!cpus_intersects(*pmask, cpu_online_map));}/* * Return in *pmask the portion of a cpusets's mems_allowed that * are online. If none are online, walk up the cpuset hierarchy * until we find one that does have some online mems. If we get * all the way to the top and still haven't found any online mems, * return node_online_map. * * One way or another, we guarantee to return some non-empty subset * of node_online_map. * * Call with callback_mutex held. */static void guarantee_online_mems(const struct cpuset *cs, nodemask_t *pmask){ while (cs && !nodes_intersects(cs->mems_allowed, node_online_map)) cs = cs->parent; if (cs) nodes_and(*pmask, cs->mems_allowed, node_online_map); else *pmask = node_online_map; BUG_ON(!nodes_intersects(*pmask, node_online_map));}/** * cpuset_update_task_memory_state - update task memory placement * * If the current tasks cpusets mems_allowed changed behind our * backs, update current->mems_allowed, mems_generation and task NUMA * mempolicy to the new value. * * Task mempolicy is updated by rebinding it relative to the * current->cpuset if a task has its memory placement changed. * Do not call this routine if in_interrupt(). * * Call without callback_mutex or task_lock() held. May be * called with or without manage_mutex held. Thanks in part to * 'the_top_cpuset_hack', the tasks cpuset pointer will never * be NULL. This routine also might acquire callback_mutex and * current->mm->mmap_sem during call. * * Reading current->cpuset->mems_generation doesn't need task_lock * to guard the current->cpuset derefence, because it is guarded * from concurrent freeing of current->cpuset by attach_task(), * using RCU. * * The rcu_dereference() is technically probably not needed, * as I don't actually mind if I see a new cpuset pointer but * an old value of mems_generation. However this really only * matters on alpha systems using cpusets heavily. If I dropped * that rcu_dereference(), it would save them a memory barrier. * For all other arch's, rcu_dereference is a no-op anyway, and for * alpha systems not using cpusets, another planned optimization, * avoiding the rcu critical section for tasks in the root cpuset * which is statically allocated, so can't vanish, will make this * irrelevant. Better to use RCU as intended, than to engage in * some cute trick to save a memory barrier that is impossible to * test, for alpha systems using cpusets heavily, which might not * even exist. * * This routine is needed to update the per-task mems_allowed data, * within the tasks context, when it is trying to allocate memory * (in various mm/mempolicy.c routines) and notices that some other * task has been modifying its cpuset. */void cpuset_update_task_memory_state(void){ int my_cpusets_mem_gen; struct task_struct *tsk = current; struct cpuset *cs; if (tsk->cpuset == &top_cpuset) { /* Don't need rcu for top_cpuset. It's never freed. */ my_cpusets_mem_gen = top_cpuset.mems_generation; } else { rcu_read_lock(); cs = rcu_dereference(tsk->cpuset); my_cpusets_mem_gen = cs->mems_generation; rcu_read_unlock(); } if (my_cpusets_mem_gen != tsk->cpuset_mems_generation) { mutex_lock(&callback_mutex); task_lock(tsk); cs = tsk->cpuset; /* Maybe changed when task not locked */ guarantee_online_mems(cs, &tsk->mems_allowed); tsk->cpuset_mems_generation = cs->mems_generation; if (is_spread_page(cs)) tsk->flags |= PF_SPREAD_PAGE; else tsk->flags &= ~PF_SPREAD_PAGE; if (is_spread_slab(cs)) tsk->flags |= PF_SPREAD_SLAB; else tsk->flags &= ~PF_SPREAD_SLAB; task_unlock(tsk); mutex_unlock(&callback_mutex); mpol_rebind_task(tsk, &tsk->mems_allowed); }}/* * is_cpuset_subset(p, q) - Is cpuset p a subset of cpuset q? * * One cpuset is a subset of another if all its allowed CPUs and * Memory Nodes are a subset of the other, and its exclusive flags * are only set if the other's are set. Call holding manage_mutex. */static int is_cpuset_subset(const struct cpuset *p, const struct cpuset *q){ return cpus_subset(p->cpus_allowed, q->cpus_allowed) && nodes_subset(p->mems_allowed, q->mems_allowed) && is_cpu_exclusive(p) <= is_cpu_exclusive(q) && is_mem_exclusive(p) <= is_mem_exclusive(q);}/* * validate_change() - Used to validate that any proposed cpuset change * follows the structural rules for cpusets. * * If we replaced the flag and mask values of the current cpuset * (cur) with those values in the trial cpuset (trial), would * our various subset and exclusive rules still be valid? Presumes * manage_mutex held. * * 'cur' is the address of an actual, in-use cpuset. Operations * such as list traversal that depend on the actual address of the * cpuset in the list must use cur below, not trial. * * 'trial' is the address of bulk structure copy of cur, with * perhaps one or more of the fields cpus_allowed, mems_allowed, * or flags changed to new, trial values. * * Return 0 if valid, -errno if not. */static int validate_change(const struct cpuset *cur, const struct cpuset *trial){ struct cpuset *c, *par; /* Each of our child cpusets must be a subset of us */ list_for_each_entry(c, &cur->children, sibling) { if (!is_cpuset_subset(c, trial)) return -EBUSY; } /* Remaining checks don't apply to root cpuset */ if (cur == &top_cpuset) return 0; par = cur->parent; /* We must be a subset of our parent cpuset */ if (!is_cpuset_subset(trial, par)) return -EACCES; /* If either I or some sibling (!= me) is exclusive, we can't overlap */ list_for_each_entry(c, &par->children, sibling) { if ((is_cpu_exclusive(trial) || is_cpu_exclusive(c)) && c != cur && cpus_intersects(trial->cpus_allowed, c->cpus_allowed)) return -EINVAL; if ((is_mem_exclusive(trial) || is_mem_exclusive(c)) && c != cur && nodes_intersects(trial->mems_allowed, c->mems_allowed)) return -EINVAL; } return 0;}/* * For a given cpuset cur, partition the system as follows * a. All cpus in the parent cpuset's cpus_allowed that are not part of any * exclusive child cpusets * b. All cpus in the current cpuset's cpus_allowed that are not part of any * exclusive child cpusets * Build these two partitions by calling partition_sched_domains * * Call with manage_mutex held. May nest a call to the * lock_cpu_hotplug()/unlock_cpu_hotplug() pair. * Must not be called holding callback_mutex, because we must * not call lock_cpu_hotplug() while holding callback_mutex. */static void update_cpu_domains(struct cpuset *cur){ struct cpuset *c, *par = cur->parent; cpumask_t pspan, cspan; if (par == NULL || cpus_empty(cur->cpus_allowed)) return; /* * Get all cpus from parent's cpus_allowed not part of exclusive * children */ pspan = par->cpus_allowed; list_for_each_entry(c, &par->children, sibling) { if (is_cpu_exclusive(c)) cpus_andnot(pspan, pspan, c->cpus_allowed); } if (!is_cpu_exclusive(cur)) { cpus_or(pspan, pspan, cur->cpus_allowed); if (cpus_equal(pspan, cur->cpus_allowed)) return; cspan = CPU_MASK_NONE; } else { if (cpus_empty(pspan)) return; cspan = cur->cpus_allowed; /* * Get all cpus from current cpuset's cpus_allowed not part * of exclusive children */ list_for_each_entry(c, &cur->children, sibling) { if (is_cpu_exclusive(c)) cpus_andnot(cspan, cspan, c->cpus_allowed); } } lock_cpu_hotplug(); partition_sched_domains(&pspan, &cspan); unlock_cpu_hotplug();}/* * Call with manage_mutex held. May take callback_mutex during call. */static int update_cpumask(struct cpuset *cs, char *buf){ struct cpuset trialcs; int retval, cpus_unchanged; /* top_cpuset.cpus_allowed tracks cpu_online_map; it's read-only */ if (cs == &top_cpuset) return -EACCES; trialcs = *cs; /* * We allow a cpuset's cpus_allowed to be empty; if it has attached * tasks, we'll catch it later when we validate the change and return * -ENOSPC. */ if (!buf[0] || (buf[0] == '\n' && !buf[1])) { cpus_clear(trialcs.cpus_allowed); } else { retval = cpulist_parse(buf, trialcs.cpus_allowed); if (retval < 0) return retval; } cpus_and(trialcs.cpus_allowed, trialcs.cpus_allowed, cpu_online_map); /* cpus_allowed cannot be empty for a cpuset with attached tasks. */ if (atomic_read(&cs->count) && cpus_empty(trialcs.cpus_allowed)) return -ENOSPC; retval = validate_change(cs, &trialcs); if (retval < 0) return retval; cpus_unchanged = cpus_equal(cs->cpus_allowed, trialcs.cpus_allowed); mutex_lock(&callback_mutex); cs->cpus_allowed = trialcs.cpus_allowed; mutex_unlock(&callback_mutex); if (is_cpu_exclusive(cs) && !cpus_unchanged) update_cpu_domains(cs); return 0;}/* * cpuset_migrate_mm * * Migrate memory region from one set of nodes to another. * * Temporarilly set tasks mems_allowed to target nodes of migration, * so that the migration code can allocate pages on these nodes. * * Call holding manage_mutex, so our current->cpuset won't change * during this call, as manage_mutex holds off any attach_task() * calls. Therefore we don't need to take task_lock around the * call to guarantee_online_mems(), as we know no one is changing * our tasks cpuset. * * Hold callback_mutex around the two modifications of our tasks * mems_allowed to synchronize with cpuset_mems_allowed(). * * While the mm_struct we are migrating is typically from some * other task, the task_struct mems_allowed that we are hacking * is for our current task, which must allocate new pages for that * migrating memory region. * * We call cpuset_update_task_memory_state() before hacking * our tasks mems_allowed, so that we are assured of being in * sync with our tasks cpuset, and in particular, callbacks to * cpuset_update_task_memory_state() from nested page allocations * won't see any mismatch of our cpuset and task mems_generation * values, so won't overwrite our hacked tasks mems_allowed * nodemask. */static void cpuset_migrate_mm(struct mm_struct *mm, const nodemask_t *from, const nodemask_t *to){ struct task_struct *tsk = current; cpuset_update_task_memory_state(); mutex_lock(&callback_mutex); tsk->mems_allowed = *to; mutex_unlock(&callback_mutex);
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -