📄 cpuset.c
字号:
/* * for the common functions, 'private' gives the type of file */static struct cftype cft_tasks = { .name = "tasks", .open = cpuset_tasks_open, .read = cpuset_tasks_read, .release = cpuset_tasks_release, .private = FILE_TASKLIST,};static struct cftype cft_cpus = { .name = "cpus", .private = FILE_CPULIST,};static struct cftype cft_mems = { .name = "mems", .private = FILE_MEMLIST,};static struct cftype cft_cpu_exclusive = { .name = "cpu_exclusive", .private = FILE_CPU_EXCLUSIVE,};static struct cftype cft_mem_exclusive = { .name = "mem_exclusive", .private = FILE_MEM_EXCLUSIVE,};static struct cftype cft_notify_on_release = { .name = "notify_on_release", .private = FILE_NOTIFY_ON_RELEASE,};static struct cftype cft_memory_migrate = { .name = "memory_migrate", .private = FILE_MEMORY_MIGRATE,};static struct cftype cft_memory_pressure_enabled = { .name = "memory_pressure_enabled", .private = FILE_MEMORY_PRESSURE_ENABLED,};static struct cftype cft_memory_pressure = { .name = "memory_pressure", .private = FILE_MEMORY_PRESSURE,};static struct cftype cft_spread_page = { .name = "memory_spread_page", .private = FILE_SPREAD_PAGE,};static struct cftype cft_spread_slab = { .name = "memory_spread_slab", .private = FILE_SPREAD_SLAB,};static int cpuset_populate_dir(struct dentry *cs_dentry){ int err; if ((err = cpuset_add_file(cs_dentry, &cft_cpus)) < 0) return err; if ((err = cpuset_add_file(cs_dentry, &cft_mems)) < 0) return err; if ((err = cpuset_add_file(cs_dentry, &cft_cpu_exclusive)) < 0) return err; if ((err = cpuset_add_file(cs_dentry, &cft_mem_exclusive)) < 0) return err; if ((err = cpuset_add_file(cs_dentry, &cft_notify_on_release)) < 0) return err; if ((err = cpuset_add_file(cs_dentry, &cft_memory_migrate)) < 0) return err; if ((err = cpuset_add_file(cs_dentry, &cft_memory_pressure)) < 0) return err; if ((err = cpuset_add_file(cs_dentry, &cft_spread_page)) < 0) return err; if ((err = cpuset_add_file(cs_dentry, &cft_spread_slab)) < 0) return err; if ((err = cpuset_add_file(cs_dentry, &cft_tasks)) < 0) return err; return 0;}/* * cpuset_create - create a cpuset * parent: cpuset that will be parent of the new cpuset. * name: name of the new cpuset. Will be strcpy'ed. * mode: mode to set on new inode * * Must be called with the mutex on the parent inode held */static long cpuset_create(struct cpuset *parent, const char *name, int mode){ struct cpuset *cs; int err; cs = kmalloc(sizeof(*cs), GFP_KERNEL); if (!cs) return -ENOMEM; mutex_lock(&manage_mutex); cpuset_update_task_memory_state(); cs->flags = 0; if (notify_on_release(parent)) set_bit(CS_NOTIFY_ON_RELEASE, &cs->flags); if (is_spread_page(parent)) set_bit(CS_SPREAD_PAGE, &cs->flags); if (is_spread_slab(parent)) set_bit(CS_SPREAD_SLAB, &cs->flags); cs->cpus_allowed = CPU_MASK_NONE; cs->mems_allowed = NODE_MASK_NONE; atomic_set(&cs->count, 0); INIT_LIST_HEAD(&cs->sibling); INIT_LIST_HEAD(&cs->children); cs->mems_generation = cpuset_mems_generation++; fmeter_init(&cs->fmeter); cs->parent = parent; mutex_lock(&callback_mutex); list_add(&cs->sibling, &cs->parent->children); number_of_cpusets++; mutex_unlock(&callback_mutex); err = cpuset_create_dir(cs, name, mode); if (err < 0) goto err; /* * Release manage_mutex before cpuset_populate_dir() because it * will down() this new directory's i_mutex and if we race with * another mkdir, we might deadlock. */ mutex_unlock(&manage_mutex); err = cpuset_populate_dir(cs->dentry); /* If err < 0, we have a half-filled directory - oh well ;) */ return 0;err: list_del(&cs->sibling); mutex_unlock(&manage_mutex); kfree(cs); return err;}static int cpuset_mkdir(struct inode *dir, struct dentry *dentry, int mode){ struct cpuset *c_parent = dentry->d_parent->d_fsdata; /* the vfs holds inode->i_mutex already */ return cpuset_create(c_parent, dentry->d_name.name, mode | S_IFDIR);}/* * Locking note on the strange update_flag() call below: * * If the cpuset being removed is marked cpu_exclusive, then simulate * turning cpu_exclusive off, which will call update_cpu_domains(). * The lock_cpu_hotplug() call in update_cpu_domains() must not be * made while holding callback_mutex. Elsewhere the kernel nests * callback_mutex inside lock_cpu_hotplug() calls. So the reverse * nesting would risk an ABBA deadlock. */static int cpuset_rmdir(struct inode *unused_dir, struct dentry *dentry){ struct cpuset *cs = dentry->d_fsdata; struct dentry *d; struct cpuset *parent; char *pathbuf = NULL; /* the vfs holds both inode->i_mutex already */ mutex_lock(&manage_mutex); cpuset_update_task_memory_state(); if (atomic_read(&cs->count) > 0) { mutex_unlock(&manage_mutex); return -EBUSY; } if (!list_empty(&cs->children)) { mutex_unlock(&manage_mutex); return -EBUSY; } if (is_cpu_exclusive(cs)) { int retval = update_flag(CS_CPU_EXCLUSIVE, cs, "0"); if (retval < 0) { mutex_unlock(&manage_mutex); return retval; } } parent = cs->parent; mutex_lock(&callback_mutex); set_bit(CS_REMOVED, &cs->flags); list_del(&cs->sibling); /* delete my sibling from parent->children */ spin_lock(&cs->dentry->d_lock); d = dget(cs->dentry); cs->dentry = NULL; spin_unlock(&d->d_lock); cpuset_d_remove_dir(d); dput(d); number_of_cpusets--; mutex_unlock(&callback_mutex); if (list_empty(&parent->children)) check_for_release(parent, &pathbuf); mutex_unlock(&manage_mutex); cpuset_release_agent(pathbuf); return 0;}/* * cpuset_init_early - just enough so that the calls to * cpuset_update_task_memory_state() in early init code * are harmless. */int __init cpuset_init_early(void){ struct task_struct *tsk = current; tsk->cpuset = &top_cpuset; tsk->cpuset->mems_generation = cpuset_mems_generation++; return 0;}/** * cpuset_init - initialize cpusets at system boot * * Description: Initialize top_cpuset and the cpuset internal file system, **/int __init cpuset_init(void){ struct dentry *root; int err; top_cpuset.cpus_allowed = CPU_MASK_ALL; top_cpuset.mems_allowed = NODE_MASK_ALL; fmeter_init(&top_cpuset.fmeter); top_cpuset.mems_generation = cpuset_mems_generation++; init_task.cpuset = &top_cpuset; err = register_filesystem(&cpuset_fs_type); if (err < 0) goto out; cpuset_mount = kern_mount(&cpuset_fs_type); if (IS_ERR(cpuset_mount)) { printk(KERN_ERR "cpuset: could not mount!\n"); err = PTR_ERR(cpuset_mount); cpuset_mount = NULL; goto out; } root = cpuset_mount->mnt_sb->s_root; root->d_fsdata = &top_cpuset; inc_nlink(root->d_inode); top_cpuset.dentry = root; root->d_inode->i_op = &cpuset_dir_inode_operations; number_of_cpusets = 1; err = cpuset_populate_dir(root); /* memory_pressure_enabled is in root cpuset only */ if (err == 0) err = cpuset_add_file(root, &cft_memory_pressure_enabled);out: return err;}/* * If common_cpu_mem_hotplug_unplug(), below, unplugs any CPUs * or memory nodes, we need to walk over the cpuset hierarchy, * removing that CPU or node from all cpusets. If this removes the * last CPU or node from a cpuset, then the guarantee_online_cpus() * or guarantee_online_mems() code will use that emptied cpusets * parent online CPUs or nodes. Cpusets that were already empty of * CPUs or nodes are left empty. * * This routine is intentionally inefficient in a couple of regards. * It will check all cpusets in a subtree even if the top cpuset of * the subtree has no offline CPUs or nodes. It checks both CPUs and * nodes, even though the caller could have been coded to know that * only one of CPUs or nodes needed to be checked on a given call. * This was done to minimize text size rather than cpu cycles. * * Call with both manage_mutex and callback_mutex held. * * Recursive, on depth of cpuset subtree. */static void guarantee_online_cpus_mems_in_subtree(const struct cpuset *cur){ struct cpuset *c; /* Each of our child cpusets mems must be online */ list_for_each_entry(c, &cur->children, sibling) { guarantee_online_cpus_mems_in_subtree(c); if (!cpus_empty(c->cpus_allowed)) guarantee_online_cpus(c, &c->cpus_allowed); if (!nodes_empty(c->mems_allowed)) guarantee_online_mems(c, &c->mems_allowed); }}/* * The cpus_allowed and mems_allowed nodemasks in the top_cpuset track * cpu_online_map and node_online_map. Force the top cpuset to track * whats online after any CPU or memory node hotplug or unplug event. * * To ensure that we don't remove a CPU or node from the top cpuset * that is currently in use by a child cpuset (which would violate * the rule that cpusets must be subsets of their parent), we first * call the recursive routine guarantee_online_cpus_mems_in_subtree(). * * Since there are two callers of this routine, one for CPU hotplug * events and one for memory node hotplug events, we could have coded * two separate routines here. We code it as a single common routine * in order to minimize text size. */static void common_cpu_mem_hotplug_unplug(void){ mutex_lock(&manage_mutex); mutex_lock(&callback_mutex); guarantee_online_cpus_mems_in_subtree(&top_cpuset); top_cpuset.cpus_allowed = cpu_online_map; top_cpuset.mems_allowed = node_online_map; mutex_unlock(&callback_mutex); mutex_unlock(&manage_mutex);}/* * The top_cpuset tracks what CPUs and Memory Nodes are online, * period. This is necessary in order to make cpusets transparent * (of no affect) on systems that are actively using CPU hotplug * but making no active use of cpusets. * * This routine ensures that top_cpuset.cpus_allowed tracks * cpu_online_map on each CPU hotplug (cpuhp) event. */static int cpuset_handle_cpuhp(struct notifier_block *nb, unsigned long phase, void *cpu){ common_cpu_mem_hotplug_unplug(); return 0;}#ifdef CONFIG_MEMORY_HOTPLUG/* * Keep top_cpuset.mems_allowed tracking node_online_map. * Call this routine anytime after you change node_online_map. * See also the previous routine cpuset_handle_cpuhp(). */void cpuset_track_online_nodes(void){ common_cpu_mem_hotplug_unplug();}#endif/** * cpuset_init_smp - initialize cpus_allowed * * Description: Finish top cpuset after cpu, node maps are initialized **/void __init cpuset_init_smp(void){ top_cpuset.cpus_allowed = cpu_online_map; top_cpuset.mems_allowed = node_online_map; hotcpu_notifier(cpuset_handle_cpuhp, 0);}/** * cpuset_fork - attach newly forked task to its parents cpuset. * @tsk: pointer to task_struct of forking parent process. * * Description: A task inherits its parent's cpuset at fork(). * * A pointer to the shared cpuset was automatically copied in fork.c * by dup_task_struct(). However, we ignore that copy, since it was * not made under the protection of task_lock(), so might no longer be * a valid cpuset pointer. attach_task() might have already changed * current->cpuset, allowing the previously referenced cpuset to * be removed and freed. Instead, we task_lock(current) and copy * its present value of current->cpuset for our freshly forked child. * * At the point that cpuset_fork() is called, 'current' is the parent * task, and the passed argument 'child' points to the child task. **/void cpuset_fork(struct task_struct *child){ task_lock(current); child->cpuset = current->cpuset; atomic_inc(&child->cpuset->count); task_unlock(current);}/** * cpuset_exit - detach cpuset from exiting task * @tsk: pointer to task_struct of exiting process * * Description: Detach cpuset from @tsk and release it. * * Note that cpusets marked notify_on_release force every task in * them to take the global manage_mutex mutex when exiting. * This could impact scaling on very large systems. Be reluctant to * use notify_on_release cpusets where very high task exit scaling * is required on large systems. * * Don't even think about derefencing 'cs' after the cpuset use count * goes to zero, except inside a critical section guarded by manage_mutex * or callback_mutex. Otherwise a zero cpuset use count is a license to * any other task to nuke the cpuset immediately, via cpuset_rmdir(). * * This routine has to take manage_mutex, not callback_mutex, because * it is holding that mutex while calling check_for_release(), * which calls kmalloc(), so can't be called holding callback_mutex(). * * the_top_cpuset_hack: * * Set the exiting tasks cpuset to the root cpuset (top_cpuset). * * Don't leave a task unable to allocate memory, as that is an * accident waiting to happen should someone add a callout in * do_exit() after the cpuset_exit() call that might allocate. * If a task tries to allocate memory with an invalid cpuset, * it will oops in cpuset_update_task_memory_state(). * * We call cpuset_exit() while the task is still competent to * handle notify_on_release(), then leave the task attached to * the root cpuset (top_cpuset) for the remainder of its exit. * * To do this properly, we would increment the reference count on * top_cpuset, and near the very end of the kernel/exit.c do_exit() * code we would add a second cpuse
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -