📄 cpuset.c
字号:
do_migrate_pages(mm, from, to, MPOL_MF_MOVE_ALL); mutex_lock(&callback_mutex); guarantee_online_mems(tsk->cpuset, &tsk->mems_allowed); mutex_unlock(&callback_mutex);}/* * Handle user request to change the 'mems' memory placement * of a cpuset. Needs to validate the request, update the * cpusets mems_allowed and mems_generation, and for each * task in the cpuset, rebind any vma mempolicies and if * the cpuset is marked 'memory_migrate', migrate the tasks * pages to the new memory. * * Call with manage_mutex held. May take callback_mutex during call. * Will take tasklist_lock, scan tasklist for tasks in cpuset cs, * lock each such tasks mm->mmap_sem, scan its vma's and rebind * their mempolicies to the cpusets new mems_allowed. */static int update_nodemask(struct cpuset *cs, char *buf){ struct cpuset trialcs; nodemask_t oldmem; struct task_struct *g, *p; struct mm_struct **mmarray; int i, n, ntasks; int migrate; int fudge; int retval; /* top_cpuset.mems_allowed tracks node_online_map; it's read-only */ if (cs == &top_cpuset) return -EACCES; trialcs = *cs; /* * We allow a cpuset's mems_allowed to be empty; if it has attached * tasks, we'll catch it later when we validate the change and return * -ENOSPC. */ if (!buf[0] || (buf[0] == '\n' && !buf[1])) { nodes_clear(trialcs.mems_allowed); } else { retval = nodelist_parse(buf, trialcs.mems_allowed); if (retval < 0) goto done; } nodes_and(trialcs.mems_allowed, trialcs.mems_allowed, node_online_map); oldmem = cs->mems_allowed; if (nodes_equal(oldmem, trialcs.mems_allowed)) { retval = 0; /* Too easy - nothing to do */ goto done; } /* mems_allowed cannot be empty for a cpuset with attached tasks. */ if (atomic_read(&cs->count) && nodes_empty(trialcs.mems_allowed)) { retval = -ENOSPC; goto done; } retval = validate_change(cs, &trialcs); if (retval < 0) goto done; mutex_lock(&callback_mutex); cs->mems_allowed = trialcs.mems_allowed; cs->mems_generation = cpuset_mems_generation++; mutex_unlock(&callback_mutex); set_cpuset_being_rebound(cs); /* causes mpol_copy() rebind */ fudge = 10; /* spare mmarray[] slots */ fudge += cpus_weight(cs->cpus_allowed); /* imagine one fork-bomb/cpu */ retval = -ENOMEM; /* * Allocate mmarray[] to hold mm reference for each task * in cpuset cs. Can't kmalloc GFP_KERNEL while holding * tasklist_lock. We could use GFP_ATOMIC, but with a * few more lines of code, we can retry until we get a big * enough mmarray[] w/o using GFP_ATOMIC. */ while (1) { ntasks = atomic_read(&cs->count); /* guess */ ntasks += fudge; mmarray = kmalloc(ntasks * sizeof(*mmarray), GFP_KERNEL); if (!mmarray) goto done; write_lock_irq(&tasklist_lock); /* block fork */ if (atomic_read(&cs->count) <= ntasks) break; /* got enough */ write_unlock_irq(&tasklist_lock); /* try again */ kfree(mmarray); } n = 0; /* Load up mmarray[] with mm reference for each task in cpuset. */ do_each_thread(g, p) { struct mm_struct *mm; if (n >= ntasks) { printk(KERN_WARNING "Cpuset mempolicy rebind incomplete.\n"); continue; } if (p->cpuset != cs) continue; mm = get_task_mm(p); if (!mm) continue; mmarray[n++] = mm; } while_each_thread(g, p); write_unlock_irq(&tasklist_lock); /* * Now that we've dropped the tasklist spinlock, we can * rebind the vma mempolicies of each mm in mmarray[] to their * new cpuset, and release that mm. The mpol_rebind_mm() * call takes mmap_sem, which we couldn't take while holding * tasklist_lock. Forks can happen again now - the mpol_copy() * cpuset_being_rebound check will catch such forks, and rebind * their vma mempolicies too. Because we still hold the global * cpuset manage_mutex, we know that no other rebind effort will * be contending for the global variable cpuset_being_rebound. * It's ok if we rebind the same mm twice; mpol_rebind_mm() * is idempotent. Also migrate pages in each mm to new nodes. */ migrate = is_memory_migrate(cs); for (i = 0; i < n; i++) { struct mm_struct *mm = mmarray[i]; mpol_rebind_mm(mm, &cs->mems_allowed); if (migrate) cpuset_migrate_mm(mm, &oldmem, &cs->mems_allowed); mmput(mm); } /* We're done rebinding vma's to this cpusets new mems_allowed. */ kfree(mmarray); set_cpuset_being_rebound(NULL); retval = 0;done: return retval;}/* * Call with manage_mutex held. */static int update_memory_pressure_enabled(struct cpuset *cs, char *buf){ if (simple_strtoul(buf, NULL, 10) != 0) cpuset_memory_pressure_enabled = 1; else cpuset_memory_pressure_enabled = 0; return 0;}/* * update_flag - read a 0 or a 1 in a file and update associated flag * bit: the bit to update (CS_CPU_EXCLUSIVE, CS_MEM_EXCLUSIVE, * CS_NOTIFY_ON_RELEASE, CS_MEMORY_MIGRATE, * CS_SPREAD_PAGE, CS_SPREAD_SLAB) * cs: the cpuset to update * buf: the buffer where we read the 0 or 1 * * Call with manage_mutex held. */static int update_flag(cpuset_flagbits_t bit, struct cpuset *cs, char *buf){ int turning_on; struct cpuset trialcs; int err, cpu_exclusive_changed; turning_on = (simple_strtoul(buf, NULL, 10) != 0); trialcs = *cs; if (turning_on) set_bit(bit, &trialcs.flags); else clear_bit(bit, &trialcs.flags); err = validate_change(cs, &trialcs); if (err < 0) return err; cpu_exclusive_changed = (is_cpu_exclusive(cs) != is_cpu_exclusive(&trialcs)); mutex_lock(&callback_mutex); cs->flags = trialcs.flags; mutex_unlock(&callback_mutex); if (cpu_exclusive_changed) update_cpu_domains(cs); return 0;}/* * Frequency meter - How fast is some event occurring? * * These routines manage a digitally filtered, constant time based, * event frequency meter. There are four routines: * fmeter_init() - initialize a frequency meter. * fmeter_markevent() - called each time the event happens. * fmeter_getrate() - returns the recent rate of such events. * fmeter_update() - internal routine used to update fmeter. * * A common data structure is passed to each of these routines, * which is used to keep track of the state required to manage the * frequency meter and its digital filter. * * The filter works on the number of events marked per unit time. * The filter is single-pole low-pass recursive (IIR). The time unit * is 1 second. Arithmetic is done using 32-bit integers scaled to * simulate 3 decimal digits of precision (multiplied by 1000). * * With an FM_COEF of 933, and a time base of 1 second, the filter * has a half-life of 10 seconds, meaning that if the events quit * happening, then the rate returned from the fmeter_getrate() * will be cut in half each 10 seconds, until it converges to zero. * * It is not worth doing a real infinitely recursive filter. If more * than FM_MAXTICKS ticks have elapsed since the last filter event, * just compute FM_MAXTICKS ticks worth, by which point the level * will be stable. * * Limit the count of unprocessed events to FM_MAXCNT, so as to avoid * arithmetic overflow in the fmeter_update() routine. * * Given the simple 32 bit integer arithmetic used, this meter works * best for reporting rates between one per millisecond (msec) and * one per 32 (approx) seconds. At constant rates faster than one * per msec it maxes out at values just under 1,000,000. At constant * rates between one per msec, and one per second it will stabilize * to a value N*1000, where N is the rate of events per second. * At constant rates between one per second and one per 32 seconds, * it will be choppy, moving up on the seconds that have an event, * and then decaying until the next event. At rates slower than * about one in 32 seconds, it decays all the way back to zero between * each event. */#define FM_COEF 933 /* coefficient for half-life of 10 secs */#define FM_MAXTICKS ((time_t)99) /* useless computing more ticks than this */#define FM_MAXCNT 1000000 /* limit cnt to avoid overflow */#define FM_SCALE 1000 /* faux fixed point scale *//* Initialize a frequency meter */static void fmeter_init(struct fmeter *fmp){ fmp->cnt = 0; fmp->val = 0; fmp->time = 0; spin_lock_init(&fmp->lock);}/* Internal meter update - process cnt events and update value */static void fmeter_update(struct fmeter *fmp){ time_t now = get_seconds(); time_t ticks = now - fmp->time; if (ticks == 0) return; ticks = min(FM_MAXTICKS, ticks); while (ticks-- > 0) fmp->val = (FM_COEF * fmp->val) / FM_SCALE; fmp->time = now; fmp->val += ((FM_SCALE - FM_COEF) * fmp->cnt) / FM_SCALE; fmp->cnt = 0;}/* Process any previous ticks, then bump cnt by one (times scale). */static void fmeter_markevent(struct fmeter *fmp){ spin_lock(&fmp->lock); fmeter_update(fmp); fmp->cnt = min(FM_MAXCNT, fmp->cnt + FM_SCALE); spin_unlock(&fmp->lock);}/* Process any previous ticks, then return current value. */static int fmeter_getrate(struct fmeter *fmp){ int val; spin_lock(&fmp->lock); fmeter_update(fmp); val = fmp->val; spin_unlock(&fmp->lock); return val;}/* * Attack task specified by pid in 'pidbuf' to cpuset 'cs', possibly * writing the path of the old cpuset in 'ppathbuf' if it needs to be * notified on release. * * Call holding manage_mutex. May take callback_mutex and task_lock of * the task 'pid' during call. */static int attach_task(struct cpuset *cs, char *pidbuf, char **ppathbuf){ pid_t pid; struct task_struct *tsk; struct cpuset *oldcs; cpumask_t cpus; nodemask_t from, to; struct mm_struct *mm; int retval; if (sscanf(pidbuf, "%d", &pid) != 1) return -EIO; if (cpus_empty(cs->cpus_allowed) || nodes_empty(cs->mems_allowed)) return -ENOSPC; if (pid) { read_lock(&tasklist_lock); tsk = find_task_by_pid(pid); if (!tsk || tsk->flags & PF_EXITING) { read_unlock(&tasklist_lock); return -ESRCH; } get_task_struct(tsk); read_unlock(&tasklist_lock); if ((current->euid) && (current->euid != tsk->uid) && (current->euid != tsk->suid)) { put_task_struct(tsk); return -EACCES; } } else { tsk = current; get_task_struct(tsk); } retval = security_task_setscheduler(tsk, 0, NULL); if (retval) { put_task_struct(tsk); return retval; } mutex_lock(&callback_mutex); task_lock(tsk); oldcs = tsk->cpuset; /* * After getting 'oldcs' cpuset ptr, be sure still not exiting. * If 'oldcs' might be the top_cpuset due to the_top_cpuset_hack * then fail this attach_task(), to avoid breaking top_cpuset.count. */ if (tsk->flags & PF_EXITING) { task_unlock(tsk); mutex_unlock(&callback_mutex); put_task_struct(tsk); return -ESRCH; } atomic_inc(&cs->count); rcu_assign_pointer(tsk->cpuset, cs); task_unlock(tsk); guarantee_online_cpus(cs, &cpus); set_cpus_allowed(tsk, cpus); from = oldcs->mems_allowed; to = cs->mems_allowed; mutex_unlock(&callback_mutex); mm = get_task_mm(tsk); if (mm) { mpol_rebind_mm(mm, &to); if (is_memory_migrate(cs)) cpuset_migrate_mm(mm, &from, &to); mmput(mm); } put_task_struct(tsk); synchronize_rcu(); if (atomic_dec_and_test(&oldcs->count)) check_for_release(oldcs, ppathbuf); return 0;}/* The various types of files and directories in a cpuset file system */typedef enum { FILE_ROOT, FILE_DIR, FILE_MEMORY_MIGRATE, FILE_CPULIST, FILE_MEMLIST, FILE_CPU_EXCLUSIVE, FILE_MEM_EXCLUSIVE, FILE_NOTIFY_ON_RELEASE, FILE_MEMORY_PRESSURE_ENABLED, FILE_MEMORY_PRESSURE, FILE_SPREAD_PAGE, FILE_SPREAD_SLAB, FILE_TASKLIST,} cpuset_filetype_t;static ssize_t cpuset_common_file_write(struct file *file, const char __user *userbuf, size_t nbytes, loff_t *unused_ppos){ struct cpuset *cs = __d_cs(file->f_path.dentry->d_parent); struct cftype *cft = __d_cft(file->f_path.dentry); cpuset_filetype_t type = cft->private; char *buffer; char *pathbuf = NULL; int retval = 0; /* Crude upper limit on largest legitimate cpulist user might write. */ if (nbytes > 100 + 6 * max(NR_CPUS, MAX_NUMNODES)) return -E2BIG; /* +1 for nul-terminator */ if ((buffer = kmalloc(nbytes + 1, GFP_KERNEL)) == 0) return -ENOMEM; if (copy_from_user(buffer, userbuf, nbytes)) { retval = -EFAULT; goto out1; } buffer[nbytes] = 0; /* nul-terminate */ mutex_lock(&manage_mutex); if (is_removed(cs)) { retval = -ENODEV; goto out2; } switch (type) { case FILE_CPULIST: retval = update_cpumask(cs, buffer); break; case FILE_MEMLIST: retval = update_nodemask(cs, buffer);
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -