📄 cgroup.c
字号:
goto again; } if (heap == &tmp_heap) heap_free(&tmp_heap); return 0;}/* * Stuff for reading the 'tasks' file. * * Reading this file can return large amounts of data if a cgroup has * *lots* of attached tasks. So it may need several calls to read(), * but we cannot guarantee that the information we produce is correct * unless we produce it entirely atomically. * * Upon tasks file open(), a struct ctr_struct is allocated, that * will have a pointer to an array (also allocated here). The struct * ctr_struct * is stored in file->private_data. Its resources will * be freed by release() when the file is closed. The array is used * to sprintf the PIDs and then used by read(). */struct ctr_struct { char *buf; int bufsz;};/* * Load into 'pidarray' up to 'npids' of the tasks using cgroup * 'cgrp'. Return actual number of pids loaded. No need to * task_lock(p) when reading out p->cgroup, since we're in an RCU * read section, so the css_set can't go away, and is * immutable after creation. */static int pid_array_load(pid_t *pidarray, int npids, struct cgroup *cgrp){ int n = 0; struct cgroup_iter it; struct task_struct *tsk; cgroup_iter_start(cgrp, &it); while ((tsk = cgroup_iter_next(cgrp, &it))) { if (unlikely(n == npids)) break; pidarray[n++] = task_pid_vnr(tsk); } cgroup_iter_end(cgrp, &it); return n;}/** * cgroupstats_build - build and fill cgroupstats * @stats: cgroupstats to fill information into * @dentry: A dentry entry belonging to the cgroup for which stats have * been requested. * * Build and fill cgroupstats so that taskstats can export it to user * space. */int cgroupstats_build(struct cgroupstats *stats, struct dentry *dentry){ int ret = -EINVAL; struct cgroup *cgrp; struct cgroup_iter it; struct task_struct *tsk; /* * Validate dentry by checking the superblock operations */ if (dentry->d_sb->s_op != &cgroup_ops) goto err; ret = 0; cgrp = dentry->d_fsdata; rcu_read_lock(); cgroup_iter_start(cgrp, &it); while ((tsk = cgroup_iter_next(cgrp, &it))) { switch (tsk->state) { case TASK_RUNNING: stats->nr_running++; break; case TASK_INTERRUPTIBLE: stats->nr_sleeping++; break; case TASK_UNINTERRUPTIBLE: stats->nr_uninterruptible++; break; case TASK_STOPPED: stats->nr_stopped++; break; default: if (delayacct_is_task_waiting_on_io(tsk)) stats->nr_io_wait++; break; } } cgroup_iter_end(cgrp, &it); rcu_read_unlock();err: return ret;}static int cmppid(const void *a, const void *b){ return *(pid_t *)a - *(pid_t *)b;}/* * Convert array 'a' of 'npids' pid_t's to a string of newline separated * decimal pids in 'buf'. Don't write more than 'sz' chars, but return * count 'cnt' of how many chars would be written if buf were large enough. */static int pid_array_to_buf(char *buf, int sz, pid_t *a, int npids){ int cnt = 0; int i; for (i = 0; i < npids; i++) cnt += snprintf(buf + cnt, max(sz - cnt, 0), "%d\n", a[i]); return cnt;}/* * Handle an open on 'tasks' file. Prepare a buffer listing the * process id's of tasks currently attached to the cgroup being opened. * * Does not require any specific cgroup mutexes, and does not take any. */static int cgroup_tasks_open(struct inode *unused, struct file *file){ struct cgroup *cgrp = __d_cgrp(file->f_dentry->d_parent); struct ctr_struct *ctr; pid_t *pidarray; int npids; char c; if (!(file->f_mode & FMODE_READ)) return 0; ctr = kmalloc(sizeof(*ctr), GFP_KERNEL); if (!ctr) goto err0; /* * If cgroup gets more users after we read count, we won't have * enough space - tough. This race is indistinguishable to the * caller from the case that the additional cgroup users didn't * show up until sometime later on. */ npids = cgroup_task_count(cgrp); if (npids) { pidarray = kmalloc(npids * sizeof(pid_t), GFP_KERNEL); if (!pidarray) goto err1; npids = pid_array_load(pidarray, npids, cgrp); sort(pidarray, npids, sizeof(pid_t), cmppid, NULL); /* Call pid_array_to_buf() twice, first just to get bufsz */ ctr->bufsz = pid_array_to_buf(&c, sizeof(c), pidarray, npids) + 1; ctr->buf = kmalloc(ctr->bufsz, GFP_KERNEL); if (!ctr->buf) goto err2; ctr->bufsz = pid_array_to_buf(ctr->buf, ctr->bufsz, pidarray, npids); kfree(pidarray); } else { ctr->buf = NULL; ctr->bufsz = 0; } file->private_data = ctr; return 0;err2: kfree(pidarray);err1: kfree(ctr);err0: return -ENOMEM;}static ssize_t cgroup_tasks_read(struct cgroup *cgrp, struct cftype *cft, struct file *file, char __user *buf, size_t nbytes, loff_t *ppos){ struct ctr_struct *ctr = file->private_data; return simple_read_from_buffer(buf, nbytes, ppos, ctr->buf, ctr->bufsz);}static int cgroup_tasks_release(struct inode *unused_inode, struct file *file){ struct ctr_struct *ctr; if (file->f_mode & FMODE_READ) { ctr = file->private_data; kfree(ctr->buf); kfree(ctr); } return 0;}static u64 cgroup_read_notify_on_release(struct cgroup *cgrp, struct cftype *cft){ return notify_on_release(cgrp);}static int cgroup_write_notify_on_release(struct cgroup *cgrp, struct cftype *cft, u64 val){ clear_bit(CGRP_RELEASABLE, &cgrp->flags); if (val) set_bit(CGRP_NOTIFY_ON_RELEASE, &cgrp->flags); else clear_bit(CGRP_NOTIFY_ON_RELEASE, &cgrp->flags); return 0;}/* * for the common functions, 'private' gives the type of file */static struct cftype files[] = { { .name = "tasks", .open = cgroup_tasks_open, .read = cgroup_tasks_read, .write_u64 = cgroup_tasks_write, .release = cgroup_tasks_release, .private = FILE_TASKLIST, }, { .name = "notify_on_release", .read_u64 = cgroup_read_notify_on_release, .write_u64 = cgroup_write_notify_on_release, .private = FILE_NOTIFY_ON_RELEASE, },};static struct cftype cft_release_agent = { .name = "release_agent", .read_seq_string = cgroup_release_agent_show, .write_string = cgroup_release_agent_write, .max_write_len = PATH_MAX, .private = FILE_RELEASE_AGENT,};static int cgroup_populate_dir(struct cgroup *cgrp){ int err; struct cgroup_subsys *ss; /* First clear out any existing files */ cgroup_clear_directory(cgrp->dentry); err = cgroup_add_files(cgrp, NULL, files, ARRAY_SIZE(files)); if (err < 0) return err; if (cgrp == cgrp->top_cgroup) { if ((err = cgroup_add_file(cgrp, NULL, &cft_release_agent)) < 0) return err; } for_each_subsys(cgrp->root, ss) { if (ss->populate && (err = ss->populate(ss, cgrp)) < 0) return err; } return 0;}static void init_cgroup_css(struct cgroup_subsys_state *css, struct cgroup_subsys *ss, struct cgroup *cgrp){ css->cgroup = cgrp; atomic_set(&css->refcnt, 0); css->flags = 0; if (cgrp == dummytop) set_bit(CSS_ROOT, &css->flags); BUG_ON(cgrp->subsys[ss->subsys_id]); cgrp->subsys[ss->subsys_id] = css;}/* * cgroup_create - create a cgroup * @parent: cgroup that will be parent of the new cgroup * @dentry: dentry of the new cgroup * @mode: mode to set on new inode * * Must be called with the mutex on the parent inode held */static long cgroup_create(struct cgroup *parent, struct dentry *dentry, int mode){ struct cgroup *cgrp; struct cgroupfs_root *root = parent->root; int err = 0; struct cgroup_subsys *ss; struct super_block *sb = root->sb; cgrp = kzalloc(sizeof(*cgrp), GFP_KERNEL); if (!cgrp) return -ENOMEM; /* Grab a reference on the superblock so the hierarchy doesn't * get deleted on unmount if there are child cgroups. This * can be done outside cgroup_mutex, since the sb can't * disappear while someone has an open control file on the * fs */ atomic_inc(&sb->s_active); mutex_lock(&cgroup_mutex); INIT_LIST_HEAD(&cgrp->sibling); INIT_LIST_HEAD(&cgrp->children); INIT_LIST_HEAD(&cgrp->css_sets); INIT_LIST_HEAD(&cgrp->release_list); cgrp->parent = parent; cgrp->root = parent->root; cgrp->top_cgroup = parent->top_cgroup; if (notify_on_release(parent)) set_bit(CGRP_NOTIFY_ON_RELEASE, &cgrp->flags); for_each_subsys(root, ss) { struct cgroup_subsys_state *css = ss->create(ss, cgrp); if (IS_ERR(css)) { err = PTR_ERR(css); goto err_destroy; } init_cgroup_css(css, ss, cgrp); } list_add(&cgrp->sibling, &cgrp->parent->children); root->number_of_cgroups++; err = cgroup_create_dir(cgrp, dentry, mode); if (err < 0) goto err_remove; /* The cgroup directory was pre-locked for us */ BUG_ON(!mutex_is_locked(&cgrp->dentry->d_inode->i_mutex)); err = cgroup_populate_dir(cgrp); /* If err < 0, we have a half-filled directory - oh well ;) */ mutex_unlock(&cgroup_mutex); mutex_unlock(&cgrp->dentry->d_inode->i_mutex); return 0; err_remove: list_del(&cgrp->sibling); root->number_of_cgroups--; err_destroy: for_each_subsys(root, ss) { if (cgrp->subsys[ss->subsys_id]) ss->destroy(ss, cgrp); } mutex_unlock(&cgroup_mutex); /* Release the reference count that we took on the superblock */ deactivate_super(sb); kfree(cgrp); return err;}static int cgroup_mkdir(struct inode *dir, struct dentry *dentry, int mode){ struct cgroup *c_parent = dentry->d_parent->d_fsdata; /* the vfs holds inode->i_mutex already */ return cgroup_create(c_parent, dentry, mode | S_IFDIR);}static int cgroup_has_css_refs(struct cgroup *cgrp){ /* Check the reference count on each subsystem. Since we * already established that there are no tasks in the * cgroup, if the css refcount is also 0, then there should * be no outstanding references, so the subsystem is safe to * destroy. We scan across all subsystems rather than using * the per-hierarchy linked list of mounted subsystems since * we can be called via check_for_release() with no * synchronization other than RCU, and the subsystem linked * list isn't RCU-safe */ int i; for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) { struct cgroup_subsys *ss = subsys[i]; struct cgroup_subsys_state *css; /* Skip subsystems not in this hierarchy */ if (ss->root != cgrp->root) continue; css = cgrp->subsys[ss->subsys_id]; /* When called from check_for_release() it's possible * that by this point the cgroup has been removed * and the css deleted. But a false-positive doesn't * matter, since it can only happen if the cgroup * has been deleted and hence no longer needs the * release agent to be called anyway. */ if (css && atomic_read(&css->refcnt)) return 1; } return 0;}static int cgroup_rmdir(struct inode *unused_dir, struct dentry *dentry){ struct cgroup *cgrp = dentry->d_fsdata; struct dentry *d; struct cgroup *parent; struct super_block *sb; struct cgroupfs_root *root; /* the vfs holds both inode->i_mutex already */ mutex_lock(&cgroup_mutex); if (atomic_read(&cgrp->count) != 0) { mutex_unlock(&cgroup_mutex); return -EBUSY; } if (!list_empty(&cgrp->children)) { mutex_unlock(&cgroup_mutex); return -EBUSY; } parent = cgrp->parent; root = cgrp->root; sb = root->sb; /* * Call pre_destroy handlers of subsys. Notify subsystems * that rmdir() request comes. */ cgroup_call_pre_destroy(cgrp); if (cgroup_has_css_refs(cgrp)) { mutex_unlock(&cgroup_mutex); return -EBUSY; } spin_lock(&release_list_lock); set_bit(CGRP_REMOVED, &cgrp->flags); if (!list_empty(&cgrp->release_list)) list_del(&cgrp->release_list); spin_unlock(&release_list_lock); /* delete my sibling from parent->children */ list_del(&cgrp->sibling); spin_lock(&cgrp->dentry->d_lock); d = dget(cgrp->dentry); spin_unlock(&d->d_lock); cgroup_d_remove_dir(d); dput(d); set_bit(CGRP_RELEASABLE, &parent->flags); check_for_release(parent); mutex_unlock(&cgroup_mutex); return 0;}static void __init cgroup_init_subsys(struct cgroup_subsys *ss){ struct cgroup_subsys_state *css; printk(KERN_INFO "Initializing cgroup subsys %s\n", ss->name); /* Create the top cgroup state for this subsystem */ ss->root = &rootnode; css = ss->create(ss, dummytop); /* We don't handle early failures gracefully */ BUG_ON(IS_ERR(css)); init_cgroup_css(css, ss, dummytop); /* Update the init_css_set to contain a subsys * pointer to this state - since the subsystem is * newly registered, all tasks and hence the * init_css_set is in the subsystem's top cgroup. */ init_css_set.subsys[ss->subsys_id] = dummytop->subsys[ss->subsys_id]; need_forkexit_callback |= ss->fork || ss->exit; need_mm_owne
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -