📄 cgroup.c
字号:
/* * Generic process-grouping system. * * Based originally on the cpuset system, extracted by Paul Menage * Copyright (C) 2006 Google, Inc * * Copyright notices from the original cpuset code: * -------------------------------------------------- * Copyright (C) 2003 BULL SA. * Copyright (C) 2004-2006 Silicon Graphics, Inc. * * Portions derived from Patrick Mochel's sysfs code. * sysfs is Copyright (c) 2001-3 Patrick Mochel * * 2003-10-10 Written by Simon Derr. * 2003-10-22 Updates by Stephen Hemminger. * 2004 May-July Rework by Paul Jackson. * --------------------------------------------------- * * This file is subject to the terms and conditions of the GNU General Public * License. See the file COPYING in the main directory of the Linux * distribution for more details. */#include <linux/cgroup.h>#include <linux/errno.h>#include <linux/fs.h>#include <linux/kernel.h>#include <linux/list.h>#include <linux/mm.h>#include <linux/mutex.h>#include <linux/mount.h>#include <linux/pagemap.h>#include <linux/proc_fs.h>#include <linux/rcupdate.h>#include <linux/sched.h>#include <linux/backing-dev.h>#include <linux/seq_file.h>#include <linux/slab.h>#include <linux/magic.h>#include <linux/spinlock.h>#include <linux/string.h>#include <linux/sort.h>#include <linux/kmod.h>#include <linux/delayacct.h>#include <linux/cgroupstats.h>#include <linux/hash.h>#include <linux/namei.h>#include <asm/atomic.h>static DEFINE_MUTEX(cgroup_mutex);/* Generate an array of cgroup subsystem pointers */#define SUBSYS(_x) &_x ## _subsys,static struct cgroup_subsys *subsys[] = {#include <linux/cgroup_subsys.h>};/* * A cgroupfs_root represents the root of a cgroup hierarchy, * and may be associated with a superblock to form an active * hierarchy */struct cgroupfs_root { struct super_block *sb; /* * The bitmask of subsystems intended to be attached to this * hierarchy */ unsigned long subsys_bits; /* The bitmask of subsystems currently attached to this hierarchy */ unsigned long actual_subsys_bits; /* A list running through the attached subsystems */ struct list_head subsys_list; /* The root cgroup for this hierarchy */ struct cgroup top_cgroup; /* Tracks how many cgroups are currently defined in hierarchy.*/ int number_of_cgroups; /* A list running through the mounted hierarchies */ struct list_head root_list; /* Hierarchy-specific flags */ unsigned long flags; /* The path to use for release notifications. */ char release_agent_path[PATH_MAX];};/* * The "rootnode" hierarchy is the "dummy hierarchy", reserved for the * subsystems that are otherwise unattached - it never has more than a * single cgroup, and all tasks are part of that cgroup. */static struct cgroupfs_root rootnode;/* The list of hierarchy roots */static LIST_HEAD(roots);static int root_count;/* dummytop is a shorthand for the dummy hierarchy's top cgroup */#define dummytop (&rootnode.top_cgroup)/* This flag indicates whether tasks in the fork and exit paths should * check for fork/exit handlers to call. This avoids us having to do * extra work in the fork/exit path if none of the subsystems need to * be called. */static int need_forkexit_callback __read_mostly;static int need_mm_owner_callback __read_mostly;/* convenient tests for these bits */inline int cgroup_is_removed(const struct cgroup *cgrp){ return test_bit(CGRP_REMOVED, &cgrp->flags);}/* bits in struct cgroupfs_root flags field */enum { ROOT_NOPREFIX, /* mounted subsystems have no named prefix */};static int cgroup_is_releasable(const struct cgroup *cgrp){ const int bits = (1 << CGRP_RELEASABLE) | (1 << CGRP_NOTIFY_ON_RELEASE); return (cgrp->flags & bits) == bits;}static int notify_on_release(const struct cgroup *cgrp){ return test_bit(CGRP_NOTIFY_ON_RELEASE, &cgrp->flags);}/* * for_each_subsys() allows you to iterate on each subsystem attached to * an active hierarchy */#define for_each_subsys(_root, _ss) \list_for_each_entry(_ss, &_root->subsys_list, sibling)/* for_each_root() allows you to iterate across the active hierarchies */#define for_each_root(_root) \list_for_each_entry(_root, &roots, root_list)/* the list of cgroups eligible for automatic release. Protected by * release_list_lock */static LIST_HEAD(release_list);static DEFINE_SPINLOCK(release_list_lock);static void cgroup_release_agent(struct work_struct *work);static DECLARE_WORK(release_agent_work, cgroup_release_agent);static void check_for_release(struct cgroup *cgrp);/* Link structure for associating css_set objects with cgroups */struct cg_cgroup_link { /* * List running through cg_cgroup_links associated with a * cgroup, anchored on cgroup->css_sets */ struct list_head cgrp_link_list; /* * List running through cg_cgroup_links pointing at a * single css_set object, anchored on css_set->cg_links */ struct list_head cg_link_list; struct css_set *cg;};/* The default css_set - used by init and its children prior to any * hierarchies being mounted. It contains a pointer to the root state * for each subsystem. Also used to anchor the list of css_sets. Not * reference-counted, to improve performance when child cgroups * haven't been created. */static struct css_set init_css_set;static struct cg_cgroup_link init_css_set_link;/* css_set_lock protects the list of css_set objects, and the * chain of tasks off each css_set. Nests outside task->alloc_lock * due to cgroup_iter_start() */static DEFINE_RWLOCK(css_set_lock);static int css_set_count;/* hash table for cgroup groups. This improves the performance to * find an existing css_set */#define CSS_SET_HASH_BITS 7#define CSS_SET_TABLE_SIZE (1 << CSS_SET_HASH_BITS)static struct hlist_head css_set_table[CSS_SET_TABLE_SIZE];static struct hlist_head *css_set_hash(struct cgroup_subsys_state *css[]){ int i; int index; unsigned long tmp = 0UL; for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) tmp += (unsigned long)css[i]; tmp = (tmp >> 16) ^ tmp; index = hash_long(tmp, CSS_SET_HASH_BITS); return &css_set_table[index];}/* We don't maintain the lists running through each css_set to its * task until after the first call to cgroup_iter_start(). This * reduces the fork()/exit() overhead for people who have cgroups * compiled into their kernel but not actually in use */static int use_task_css_set_links __read_mostly;/* When we create or destroy a css_set, the operation simply * takes/releases a reference count on all the cgroups referenced * by subsystems in this css_set. This can end up multiple-counting * some cgroups, but that's OK - the ref-count is just a * busy/not-busy indicator; ensuring that we only count each cgroup * once would require taking a global lock to ensure that no * subsystems moved between hierarchies while we were doing so. * * Possible TODO: decide at boot time based on the number of * registered subsystems and the number of CPUs or NUMA nodes whether * it's better for performance to ref-count every subsystem, or to * take a global lock and only add one ref count to each hierarchy. *//* * unlink a css_set from the list and free it */static void unlink_css_set(struct css_set *cg){ struct cg_cgroup_link *link; struct cg_cgroup_link *saved_link; write_lock(&css_set_lock); hlist_del(&cg->hlist); css_set_count--; list_for_each_entry_safe(link, saved_link, &cg->cg_links, cg_link_list) { list_del(&link->cg_link_list); list_del(&link->cgrp_link_list); kfree(link); } write_unlock(&css_set_lock);}static void __release_css_set(struct kref *k, int taskexit){ int i; struct css_set *cg = container_of(k, struct css_set, ref); unlink_css_set(cg); rcu_read_lock(); for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) { struct cgroup *cgrp = cg->subsys[i]->cgroup; if (atomic_dec_and_test(&cgrp->count) && notify_on_release(cgrp)) { if (taskexit) set_bit(CGRP_RELEASABLE, &cgrp->flags); check_for_release(cgrp); } } rcu_read_unlock(); kfree(cg);}static void release_css_set(struct kref *k){ __release_css_set(k, 0);}static void release_css_set_taskexit(struct kref *k){ __release_css_set(k, 1);}/* * refcounted get/put for css_set objects */static inline void get_css_set(struct css_set *cg){ kref_get(&cg->ref);}static inline void put_css_set(struct css_set *cg){ kref_put(&cg->ref, release_css_set);}static inline void put_css_set_taskexit(struct css_set *cg){ kref_put(&cg->ref, release_css_set_taskexit);}/* * find_existing_css_set() is a helper for * find_css_set(), and checks to see whether an existing * css_set is suitable. * * oldcg: the cgroup group that we're using before the cgroup * transition * * cgrp: the cgroup that we're moving into * * template: location in which to build the desired set of subsystem * state objects for the new cgroup group */static struct css_set *find_existing_css_set( struct css_set *oldcg, struct cgroup *cgrp, struct cgroup_subsys_state *template[]){ int i; struct cgroupfs_root *root = cgrp->root; struct hlist_head *hhead; struct hlist_node *node; struct css_set *cg; /* Built the set of subsystem state objects that we want to * see in the new css_set */ for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) { if (root->subsys_bits & (1UL << i)) { /* Subsystem is in this hierarchy. So we want * the subsystem state from the new * cgroup */ template[i] = cgrp->subsys[i]; } else { /* Subsystem is not in this hierarchy, so we * don't want to change the subsystem state */ template[i] = oldcg->subsys[i]; } } hhead = css_set_hash(template); hlist_for_each_entry(cg, node, hhead, hlist) { if (!memcmp(template, cg->subsys, sizeof(cg->subsys))) { /* All subsystems matched */ return cg; } } /* No existing cgroup group matched */ return NULL;}static void free_cg_links(struct list_head *tmp){ struct cg_cgroup_link *link; struct cg_cgroup_link *saved_link; list_for_each_entry_safe(link, saved_link, tmp, cgrp_link_list) { list_del(&link->cgrp_link_list); kfree(link); }}/* * allocate_cg_links() allocates "count" cg_cgroup_link structures * and chains them on tmp through their cgrp_link_list fields. Returns 0 on * success or a negative error */static int allocate_cg_links(int count, struct list_head *tmp){ struct cg_cgroup_link *link; int i; INIT_LIST_HEAD(tmp); for (i = 0; i < count; i++) { link = kmalloc(sizeof(*link), GFP_KERNEL); if (!link) { free_cg_links(tmp); return -ENOMEM; } list_add(&link->cgrp_link_list, tmp); } return 0;}/* * find_css_set() takes an existing cgroup group and a * cgroup object, and returns a css_set object that's * equivalent to the old group, but with the given cgroup * substituted into the appropriate hierarchy. Must be called with * cgroup_mutex held */static struct css_set *find_css_set( struct css_set *oldcg, struct cgroup *cgrp){ struct css_set *res; struct cgroup_subsys_state *template[CGROUP_SUBSYS_COUNT]; int i; struct list_head tmp_cg_links; struct cg_cgroup_link *link; struct hlist_head *hhead; /* First see if we already have a cgroup group that matches * the desired set */ read_lock(&css_set_lock); res = find_existing_css_set(oldcg, cgrp, template); if (res) get_css_set(res); read_unlock(&css_set_lock); if (res) return res; res = kmalloc(sizeof(*res), GFP_KERNEL); if (!res) return NULL; /* Allocate all the cg_cgroup_link objects that we'll need */ if (allocate_cg_links(root_count, &tmp_cg_links) < 0) { kfree(res); return NULL; } kref_init(&res->ref); INIT_LIST_HEAD(&res->cg_links); INIT_LIST_HEAD(&res->tasks); INIT_HLIST_NODE(&res->hlist); /* Copy the set of subsystem state objects generated in * find_existing_css_set() */ memcpy(res->subsys, template, sizeof(res->subsys)); write_lock(&css_set_lock); /* Add reference counts and links from the new css_set. */ for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) { struct cgroup *cgrp = res->subsys[i]->cgroup; struct cgroup_subsys *ss = subsys[i]; atomic_inc(&cgrp->count); /* * We want to add a link once per cgroup, so we * only do it for the first subsystem in each * hierarchy */ if (ss->root->subsys_list.next == &ss->sibling) { BUG_ON(list_empty(&tmp_cg_links)); link = list_entry(tmp_cg_links.next, struct cg_cgroup_link, cgrp_link_list); list_del(&link->cgrp_link_list); list_add(&link->cgrp_link_list, &cgrp->css_sets); link->cg = res; list_add(&link->cg_link_list, &res->cg_links); } } if (list_empty(&rootnode.subsys_list)) { link = list_entry(tmp_cg_links.next, struct cg_cgroup_link, cgrp_link_list); list_del(&link->cgrp_link_list); list_add(&link->cgrp_link_list, &dummytop->css_sets); link->cg = res; list_add(&link->cg_link_list, &res->cg_links); } BUG_ON(!list_empty(&tmp_cg_links)); css_set_count++; /* Add this cgroup group to the hash table */ hhead = css_set_hash(res->subsys); hlist_add_head(&res->hlist, hhead); write_unlock(&css_set_lock); return res;}/* * There is one global cgroup mutex. We also require taking * task_lock() when dereferencing a task's cgroup subsys pointers. * See "The task_lock() exception", at the end of this comment. * * A task must hold cgroup_mutex to modify cgroups. * * Any task can increment and decrement the count field without lock. * So in general, code holding cgroup_mutex can't rely on the count * field not changing. However, if the count goes to zero, then only * cgroup_attach_task() can increment it again. Because a count of zero * means that no tasks are currently attached, therefore there is no * way a task attached to that cgroup can fork (the other way to
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -