📄 memcontrol.c
字号:
/* memcontrol.c - Memory Controller * * Copyright IBM Corporation, 2007 * Author Balbir Singh <balbir@linux.vnet.ibm.com> * * Copyright 2007 OpenVZ SWsoft Inc * Author: Pavel Emelianov <xemul@openvz.org> * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 2 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. */#include <linux/res_counter.h>#include <linux/memcontrol.h>#include <linux/cgroup.h>#include <linux/mm.h>#include <linux/pagemap.h>#include <linux/smp.h>#include <linux/page-flags.h>#include <linux/backing-dev.h>#include <linux/bit_spinlock.h>#include <linux/rcupdate.h>#include <linux/mutex.h>#include <linux/slab.h>#include <linux/swap.h>#include <linux/spinlock.h>#include <linux/fs.h>#include <linux/seq_file.h>#include <linux/vmalloc.h>#include <linux/mm_inline.h>#include <linux/page_cgroup.h>#include "internal.h"#include <asm/uaccess.h>struct cgroup_subsys mem_cgroup_subsys __read_mostly;#define MEM_CGROUP_RECLAIM_RETRIES 5#ifdef CONFIG_CGROUP_MEM_RES_CTLR_SWAP/* Turned on only when memory cgroup is enabled && really_do_swap_account = 0 */int do_swap_account __read_mostly;static int really_do_swap_account __initdata = 1; /* for remember boot option*/#else#define do_swap_account (0)#endifstatic DEFINE_MUTEX(memcg_tasklist); /* can be hold under cgroup_mutex *//* * Statistics for memory cgroup. */enum mem_cgroup_stat_index { /* * For MEM_CONTAINER_TYPE_ALL, usage = pagecache + rss. */ MEM_CGROUP_STAT_CACHE, /* # of pages charged as cache */ MEM_CGROUP_STAT_RSS, /* # of pages charged as rss */ MEM_CGROUP_STAT_PGPGIN_COUNT, /* # of pages paged in */ MEM_CGROUP_STAT_PGPGOUT_COUNT, /* # of pages paged out */ MEM_CGROUP_STAT_NSTATS,};struct mem_cgroup_stat_cpu { s64 count[MEM_CGROUP_STAT_NSTATS];} ____cacheline_aligned_in_smp;struct mem_cgroup_stat { struct mem_cgroup_stat_cpu cpustat[0];};/* * For accounting under irq disable, no need for increment preempt count. */static inline void __mem_cgroup_stat_add_safe(struct mem_cgroup_stat_cpu *stat, enum mem_cgroup_stat_index idx, int val){ stat->count[idx] += val;}static s64 mem_cgroup_read_stat(struct mem_cgroup_stat *stat, enum mem_cgroup_stat_index idx){ int cpu; s64 ret = 0; for_each_possible_cpu(cpu) ret += stat->cpustat[cpu].count[idx]; return ret;}/* * per-zone information in memory controller. */struct mem_cgroup_per_zone { /* * spin_lock to protect the per cgroup LRU */ struct list_head lists[NR_LRU_LISTS]; unsigned long count[NR_LRU_LISTS]; struct zone_reclaim_stat reclaim_stat;};/* Macro for accessing counter */#define MEM_CGROUP_ZSTAT(mz, idx) ((mz)->count[(idx)])struct mem_cgroup_per_node { struct mem_cgroup_per_zone zoneinfo[MAX_NR_ZONES];};struct mem_cgroup_lru_info { struct mem_cgroup_per_node *nodeinfo[MAX_NUMNODES];};/* * The memory controller data structure. The memory controller controls both * page cache and RSS per cgroup. We would eventually like to provide * statistics based on the statistics developed by Rik Van Riel for clock-pro, * to help the administrator determine what knobs to tune. * * TODO: Add a water mark for the memory controller. Reclaim will begin when * we hit the water mark. May be even add a low water mark, such that * no reclaim occurs from a cgroup at it's low water mark, this is * a feature that will be implemented much later in the future. */struct mem_cgroup { struct cgroup_subsys_state css; /* * the counter to account for memory usage */ struct res_counter res; /* * the counter to account for mem+swap usage. */ struct res_counter memsw; /* * Per cgroup active and inactive list, similar to the * per zone LRU lists. */ struct mem_cgroup_lru_info info; /* protect against reclaim related member. */ spinlock_t reclaim_param_lock; int prev_priority; /* for recording reclaim priority */ /* * While reclaiming in a hiearchy, we cache the last child we * reclaimed from. Protected by hierarchy_mutex */ struct mem_cgroup *last_scanned_child; /* * Should the accounting and control be hierarchical, per subtree? */ bool use_hierarchy; unsigned long last_oom_jiffies; atomic_t refcnt; unsigned int swappiness; /* * statistics. This must be placed at the end of memcg. */ struct mem_cgroup_stat stat;};enum charge_type { MEM_CGROUP_CHARGE_TYPE_CACHE = 0, MEM_CGROUP_CHARGE_TYPE_MAPPED, MEM_CGROUP_CHARGE_TYPE_SHMEM, /* used by page migration of shmem */ MEM_CGROUP_CHARGE_TYPE_FORCE, /* used by force_empty */ MEM_CGROUP_CHARGE_TYPE_SWAPOUT, /* for accounting swapcache */ NR_CHARGE_TYPE,};/* only for here (for easy reading.) */#define PCGF_CACHE (1UL << PCG_CACHE)#define PCGF_USED (1UL << PCG_USED)#define PCGF_LOCK (1UL << PCG_LOCK)static const unsigned longpcg_default_flags[NR_CHARGE_TYPE] = { PCGF_CACHE | PCGF_USED | PCGF_LOCK, /* File Cache */ PCGF_USED | PCGF_LOCK, /* Anon */ PCGF_CACHE | PCGF_USED | PCGF_LOCK, /* Shmem */ 0, /* FORCE */};/* for encoding cft->private value on file */#define _MEM (0)#define _MEMSWAP (1)#define MEMFILE_PRIVATE(x, val) (((x) << 16) | (val))#define MEMFILE_TYPE(val) (((val) >> 16) & 0xffff)#define MEMFILE_ATTR(val) ((val) & 0xffff)static void mem_cgroup_get(struct mem_cgroup *mem);static void mem_cgroup_put(struct mem_cgroup *mem);static struct mem_cgroup *parent_mem_cgroup(struct mem_cgroup *mem);static void mem_cgroup_charge_statistics(struct mem_cgroup *mem, struct page_cgroup *pc, bool charge){ int val = (charge)? 1 : -1; struct mem_cgroup_stat *stat = &mem->stat; struct mem_cgroup_stat_cpu *cpustat; int cpu = get_cpu(); cpustat = &stat->cpustat[cpu]; if (PageCgroupCache(pc)) __mem_cgroup_stat_add_safe(cpustat, MEM_CGROUP_STAT_CACHE, val); else __mem_cgroup_stat_add_safe(cpustat, MEM_CGROUP_STAT_RSS, val); if (charge) __mem_cgroup_stat_add_safe(cpustat, MEM_CGROUP_STAT_PGPGIN_COUNT, 1); else __mem_cgroup_stat_add_safe(cpustat, MEM_CGROUP_STAT_PGPGOUT_COUNT, 1); put_cpu();}static struct mem_cgroup_per_zone *mem_cgroup_zoneinfo(struct mem_cgroup *mem, int nid, int zid){ return &mem->info.nodeinfo[nid]->zoneinfo[zid];}static struct mem_cgroup_per_zone *page_cgroup_zoneinfo(struct page_cgroup *pc){ struct mem_cgroup *mem = pc->mem_cgroup; int nid = page_cgroup_nid(pc); int zid = page_cgroup_zid(pc); if (!mem) return NULL; return mem_cgroup_zoneinfo(mem, nid, zid);}static unsigned long mem_cgroup_get_all_zonestat(struct mem_cgroup *mem, enum lru_list idx){ int nid, zid; struct mem_cgroup_per_zone *mz; u64 total = 0; for_each_online_node(nid) for (zid = 0; zid < MAX_NR_ZONES; zid++) { mz = mem_cgroup_zoneinfo(mem, nid, zid); total += MEM_CGROUP_ZSTAT(mz, idx); } return total;}static struct mem_cgroup *mem_cgroup_from_cont(struct cgroup *cont){ return container_of(cgroup_subsys_state(cont, mem_cgroup_subsys_id), struct mem_cgroup, css);}struct mem_cgroup *mem_cgroup_from_task(struct task_struct *p){ /* * mm_update_next_owner() may clear mm->owner to NULL * if it races with swapoff, page migration, etc. * So this can be called with p == NULL. */ if (unlikely(!p)) return NULL; return container_of(task_subsys_state(p, mem_cgroup_subsys_id), struct mem_cgroup, css);}static struct mem_cgroup *try_get_mem_cgroup_from_mm(struct mm_struct *mm){ struct mem_cgroup *mem = NULL; /* * Because we have no locks, mm->owner's may be being moved to other * cgroup. We use css_tryget() here even if this looks * pessimistic (rather than adding locks here). */ rcu_read_lock(); do { mem = mem_cgroup_from_task(rcu_dereference(mm->owner)); if (unlikely(!mem)) break; } while (!css_tryget(&mem->css)); rcu_read_unlock(); return mem;}static bool mem_cgroup_is_obsolete(struct mem_cgroup *mem){ if (!mem) return true; return css_is_removed(&mem->css);}/* * Following LRU functions are allowed to be used without PCG_LOCK. * Operations are called by routine of global LRU independently from memcg. * What we have to take care of here is validness of pc->mem_cgroup. * * Changes to pc->mem_cgroup happens when * 1. charge * 2. moving account * In typical case, "charge" is done before add-to-lru. Exception is SwapCache. * It is added to LRU before charge. * If PCG_USED bit is not set, page_cgroup is not added to this private LRU. * When moving account, the page is not on LRU. It's isolated. */void mem_cgroup_del_lru_list(struct page *page, enum lru_list lru){ struct page_cgroup *pc; struct mem_cgroup *mem; struct mem_cgroup_per_zone *mz; if (mem_cgroup_disabled()) return; pc = lookup_page_cgroup(page); /* can happen while we handle swapcache. */ if (list_empty(&pc->lru) || !pc->mem_cgroup) return; /* * We don't check PCG_USED bit. It's cleared when the "page" is finally * removed from global LRU. */ mz = page_cgroup_zoneinfo(pc); mem = pc->mem_cgroup; MEM_CGROUP_ZSTAT(mz, lru) -= 1; list_del_init(&pc->lru); return;}void mem_cgroup_del_lru(struct page *page){ mem_cgroup_del_lru_list(page, page_lru(page));}void mem_cgroup_rotate_lru_list(struct page *page, enum lru_list lru){ struct mem_cgroup_per_zone *mz; struct page_cgroup *pc; if (mem_cgroup_disabled()) return; pc = lookup_page_cgroup(page); /* * Used bit is set without atomic ops but after smp_wmb(). * For making pc->mem_cgroup visible, insert smp_rmb() here. */ smp_rmb(); /* unused page is not rotated. */ if (!PageCgroupUsed(pc)) return; mz = page_cgroup_zoneinfo(pc); list_move(&pc->lru, &mz->lists[lru]);}void mem_cgroup_add_lru_list(struct page *page, enum lru_list lru){ struct page_cgroup *pc; struct mem_cgroup_per_zone *mz; if (mem_cgroup_disabled()) return; pc = lookup_page_cgroup(page); /* * Used bit is set without atomic ops but after smp_wmb(). * For making pc->mem_cgroup visible, insert smp_rmb() here. */ smp_rmb(); if (!PageCgroupUsed(pc)) return; mz = page_cgroup_zoneinfo(pc); MEM_CGROUP_ZSTAT(mz, lru) += 1; list_add(&pc->lru, &mz->lists[lru]);}/* * At handling SwapCache, pc->mem_cgroup may be changed while it's linked to * lru because the page may.be reused after it's fully uncharged (because of * SwapCache behavior).To handle that, unlink page_cgroup from LRU when charge * it again. This function is only used to charge SwapCache. It's done under * lock_page and expected that zone->lru_lock is never held. */static void mem_cgroup_lru_del_before_commit_swapcache(struct page *page){ unsigned long flags; struct zone *zone = page_zone(page); struct page_cgroup *pc = lookup_page_cgroup(page); spin_lock_irqsave(&zone->lru_lock, flags); /* * Forget old LRU when this page_cgroup is *not* used. This Used bit * is guarded by lock_page() because the page is SwapCache. */ if (!PageCgroupUsed(pc)) mem_cgroup_del_lru_list(page, page_lru(page)); spin_unlock_irqrestore(&zone->lru_lock, flags);}static void mem_cgroup_lru_add_after_commit_swapcache(struct page *page){ unsigned long flags; struct zone *zone = page_zone(page); struct page_cgroup *pc = lookup_page_cgroup(page); spin_lock_irqsave(&zone->lru_lock, flags); /* link when the page is linked to LRU but page_cgroup isn't */ if (PageLRU(page) && list_empty(&pc->lru)) mem_cgroup_add_lru_list(page, page_lru(page)); spin_unlock_irqrestore(&zone->lru_lock, flags);}void mem_cgroup_move_lists(struct page *page, enum lru_list from, enum lru_list to){ if (mem_cgroup_disabled()) return; mem_cgroup_del_lru_list(page, from); mem_cgroup_add_lru_list(page, to);}int task_in_mem_cgroup(struct task_struct *task, const struct mem_cgroup *mem){ int ret; task_lock(task); ret = task->mm && mm_match_cgroup(task->mm, mem); task_unlock(task); return ret;}/* * Calculate mapped_ratio under memory controller. This will be used in * vmscan.c for deteremining we have to reclaim mapped pages. */int mem_cgroup_calc_mapped_ratio(struct mem_cgroup *mem){ long total, rss; /* * usage is recorded in bytes. But, here, we assume the number of * physical pages can be represented by "long" on any arch. */ total = (long) (mem->res.usage >> PAGE_SHIFT) + 1L; rss = (long)mem_cgroup_read_stat(&mem->stat, MEM_CGROUP_STAT_RSS); return (int)((rss * 100L) / total);}/* * prev_priority control...this will be used in memory reclaim path. */int mem_cgroup_get_reclaim_priority(struct mem_cgroup *mem){ int prev_priority; spin_lock(&mem->reclaim_param_lock); prev_priority = mem->prev_priority; spin_unlock(&mem->reclaim_param_lock); return prev_priority;}void mem_cgroup_note_reclaim_priority(struct mem_cgroup *mem, int priority){ spin_lock(&mem->reclaim_param_lock); if (priority < mem->prev_priority) mem->prev_priority = priority; spin_unlock(&mem->reclaim_param_lock);}void mem_cgroup_record_reclaim_priority(struct mem_cgroup *mem, int priority){ spin_lock(&mem->reclaim_param_lock); mem->prev_priority = priority; spin_unlock(&mem->reclaim_param_lock);}static int calc_inactive_ratio(struct mem_cgroup *memcg, unsigned long *present_pages){ unsigned long active; unsigned long inactive; unsigned long gb; unsigned long inactive_ratio; inactive = mem_cgroup_get_all_zonestat(memcg, LRU_INACTIVE_ANON); active = mem_cgroup_get_all_zonestat(memcg, LRU_ACTIVE_ANON); gb = (inactive + active) >> (30 - PAGE_SHIFT); if (gb) inactive_ratio = int_sqrt(10 * gb); else inactive_ratio = 1; if (present_pages) { present_pages[0] = inactive; present_pages[1] = active; } return inactive_ratio;}int mem_cgroup_inactive_anon_is_low(struct mem_cgroup *memcg){ unsigned long active; unsigned long inactive; unsigned long present_pages[2]; unsigned long inactive_ratio; inactive_ratio = calc_inactive_ratio(memcg, present_pages); inactive = present_pages[0]; active = present_pages[1]; if (inactive * inactive_ratio < active) return 1; return 0;}unsigned long mem_cgroup_zone_nr_pages(struct mem_cgroup *memcg, struct zone *zone, enum lru_list lru){ int nid = zone->zone_pgdat->node_id; int zid = zone_idx(zone); struct mem_cgroup_per_zone *mz = mem_cgroup_zoneinfo(memcg, nid, zid); return MEM_CGROUP_ZSTAT(mz, lru);}struct zone_reclaim_stat *mem_cgroup_get_reclaim_stat(struct mem_cgroup *memcg, struct zone *zone){ int nid = zone->zone_pgdat->node_id; int zid = zone_idx(zone); struct mem_cgroup_per_zone *mz = mem_cgroup_zoneinfo(memcg, nid, zid); return &mz->reclaim_stat;}struct zone_reclaim_stat *mem_cgroup_get_reclaim_stat_from_page(struct page *page){ struct page_cgroup *pc; struct mem_cgroup_per_zone *mz; if (mem_cgroup_disabled()) return NULL; pc = lookup_page_cgroup(page); /* * Used bit is set without atomic ops but after smp_wmb(). * For making pc->mem_cgroup visible, insert smp_rmb() here. */ smp_rmb(); if (!PageCgroupUsed(pc)) return NULL; mz = page_cgroup_zoneinfo(pc); if (!mz) return NULL; return &mz->reclaim_stat;}unsigned long mem_cgroup_isolate_pages(unsigned long nr_to_scan, struct list_head *dst,
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -