📄 vmscan.c
字号:
/* * linux/mm/vmscan.c * * Copyright (C) 1991, 1992, 1993, 1994 Linus Torvalds * * Swap reorganised 29.12.95, Stephen Tweedie. * kswapd added: 7.1.96 sct * Removed kswapd_ctl limits, and swap out as many pages as needed * to bring the system back to freepages.high: 2.4.97, Rik van Riel. * Zone aware kswapd started 02/00, Kanoj Sarcar (kanoj@sgi.com). * Multiqueue VM started 5.8.00, Rik van Riel. */#include <linux/mm.h>#include <linux/module.h>#include <linux/slab.h>#include <linux/kernel_stat.h>#include <linux/swap.h>#include <linux/pagemap.h>#include <linux/init.h>#include <linux/highmem.h>#include <linux/vmstat.h>#include <linux/file.h>#include <linux/writeback.h>#include <linux/blkdev.h>#include <linux/buffer_head.h> /* for try_to_release_page(), buffer_heads_over_limit */#include <linux/mm_inline.h>#include <linux/pagevec.h>#include <linux/backing-dev.h>#include <linux/rmap.h>#include <linux/topology.h>#include <linux/cpu.h>#include <linux/cpuset.h>#include <linux/notifier.h>#include <linux/rwsem.h>#include <linux/delay.h>#include <linux/kthread.h>#include <linux/freezer.h>#include <linux/memcontrol.h>#include <linux/delayacct.h>#include <linux/sysctl.h>#include <asm/tlbflush.h>#include <asm/div64.h>#include <linux/swapops.h>#include "internal.h"struct scan_control { /* Incremented by the number of inactive pages that were scanned */ unsigned long nr_scanned; /* Number of pages freed so far during a call to shrink_zones() */ unsigned long nr_reclaimed; /* This context's GFP mask */ gfp_t gfp_mask; int may_writepage; /* Can pages be swapped as part of reclaim? */ int may_swap; /* This context's SWAP_CLUSTER_MAX. If freeing memory for * suspend, we effectively ignore SWAP_CLUSTER_MAX. * In this context, it doesn't matter that we scan the * whole list at once. */ int swap_cluster_max; int swappiness; int all_unreclaimable; int order; /* Which cgroup do we reclaim from */ struct mem_cgroup *mem_cgroup; /* Pluggable isolate pages callback */ unsigned long (*isolate_pages)(unsigned long nr, struct list_head *dst, unsigned long *scanned, int order, int mode, struct zone *z, struct mem_cgroup *mem_cont, int active, int file);};#define lru_to_page(_head) (list_entry((_head)->prev, struct page, lru))#ifdef ARCH_HAS_PREFETCH#define prefetch_prev_lru_page(_page, _base, _field) \ do { \ if ((_page)->lru.prev != _base) { \ struct page *prev; \ \ prev = lru_to_page(&(_page->lru)); \ prefetch(&prev->_field); \ } \ } while (0)#else#define prefetch_prev_lru_page(_page, _base, _field) do { } while (0)#endif#ifdef ARCH_HAS_PREFETCHW#define prefetchw_prev_lru_page(_page, _base, _field) \ do { \ if ((_page)->lru.prev != _base) { \ struct page *prev; \ \ prev = lru_to_page(&(_page->lru)); \ prefetchw(&prev->_field); \ } \ } while (0)#else#define prefetchw_prev_lru_page(_page, _base, _field) do { } while (0)#endif/* * From 0 .. 100. Higher means more swappy. */int vm_swappiness = 60;long vm_total_pages; /* The total number of pages which the VM controls */static LIST_HEAD(shrinker_list);static DECLARE_RWSEM(shrinker_rwsem);#ifdef CONFIG_CGROUP_MEM_RES_CTLR#define scanning_global_lru(sc) (!(sc)->mem_cgroup)#else#define scanning_global_lru(sc) (1)#endifstatic struct zone_reclaim_stat *get_reclaim_stat(struct zone *zone, struct scan_control *sc){ if (!scanning_global_lru(sc)) return mem_cgroup_get_reclaim_stat(sc->mem_cgroup, zone); return &zone->reclaim_stat;}static unsigned long zone_nr_pages(struct zone *zone, struct scan_control *sc, enum lru_list lru){ if (!scanning_global_lru(sc)) return mem_cgroup_zone_nr_pages(sc->mem_cgroup, zone, lru); return zone_page_state(zone, NR_LRU_BASE + lru);}/* * Add a shrinker callback to be called from the vm */void register_shrinker(struct shrinker *shrinker){ shrinker->nr = 0; down_write(&shrinker_rwsem); list_add_tail(&shrinker->list, &shrinker_list); up_write(&shrinker_rwsem);}EXPORT_SYMBOL(register_shrinker);/* * Remove one */void unregister_shrinker(struct shrinker *shrinker){ down_write(&shrinker_rwsem); list_del(&shrinker->list); up_write(&shrinker_rwsem);}EXPORT_SYMBOL(unregister_shrinker);#define SHRINK_BATCH 128/* * Call the shrink functions to age shrinkable caches * * Here we assume it costs one seek to replace a lru page and that it also * takes a seek to recreate a cache object. With this in mind we age equal * percentages of the lru and ageable caches. This should balance the seeks * generated by these structures. * * If the vm encountered mapped pages on the LRU it increase the pressure on * slab to avoid swapping. * * We do weird things to avoid (scanned*seeks*entries) overflowing 32 bits. * * `lru_pages' represents the number of on-LRU pages in all the zones which * are eligible for the caller's allocation attempt. It is used for balancing * slab reclaim versus page reclaim. * * Returns the number of slab objects which we shrunk. */unsigned long shrink_slab(unsigned long scanned, gfp_t gfp_mask, unsigned long lru_pages){ struct shrinker *shrinker; unsigned long ret = 0; if (scanned == 0) scanned = SWAP_CLUSTER_MAX; if (!down_read_trylock(&shrinker_rwsem)) return 1; /* Assume we'll be able to shrink next time */ list_for_each_entry(shrinker, &shrinker_list, list) { unsigned long long delta; unsigned long total_scan; unsigned long max_pass = (*shrinker->shrink)(0, gfp_mask); delta = (4 * scanned) / shrinker->seeks; delta *= max_pass; do_div(delta, lru_pages + 1); shrinker->nr += delta; if (shrinker->nr < 0) { printk(KERN_ERR "%s: nr=%ld\n", __func__, shrinker->nr); shrinker->nr = max_pass; } /* * Avoid risking looping forever due to too large nr value: * never try to free more than twice the estimate number of * freeable entries. */ if (shrinker->nr > max_pass * 2) shrinker->nr = max_pass * 2; total_scan = shrinker->nr; shrinker->nr = 0; while (total_scan >= SHRINK_BATCH) { long this_scan = SHRINK_BATCH; int shrink_ret; int nr_before; nr_before = (*shrinker->shrink)(0, gfp_mask); shrink_ret = (*shrinker->shrink)(this_scan, gfp_mask); if (shrink_ret == -1) break; if (shrink_ret < nr_before) ret += nr_before - shrink_ret; count_vm_events(SLABS_SCANNED, this_scan); total_scan -= this_scan; cond_resched(); } shrinker->nr += total_scan; } up_read(&shrinker_rwsem); return ret;}/* Called without lock on whether page is mapped, so answer is unstable */static inline int page_mapping_inuse(struct page *page){ struct address_space *mapping; /* Page is in somebody's page tables. */ if (page_mapped(page)) return 1; /* Be more reluctant to reclaim swapcache than pagecache */ if (PageSwapCache(page)) return 1; mapping = page_mapping(page); if (!mapping) return 0; /* File is mmap'd by somebody? */ return mapping_mapped(mapping);}static inline int is_page_cache_freeable(struct page *page){ return page_count(page) - !!PagePrivate(page) == 2;}static int may_write_to_queue(struct backing_dev_info *bdi){ if (current->flags & PF_SWAPWRITE) return 1; if (!bdi_write_congested(bdi)) return 1; if (bdi == current->backing_dev_info) return 1; return 0;}/* * We detected a synchronous write error writing a page out. Probably * -ENOSPC. We need to propagate that into the address_space for a subsequent * fsync(), msync() or close(). * * The tricky part is that after writepage we cannot touch the mapping: nothing * prevents it from being freed up. But we have a ref on the page and once * that page is locked, the mapping is pinned. * * We're allowed to run sleeping lock_page() here because we know the caller has * __GFP_FS. */static void handle_write_error(struct address_space *mapping, struct page *page, int error){ lock_page(page); if (page_mapping(page) == mapping) mapping_set_error(mapping, error); unlock_page(page);}/* Request for sync pageout. */enum pageout_io { PAGEOUT_IO_ASYNC, PAGEOUT_IO_SYNC,};/* possible outcome of pageout() */typedef enum { /* failed to write page out, page is locked */ PAGE_KEEP, /* move page to the active list, page is locked */ PAGE_ACTIVATE, /* page has been sent to the disk successfully, page is unlocked */ PAGE_SUCCESS, /* page is clean and locked */ PAGE_CLEAN,} pageout_t;/* * pageout is called by shrink_page_list() for each dirty page. * Calls ->writepage(). */static pageout_t pageout(struct page *page, struct address_space *mapping, enum pageout_io sync_writeback){ /* * If the page is dirty, only perform writeback if that write * will be non-blocking. To prevent this allocation from being * stalled by pagecache activity. But note that there may be * stalls if we need to run get_block(). We could test * PagePrivate for that. * * If this process is currently in generic_file_write() against * this page's queue, we can perform writeback even if that * will block. * * If the page is swapcache, write it back even if that would * block, for some throttling. This happens by accident, because * swap_backing_dev_info is bust: it doesn't reflect the * congestion state of the swapdevs. Easy to fix, if needed. * See swapfile.c:page_queue_congested(). */ if (!is_page_cache_freeable(page)) return PAGE_KEEP; if (!mapping) { /* * Some data journaling orphaned pages can have * page->mapping == NULL while being dirty with clean buffers. */ if (PagePrivate(page)) { if (try_to_free_buffers(page)) { ClearPageDirty(page); printk("%s: orphaned page\n", __func__); return PAGE_CLEAN; } } return PAGE_KEEP; } if (mapping->a_ops->writepage == NULL) return PAGE_ACTIVATE; if (!may_write_to_queue(mapping->backing_dev_info)) return PAGE_KEEP; if (clear_page_dirty_for_io(page)) { int res; struct writeback_control wbc = { .sync_mode = WB_SYNC_NONE, .nr_to_write = SWAP_CLUSTER_MAX, .range_start = 0, .range_end = LLONG_MAX, .nonblocking = 1, .for_reclaim = 1, }; SetPageReclaim(page); res = mapping->a_ops->writepage(page, &wbc); if (res < 0) handle_write_error(mapping, page, res); if (res == AOP_WRITEPAGE_ACTIVATE) { ClearPageReclaim(page); return PAGE_ACTIVATE; } /* * Wait on writeback if requested to. This happens when * direct reclaiming a large contiguous area and the * first attempt to free a range of pages fails. */ if (PageWriteback(page) && sync_writeback == PAGEOUT_IO_SYNC) wait_on_page_writeback(page); if (!PageWriteback(page)) { /* synchronous write or broken a_ops? */ ClearPageReclaim(page); } inc_zone_page_state(page, NR_VMSCAN_WRITE); return PAGE_SUCCESS; } return PAGE_CLEAN;}/* * Same as remove_mapping, but if the page is removed from the mapping, it * gets returned with a refcount of 0. */static int __remove_mapping(struct address_space *mapping, struct page *page){ BUG_ON(!PageLocked(page)); BUG_ON(mapping != page_mapping(page)); spin_lock_irq(&mapping->tree_lock); /* * The non racy check for a busy page. * * Must be careful with the order of the tests. When someone has * a ref to the page, it may be possible that they dirty it then * drop the reference. So if PageDirty is tested before page_count * here, then the following race may occur: * * get_user_pages(&page); * [user mapping goes away] * write_to(page); * !PageDirty(page) [good] * SetPageDirty(page); * put_page(page); * !page_count(page) [good, discard it] * * [oops, our write_to data is lost] * * Reversing the order of the tests ensures such a situation cannot * escape unnoticed. The smp_rmb is needed to ensure the page->flags * load is not satisfied before that of page->_count. * * Note that if SetPageDirty is always performed via set_page_dirty, * and thus under tree_lock, then this ordering is not required. */ if (!page_freeze_refs(page, 2)) goto cannot_free; /* note: atomic_cmpxchg in page_freeze_refs provides the smp_rmb */ if (unlikely(PageDirty(page))) { page_unfreeze_refs(page, 2); goto cannot_free; } if (PageSwapCache(page)) { swp_entry_t swap = { .val = page_private(page) }; __delete_from_swap_cache(page); spin_unlock_irq(&mapping->tree_lock); swap_free(swap); } else { __remove_from_page_cache(page); spin_unlock_irq(&mapping->tree_lock); }
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -