📄 swapfile.c
字号:
/* * linux/mm/swapfile.c * * Copyright (C) 1991, 1992, 1993, 1994 Linus Torvalds * Swap reorganised 29.12.95, Stephen Tweedie */#include <linux/mm.h>#include <linux/hugetlb.h>#include <linux/mman.h>#include <linux/slab.h>#include <linux/kernel_stat.h>#include <linux/swap.h>#include <linux/vmalloc.h>#include <linux/pagemap.h>#include <linux/namei.h>#include <linux/shm.h>#include <linux/blkdev.h>#include <linux/random.h>#include <linux/writeback.h>#include <linux/proc_fs.h>#include <linux/seq_file.h>#include <linux/init.h>#include <linux/module.h>#include <linux/rmap.h>#include <linux/security.h>#include <linux/backing-dev.h>#include <linux/mutex.h>#include <linux/capability.h>#include <linux/syscalls.h>#include <linux/memcontrol.h>#include <asm/pgtable.h>#include <asm/tlbflush.h>#include <linux/swapops.h>#include <linux/page_cgroup.h>static DEFINE_SPINLOCK(swap_lock);static unsigned int nr_swapfiles;long nr_swap_pages;long total_swap_pages;static int swap_overflow;static int least_priority;static const char Bad_file[] = "Bad swap file entry ";static const char Unused_file[] = "Unused swap file entry ";static const char Bad_offset[] = "Bad swap offset entry ";static const char Unused_offset[] = "Unused swap offset entry ";static struct swap_list_t swap_list = {-1, -1};static struct swap_info_struct swap_info[MAX_SWAPFILES];static DEFINE_MUTEX(swapon_mutex);/* * We need this because the bdev->unplug_fn can sleep and we cannot * hold swap_lock while calling the unplug_fn. And swap_lock * cannot be turned into a mutex. */static DECLARE_RWSEM(swap_unplug_sem);void swap_unplug_io_fn(struct backing_dev_info *unused_bdi, struct page *page){ swp_entry_t entry; down_read(&swap_unplug_sem); entry.val = page_private(page); if (PageSwapCache(page)) { struct block_device *bdev = swap_info[swp_type(entry)].bdev; struct backing_dev_info *bdi; /* * If the page is removed from swapcache from under us (with a * racy try_to_unuse/swapoff) we need an additional reference * count to avoid reading garbage from page_private(page) above. * If the WARN_ON triggers during a swapoff it maybe the race * condition and it's harmless. However if it triggers without * swapoff it signals a problem. */ WARN_ON(page_count(page) <= 1); bdi = bdev->bd_inode->i_mapping->backing_dev_info; blk_run_backing_dev(bdi, page); } up_read(&swap_unplug_sem);}/* * swapon tell device that all the old swap contents can be discarded, * to allow the swap device to optimize its wear-levelling. */static int discard_swap(struct swap_info_struct *si){ struct swap_extent *se; int err = 0; list_for_each_entry(se, &si->extent_list, list) { sector_t start_block = se->start_block << (PAGE_SHIFT - 9); sector_t nr_blocks = (sector_t)se->nr_pages << (PAGE_SHIFT - 9); if (se->start_page == 0) { /* Do not discard the swap header page! */ start_block += 1 << (PAGE_SHIFT - 9); nr_blocks -= 1 << (PAGE_SHIFT - 9); if (!nr_blocks) continue; } err = blkdev_issue_discard(si->bdev, start_block, nr_blocks, GFP_KERNEL); if (err) break; cond_resched(); } return err; /* That will often be -EOPNOTSUPP */}/* * swap allocation tell device that a cluster of swap can now be discarded, * to allow the swap device to optimize its wear-levelling. */static void discard_swap_cluster(struct swap_info_struct *si, pgoff_t start_page, pgoff_t nr_pages){ struct swap_extent *se = si->curr_swap_extent; int found_extent = 0; while (nr_pages) { struct list_head *lh; if (se->start_page <= start_page && start_page < se->start_page + se->nr_pages) { pgoff_t offset = start_page - se->start_page; sector_t start_block = se->start_block + offset; sector_t nr_blocks = se->nr_pages - offset; if (nr_blocks > nr_pages) nr_blocks = nr_pages; start_page += nr_blocks; nr_pages -= nr_blocks; if (!found_extent++) si->curr_swap_extent = se; start_block <<= PAGE_SHIFT - 9; nr_blocks <<= PAGE_SHIFT - 9; if (blkdev_issue_discard(si->bdev, start_block, nr_blocks, GFP_NOIO)) break; } lh = se->list.next; if (lh == &si->extent_list) lh = lh->next; se = list_entry(lh, struct swap_extent, list); }}static int wait_for_discard(void *word){ schedule(); return 0;}#define SWAPFILE_CLUSTER 256#define LATENCY_LIMIT 256static inline unsigned long scan_swap_map(struct swap_info_struct *si){ unsigned long offset; unsigned long scan_base; unsigned long last_in_cluster = 0; int latency_ration = LATENCY_LIMIT; int found_free_cluster = 0; /* * We try to cluster swap pages by allocating them sequentially * in swap. Once we've allocated SWAPFILE_CLUSTER pages this * way, however, we resort to first-free allocation, starting * a new cluster. This prevents us from scattering swap pages * all over the entire swap partition, so that we reduce * overall disk seek times between swap pages. -- sct * But we do now try to find an empty cluster. -Andrea * And we let swap pages go all over an SSD partition. Hugh */ si->flags += SWP_SCANNING; scan_base = offset = si->cluster_next; if (unlikely(!si->cluster_nr--)) { if (si->pages - si->inuse_pages < SWAPFILE_CLUSTER) { si->cluster_nr = SWAPFILE_CLUSTER - 1; goto checks; } if (si->flags & SWP_DISCARDABLE) { /* * Start range check on racing allocations, in case * they overlap the cluster we eventually decide on * (we scan without swap_lock to allow preemption). * It's hardly conceivable that cluster_nr could be * wrapped during our scan, but don't depend on it. */ if (si->lowest_alloc) goto checks; si->lowest_alloc = si->max; si->highest_alloc = 0; } spin_unlock(&swap_lock); /* * If seek is expensive, start searching for new cluster from * start of partition, to minimize the span of allocated swap. * But if seek is cheap, search from our current position, so * that swap is allocated from all over the partition: if the * Flash Translation Layer only remaps within limited zones, * we don't want to wear out the first zone too quickly. */ if (!(si->flags & SWP_SOLIDSTATE)) scan_base = offset = si->lowest_bit; last_in_cluster = offset + SWAPFILE_CLUSTER - 1; /* Locate the first empty (unaligned) cluster */ for (; last_in_cluster <= si->highest_bit; offset++) { if (si->swap_map[offset]) last_in_cluster = offset + SWAPFILE_CLUSTER; else if (offset == last_in_cluster) { spin_lock(&swap_lock); offset -= SWAPFILE_CLUSTER - 1; si->cluster_next = offset; si->cluster_nr = SWAPFILE_CLUSTER - 1; found_free_cluster = 1; goto checks; } if (unlikely(--latency_ration < 0)) { cond_resched(); latency_ration = LATENCY_LIMIT; } } offset = si->lowest_bit; last_in_cluster = offset + SWAPFILE_CLUSTER - 1; /* Locate the first empty (unaligned) cluster */ for (; last_in_cluster < scan_base; offset++) { if (si->swap_map[offset]) last_in_cluster = offset + SWAPFILE_CLUSTER; else if (offset == last_in_cluster) { spin_lock(&swap_lock); offset -= SWAPFILE_CLUSTER - 1; si->cluster_next = offset; si->cluster_nr = SWAPFILE_CLUSTER - 1; found_free_cluster = 1; goto checks; } if (unlikely(--latency_ration < 0)) { cond_resched(); latency_ration = LATENCY_LIMIT; } } offset = scan_base; spin_lock(&swap_lock); si->cluster_nr = SWAPFILE_CLUSTER - 1; si->lowest_alloc = 0; }checks: if (!(si->flags & SWP_WRITEOK)) goto no_page; if (!si->highest_bit) goto no_page; if (offset > si->highest_bit) scan_base = offset = si->lowest_bit; if (si->swap_map[offset]) goto scan; if (offset == si->lowest_bit) si->lowest_bit++; if (offset == si->highest_bit) si->highest_bit--; si->inuse_pages++; if (si->inuse_pages == si->pages) { si->lowest_bit = si->max; si->highest_bit = 0; } si->swap_map[offset] = 1; si->cluster_next = offset + 1; si->flags -= SWP_SCANNING; if (si->lowest_alloc) { /* * Only set when SWP_DISCARDABLE, and there's a scan * for a free cluster in progress or just completed. */ if (found_free_cluster) { /* * To optimize wear-levelling, discard the * old data of the cluster, taking care not to * discard any of its pages that have already * been allocated by racing tasks (offset has * already stepped over any at the beginning). */ if (offset < si->highest_alloc && si->lowest_alloc <= last_in_cluster) last_in_cluster = si->lowest_alloc - 1; si->flags |= SWP_DISCARDING; spin_unlock(&swap_lock); if (offset < last_in_cluster) discard_swap_cluster(si, offset, last_in_cluster - offset + 1); spin_lock(&swap_lock); si->lowest_alloc = 0; si->flags &= ~SWP_DISCARDING; smp_mb(); /* wake_up_bit advises this */ wake_up_bit(&si->flags, ilog2(SWP_DISCARDING)); } else if (si->flags & SWP_DISCARDING) { /* * Delay using pages allocated by racing tasks * until the whole discard has been issued. We * could defer that delay until swap_writepage, * but it's easier to keep this self-contained. */ spin_unlock(&swap_lock); wait_on_bit(&si->flags, ilog2(SWP_DISCARDING), wait_for_discard, TASK_UNINTERRUPTIBLE); spin_lock(&swap_lock); } else { /* * Note pages allocated by racing tasks while * scan for a free cluster is in progress, so * that its final discard can exclude them. */ if (offset < si->lowest_alloc) si->lowest_alloc = offset; if (offset > si->highest_alloc) si->highest_alloc = offset; } } return offset;scan: spin_unlock(&swap_lock); while (++offset <= si->highest_bit) { if (!si->swap_map[offset]) { spin_lock(&swap_lock); goto checks; } if (unlikely(--latency_ration < 0)) { cond_resched(); latency_ration = LATENCY_LIMIT; } } offset = si->lowest_bit; while (++offset < scan_base) { if (!si->swap_map[offset]) { spin_lock(&swap_lock); goto checks; } if (unlikely(--latency_ration < 0)) { cond_resched(); latency_ration = LATENCY_LIMIT; } } spin_lock(&swap_lock);no_page: si->flags -= SWP_SCANNING; return 0;}swp_entry_t get_swap_page(void){ struct swap_info_struct *si; pgoff_t offset; int type, next; int wrapped = 0; spin_lock(&swap_lock); if (nr_swap_pages <= 0) goto noswap; nr_swap_pages--; for (type = swap_list.next; type >= 0 && wrapped < 2; type = next) { si = swap_info + type; next = si->next; if (next < 0 || (!wrapped && si->prio != swap_info[next].prio)) { next = swap_list.head; wrapped++; } if (!si->highest_bit) continue; if (!(si->flags & SWP_WRITEOK)) continue; swap_list.next = next; offset = scan_swap_map(si); if (offset) { spin_unlock(&swap_lock); return swp_entry(type, offset); } next = swap_list.next; } nr_swap_pages++;noswap: spin_unlock(&swap_lock); return (swp_entry_t) {0};}swp_entry_t get_swap_page_of_type(int type){ struct swap_info_struct *si; pgoff_t offset; spin_lock(&swap_lock); si = swap_info + type; if (si->flags & SWP_WRITEOK) { nr_swap_pages--; offset = scan_swap_map(si); if (offset) { spin_unlock(&swap_lock); return swp_entry(type, offset); } nr_swap_pages++; } spin_unlock(&swap_lock); return (swp_entry_t) {0};}static struct swap_info_struct * swap_info_get(swp_entry_t entry){ struct swap_info_struct * p; unsigned long offset, type; if (!entry.val) goto out; type = swp_type(entry); if (type >= nr_swapfiles) goto bad_nofile; p = & swap_info[type]; if (!(p->flags & SWP_USED)) goto bad_device; offset = swp_offset(entry); if (offset >= p->max) goto bad_offset; if (!p->swap_map[offset]) goto bad_free; spin_lock(&swap_lock); return p;bad_free: printk(KERN_ERR "swap_free: %s%08lx\n", Unused_offset, entry.val); goto out;bad_offset: printk(KERN_ERR "swap_free: %s%08lx\n", Bad_offset, entry.val); goto out;bad_device: printk(KERN_ERR "swap_free: %s%08lx\n", Unused_file, entry.val); goto out;bad_nofile: printk(KERN_ERR "swap_free: %s%08lx\n", Bad_file, entry.val);out: return NULL;}static int swap_entry_free(struct swap_info_struct *p, swp_entry_t ent){ unsigned long offset = swp_offset(ent); int count = p->swap_map[offset]; if (count < SWAP_MAP_MAX) { count--; p->swap_map[offset] = count; if (!count) { if (offset < p->lowest_bit) p->lowest_bit = offset; if (offset > p->highest_bit) p->highest_bit = offset; if (p->prio > swap_info[swap_list.next].prio) swap_list.next = p - swap_info; nr_swap_pages++; p->inuse_pages--; mem_cgroup_uncharge_swap(ent); } } return count;}/* * Caller has made sure that the swapdevice corresponding to entry * is still around or has not been recycled. */void swap_free(swp_entry_t entry){ struct swap_info_struct * p; p = swap_info_get(entry); if (p) { swap_entry_free(p, entry); spin_unlock(&swap_lock); }}
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -