📄 swapfile.c
字号:
/* * How many references to page are currently swapped out? */static inline int page_swapcount(struct page *page){ int count = 0; struct swap_info_struct *p; swp_entry_t entry; entry.val = page_private(page); p = swap_info_get(entry); if (p) { /* Subtract the 1 for the swap cache itself */ count = p->swap_map[swp_offset(entry)] - 1; spin_unlock(&swap_lock); } return count;}/* * We can write to an anon page without COW if there are no other references * to it. And as a side-effect, free up its swap: because the old content * on disk will never be read, and seeking back there to write new content * later would only waste time away from clustering. */int reuse_swap_page(struct page *page){ int count; VM_BUG_ON(!PageLocked(page)); count = page_mapcount(page); if (count <= 1 && PageSwapCache(page)) { count += page_swapcount(page); if (count == 1 && !PageWriteback(page)) { delete_from_swap_cache(page); SetPageDirty(page); } } return count == 1;}/* * If swap is getting full, or if there are no more mappings of this page, * then try_to_free_swap is called to free its swap space. */int try_to_free_swap(struct page *page){ VM_BUG_ON(!PageLocked(page)); if (!PageSwapCache(page)) return 0; if (PageWriteback(page)) return 0; if (page_swapcount(page)) return 0; delete_from_swap_cache(page); SetPageDirty(page); return 1;}/* * Free the swap entry like above, but also try to * free the page cache entry if it is the last user. */int free_swap_and_cache(swp_entry_t entry){ struct swap_info_struct *p; struct page *page = NULL; if (is_migration_entry(entry)) return 1; p = swap_info_get(entry); if (p) { if (swap_entry_free(p, entry) == 1) { page = find_get_page(&swapper_space, entry.val); if (page && !trylock_page(page)) { page_cache_release(page); page = NULL; } } spin_unlock(&swap_lock); } if (page) { /* * Not mapped elsewhere, or swap space full? Free it! * Also recheck PageSwapCache now page is locked (above). */ if (PageSwapCache(page) && !PageWriteback(page) && (!page_mapped(page) || vm_swap_full())) { delete_from_swap_cache(page); SetPageDirty(page); } unlock_page(page); page_cache_release(page); } return p != NULL;}#ifdef CONFIG_HIBERNATION/* * Find the swap type that corresponds to given device (if any). * * @offset - number of the PAGE_SIZE-sized block of the device, starting * from 0, in which the swap header is expected to be located. * * This is needed for the suspend to disk (aka swsusp). */int swap_type_of(dev_t device, sector_t offset, struct block_device **bdev_p){ struct block_device *bdev = NULL; int i; if (device) bdev = bdget(device); spin_lock(&swap_lock); for (i = 0; i < nr_swapfiles; i++) { struct swap_info_struct *sis = swap_info + i; if (!(sis->flags & SWP_WRITEOK)) continue; if (!bdev) { if (bdev_p) *bdev_p = bdget(sis->bdev->bd_dev); spin_unlock(&swap_lock); return i; } if (bdev == sis->bdev) { struct swap_extent *se; se = list_entry(sis->extent_list.next, struct swap_extent, list); if (se->start_block == offset) { if (bdev_p) *bdev_p = bdget(sis->bdev->bd_dev); spin_unlock(&swap_lock); bdput(bdev); return i; } } } spin_unlock(&swap_lock); if (bdev) bdput(bdev); return -ENODEV;}/* * Return either the total number of swap pages of given type, or the number * of free pages of that type (depending on @free) * * This is needed for software suspend */unsigned int count_swap_pages(int type, int free){ unsigned int n = 0; if (type < nr_swapfiles) { spin_lock(&swap_lock); if (swap_info[type].flags & SWP_WRITEOK) { n = swap_info[type].pages; if (free) n -= swap_info[type].inuse_pages; } spin_unlock(&swap_lock); } return n;}#endif/* * No need to decide whether this PTE shares the swap entry with others, * just let do_wp_page work it out if a write is requested later - to * force COW, vm_page_prot omits write permission from any private vma. */static int unuse_pte(struct vm_area_struct *vma, pmd_t *pmd, unsigned long addr, swp_entry_t entry, struct page *page){ struct mem_cgroup *ptr = NULL; spinlock_t *ptl; pte_t *pte; int ret = 1; if (mem_cgroup_try_charge_swapin(vma->vm_mm, page, GFP_KERNEL, &ptr)) { ret = -ENOMEM; goto out_nolock; } pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl); if (unlikely(!pte_same(*pte, swp_entry_to_pte(entry)))) { if (ret > 0) mem_cgroup_cancel_charge_swapin(ptr); ret = 0; goto out; } inc_mm_counter(vma->vm_mm, anon_rss); get_page(page); set_pte_at(vma->vm_mm, addr, pte, pte_mkold(mk_pte(page, vma->vm_page_prot))); page_add_anon_rmap(page, vma, addr); mem_cgroup_commit_charge_swapin(page, ptr); swap_free(entry); /* * Move the page to the active list so it is not * immediately swapped out again after swapon. */ activate_page(page);out: pte_unmap_unlock(pte, ptl);out_nolock: return ret;}static int unuse_pte_range(struct vm_area_struct *vma, pmd_t *pmd, unsigned long addr, unsigned long end, swp_entry_t entry, struct page *page){ pte_t swp_pte = swp_entry_to_pte(entry); pte_t *pte; int ret = 0; /* * We don't actually need pte lock while scanning for swp_pte: since * we hold page lock and mmap_sem, swp_pte cannot be inserted into the * page table while we're scanning; though it could get zapped, and on * some architectures (e.g. x86_32 with PAE) we might catch a glimpse * of unmatched parts which look like swp_pte, so unuse_pte must * recheck under pte lock. Scanning without pte lock lets it be * preemptible whenever CONFIG_PREEMPT but not CONFIG_HIGHPTE. */ pte = pte_offset_map(pmd, addr); do { /* * swapoff spends a _lot_ of time in this loop! * Test inline before going to call unuse_pte. */ if (unlikely(pte_same(*pte, swp_pte))) { pte_unmap(pte); ret = unuse_pte(vma, pmd, addr, entry, page); if (ret) goto out; pte = pte_offset_map(pmd, addr); } } while (pte++, addr += PAGE_SIZE, addr != end); pte_unmap(pte - 1);out: return ret;}static inline int unuse_pmd_range(struct vm_area_struct *vma, pud_t *pud, unsigned long addr, unsigned long end, swp_entry_t entry, struct page *page){ pmd_t *pmd; unsigned long next; int ret; pmd = pmd_offset(pud, addr); do { next = pmd_addr_end(addr, end); if (pmd_none_or_clear_bad(pmd)) continue; ret = unuse_pte_range(vma, pmd, addr, next, entry, page); if (ret) return ret; } while (pmd++, addr = next, addr != end); return 0;}static inline int unuse_pud_range(struct vm_area_struct *vma, pgd_t *pgd, unsigned long addr, unsigned long end, swp_entry_t entry, struct page *page){ pud_t *pud; unsigned long next; int ret; pud = pud_offset(pgd, addr); do { next = pud_addr_end(addr, end); if (pud_none_or_clear_bad(pud)) continue; ret = unuse_pmd_range(vma, pud, addr, next, entry, page); if (ret) return ret; } while (pud++, addr = next, addr != end); return 0;}static int unuse_vma(struct vm_area_struct *vma, swp_entry_t entry, struct page *page){ pgd_t *pgd; unsigned long addr, end, next; int ret; if (page->mapping) { addr = page_address_in_vma(page, vma); if (addr == -EFAULT) return 0; else end = addr + PAGE_SIZE; } else { addr = vma->vm_start; end = vma->vm_end; } pgd = pgd_offset(vma->vm_mm, addr); do { next = pgd_addr_end(addr, end); if (pgd_none_or_clear_bad(pgd)) continue; ret = unuse_pud_range(vma, pgd, addr, next, entry, page); if (ret) return ret; } while (pgd++, addr = next, addr != end); return 0;}static int unuse_mm(struct mm_struct *mm, swp_entry_t entry, struct page *page){ struct vm_area_struct *vma; int ret = 0; if (!down_read_trylock(&mm->mmap_sem)) { /* * Activate page so shrink_inactive_list is unlikely to unmap * its ptes while lock is dropped, so swapoff can make progress. */ activate_page(page); unlock_page(page); down_read(&mm->mmap_sem); lock_page(page); } for (vma = mm->mmap; vma; vma = vma->vm_next) { if (vma->anon_vma && (ret = unuse_vma(vma, entry, page))) break; } up_read(&mm->mmap_sem); return (ret < 0)? ret: 0;}/* * Scan swap_map from current position to next entry still in use. * Recycle to start on reaching the end, returning 0 when empty. */static unsigned int find_next_to_unuse(struct swap_info_struct *si, unsigned int prev){ unsigned int max = si->max; unsigned int i = prev; int count; /* * No need for swap_lock here: we're just looking * for whether an entry is in use, not modifying it; false * hits are okay, and sys_swapoff() has already prevented new * allocations from this area (while holding swap_lock). */ for (;;) { if (++i >= max) { if (!prev) { i = 0; break; } /* * No entries in use at top of swap_map, * loop back to start and recheck there. */ max = prev + 1; prev = 0; i = 1; } count = si->swap_map[i]; if (count && count != SWAP_MAP_BAD) break; } return i;}/* * We completely avoid races by reading each swap page in advance, * and then search for the process using it. All the necessary * page table adjustments can then be made atomically. */static int try_to_unuse(unsigned int type){ struct swap_info_struct * si = &swap_info[type]; struct mm_struct *start_mm; unsigned short *swap_map; unsigned short swcount; struct page *page; swp_entry_t entry; unsigned int i = 0; int retval = 0; int reset_overflow = 0; int shmem; /* * When searching mms for an entry, a good strategy is to * start at the first mm we freed the previous entry from * (though actually we don't notice whether we or coincidence * freed the entry). Initialize this start_mm with a hold. * * A simpler strategy would be to start at the last mm we * freed the previous entry from; but that would take less * advantage of mmlist ordering, which clusters forked mms * together, child after parent. If we race with dup_mmap(), we * prefer to resolve parent before child, lest we miss entries * duplicated after we scanned child: using last mm would invert * that. Though it's only a serious concern when an overflowed * swap count is reset from SWAP_MAP_MAX, preventing a rescan. */ start_mm = &init_mm; atomic_inc(&init_mm.mm_users); /* * Keep on scanning until all entries have gone. Usually, * one pass through swap_map is enough, but not necessarily: * there are races when an instance of an entry might be missed. */ while ((i = find_next_to_unuse(si, i)) != 0) { if (signal_pending(current)) { retval = -EINTR; break; } /* * Get a page for the entry, using the existing swap * cache page if there is one. Otherwise, get a clean * page and read the swap into it. */ swap_map = &si->swap_map[i]; entry = swp_entry(type, i); page = read_swap_cache_async(entry, GFP_HIGHUSER_MOVABLE, NULL, 0); if (!page) { /* * Either swap_duplicate() failed because entry * has been freed independently, and will not be * reused since sys_swapoff() already disabled * allocation from here, or alloc_page() failed. */ if (!*swap_map) continue; retval = -ENOMEM; break; } /* * Don't hold on to start_mm if it looks like exiting. */ if (atomic_read(&start_mm->mm_users) == 1) { mmput(start_mm); start_mm = &init_mm; atomic_inc(&init_mm.mm_users); } /* * Wait for and lock page. When do_swap_page races with * try_to_unuse, do_swap_page can handle the fault much * faster than try_to_unuse can locate the entry. This * apparently redundant "wait_on_page_locked" lets try_to_unuse * defer to do_swap_page in such a case - in some tests, * do_swap_page and try_to_unuse repeatedly compete. */ wait_on_page_locked(page); wait_on_page_writeback(page); lock_page(page); wait_on_page_writeback(page); /* * Remove all references to entry. * Whenever we reach init_mm, there's no address space * to search, but use it as a reminder to search shmem. */ shmem = 0; swcount = *swap_map; if (swcount > 1) { if (start_mm == &init_mm) shmem = shmem_unuse(entry, page); else retval = unuse_mm(start_mm, entry, page); } if (*swap_map > 1) { int set_start_mm = (*swap_map >= swcount); struct list_head *p = &start_mm->mmlist; struct mm_struct *new_start_mm = start_mm; struct mm_struct *prev_mm = start_mm; struct mm_struct *mm; atomic_inc(&new_start_mm->mm_users); atomic_inc(&prev_mm->mm_users); spin_lock(&mmlist_lock); while (*swap_map > 1 && !retval && !shmem && (p = p->next) != &start_mm->mmlist) { mm = list_entry(p, struct mm_struct, mmlist); if (!atomic_inc_not_zero(&mm->mm_users)) continue; spin_unlock(&mmlist_lock); mmput(prev_mm);
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -