📄 memory.c
字号:
flush_dcache_page(dst); } else copy_user_highpage(dst, src, va, vma);}/* * This routine handles present pages, when users try to write * to a shared page. It is done by copying the page to a new address * and decrementing the shared-page counter for the old page. * * Note that this routine assumes that the protection checks have been * done by the caller (the low-level page fault routine in most cases). * Thus we can safely just mark it writable once we've done any necessary * COW. * * We also mark the page dirty at this point even though the page will * change only once the write actually happens. This avoids a few races, * and potentially makes it more efficient. * * We enter with non-exclusive mmap_sem (to exclude vma changes, * but allow concurrent faults), with pte both mapped and locked. * We return with mmap_sem still held, but pte unmapped and unlocked. */static int do_wp_page(struct mm_struct *mm, struct vm_area_struct *vma, unsigned long address, pte_t *page_table, pmd_t *pmd, spinlock_t *ptl, pte_t orig_pte){ struct page *old_page, *new_page; pte_t entry; int reuse = 0, ret = 0; int page_mkwrite = 0; struct page *dirty_page = NULL; old_page = vm_normal_page(vma, address, orig_pte); if (!old_page) { /* * VM_MIXEDMAP !pfn_valid() case * * We should not cow pages in a shared writeable mapping. * Just mark the pages writable as we can't do any dirty * accounting on raw pfn maps. */ if ((vma->vm_flags & (VM_WRITE|VM_SHARED)) == (VM_WRITE|VM_SHARED)) goto reuse; goto gotten; } /* * Take out anonymous pages first, anonymous shared vmas are * not dirty accountable. */ if (PageAnon(old_page)) { if (!trylock_page(old_page)) { page_cache_get(old_page); pte_unmap_unlock(page_table, ptl); lock_page(old_page); page_table = pte_offset_map_lock(mm, pmd, address, &ptl); if (!pte_same(*page_table, orig_pte)) { unlock_page(old_page); page_cache_release(old_page); goto unlock; } page_cache_release(old_page); } reuse = reuse_swap_page(old_page); unlock_page(old_page); } else if (unlikely((vma->vm_flags & (VM_WRITE|VM_SHARED)) == (VM_WRITE|VM_SHARED))) { /* * Only catch write-faults on shared writable pages, * read-only shared pages can get COWed by * get_user_pages(.write=1, .force=1). */ if (vma->vm_ops && vma->vm_ops->page_mkwrite) { /* * Notify the address space that the page is about to * become writable so that it can prohibit this or wait * for the page to get into an appropriate state. * * We do this without the lock held, so that it can * sleep if it needs to. */ page_cache_get(old_page); pte_unmap_unlock(page_table, ptl); if (vma->vm_ops->page_mkwrite(vma, old_page) < 0) goto unwritable_page; /* * Since we dropped the lock we need to revalidate * the PTE as someone else may have changed it. If * they did, we just return, as we can count on the * MMU to tell us if they didn't also make it writable. */ page_table = pte_offset_map_lock(mm, pmd, address, &ptl); page_cache_release(old_page); if (!pte_same(*page_table, orig_pte)) goto unlock; page_mkwrite = 1; } dirty_page = old_page; get_page(dirty_page); reuse = 1; } if (reuse) {reuse: flush_cache_page(vma, address, pte_pfn(orig_pte)); entry = pte_mkyoung(orig_pte); entry = maybe_mkwrite(pte_mkdirty(entry), vma); if (ptep_set_access_flags(vma, address, page_table, entry,1)) update_mmu_cache(vma, address, entry); ret |= VM_FAULT_WRITE; goto unlock; } /* * Ok, we need to copy. Oh, well.. */ page_cache_get(old_page);gotten: pte_unmap_unlock(page_table, ptl); if (unlikely(anon_vma_prepare(vma))) goto oom; VM_BUG_ON(old_page == ZERO_PAGE(0)); new_page = alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma, address); if (!new_page) goto oom; /* * Don't let another task, with possibly unlocked vma, * keep the mlocked page. */ if ((vma->vm_flags & VM_LOCKED) && old_page) { lock_page(old_page); /* for LRU manipulation */ clear_page_mlock(old_page); unlock_page(old_page); } cow_user_page(new_page, old_page, address, vma); __SetPageUptodate(new_page); if (mem_cgroup_newpage_charge(new_page, mm, GFP_KERNEL)) goto oom_free_new; /* * Re-check the pte - we dropped the lock */ page_table = pte_offset_map_lock(mm, pmd, address, &ptl); if (likely(pte_same(*page_table, orig_pte))) { if (old_page) { if (!PageAnon(old_page)) { dec_mm_counter(mm, file_rss); inc_mm_counter(mm, anon_rss); } } else inc_mm_counter(mm, anon_rss); flush_cache_page(vma, address, pte_pfn(orig_pte)); entry = mk_pte(new_page, vma->vm_page_prot); entry = maybe_mkwrite(pte_mkdirty(entry), vma); /* * Clear the pte entry and flush it first, before updating the * pte with the new entry. This will avoid a race condition * seen in the presence of one thread doing SMC and another * thread doing COW. */ ptep_clear_flush_notify(vma, address, page_table); page_add_new_anon_rmap(new_page, vma, address); set_pte_at(mm, address, page_table, entry); update_mmu_cache(vma, address, entry); if (old_page) { /* * Only after switching the pte to the new page may * we remove the mapcount here. Otherwise another * process may come and find the rmap count decremented * before the pte is switched to the new page, and * "reuse" the old page writing into it while our pte * here still points into it and can be read by other * threads. * * The critical issue is to order this * page_remove_rmap with the ptp_clear_flush above. * Those stores are ordered by (if nothing else,) * the barrier present in the atomic_add_negative * in page_remove_rmap. * * Then the TLB flush in ptep_clear_flush ensures that * no process can access the old page before the * decremented mapcount is visible. And the old page * cannot be reused until after the decremented * mapcount is visible. So transitively, TLBs to * old page will be flushed before it can be reused. */ page_remove_rmap(old_page); } /* Free the old page.. */ new_page = old_page; ret |= VM_FAULT_WRITE; } else mem_cgroup_uncharge_page(new_page); if (new_page) page_cache_release(new_page); if (old_page) page_cache_release(old_page);unlock: pte_unmap_unlock(page_table, ptl); if (dirty_page) { if (vma->vm_file) file_update_time(vma->vm_file); /* * Yes, Virginia, this is actually required to prevent a race * with clear_page_dirty_for_io() from clearing the page dirty * bit after it clear all dirty ptes, but before a racing * do_wp_page installs a dirty pte. * * do_no_page is protected similarly. */ wait_on_page_locked(dirty_page); set_page_dirty_balance(dirty_page, page_mkwrite); put_page(dirty_page); } return ret;oom_free_new: page_cache_release(new_page);oom: if (old_page) page_cache_release(old_page); return VM_FAULT_OOM;unwritable_page: page_cache_release(old_page); return VM_FAULT_SIGBUS;}/* * Helper functions for unmap_mapping_range(). * * __ Notes on dropping i_mmap_lock to reduce latency while unmapping __ * * We have to restart searching the prio_tree whenever we drop the lock, * since the iterator is only valid while the lock is held, and anyway * a later vma might be split and reinserted earlier while lock dropped. * * The list of nonlinear vmas could be handled more efficiently, using * a placeholder, but handle it in the same way until a need is shown. * It is important to search the prio_tree before nonlinear list: a vma * may become nonlinear and be shifted from prio_tree to nonlinear list * while the lock is dropped; but never shifted from list to prio_tree. * * In order to make forward progress despite restarting the search, * vm_truncate_count is used to mark a vma as now dealt with, so we can * quickly skip it next time around. Since the prio_tree search only * shows us those vmas affected by unmapping the range in question, we * can't efficiently keep all vmas in step with mapping->truncate_count: * so instead reset them all whenever it wraps back to 0 (then go to 1). * mapping->truncate_count and vma->vm_truncate_count are protected by * i_mmap_lock. * * In order to make forward progress despite repeatedly restarting some * large vma, note the restart_addr from unmap_vmas when it breaks out: * and restart from that address when we reach that vma again. It might * have been split or merged, shrunk or extended, but never shifted: so * restart_addr remains valid so long as it remains in the vma's range. * unmap_mapping_range forces truncate_count to leap over page-aligned * values so we can save vma's restart_addr in its truncate_count field. */#define is_restart_addr(truncate_count) (!((truncate_count) & ~PAGE_MASK))static void reset_vma_truncate_counts(struct address_space *mapping){ struct vm_area_struct *vma; struct prio_tree_iter iter; vma_prio_tree_foreach(vma, &iter, &mapping->i_mmap, 0, ULONG_MAX) vma->vm_truncate_count = 0; list_for_each_entry(vma, &mapping->i_mmap_nonlinear, shared.vm_set.list) vma->vm_truncate_count = 0;}static int unmap_mapping_range_vma(struct vm_area_struct *vma, unsigned long start_addr, unsigned long end_addr, struct zap_details *details){ unsigned long restart_addr; int need_break; /* * files that support invalidating or truncating portions of the * file from under mmaped areas must have their ->fault function * return a locked page (and set VM_FAULT_LOCKED in the return). * This provides synchronisation against concurrent unmapping here. */again: restart_addr = vma->vm_truncate_count; if (is_restart_addr(restart_addr) && start_addr < restart_addr) { start_addr = restart_addr; if (start_addr >= end_addr) { /* Top of vma has been split off since last time */ vma->vm_truncate_count = details->truncate_count; return 0; } } restart_addr = zap_page_range(vma, start_addr, end_addr - start_addr, details); need_break = need_resched() || spin_needbreak(details->i_mmap_lock); if (restart_addr >= end_addr) { /* We have now completed this vma: mark it so */ vma->vm_truncate_count = details->truncate_count; if (!need_break) return 0; } else { /* Note restart_addr in vma's truncate_count field */ vma->vm_truncate_count = restart_addr; if (!need_break) goto again; } spin_unlock(details->i_mmap_lock); cond_resched(); spin_lock(details->i_mmap_lock); return -EINTR;}static inline void unmap_mapping_range_tree(struct prio_tree_root *root, struct zap_details *details){ struct vm_area_struct *vma; struct prio_tree_iter iter; pgoff_t vba, vea, zba, zea;restart: vma_prio_tree_foreach(vma, &iter, root, details->first_index, details->last_index) { /* Skip quickly over those we have already dealt with */ if (vma->vm_truncate_count == details->truncate_count) continue; vba = vma->vm_pgoff; vea = vba + ((vma->vm_end - vma->vm_start) >> PAGE_SHIFT) - 1; /* Assume for now that PAGE_CACHE_SHIFT == PAGE_SHIFT */ zba = details->first_index; if (zba < vba) zba = vba; zea = details->last_index; if (zea > vea) zea = vea; if (unmap_mapping_range_vma(vma, ((zba - vba) << PAGE_SHIFT) + vma->vm_start, ((zea - vba + 1) << PAGE_SHIFT) + vma->vm_start, details) < 0) goto restart; }}static inline void unmap_mapping_range_list(struct list_head *head, struct zap_details *details){ struct vm_area_struct *vma; /* * In nonlinear VMAs there is no correspondence between virtual address * offset and file offset. So we must perform an exhaustive search * across *all* the pages in each nonlinear VMA, not just the pages * whose virtual address lies outside the file truncation point. */restart: list_for_each_entry(vma, head, shared.vm_set.list) { /* Skip quickly over those we have already dealt with */ if (vma->vm_truncate_count == details->truncate_count) continue; details->nonlinear_vma = vma; if (unmap_mapping_range_vma(vma, vma->vm_start, vma->vm_end, details) < 0) goto restart; }}/** * unmap_mapping_range - unmap the portion of all mmaps in the specified address_space corresponding to the specified page range in the underlying file. * @mapping: the address space containing mmaps to be unmapped. * @holebegin: byte in first page to unmap, relative to the start of * the underlying file. This will be rounded down to a PAGE_SIZE * boundary. Note that this is different from vmtruncate(), which * must keep the partial page. In contrast, we must get rid of * partial pages. * @holelen: size of prospective hole in bytes. This will be rounded * up to a PAGE_SIZE boundary. A holelen of zero truncates to the * end of the file. * @even_cows: 1 when truncating a file, unmap even private COWed pages; * but 0 when invalidating pagecache, don't throw away private data. */void unmap_mapping_range(struct address_space *mapping, loff_t const holebegin, loff_t const holelen, int even_cows){ struct zap_details details; pgoff_t hba = holebegin >> PAGE_SHIFT; pgoff_t hlen = (holelen + PAGE_SIZE - 1) >> PAGE_SHIFT; /* Check for overflow. */ if (sizeof(holelen) > sizeof(hlen)) { long long holeend = (holebegin + holelen + PAGE_SIZE - 1) >> PAGE_SHIFT; if (holeend & ~(long long)ULONG_MAX) hlen = ULONG_MAX - hba + 1; } details.check_mapping = even_cows? NULL: mapping; details.nonlinear_vma = NULL; details.first_index = hba; details.last_index = hba + hlen - 1; if (details.last_index < details.first_index) details.last_index = ULONG_MAX; details.i_mmap_lock = &mapping->i_mmap_lock; spin_lock(&mapping->i_mmap_lock); /* Protect against endless unmapping loops */ mapping->truncate_count++; if (unlikely(is_restart_addr(mapping->truncate_count))) { if (mapping->truncate_count == 0) reset_vma_truncate_counts(mapping); mapping->truncate_count++; } details.truncate_count = mapping->truncate_count; if (unlikely(!prio_tree_empty(&mapping->i_mmap))) unmap_mapping_range_tree(&mapping->i_mmap, &details); if (unlikely(!list_empty(&mapping->i_mmap_nonlinear))) unmap_mapping_range_list(&mapping->i_mmap_nonlinear, &details); spin_unlock(&mapping->i_mmap_lock);}EXPORT_SYMBOL(unmap_mapping_range);/** * vmtruncate - unmap mappings "freed" by truncate() syscall * @inode: inode of the file used * @offset: file offset to start truncating * * NOTE! We have to be ready to update the memory sharing * between the file and the memory map for a potential last * incomplete page. Ugly, but necessary. */int vmtruncate(struct inode * inode, loff_t offset){ if (inode->i_size < offset) { unsigned long limit; limit = current->signal->rlim[RLIMIT_FSIZE].rlim_cur; if (limit != RLIM_INFINITY && offset > limit) goto out_sig; if (offset > inode->i_sb->s_maxbytes) goto out_big; i_size_write(in
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -