📄 memory.c
字号:
* unmap_vmas - unmap a range of memory covered by a list of vma's * @tlbp: address of the caller's struct mmu_gather * @vma: the starting vma * @start_addr: virtual address at which to start unmapping * @end_addr: virtual address at which to end unmapping * @nr_accounted: Place number of unmapped pages in vm-accountable vma's here * @details: details of nonlinear truncation or shared cache invalidation * * Returns the end address of the unmapping (restart addr if interrupted). * * Unmap all pages in the vma list. * * We aim to not hold locks for too long (for scheduling latency reasons). * So zap pages in ZAP_BLOCK_SIZE bytecounts. This means we need to * return the ending mmu_gather to the caller. * * Only addresses between `start' and `end' will be unmapped. * * The VMA list must be sorted in ascending virtual address order. * * unmap_vmas() assumes that the caller will flush the whole unmapped address * range after unmap_vmas() returns. So the only responsibility here is to * ensure that any thus-far unmapped pages are flushed before unmap_vmas() * drops the lock and schedules. */unsigned long unmap_vmas(struct mmu_gather **tlbp, struct vm_area_struct *vma, unsigned long start_addr, unsigned long end_addr, unsigned long *nr_accounted, struct zap_details *details){ long zap_work = ZAP_BLOCK_SIZE; unsigned long tlb_start = 0; /* For tlb_finish_mmu */ int tlb_start_valid = 0; unsigned long start = start_addr; spinlock_t *i_mmap_lock = details? details->i_mmap_lock: NULL; int fullmm = (*tlbp)->fullmm; struct mm_struct *mm = vma->vm_mm; mmu_notifier_invalidate_range_start(mm, start_addr, end_addr); for ( ; vma && vma->vm_start < end_addr; vma = vma->vm_next) { unsigned long end; start = max(vma->vm_start, start_addr); if (start >= vma->vm_end) continue; end = min(vma->vm_end, end_addr); if (end <= vma->vm_start) continue; if (vma->vm_flags & VM_ACCOUNT) *nr_accounted += (end - start) >> PAGE_SHIFT; if (unlikely(is_pfn_mapping(vma))) untrack_pfn_vma(vma, 0, 0); while (start != end) { if (!tlb_start_valid) { tlb_start = start; tlb_start_valid = 1; } if (unlikely(is_vm_hugetlb_page(vma))) { /* * It is undesirable to test vma->vm_file as it * should be non-null for valid hugetlb area. * However, vm_file will be NULL in the error * cleanup path of do_mmap_pgoff. When * hugetlbfs ->mmap method fails, * do_mmap_pgoff() nullifies vma->vm_file * before calling this function to clean up. * Since no pte has actually been setup, it is * safe to do nothing in this case. */ if (vma->vm_file) { unmap_hugepage_range(vma, start, end, NULL); zap_work -= (end - start) / pages_per_huge_page(hstate_vma(vma)); } start = end; } else start = unmap_page_range(*tlbp, vma, start, end, &zap_work, details); if (zap_work > 0) { BUG_ON(start != end); break; } tlb_finish_mmu(*tlbp, tlb_start, start); if (need_resched() || (i_mmap_lock && spin_needbreak(i_mmap_lock))) { if (i_mmap_lock) { *tlbp = NULL; goto out; } cond_resched(); } *tlbp = tlb_gather_mmu(vma->vm_mm, fullmm); tlb_start_valid = 0; zap_work = ZAP_BLOCK_SIZE; } }out: mmu_notifier_invalidate_range_end(mm, start_addr, end_addr); return start; /* which is now the end (or restart) address */}/** * zap_page_range - remove user pages in a given range * @vma: vm_area_struct holding the applicable pages * @address: starting address of pages to zap * @size: number of bytes to zap * @details: details of nonlinear truncation or shared cache invalidation */unsigned long zap_page_range(struct vm_area_struct *vma, unsigned long address, unsigned long size, struct zap_details *details){ struct mm_struct *mm = vma->vm_mm; struct mmu_gather *tlb; unsigned long end = address + size; unsigned long nr_accounted = 0; lru_add_drain(); tlb = tlb_gather_mmu(mm, 0); update_hiwater_rss(mm); end = unmap_vmas(&tlb, vma, address, end, &nr_accounted, details); if (tlb) tlb_finish_mmu(tlb, address, end); return end;}/** * zap_vma_ptes - remove ptes mapping the vma * @vma: vm_area_struct holding ptes to be zapped * @address: starting address of pages to zap * @size: number of bytes to zap * * This function only unmaps ptes assigned to VM_PFNMAP vmas. * * The entire address range must be fully contained within the vma. * * Returns 0 if successful. */int zap_vma_ptes(struct vm_area_struct *vma, unsigned long address, unsigned long size){ if (address < vma->vm_start || address + size > vma->vm_end || !(vma->vm_flags & VM_PFNMAP)) return -1; zap_page_range(vma, address, size, NULL); return 0;}EXPORT_SYMBOL_GPL(zap_vma_ptes);/* * Do a quick page-table lookup for a single page. */struct page *follow_page(struct vm_area_struct *vma, unsigned long address, unsigned int flags){ pgd_t *pgd; pud_t *pud; pmd_t *pmd; pte_t *ptep, pte; spinlock_t *ptl; struct page *page; struct mm_struct *mm = vma->vm_mm; page = follow_huge_addr(mm, address, flags & FOLL_WRITE); if (!IS_ERR(page)) { BUG_ON(flags & FOLL_GET); goto out; } page = NULL; pgd = pgd_offset(mm, address); if (pgd_none(*pgd) || unlikely(pgd_bad(*pgd))) goto no_page_table; pud = pud_offset(pgd, address); if (pud_none(*pud)) goto no_page_table; if (pud_huge(*pud)) { BUG_ON(flags & FOLL_GET); page = follow_huge_pud(mm, address, pud, flags & FOLL_WRITE); goto out; } if (unlikely(pud_bad(*pud))) goto no_page_table; pmd = pmd_offset(pud, address); if (pmd_none(*pmd)) goto no_page_table; if (pmd_huge(*pmd)) { BUG_ON(flags & FOLL_GET); page = follow_huge_pmd(mm, address, pmd, flags & FOLL_WRITE); goto out; } if (unlikely(pmd_bad(*pmd))) goto no_page_table; ptep = pte_offset_map_lock(mm, pmd, address, &ptl); pte = *ptep; if (!pte_present(pte)) goto no_page; if ((flags & FOLL_WRITE) && !pte_write(pte)) goto unlock; page = vm_normal_page(vma, address, pte); if (unlikely(!page)) goto bad_page; if (flags & FOLL_GET) get_page(page); if (flags & FOLL_TOUCH) { if ((flags & FOLL_WRITE) && !pte_dirty(pte) && !PageDirty(page)) set_page_dirty(page); mark_page_accessed(page); }unlock: pte_unmap_unlock(ptep, ptl);out: return page;bad_page: pte_unmap_unlock(ptep, ptl); return ERR_PTR(-EFAULT);no_page: pte_unmap_unlock(ptep, ptl); if (!pte_none(pte)) return page; /* Fall through to ZERO_PAGE handling */no_page_table: /* * When core dumping an enormous anonymous area that nobody * has touched so far, we don't want to allocate page tables. */ if (flags & FOLL_ANON) { page = ZERO_PAGE(0); if (flags & FOLL_GET) get_page(page); BUG_ON(flags & FOLL_WRITE); } return page;}/* Can we do the FOLL_ANON optimization? */static inline int use_zero_page(struct vm_area_struct *vma){ /* * We don't want to optimize FOLL_ANON for make_pages_present() * when it tries to page in a VM_LOCKED region. As to VM_SHARED, * we want to get the page from the page tables to make sure * that we serialize and update with any other user of that * mapping. */ if (vma->vm_flags & (VM_LOCKED | VM_SHARED)) return 0; /* * And if we have a fault routine, it's not an anonymous region. */ return !vma->vm_ops || !vma->vm_ops->fault;}int __get_user_pages(struct task_struct *tsk, struct mm_struct *mm, unsigned long start, int len, int flags, struct page **pages, struct vm_area_struct **vmas){ int i; unsigned int vm_flags = 0; int write = !!(flags & GUP_FLAGS_WRITE); int force = !!(flags & GUP_FLAGS_FORCE); int ignore = !!(flags & GUP_FLAGS_IGNORE_VMA_PERMISSIONS); int ignore_sigkill = !!(flags & GUP_FLAGS_IGNORE_SIGKILL); if (len <= 0) return 0; /* * Require read or write permissions. * If 'force' is set, we only require the "MAY" flags. */ vm_flags = write ? (VM_WRITE | VM_MAYWRITE) : (VM_READ | VM_MAYREAD); vm_flags &= force ? (VM_MAYREAD | VM_MAYWRITE) : (VM_READ | VM_WRITE); i = 0; do { struct vm_area_struct *vma; unsigned int foll_flags; vma = find_extend_vma(mm, start); if (!vma && in_gate_area(tsk, start)) { unsigned long pg = start & PAGE_MASK; struct vm_area_struct *gate_vma = get_gate_vma(tsk); pgd_t *pgd; pud_t *pud; pmd_t *pmd; pte_t *pte; /* user gate pages are read-only */ if (!ignore && write) return i ? : -EFAULT; if (pg > TASK_SIZE) pgd = pgd_offset_k(pg); else pgd = pgd_offset_gate(mm, pg); BUG_ON(pgd_none(*pgd)); pud = pud_offset(pgd, pg); BUG_ON(pud_none(*pud)); pmd = pmd_offset(pud, pg); if (pmd_none(*pmd)) return i ? : -EFAULT; pte = pte_offset_map(pmd, pg); if (pte_none(*pte)) { pte_unmap(pte); return i ? : -EFAULT; } if (pages) { struct page *page = vm_normal_page(gate_vma, start, *pte); pages[i] = page; if (page) get_page(page); } pte_unmap(pte); if (vmas) vmas[i] = gate_vma; i++; start += PAGE_SIZE; len--; continue; } if (!vma || (vma->vm_flags & (VM_IO | VM_PFNMAP)) || (!ignore && !(vm_flags & vma->vm_flags))) return i ? : -EFAULT; if (is_vm_hugetlb_page(vma)) { i = follow_hugetlb_page(mm, vma, pages, vmas, &start, &len, i, write); continue; } foll_flags = FOLL_TOUCH; if (pages) foll_flags |= FOLL_GET; if (!write && use_zero_page(vma)) foll_flags |= FOLL_ANON; do { struct page *page; /* * If we have a pending SIGKILL, don't keep faulting * pages and potentially allocating memory, unless * current is handling munlock--e.g., on exit. In * that case, we are not allocating memory. Rather, * we're only unlocking already resident/mapped pages. */ if (unlikely(!ignore_sigkill && fatal_signal_pending(current))) return i ? i : -ERESTARTSYS; if (write) foll_flags |= FOLL_WRITE; cond_resched(); while (!(page = follow_page(vma, start, foll_flags))) { int ret; ret = handle_mm_fault(mm, vma, start, foll_flags & FOLL_WRITE); if (ret & VM_FAULT_ERROR) { if (ret & VM_FAULT_OOM) return i ? i : -ENOMEM; else if (ret & VM_FAULT_SIGBUS) return i ? i : -EFAULT; BUG(); } if (ret & VM_FAULT_MAJOR) tsk->maj_flt++; else tsk->min_flt++; /* * The VM_FAULT_WRITE bit tells us that * do_wp_page has broken COW when necessary, * even if maybe_mkwrite decided not to set * pte_write. We can thus safely do subsequent * page lookups as if they were reads. But only * do so when looping for pte_write is futile: * in some cases userspace may also be wanting * to write to the gotten user page, which a * read fault here might prevent (a readonly * page might get reCOWed by userspace write). */ if ((ret & VM_FAULT_WRITE) && !(vma->vm_flags & VM_WRITE)) foll_flags &= ~FOLL_WRITE; cond_resched(); } if (IS_ERR(page)) return i ? i : PTR_ERR(page); if (pages) { pages[i] = page; flush_anon_page(vma, page, start); flush_dcache_page(page); } if (vmas) vmas[i] = vma; i++; start += PAGE_SIZE; len--; } while (len && start < vma->vm_end); } while (len); return i;}int get_user_pages(struct task_struct *tsk, struct mm_struct *mm, unsigned long start, int len, int write, int force, struct page **pages, struct vm_area_struct **vmas){ int flags = 0; if (write) flags |= GUP_FLAGS_WRITE; if (force) flags |= GUP_FLAGS_FORCE; return __get_user_pages(tsk, mm, start, len, flags, pages, vmas);}EXPORT_SYMBOL(get_user_pages);pte_t *get_locked_pte(struct mm_struct *mm, unsigned long addr, spinlock_t **ptl){ pgd_t * pgd = pgd_offset(mm, addr); pud_t * pud = pud_alloc(mm, pgd, addr); if (pud) { pmd_t * pmd = pmd_alloc(mm, pud, addr); if (pmd) return pte_alloc_map_lock(mm, pmd, addr, ptl); } return NULL;}/* * This is the old fallback for page remapping. * * For historical reasons, it only allows reserved pages. Only * old drivers should use this, and they needed to mark their * pages reserved for the old functions anyway. */static int insert_page(struct vm_area_struct *vma, unsigned long addr, struct page *page, pgprot_t prot){
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -