📄 memory.c

📁 最新最稳定的Linux内存管理模块源代码
💻 C
📖 第 1 页 / 共 5 页
字号:
 * unmap_vmas - unmap a range of memory covered by a list of vma's * @tlbp: address of the caller's struct mmu_gather * @vma: the starting vma * @start_addr: virtual address at which to start unmapping * @end_addr: virtual address at which to end unmapping * @nr_accounted: Place number of unmapped pages in vm-accountable vma's here * @details: details of nonlinear truncation or shared cache invalidation * * Returns the end address of the unmapping (restart addr if interrupted). * * Unmap all pages in the vma list. * * We aim to not hold locks for too long (for scheduling latency reasons). * So zap pages in ZAP_BLOCK_SIZE bytecounts.  This means we need to * return the ending mmu_gather to the caller. * * Only addresses between `start' and `end' will be unmapped. * * The VMA list must be sorted in ascending virtual address order. * * unmap_vmas() assumes that the caller will flush the whole unmapped address * range after unmap_vmas() returns.  So the only responsibility here is to * ensure that any thus-far unmapped pages are flushed before unmap_vmas() * drops the lock and schedules. */unsigned long unmap_vmas(struct mmu_gather **tlbp,		struct vm_area_struct *vma, unsigned long start_addr,		unsigned long end_addr, unsigned long *nr_accounted,		struct zap_details *details){	long zap_work = ZAP_BLOCK_SIZE;	unsigned long tlb_start = 0;	/* For tlb_finish_mmu */	int tlb_start_valid = 0;	unsigned long start = start_addr;	spinlock_t *i_mmap_lock = details? details->i_mmap_lock: NULL;	int fullmm = (*tlbp)->fullmm;	struct mm_struct *mm = vma->vm_mm;	mmu_notifier_invalidate_range_start(mm, start_addr, end_addr);	for ( ; vma && vma->vm_start < end_addr; vma = vma->vm_next) {		unsigned long end;		start = max(vma->vm_start, start_addr);		if (start >= vma->vm_end)			continue;		end = min(vma->vm_end, end_addr);		if (end <= vma->vm_start)			continue;		if (vma->vm_flags & VM_ACCOUNT)			*nr_accounted += (end - start) >> PAGE_SHIFT;		if (unlikely(is_pfn_mapping(vma)))			untrack_pfn_vma(vma, 0, 0);		while (start != end) {			if (!tlb_start_valid) {				tlb_start = start;				tlb_start_valid = 1;			}			if (unlikely(is_vm_hugetlb_page(vma))) {				/*				 * It is undesirable to test vma->vm_file as it				 * should be non-null for valid hugetlb area.				 * However, vm_file will be NULL in the error				 * cleanup path of do_mmap_pgoff. When				 * hugetlbfs ->mmap method fails,				 * do_mmap_pgoff() nullifies vma->vm_file				 * before calling this function to clean up.				 * Since no pte has actually been setup, it is				 * safe to do nothing in this case.				 */				if (vma->vm_file) {					unmap_hugepage_range(vma, start, end, NULL);					zap_work -= (end - start) /					pages_per_huge_page(hstate_vma(vma));				}				start = end;			} else				start = unmap_page_range(*tlbp, vma,						start, end, &zap_work, details);			if (zap_work > 0) {				BUG_ON(start != end);				break;			}			tlb_finish_mmu(*tlbp, tlb_start, start);			if (need_resched() ||				(i_mmap_lock && spin_needbreak(i_mmap_lock))) {				if (i_mmap_lock) {					*tlbp = NULL;					goto out;				}				cond_resched();			}			*tlbp = tlb_gather_mmu(vma->vm_mm, fullmm);			tlb_start_valid = 0;			zap_work = ZAP_BLOCK_SIZE;		}	}out:	mmu_notifier_invalidate_range_end(mm, start_addr, end_addr);	return start;	/* which is now the end (or restart) address */}/** * zap_page_range - remove user pages in a given range * @vma: vm_area_struct holding the applicable pages * @address: starting address of pages to zap * @size: number of bytes to zap * @details: details of nonlinear truncation or shared cache invalidation */unsigned long zap_page_range(struct vm_area_struct *vma, unsigned long address,		unsigned long size, struct zap_details *details){	struct mm_struct *mm = vma->vm_mm;	struct mmu_gather *tlb;	unsigned long end = address + size;	unsigned long nr_accounted = 0;	lru_add_drain();	tlb = tlb_gather_mmu(mm, 0);	update_hiwater_rss(mm);	end = unmap_vmas(&tlb, vma, address, end, &nr_accounted, details);	if (tlb)		tlb_finish_mmu(tlb, address, end);	return end;}/** * zap_vma_ptes - remove ptes mapping the vma * @vma: vm_area_struct holding ptes to be zapped * @address: starting address of pages to zap * @size: number of bytes to zap * * This function only unmaps ptes assigned to VM_PFNMAP vmas. * * The entire address range must be fully contained within the vma. * * Returns 0 if successful. */int zap_vma_ptes(struct vm_area_struct *vma, unsigned long address,		unsigned long size){	if (address < vma->vm_start || address + size > vma->vm_end ||	    		!(vma->vm_flags & VM_PFNMAP))		return -1;	zap_page_range(vma, address, size, NULL);	return 0;}EXPORT_SYMBOL_GPL(zap_vma_ptes);/* * Do a quick page-table lookup for a single page. */struct page *follow_page(struct vm_area_struct *vma, unsigned long address,			unsigned int flags){	pgd_t *pgd;	pud_t *pud;	pmd_t *pmd;	pte_t *ptep, pte;	spinlock_t *ptl;	struct page *page;	struct mm_struct *mm = vma->vm_mm;	page = follow_huge_addr(mm, address, flags & FOLL_WRITE);	if (!IS_ERR(page)) {		BUG_ON(flags & FOLL_GET);		goto out;	}	page = NULL;	pgd = pgd_offset(mm, address);	if (pgd_none(*pgd) || unlikely(pgd_bad(*pgd)))		goto no_page_table;	pud = pud_offset(pgd, address);	if (pud_none(*pud))		goto no_page_table;	if (pud_huge(*pud)) {		BUG_ON(flags & FOLL_GET);		page = follow_huge_pud(mm, address, pud, flags & FOLL_WRITE);		goto out;	}	if (unlikely(pud_bad(*pud)))		goto no_page_table;	pmd = pmd_offset(pud, address);	if (pmd_none(*pmd))		goto no_page_table;	if (pmd_huge(*pmd)) {		BUG_ON(flags & FOLL_GET);		page = follow_huge_pmd(mm, address, pmd, flags & FOLL_WRITE);		goto out;	}	if (unlikely(pmd_bad(*pmd)))		goto no_page_table;	ptep = pte_offset_map_lock(mm, pmd, address, &ptl);	pte = *ptep;	if (!pte_present(pte))		goto no_page;	if ((flags & FOLL_WRITE) && !pte_write(pte))		goto unlock;	page = vm_normal_page(vma, address, pte);	if (unlikely(!page))		goto bad_page;	if (flags & FOLL_GET)		get_page(page);	if (flags & FOLL_TOUCH) {		if ((flags & FOLL_WRITE) &&		    !pte_dirty(pte) && !PageDirty(page))			set_page_dirty(page);		mark_page_accessed(page);	}unlock:	pte_unmap_unlock(ptep, ptl);out:	return page;bad_page:	pte_unmap_unlock(ptep, ptl);	return ERR_PTR(-EFAULT);no_page:	pte_unmap_unlock(ptep, ptl);	if (!pte_none(pte))		return page;	/* Fall through to ZERO_PAGE handling */no_page_table:	/*	 * When core dumping an enormous anonymous area that nobody	 * has touched so far, we don't want to allocate page tables.	 */	if (flags & FOLL_ANON) {		page = ZERO_PAGE(0);		if (flags & FOLL_GET)			get_page(page);		BUG_ON(flags & FOLL_WRITE);	}	return page;}/* Can we do the FOLL_ANON optimization? */static inline int use_zero_page(struct vm_area_struct *vma){	/*	 * We don't want to optimize FOLL_ANON for make_pages_present()	 * when it tries to page in a VM_LOCKED region. As to VM_SHARED,	 * we want to get the page from the page tables to make sure	 * that we serialize and update with any other user of that	 * mapping.	 */	if (vma->vm_flags & (VM_LOCKED | VM_SHARED))		return 0;	/*	 * And if we have a fault routine, it's not an anonymous region.	 */	return !vma->vm_ops || !vma->vm_ops->fault;}int __get_user_pages(struct task_struct *tsk, struct mm_struct *mm,		     unsigned long start, int len, int flags,		struct page **pages, struct vm_area_struct **vmas){	int i;	unsigned int vm_flags = 0;	int write = !!(flags & GUP_FLAGS_WRITE);	int force = !!(flags & GUP_FLAGS_FORCE);	int ignore = !!(flags & GUP_FLAGS_IGNORE_VMA_PERMISSIONS);	int ignore_sigkill = !!(flags & GUP_FLAGS_IGNORE_SIGKILL);	if (len <= 0)		return 0;	/* 	 * Require read or write permissions.	 * If 'force' is set, we only require the "MAY" flags.	 */	vm_flags  = write ? (VM_WRITE | VM_MAYWRITE) : (VM_READ | VM_MAYREAD);	vm_flags &= force ? (VM_MAYREAD | VM_MAYWRITE) : (VM_READ | VM_WRITE);	i = 0;	do {		struct vm_area_struct *vma;		unsigned int foll_flags;		vma = find_extend_vma(mm, start);		if (!vma && in_gate_area(tsk, start)) {			unsigned long pg = start & PAGE_MASK;			struct vm_area_struct *gate_vma = get_gate_vma(tsk);			pgd_t *pgd;			pud_t *pud;			pmd_t *pmd;			pte_t *pte;			/* user gate pages are read-only */			if (!ignore && write)				return i ? : -EFAULT;			if (pg > TASK_SIZE)				pgd = pgd_offset_k(pg);			else				pgd = pgd_offset_gate(mm, pg);			BUG_ON(pgd_none(*pgd));			pud = pud_offset(pgd, pg);			BUG_ON(pud_none(*pud));			pmd = pmd_offset(pud, pg);			if (pmd_none(*pmd))				return i ? : -EFAULT;			pte = pte_offset_map(pmd, pg);			if (pte_none(*pte)) {				pte_unmap(pte);				return i ? : -EFAULT;			}			if (pages) {				struct page *page = vm_normal_page(gate_vma, start, *pte);				pages[i] = page;				if (page)					get_page(page);			}			pte_unmap(pte);			if (vmas)				vmas[i] = gate_vma;			i++;			start += PAGE_SIZE;			len--;			continue;		}		if (!vma ||		    (vma->vm_flags & (VM_IO | VM_PFNMAP)) ||		    (!ignore && !(vm_flags & vma->vm_flags)))			return i ? : -EFAULT;		if (is_vm_hugetlb_page(vma)) {			i = follow_hugetlb_page(mm, vma, pages, vmas,						&start, &len, i, write);			continue;		}		foll_flags = FOLL_TOUCH;		if (pages)			foll_flags |= FOLL_GET;		if (!write && use_zero_page(vma))			foll_flags |= FOLL_ANON;		do {			struct page *page;			/*			 * If we have a pending SIGKILL, don't keep faulting			 * pages and potentially allocating memory, unless			 * current is handling munlock--e.g., on exit. In			 * that case, we are not allocating memory.  Rather,			 * we're only unlocking already resident/mapped pages.			 */			if (unlikely(!ignore_sigkill &&					fatal_signal_pending(current)))				return i ? i : -ERESTARTSYS;			if (write)				foll_flags |= FOLL_WRITE;			cond_resched();			while (!(page = follow_page(vma, start, foll_flags))) {				int ret;				ret = handle_mm_fault(mm, vma, start,						foll_flags & FOLL_WRITE);				if (ret & VM_FAULT_ERROR) {					if (ret & VM_FAULT_OOM)						return i ? i : -ENOMEM;					else if (ret & VM_FAULT_SIGBUS)						return i ? i : -EFAULT;					BUG();				}				if (ret & VM_FAULT_MAJOR)					tsk->maj_flt++;				else					tsk->min_flt++;				/*				 * The VM_FAULT_WRITE bit tells us that				 * do_wp_page has broken COW when necessary,				 * even if maybe_mkwrite decided not to set				 * pte_write. We can thus safely do subsequent				 * page lookups as if they were reads. But only				 * do so when looping for pte_write is futile:				 * in some cases userspace may also be wanting				 * to write to the gotten user page, which a				 * read fault here might prevent (a readonly				 * page might get reCOWed by userspace write).				 */				if ((ret & VM_FAULT_WRITE) &&				    !(vma->vm_flags & VM_WRITE))					foll_flags &= ~FOLL_WRITE;				cond_resched();			}			if (IS_ERR(page))				return i ? i : PTR_ERR(page);			if (pages) {				pages[i] = page;				flush_anon_page(vma, page, start);				flush_dcache_page(page);			}			if (vmas)				vmas[i] = vma;			i++;			start += PAGE_SIZE;			len--;		} while (len && start < vma->vm_end);	} while (len);	return i;}int get_user_pages(struct task_struct *tsk, struct mm_struct *mm,		unsigned long start, int len, int write, int force,		struct page **pages, struct vm_area_struct **vmas){	int flags = 0;	if (write)		flags |= GUP_FLAGS_WRITE;	if (force)		flags |= GUP_FLAGS_FORCE;	return __get_user_pages(tsk, mm,				start, len, flags,				pages, vmas);}EXPORT_SYMBOL(get_user_pages);pte_t *get_locked_pte(struct mm_struct *mm, unsigned long addr,			spinlock_t **ptl){	pgd_t * pgd = pgd_offset(mm, addr);	pud_t * pud = pud_alloc(mm, pgd, addr);	if (pud) {		pmd_t * pmd = pmd_alloc(mm, pud, addr);		if (pmd)			return pte_alloc_map_lock(mm, pmd, addr, ptl);	}	return NULL;}/* * This is the old fallback for page remapping. * * For historical reasons, it only allows reserved pages. Only * old drivers should use this, and they needed to mark their * pages reserved for the old functions anyway. */static int insert_page(struct vm_area_struct *vma, unsigned long addr,			struct page *page, pgprot_t prot){
⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -