📄 hugetlb.c

📁 最新最稳定的Linux内存管理模块源代码
💻 C
📖 第 1 页 / 共 5 页
字号:
	h->mask = ~((1ULL << (order + PAGE_SHIFT)) - 1);	h->nr_huge_pages = 0;	h->free_huge_pages = 0;	for (i = 0; i < MAX_NUMNODES; ++i)		INIT_LIST_HEAD(&h->hugepage_freelists[i]);	h->hugetlb_next_nid = first_node(node_online_map);	snprintf(h->name, HSTATE_NAME_LEN, "hugepages-%lukB",					huge_page_size(h)/1024);	parsed_hstate = h;}static int __init hugetlb_nrpages_setup(char *s){	unsigned long *mhp;	static unsigned long *last_mhp;	/*	 * !max_hstate means we haven't parsed a hugepagesz= parameter yet,	 * so this hugepages= parameter goes to the "default hstate".	 */	if (!max_hstate)		mhp = &default_hstate_max_huge_pages;	else		mhp = &parsed_hstate->max_huge_pages;	if (mhp == last_mhp) {		printk(KERN_WARNING "hugepages= specified twice without "			"interleaving hugepagesz=, ignoring\n");		return 1;	}	if (sscanf(s, "%lu", mhp) <= 0)		*mhp = 0;	/*	 * Global state is always initialized later in hugetlb_init.	 * But we need to allocate >= MAX_ORDER hstates here early to still	 * use the bootmem allocator.	 */	if (max_hstate && parsed_hstate->order >= MAX_ORDER)		hugetlb_hstate_alloc_pages(parsed_hstate);	last_mhp = mhp;	return 1;}__setup("hugepages=", hugetlb_nrpages_setup);static int __init hugetlb_default_setup(char *s){	default_hstate_size = memparse(s, &s);	return 1;}__setup("default_hugepagesz=", hugetlb_default_setup);static unsigned int cpuset_mems_nr(unsigned int *array){	int node;	unsigned int nr = 0;	for_each_node_mask(node, cpuset_current_mems_allowed)		nr += array[node];	return nr;}#ifdef CONFIG_SYSCTLint hugetlb_sysctl_handler(struct ctl_table *table, int write,			   struct file *file, void __user *buffer,			   size_t *length, loff_t *ppos){	struct hstate *h = &default_hstate;	unsigned long tmp;	if (!write)		tmp = h->max_huge_pages;	table->data = &tmp;	table->maxlen = sizeof(unsigned long);	proc_doulongvec_minmax(table, write, file, buffer, length, ppos);	if (write)		h->max_huge_pages = set_max_huge_pages(h, tmp);	return 0;}int hugetlb_treat_movable_handler(struct ctl_table *table, int write,			struct file *file, void __user *buffer,			size_t *length, loff_t *ppos){	proc_dointvec(table, write, file, buffer, length, ppos);	if (hugepages_treat_as_movable)		htlb_alloc_mask = GFP_HIGHUSER_MOVABLE;	else		htlb_alloc_mask = GFP_HIGHUSER;	return 0;}int hugetlb_overcommit_handler(struct ctl_table *table, int write,			struct file *file, void __user *buffer,			size_t *length, loff_t *ppos){	struct hstate *h = &default_hstate;	unsigned long tmp;	if (!write)		tmp = h->nr_overcommit_huge_pages;	table->data = &tmp;	table->maxlen = sizeof(unsigned long);	proc_doulongvec_minmax(table, write, file, buffer, length, ppos);	if (write) {		spin_lock(&hugetlb_lock);		h->nr_overcommit_huge_pages = tmp;		spin_unlock(&hugetlb_lock);	}	return 0;}#endif /* CONFIG_SYSCTL */void hugetlb_report_meminfo(struct seq_file *m){	struct hstate *h = &default_hstate;	seq_printf(m,			"HugePages_Total:   %5lu\n"			"HugePages_Free:    %5lu\n"			"HugePages_Rsvd:    %5lu\n"			"HugePages_Surp:    %5lu\n"			"Hugepagesize:   %8lu kB\n",			h->nr_huge_pages,			h->free_huge_pages,			h->resv_huge_pages,			h->surplus_huge_pages,			1UL << (huge_page_order(h) + PAGE_SHIFT - 10));}int hugetlb_report_node_meminfo(int nid, char *buf){	struct hstate *h = &default_hstate;	return sprintf(buf,		"Node %d HugePages_Total: %5u\n"		"Node %d HugePages_Free:  %5u\n"		"Node %d HugePages_Surp:  %5u\n",		nid, h->nr_huge_pages_node[nid],		nid, h->free_huge_pages_node[nid],		nid, h->surplus_huge_pages_node[nid]);}/* Return the number pages of memory we physically have, in PAGE_SIZE units. */unsigned long hugetlb_total_pages(void){	struct hstate *h = &default_hstate;	return h->nr_huge_pages * pages_per_huge_page(h);}static int hugetlb_acct_memory(struct hstate *h, long delta){	int ret = -ENOMEM;	spin_lock(&hugetlb_lock);	/*	 * When cpuset is configured, it breaks the strict hugetlb page	 * reservation as the accounting is done on a global variable. Such	 * reservation is completely rubbish in the presence of cpuset because	 * the reservation is not checked against page availability for the	 * current cpuset. Application can still potentially OOM'ed by kernel	 * with lack of free htlb page in cpuset that the task is in.	 * Attempt to enforce strict accounting with cpuset is almost	 * impossible (or too ugly) because cpuset is too fluid that	 * task or memory node can be dynamically moved between cpusets.	 *	 * The change of semantics for shared hugetlb mapping with cpuset is	 * undesirable. However, in order to preserve some of the semantics,	 * we fall back to check against current free page availability as	 * a best attempt and hopefully to minimize the impact of changing	 * semantics that cpuset has.	 */	if (delta > 0) {		if (gather_surplus_pages(h, delta) < 0)			goto out;		if (delta > cpuset_mems_nr(h->free_huge_pages_node)) {			return_unused_surplus_pages(h, delta);			goto out;		}	}	ret = 0;	if (delta < 0)		return_unused_surplus_pages(h, (unsigned long) -delta);out:	spin_unlock(&hugetlb_lock);	return ret;}static void hugetlb_vm_op_open(struct vm_area_struct *vma){	struct resv_map *reservations = vma_resv_map(vma);	/*	 * This new VMA should share its siblings reservation map if present.	 * The VMA will only ever have a valid reservation map pointer where	 * it is being copied for another still existing VMA.  As that VMA	 * has a reference to the reservation map it cannot dissappear until	 * after this open call completes.  It is therefore safe to take a	 * new reference here without additional locking.	 */	if (reservations)		kref_get(&reservations->refs);}static void hugetlb_vm_op_close(struct vm_area_struct *vma){	struct hstate *h = hstate_vma(vma);	struct resv_map *reservations = vma_resv_map(vma);	unsigned long reserve;	unsigned long start;	unsigned long end;	if (reservations) {		start = vma_hugecache_offset(h, vma, vma->vm_start);		end = vma_hugecache_offset(h, vma, vma->vm_end);		reserve = (end - start) -			region_count(&reservations->regions, start, end);		kref_put(&reservations->refs, resv_map_release);		if (reserve) {			hugetlb_acct_memory(h, -reserve);			hugetlb_put_quota(vma->vm_file->f_mapping, reserve);		}	}}/* * We cannot handle pagefaults against hugetlb pages at all.  They cause * handle_mm_fault() to try to instantiate regular-sized pages in the * hugegpage VMA.  do_page_fault() is supposed to trap this, so BUG is we get * this far. */static int hugetlb_vm_op_fault(struct vm_area_struct *vma, struct vm_fault *vmf){	BUG();	return 0;}struct vm_operations_struct hugetlb_vm_ops = {	.fault = hugetlb_vm_op_fault,	.open = hugetlb_vm_op_open,	.close = hugetlb_vm_op_close,};static pte_t make_huge_pte(struct vm_area_struct *vma, struct page *page,				int writable){	pte_t entry;	if (writable) {		entry =		    pte_mkwrite(pte_mkdirty(mk_pte(page, vma->vm_page_prot)));	} else {		entry = huge_pte_wrprotect(mk_pte(page, vma->vm_page_prot));	}	entry = pte_mkyoung(entry);	entry = pte_mkhuge(entry);	return entry;}static void set_huge_ptep_writable(struct vm_area_struct *vma,				   unsigned long address, pte_t *ptep){	pte_t entry;	entry = pte_mkwrite(pte_mkdirty(huge_ptep_get(ptep)));	if (huge_ptep_set_access_flags(vma, address, ptep, entry, 1)) {		update_mmu_cache(vma, address, entry);	}}int copy_hugetlb_page_range(struct mm_struct *dst, struct mm_struct *src,			    struct vm_area_struct *vma){	pte_t *src_pte, *dst_pte, entry;	struct page *ptepage;	unsigned long addr;	int cow;	struct hstate *h = hstate_vma(vma);	unsigned long sz = huge_page_size(h);	cow = (vma->vm_flags & (VM_SHARED | VM_MAYWRITE)) == VM_MAYWRITE;	for (addr = vma->vm_start; addr < vma->vm_end; addr += sz) {		src_pte = huge_pte_offset(src, addr);		if (!src_pte)			continue;		dst_pte = huge_pte_alloc(dst, addr, sz);		if (!dst_pte)			goto nomem;		/* If the pagetables are shared don't copy or take references */		if (dst_pte == src_pte)			continue;		spin_lock(&dst->page_table_lock);		spin_lock_nested(&src->page_table_lock, SINGLE_DEPTH_NESTING);		if (!huge_pte_none(huge_ptep_get(src_pte))) {			if (cow)				huge_ptep_set_wrprotect(src, addr, src_pte);			entry = huge_ptep_get(src_pte);			ptepage = pte_page(entry);			get_page(ptepage);			set_huge_pte_at(dst, addr, dst_pte, entry);		}		spin_unlock(&src->page_table_lock);		spin_unlock(&dst->page_table_lock);	}	return 0;nomem:	return -ENOMEM;}void __unmap_hugepage_range(struct vm_area_struct *vma, unsigned long start,			    unsigned long end, struct page *ref_page){	struct mm_struct *mm = vma->vm_mm;	unsigned long address;	pte_t *ptep;	pte_t pte;	struct page *page;	struct page *tmp;	struct hstate *h = hstate_vma(vma);	unsigned long sz = huge_page_size(h);	/*	 * A page gathering list, protected by per file i_mmap_lock. The	 * lock is used to avoid list corruption from multiple unmapping	 * of the same page since we are using page->lru.	 */	LIST_HEAD(page_list);	WARN_ON(!is_vm_hugetlb_page(vma));	BUG_ON(start & ~huge_page_mask(h));	BUG_ON(end & ~huge_page_mask(h));	mmu_notifier_invalidate_range_start(mm, start, end);	spin_lock(&mm->page_table_lock);	for (address = start; address < end; address += sz) {		ptep = huge_pte_offset(mm, address);		if (!ptep)			continue;		if (huge_pmd_unshare(mm, &address, ptep))			continue;		/*		 * If a reference page is supplied, it is because a specific		 * page is being unmapped, not a range. Ensure the page we		 * are about to unmap is the actual page of interest.		 */		if (ref_page) {			pte = huge_ptep_get(ptep);			if (huge_pte_none(pte))				continue;			page = pte_page(pte);			if (page != ref_page)				continue;			/*			 * Mark the VMA as having unmapped its page so that			 * future faults in this VMA will fail rather than			 * looking like data was lost			 */			set_vma_resv_flags(vma, HPAGE_RESV_UNMAPPED);		}		pte = huge_ptep_get_and_clear(mm, address, ptep);		if (huge_pte_none(pte))			continue;		page = pte_page(pte);		if (pte_dirty(pte))			set_page_dirty(page);		list_add(&page->lru, &page_list);	}	spin_unlock(&mm->page_table_lock);	flush_tlb_range(vma, start, end);	mmu_notifier_invalidate_range_end(mm, start, end);	list_for_each_entry_safe(page, tmp, &page_list, lru) {		list_del(&page->lru);		put_page(page);	}}void unmap_hugepage_range(struct vm_area_struct *vma, unsigned long start,			  unsigned long end, struct page *ref_page){	spin_lock(&vma->vm_file->f_mapping->i_mmap_lock);	__unmap_hugepage_range(vma, start, end, ref_page);	spin_unlock(&vma->vm_file->f_mapping->i_mmap_lock);}/* * This is called when the original mapper is failing to COW a MAP_PRIVATE * mappping it owns the reserve page for. The intention is to unmap the page * from other VMAs and let the children be SIGKILLed if they are faulting the * same region. */static int unmap_ref_private(struct mm_struct *mm, struct vm_area_struct *vma,				struct page *page, unsigned long address){	struct hstate *h = hstate_vma(vma);	struct vm_area_struct *iter_vma;	struct address_space *mapping;	struct prio_tree_iter iter;	pgoff_t pgoff;	/*	 * vm_pgoff is in PAGE_SIZE units, hence the different calculation	 * from page cache lookup which is in HPAGE_SIZE units.	 */	address = address & huge_page_mask(h);	pgoff = ((address - vma->vm_start) >> PAGE_SHIFT)		+ (vma->vm_pgoff >> PAGE_SHIFT);	mapping = (struct address_space *)page_private(page);	vma_prio_tree_foreach(iter_vma, &iter, &mapping->i_mmap, pgoff, pgoff) {		/* Do not unmap the current VMA */		if (iter_vma == vma)			continue;		/*		 * Unmap the page from other VMAs without their own reserves.		 * They get marked to be SIGKILLed if they fault in these		 * areas. This is because a future no-page fault on this VMA		 * could insert a zeroed page instead of the data existing		 * from the time of fork. This would look like data corruption		 */		if (!is_vma_resv_set(iter_vma, HPAGE_RESV_OWNER))			unmap_hugepage_range(iter_vma,				address, address + huge_page_size(h),				page);	}	return 1;}static int hugetlb_cow(struct mm_struct *mm, struct vm_area_struct *vma,			unsigned long address, pte_t *ptep, pte_t pte,			struct page *pagecache_page){	struct hstate *h = hstate_vma(vma);	struct page *old_page, *new_page;	int avoidcopy;	int outside_reserve = 0;	old_page = pte_page(pte);retry_avoidcopy:	/* If no-one else is actually using this page, avoid the copy	 * and just make the page writable */
⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -