📄 hugetlb.c

📁 最新最稳定的Linux内存管理模块源代码
💻 C
📖 第 1 页 / 共 5 页
字号:
		}	}	return page;}static struct page *dequeue_huge_page_vma(struct hstate *h,				struct vm_area_struct *vma,				unsigned long address, int avoid_reserve){	int nid;	struct page *page = NULL;	struct mempolicy *mpol;	nodemask_t *nodemask;	struct zonelist *zonelist = huge_zonelist(vma, address,					htlb_alloc_mask, &mpol, &nodemask);	struct zone *zone;	struct zoneref *z;	/*	 * A child process with MAP_PRIVATE mappings created by their parent	 * have no page reserves. This check ensures that reservations are	 * not "stolen". The child may still get SIGKILLed	 */	if (!vma_has_reserves(vma) &&			h->free_huge_pages - h->resv_huge_pages == 0)		return NULL;	/* If reserves cannot be used, ensure enough pages are in the pool */	if (avoid_reserve && h->free_huge_pages - h->resv_huge_pages == 0)		return NULL;	for_each_zone_zonelist_nodemask(zone, z, zonelist,						MAX_NR_ZONES - 1, nodemask) {		nid = zone_to_nid(zone);		if (cpuset_zone_allowed_softwall(zone, htlb_alloc_mask) &&		    !list_empty(&h->hugepage_freelists[nid])) {			page = list_entry(h->hugepage_freelists[nid].next,					  struct page, lru);			list_del(&page->lru);			h->free_huge_pages--;			h->free_huge_pages_node[nid]--;			if (!avoid_reserve)				decrement_hugepage_resv_vma(h, vma);			break;		}	}	mpol_cond_put(mpol);	return page;}static void update_and_free_page(struct hstate *h, struct page *page){	int i;	VM_BUG_ON(h->order >= MAX_ORDER);	h->nr_huge_pages--;	h->nr_huge_pages_node[page_to_nid(page)]--;	for (i = 0; i < pages_per_huge_page(h); i++) {		page[i].flags &= ~(1 << PG_locked | 1 << PG_error | 1 << PG_referenced |				1 << PG_dirty | 1 << PG_active | 1 << PG_reserved |				1 << PG_private | 1<< PG_writeback);	}	set_compound_page_dtor(page, NULL);	set_page_refcounted(page);	arch_release_hugepage(page);	__free_pages(page, huge_page_order(h));}struct hstate *size_to_hstate(unsigned long size){	struct hstate *h;	for_each_hstate(h) {		if (huge_page_size(h) == size)			return h;	}	return NULL;}static void free_huge_page(struct page *page){	/*	 * Can't pass hstate in here because it is called from the	 * compound page destructor.	 */	struct hstate *h = page_hstate(page);	int nid = page_to_nid(page);	struct address_space *mapping;	mapping = (struct address_space *) page_private(page);	set_page_private(page, 0);	BUG_ON(page_count(page));	INIT_LIST_HEAD(&page->lru);	spin_lock(&hugetlb_lock);	if (h->surplus_huge_pages_node[nid] && huge_page_order(h) < MAX_ORDER) {		update_and_free_page(h, page);		h->surplus_huge_pages--;		h->surplus_huge_pages_node[nid]--;	} else {		enqueue_huge_page(h, page);	}	spin_unlock(&hugetlb_lock);	if (mapping)		hugetlb_put_quota(mapping, 1);}/* * Increment or decrement surplus_huge_pages.  Keep node-specific counters * balanced by operating on them in a round-robin fashion. * Returns 1 if an adjustment was made. */static int adjust_pool_surplus(struct hstate *h, int delta){	static int prev_nid;	int nid = prev_nid;	int ret = 0;	VM_BUG_ON(delta != -1 && delta != 1);	do {		nid = next_node(nid, node_online_map);		if (nid == MAX_NUMNODES)			nid = first_node(node_online_map);		/* To shrink on this node, there must be a surplus page */		if (delta < 0 && !h->surplus_huge_pages_node[nid])			continue;		/* Surplus cannot exceed the total number of pages */		if (delta > 0 && h->surplus_huge_pages_node[nid] >=						h->nr_huge_pages_node[nid])			continue;		h->surplus_huge_pages += delta;		h->surplus_huge_pages_node[nid] += delta;		ret = 1;		break;	} while (nid != prev_nid);	prev_nid = nid;	return ret;}static void prep_new_huge_page(struct hstate *h, struct page *page, int nid){	set_compound_page_dtor(page, free_huge_page);	spin_lock(&hugetlb_lock);	h->nr_huge_pages++;	h->nr_huge_pages_node[nid]++;	spin_unlock(&hugetlb_lock);	put_page(page); /* free it into the hugepage allocator */}static struct page *alloc_fresh_huge_page_node(struct hstate *h, int nid){	struct page *page;	if (h->order >= MAX_ORDER)		return NULL;	page = alloc_pages_node(nid,		htlb_alloc_mask|__GFP_COMP|__GFP_THISNODE|						__GFP_REPEAT|__GFP_NOWARN,		huge_page_order(h));	if (page) {		if (arch_prepare_hugepage(page)) {			__free_pages(page, huge_page_order(h));			return NULL;		}		prep_new_huge_page(h, page, nid);	}	return page;}/* * Use a helper variable to find the next node and then * copy it back to hugetlb_next_nid afterwards: * otherwise there's a window in which a racer might * pass invalid nid MAX_NUMNODES to alloc_pages_node. * But we don't need to use a spin_lock here: it really * doesn't matter if occasionally a racer chooses the * same nid as we do.  Move nid forward in the mask even * if we just successfully allocated a hugepage so that * the next caller gets hugepages on the next node. */static int hstate_next_node(struct hstate *h){	int next_nid;	next_nid = next_node(h->hugetlb_next_nid, node_online_map);	if (next_nid == MAX_NUMNODES)		next_nid = first_node(node_online_map);	h->hugetlb_next_nid = next_nid;	return next_nid;}static int alloc_fresh_huge_page(struct hstate *h){	struct page *page;	int start_nid;	int next_nid;	int ret = 0;	start_nid = h->hugetlb_next_nid;	do {		page = alloc_fresh_huge_page_node(h, h->hugetlb_next_nid);		if (page)			ret = 1;		next_nid = hstate_next_node(h);	} while (!page && h->hugetlb_next_nid != start_nid);	if (ret)		count_vm_event(HTLB_BUDDY_PGALLOC);	else		count_vm_event(HTLB_BUDDY_PGALLOC_FAIL);	return ret;}static struct page *alloc_buddy_huge_page(struct hstate *h,			struct vm_area_struct *vma, unsigned long address){	struct page *page;	unsigned int nid;	if (h->order >= MAX_ORDER)		return NULL;	/*	 * Assume we will successfully allocate the surplus page to	 * prevent racing processes from causing the surplus to exceed	 * overcommit	 *	 * This however introduces a different race, where a process B	 * tries to grow the static hugepage pool while alloc_pages() is	 * called by process A. B will only examine the per-node	 * counters in determining if surplus huge pages can be	 * converted to normal huge pages in adjust_pool_surplus(). A	 * won't be able to increment the per-node counter, until the	 * lock is dropped by B, but B doesn't drop hugetlb_lock until	 * no more huge pages can be converted from surplus to normal	 * state (and doesn't try to convert again). Thus, we have a	 * case where a surplus huge page exists, the pool is grown, and	 * the surplus huge page still exists after, even though it	 * should just have been converted to a normal huge page. This	 * does not leak memory, though, as the hugepage will be freed	 * once it is out of use. It also does not allow the counters to	 * go out of whack in adjust_pool_surplus() as we don't modify	 * the node values until we've gotten the hugepage and only the	 * per-node value is checked there.	 */	spin_lock(&hugetlb_lock);	if (h->surplus_huge_pages >= h->nr_overcommit_huge_pages) {		spin_unlock(&hugetlb_lock);		return NULL;	} else {		h->nr_huge_pages++;		h->surplus_huge_pages++;	}	spin_unlock(&hugetlb_lock);	page = alloc_pages(htlb_alloc_mask|__GFP_COMP|					__GFP_REPEAT|__GFP_NOWARN,					huge_page_order(h));	if (page && arch_prepare_hugepage(page)) {		__free_pages(page, huge_page_order(h));		return NULL;	}	spin_lock(&hugetlb_lock);	if (page) {		/*		 * This page is now managed by the hugetlb allocator and has		 * no users -- drop the buddy allocator's reference.		 */		put_page_testzero(page);		VM_BUG_ON(page_count(page));		nid = page_to_nid(page);		set_compound_page_dtor(page, free_huge_page);		/*		 * We incremented the global counters already		 */		h->nr_huge_pages_node[nid]++;		h->surplus_huge_pages_node[nid]++;		__count_vm_event(HTLB_BUDDY_PGALLOC);	} else {		h->nr_huge_pages--;		h->surplus_huge_pages--;		__count_vm_event(HTLB_BUDDY_PGALLOC_FAIL);	}	spin_unlock(&hugetlb_lock);	return page;}/* * Increase the hugetlb pool such that it can accomodate a reservation * of size 'delta'. */static int gather_surplus_pages(struct hstate *h, int delta){	struct list_head surplus_list;	struct page *page, *tmp;	int ret, i;	int needed, allocated;	needed = (h->resv_huge_pages + delta) - h->free_huge_pages;	if (needed <= 0) {		h->resv_huge_pages += delta;		return 0;	}	allocated = 0;	INIT_LIST_HEAD(&surplus_list);	ret = -ENOMEM;retry:	spin_unlock(&hugetlb_lock);	for (i = 0; i < needed; i++) {		page = alloc_buddy_huge_page(h, NULL, 0);		if (!page) {			/*			 * We were not able to allocate enough pages to			 * satisfy the entire reservation so we free what			 * we've allocated so far.			 */			spin_lock(&hugetlb_lock);			needed = 0;			goto free;		}		list_add(&page->lru, &surplus_list);	}	allocated += needed;	/*	 * After retaking hugetlb_lock, we need to recalculate 'needed'	 * because either resv_huge_pages or free_huge_pages may have changed.	 */	spin_lock(&hugetlb_lock);	needed = (h->resv_huge_pages + delta) -			(h->free_huge_pages + allocated);	if (needed > 0)		goto retry;	/*	 * The surplus_list now contains _at_least_ the number of extra pages	 * needed to accomodate the reservation.  Add the appropriate number	 * of pages to the hugetlb pool and free the extras back to the buddy	 * allocator.  Commit the entire reservation here to prevent another	 * process from stealing the pages as they are added to the pool but	 * before they are reserved.	 */	needed += allocated;	h->resv_huge_pages += delta;	ret = 0;free:	/* Free the needed pages to the hugetlb pool */	list_for_each_entry_safe(page, tmp, &surplus_list, lru) {		if ((--needed) < 0)			break;		list_del(&page->lru);		enqueue_huge_page(h, page);	}	/* Free unnecessary surplus pages to the buddy allocator */	if (!list_empty(&surplus_list)) {		spin_unlock(&hugetlb_lock);		list_for_each_entry_safe(page, tmp, &surplus_list, lru) {			list_del(&page->lru);			/*			 * The page has a reference count of zero already, so			 * call free_huge_page directly instead of using			 * put_page.  This must be done with hugetlb_lock			 * unlocked which is safe because free_huge_page takes			 * hugetlb_lock before deciding how to free the page.			 */			free_huge_page(page);		}		spin_lock(&hugetlb_lock);	}	return ret;}/* * When releasing a hugetlb pool reservation, any surplus pages that were * allocated to satisfy the reservation must be explicitly freed if they were * never used. */static void return_unused_surplus_pages(struct hstate *h,					unsigned long unused_resv_pages){	static int nid = -1;	struct page *page;	unsigned long nr_pages;	/*	 * We want to release as many surplus pages as possible, spread	 * evenly across all nodes. Iterate across all nodes until we	 * can no longer free unreserved surplus pages. This occurs when	 * the nodes with surplus pages have no free pages.	 */	unsigned long remaining_iterations = num_online_nodes();	/* Uncommit the reservation */	h->resv_huge_pages -= unused_resv_pages;	/* Cannot return gigantic pages currently */	if (h->order >= MAX_ORDER)		return;	nr_pages = min(unused_resv_pages, h->surplus_huge_pages);	while (remaining_iterations-- && nr_pages) {		nid = next_node(nid, node_online_map);		if (nid == MAX_NUMNODES)			nid = first_node(node_online_map);		if (!h->surplus_huge_pages_node[nid])			continue;		if (!list_empty(&h->hugepage_freelists[nid])) {			page = list_entry(h->hugepage_freelists[nid].next,					  struct page, lru);			list_del(&page->lru);			update_and_free_page(h, page);			h->free_huge_pages--;			h->free_huge_pages_node[nid]--;			h->surplus_huge_pages--;			h->surplus_huge_pages_node[nid]--;			nr_pages--;			remaining_iterations = num_online_nodes();		}	}}/* * Determine if the huge page at addr within the vma has an associated * reservation.  Where it does not we will need to logically increase * reservation and actually increase quota before an allocation can occur. * Where any new reservation would be required the reservation change is * prepared, but not committed.  Once the page has been quota'd allocated * an instantiated the change should be committed via vma_commit_reservation. * No action is required on failure. */static int vma_needs_reservation(struct hstate *h,			struct vm_area_struct *vma, unsigned long addr){	struct address_space *mapping = vma->vm_file->f_mapping;	struct inode *inode = mapping->host;	if (vma->vm_flags & VM_SHARED) {		pgoff_t idx = vma_hugecache_offset(h, vma, addr);		return region_chg(&inode->i_mapping->private_list,							idx, idx + 1);	} else if (!is_vma_resv_set(vma, HPAGE_RESV_OWNER)) {		return 1;	} else  {		int err;		pgoff_t idx = vma_hugecache_offset(h, vma, addr);		struct resv_map *reservations = vma_resv_map(vma);		err = region_chg(&reservations->regions, idx, idx + 1);
⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -