📄 hugetlb.c
字号:
} } return page;}static struct page *dequeue_huge_page_vma(struct hstate *h, struct vm_area_struct *vma, unsigned long address, int avoid_reserve){ int nid; struct page *page = NULL; struct mempolicy *mpol; nodemask_t *nodemask; struct zonelist *zonelist = huge_zonelist(vma, address, htlb_alloc_mask, &mpol, &nodemask); struct zone *zone; struct zoneref *z; /* * A child process with MAP_PRIVATE mappings created by their parent * have no page reserves. This check ensures that reservations are * not "stolen". The child may still get SIGKILLed */ if (!vma_has_reserves(vma) && h->free_huge_pages - h->resv_huge_pages == 0) return NULL; /* If reserves cannot be used, ensure enough pages are in the pool */ if (avoid_reserve && h->free_huge_pages - h->resv_huge_pages == 0) return NULL; for_each_zone_zonelist_nodemask(zone, z, zonelist, MAX_NR_ZONES - 1, nodemask) { nid = zone_to_nid(zone); if (cpuset_zone_allowed_softwall(zone, htlb_alloc_mask) && !list_empty(&h->hugepage_freelists[nid])) { page = list_entry(h->hugepage_freelists[nid].next, struct page, lru); list_del(&page->lru); h->free_huge_pages--; h->free_huge_pages_node[nid]--; if (!avoid_reserve) decrement_hugepage_resv_vma(h, vma); break; } } mpol_cond_put(mpol); return page;}static void update_and_free_page(struct hstate *h, struct page *page){ int i; VM_BUG_ON(h->order >= MAX_ORDER); h->nr_huge_pages--; h->nr_huge_pages_node[page_to_nid(page)]--; for (i = 0; i < pages_per_huge_page(h); i++) { page[i].flags &= ~(1 << PG_locked | 1 << PG_error | 1 << PG_referenced | 1 << PG_dirty | 1 << PG_active | 1 << PG_reserved | 1 << PG_private | 1<< PG_writeback); } set_compound_page_dtor(page, NULL); set_page_refcounted(page); arch_release_hugepage(page); __free_pages(page, huge_page_order(h));}struct hstate *size_to_hstate(unsigned long size){ struct hstate *h; for_each_hstate(h) { if (huge_page_size(h) == size) return h; } return NULL;}static void free_huge_page(struct page *page){ /* * Can't pass hstate in here because it is called from the * compound page destructor. */ struct hstate *h = page_hstate(page); int nid = page_to_nid(page); struct address_space *mapping; mapping = (struct address_space *) page_private(page); set_page_private(page, 0); BUG_ON(page_count(page)); INIT_LIST_HEAD(&page->lru); spin_lock(&hugetlb_lock); if (h->surplus_huge_pages_node[nid] && huge_page_order(h) < MAX_ORDER) { update_and_free_page(h, page); h->surplus_huge_pages--; h->surplus_huge_pages_node[nid]--; } else { enqueue_huge_page(h, page); } spin_unlock(&hugetlb_lock); if (mapping) hugetlb_put_quota(mapping, 1);}/* * Increment or decrement surplus_huge_pages. Keep node-specific counters * balanced by operating on them in a round-robin fashion. * Returns 1 if an adjustment was made. */static int adjust_pool_surplus(struct hstate *h, int delta){ static int prev_nid; int nid = prev_nid; int ret = 0; VM_BUG_ON(delta != -1 && delta != 1); do { nid = next_node(nid, node_online_map); if (nid == MAX_NUMNODES) nid = first_node(node_online_map); /* To shrink on this node, there must be a surplus page */ if (delta < 0 && !h->surplus_huge_pages_node[nid]) continue; /* Surplus cannot exceed the total number of pages */ if (delta > 0 && h->surplus_huge_pages_node[nid] >= h->nr_huge_pages_node[nid]) continue; h->surplus_huge_pages += delta; h->surplus_huge_pages_node[nid] += delta; ret = 1; break; } while (nid != prev_nid); prev_nid = nid; return ret;}static void prep_new_huge_page(struct hstate *h, struct page *page, int nid){ set_compound_page_dtor(page, free_huge_page); spin_lock(&hugetlb_lock); h->nr_huge_pages++; h->nr_huge_pages_node[nid]++; spin_unlock(&hugetlb_lock); put_page(page); /* free it into the hugepage allocator */}static struct page *alloc_fresh_huge_page_node(struct hstate *h, int nid){ struct page *page; if (h->order >= MAX_ORDER) return NULL; page = alloc_pages_node(nid, htlb_alloc_mask|__GFP_COMP|__GFP_THISNODE| __GFP_REPEAT|__GFP_NOWARN, huge_page_order(h)); if (page) { if (arch_prepare_hugepage(page)) { __free_pages(page, huge_page_order(h)); return NULL; } prep_new_huge_page(h, page, nid); } return page;}/* * Use a helper variable to find the next node and then * copy it back to hugetlb_next_nid afterwards: * otherwise there's a window in which a racer might * pass invalid nid MAX_NUMNODES to alloc_pages_node. * But we don't need to use a spin_lock here: it really * doesn't matter if occasionally a racer chooses the * same nid as we do. Move nid forward in the mask even * if we just successfully allocated a hugepage so that * the next caller gets hugepages on the next node. */static int hstate_next_node(struct hstate *h){ int next_nid; next_nid = next_node(h->hugetlb_next_nid, node_online_map); if (next_nid == MAX_NUMNODES) next_nid = first_node(node_online_map); h->hugetlb_next_nid = next_nid; return next_nid;}static int alloc_fresh_huge_page(struct hstate *h){ struct page *page; int start_nid; int next_nid; int ret = 0; start_nid = h->hugetlb_next_nid; do { page = alloc_fresh_huge_page_node(h, h->hugetlb_next_nid); if (page) ret = 1; next_nid = hstate_next_node(h); } while (!page && h->hugetlb_next_nid != start_nid); if (ret) count_vm_event(HTLB_BUDDY_PGALLOC); else count_vm_event(HTLB_BUDDY_PGALLOC_FAIL); return ret;}static struct page *alloc_buddy_huge_page(struct hstate *h, struct vm_area_struct *vma, unsigned long address){ struct page *page; unsigned int nid; if (h->order >= MAX_ORDER) return NULL; /* * Assume we will successfully allocate the surplus page to * prevent racing processes from causing the surplus to exceed * overcommit * * This however introduces a different race, where a process B * tries to grow the static hugepage pool while alloc_pages() is * called by process A. B will only examine the per-node * counters in determining if surplus huge pages can be * converted to normal huge pages in adjust_pool_surplus(). A * won't be able to increment the per-node counter, until the * lock is dropped by B, but B doesn't drop hugetlb_lock until * no more huge pages can be converted from surplus to normal * state (and doesn't try to convert again). Thus, we have a * case where a surplus huge page exists, the pool is grown, and * the surplus huge page still exists after, even though it * should just have been converted to a normal huge page. This * does not leak memory, though, as the hugepage will be freed * once it is out of use. It also does not allow the counters to * go out of whack in adjust_pool_surplus() as we don't modify * the node values until we've gotten the hugepage and only the * per-node value is checked there. */ spin_lock(&hugetlb_lock); if (h->surplus_huge_pages >= h->nr_overcommit_huge_pages) { spin_unlock(&hugetlb_lock); return NULL; } else { h->nr_huge_pages++; h->surplus_huge_pages++; } spin_unlock(&hugetlb_lock); page = alloc_pages(htlb_alloc_mask|__GFP_COMP| __GFP_REPEAT|__GFP_NOWARN, huge_page_order(h)); if (page && arch_prepare_hugepage(page)) { __free_pages(page, huge_page_order(h)); return NULL; } spin_lock(&hugetlb_lock); if (page) { /* * This page is now managed by the hugetlb allocator and has * no users -- drop the buddy allocator's reference. */ put_page_testzero(page); VM_BUG_ON(page_count(page)); nid = page_to_nid(page); set_compound_page_dtor(page, free_huge_page); /* * We incremented the global counters already */ h->nr_huge_pages_node[nid]++; h->surplus_huge_pages_node[nid]++; __count_vm_event(HTLB_BUDDY_PGALLOC); } else { h->nr_huge_pages--; h->surplus_huge_pages--; __count_vm_event(HTLB_BUDDY_PGALLOC_FAIL); } spin_unlock(&hugetlb_lock); return page;}/* * Increase the hugetlb pool such that it can accomodate a reservation * of size 'delta'. */static int gather_surplus_pages(struct hstate *h, int delta){ struct list_head surplus_list; struct page *page, *tmp; int ret, i; int needed, allocated; needed = (h->resv_huge_pages + delta) - h->free_huge_pages; if (needed <= 0) { h->resv_huge_pages += delta; return 0; } allocated = 0; INIT_LIST_HEAD(&surplus_list); ret = -ENOMEM;retry: spin_unlock(&hugetlb_lock); for (i = 0; i < needed; i++) { page = alloc_buddy_huge_page(h, NULL, 0); if (!page) { /* * We were not able to allocate enough pages to * satisfy the entire reservation so we free what * we've allocated so far. */ spin_lock(&hugetlb_lock); needed = 0; goto free; } list_add(&page->lru, &surplus_list); } allocated += needed; /* * After retaking hugetlb_lock, we need to recalculate 'needed' * because either resv_huge_pages or free_huge_pages may have changed. */ spin_lock(&hugetlb_lock); needed = (h->resv_huge_pages + delta) - (h->free_huge_pages + allocated); if (needed > 0) goto retry; /* * The surplus_list now contains _at_least_ the number of extra pages * needed to accomodate the reservation. Add the appropriate number * of pages to the hugetlb pool and free the extras back to the buddy * allocator. Commit the entire reservation here to prevent another * process from stealing the pages as they are added to the pool but * before they are reserved. */ needed += allocated; h->resv_huge_pages += delta; ret = 0;free: /* Free the needed pages to the hugetlb pool */ list_for_each_entry_safe(page, tmp, &surplus_list, lru) { if ((--needed) < 0) break; list_del(&page->lru); enqueue_huge_page(h, page); } /* Free unnecessary surplus pages to the buddy allocator */ if (!list_empty(&surplus_list)) { spin_unlock(&hugetlb_lock); list_for_each_entry_safe(page, tmp, &surplus_list, lru) { list_del(&page->lru); /* * The page has a reference count of zero already, so * call free_huge_page directly instead of using * put_page. This must be done with hugetlb_lock * unlocked which is safe because free_huge_page takes * hugetlb_lock before deciding how to free the page. */ free_huge_page(page); } spin_lock(&hugetlb_lock); } return ret;}/* * When releasing a hugetlb pool reservation, any surplus pages that were * allocated to satisfy the reservation must be explicitly freed if they were * never used. */static void return_unused_surplus_pages(struct hstate *h, unsigned long unused_resv_pages){ static int nid = -1; struct page *page; unsigned long nr_pages; /* * We want to release as many surplus pages as possible, spread * evenly across all nodes. Iterate across all nodes until we * can no longer free unreserved surplus pages. This occurs when * the nodes with surplus pages have no free pages. */ unsigned long remaining_iterations = num_online_nodes(); /* Uncommit the reservation */ h->resv_huge_pages -= unused_resv_pages; /* Cannot return gigantic pages currently */ if (h->order >= MAX_ORDER) return; nr_pages = min(unused_resv_pages, h->surplus_huge_pages); while (remaining_iterations-- && nr_pages) { nid = next_node(nid, node_online_map); if (nid == MAX_NUMNODES) nid = first_node(node_online_map); if (!h->surplus_huge_pages_node[nid]) continue; if (!list_empty(&h->hugepage_freelists[nid])) { page = list_entry(h->hugepage_freelists[nid].next, struct page, lru); list_del(&page->lru); update_and_free_page(h, page); h->free_huge_pages--; h->free_huge_pages_node[nid]--; h->surplus_huge_pages--; h->surplus_huge_pages_node[nid]--; nr_pages--; remaining_iterations = num_online_nodes(); } }}/* * Determine if the huge page at addr within the vma has an associated * reservation. Where it does not we will need to logically increase * reservation and actually increase quota before an allocation can occur. * Where any new reservation would be required the reservation change is * prepared, but not committed. Once the page has been quota'd allocated * an instantiated the change should be committed via vma_commit_reservation. * No action is required on failure. */static int vma_needs_reservation(struct hstate *h, struct vm_area_struct *vma, unsigned long addr){ struct address_space *mapping = vma->vm_file->f_mapping; struct inode *inode = mapping->host; if (vma->vm_flags & VM_SHARED) { pgoff_t idx = vma_hugecache_offset(h, vma, addr); return region_chg(&inode->i_mapping->private_list, idx, idx + 1); } else if (!is_vma_resv_set(vma, HPAGE_RESV_OWNER)) { return 1; } else { int err; pgoff_t idx = vma_hugecache_offset(h, vma, addr); struct resv_map *reservations = vma_resv_map(vma); err = region_chg(&reservations->regions, idx, idx + 1);
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -