📄 hugetlb.c
字号:
avoidcopy = (page_count(old_page) == 1); if (avoidcopy) { set_huge_ptep_writable(vma, address, ptep); return 0; } /* * If the process that created a MAP_PRIVATE mapping is about to * perform a COW due to a shared page count, attempt to satisfy * the allocation without using the existing reserves. The pagecache * page is used to determine if the reserve at this address was * consumed or not. If reserves were used, a partial faulted mapping * at the time of fork() could consume its reserves on COW instead * of the full address range. */ if (!(vma->vm_flags & VM_SHARED) && is_vma_resv_set(vma, HPAGE_RESV_OWNER) && old_page != pagecache_page) outside_reserve = 1; page_cache_get(old_page); new_page = alloc_huge_page(vma, address, outside_reserve); if (IS_ERR(new_page)) { page_cache_release(old_page); /* * If a process owning a MAP_PRIVATE mapping fails to COW, * it is due to references held by a child and an insufficient * huge page pool. To guarantee the original mappers * reliability, unmap the page from child processes. The child * may get SIGKILLed if it later faults. */ if (outside_reserve) { BUG_ON(huge_pte_none(pte)); if (unmap_ref_private(mm, vma, old_page, address)) { BUG_ON(page_count(old_page) != 1); BUG_ON(huge_pte_none(pte)); goto retry_avoidcopy; } WARN_ON_ONCE(1); } return -PTR_ERR(new_page); } spin_unlock(&mm->page_table_lock); copy_huge_page(new_page, old_page, address, vma); __SetPageUptodate(new_page); spin_lock(&mm->page_table_lock); ptep = huge_pte_offset(mm, address & huge_page_mask(h)); if (likely(pte_same(huge_ptep_get(ptep), pte))) { /* Break COW */ huge_ptep_clear_flush(vma, address, ptep); set_huge_pte_at(mm, address, ptep, make_huge_pte(vma, new_page, 1)); /* Make the old page be freed below */ new_page = old_page; } page_cache_release(new_page); page_cache_release(old_page); return 0;}/* Return the pagecache page at a given address within a VMA */static struct page *hugetlbfs_pagecache_page(struct hstate *h, struct vm_area_struct *vma, unsigned long address){ struct address_space *mapping; pgoff_t idx; mapping = vma->vm_file->f_mapping; idx = vma_hugecache_offset(h, vma, address); return find_lock_page(mapping, idx);}static int hugetlb_no_page(struct mm_struct *mm, struct vm_area_struct *vma, unsigned long address, pte_t *ptep, int write_access){ struct hstate *h = hstate_vma(vma); int ret = VM_FAULT_SIGBUS; pgoff_t idx; unsigned long size; struct page *page; struct address_space *mapping; pte_t new_pte; /* * Currently, we are forced to kill the process in the event the * original mapper has unmapped pages from the child due to a failed * COW. Warn that such a situation has occured as it may not be obvious */ if (is_vma_resv_set(vma, HPAGE_RESV_UNMAPPED)) { printk(KERN_WARNING "PID %d killed due to inadequate hugepage pool\n", current->pid); return ret; } mapping = vma->vm_file->f_mapping; idx = vma_hugecache_offset(h, vma, address); /* * Use page lock to guard against racing truncation * before we get page_table_lock. */retry: page = find_lock_page(mapping, idx); if (!page) { size = i_size_read(mapping->host) >> huge_page_shift(h); if (idx >= size) goto out; page = alloc_huge_page(vma, address, 0); if (IS_ERR(page)) { ret = -PTR_ERR(page); goto out; } clear_huge_page(page, address, huge_page_size(h)); __SetPageUptodate(page); if (vma->vm_flags & VM_SHARED) { int err; struct inode *inode = mapping->host; err = add_to_page_cache(page, mapping, idx, GFP_KERNEL); if (err) { put_page(page); if (err == -EEXIST) goto retry; goto out; } spin_lock(&inode->i_lock); inode->i_blocks += blocks_per_huge_page(h); spin_unlock(&inode->i_lock); } else lock_page(page); } /* * If we are going to COW a private mapping later, we examine the * pending reservations for this page now. This will ensure that * any allocations necessary to record that reservation occur outside * the spinlock. */ if (write_access && !(vma->vm_flags & VM_SHARED)) if (vma_needs_reservation(h, vma, address) < 0) { ret = VM_FAULT_OOM; goto backout_unlocked; } spin_lock(&mm->page_table_lock); size = i_size_read(mapping->host) >> huge_page_shift(h); if (idx >= size) goto backout; ret = 0; if (!huge_pte_none(huge_ptep_get(ptep))) goto backout; new_pte = make_huge_pte(vma, page, ((vma->vm_flags & VM_WRITE) && (vma->vm_flags & VM_SHARED))); set_huge_pte_at(mm, address, ptep, new_pte); if (write_access && !(vma->vm_flags & VM_SHARED)) { /* Optimization, do the COW without a second fault */ ret = hugetlb_cow(mm, vma, address, ptep, new_pte, page); } spin_unlock(&mm->page_table_lock); unlock_page(page);out: return ret;backout: spin_unlock(&mm->page_table_lock);backout_unlocked: unlock_page(page); put_page(page); goto out;}int hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma, unsigned long address, int write_access){ pte_t *ptep; pte_t entry; int ret; struct page *pagecache_page = NULL; static DEFINE_MUTEX(hugetlb_instantiation_mutex); struct hstate *h = hstate_vma(vma); ptep = huge_pte_alloc(mm, address, huge_page_size(h)); if (!ptep) return VM_FAULT_OOM; /* * Serialize hugepage allocation and instantiation, so that we don't * get spurious allocation failures if two CPUs race to instantiate * the same page in the page cache. */ mutex_lock(&hugetlb_instantiation_mutex); entry = huge_ptep_get(ptep); if (huge_pte_none(entry)) { ret = hugetlb_no_page(mm, vma, address, ptep, write_access); goto out_mutex; } ret = 0; /* * If we are going to COW the mapping later, we examine the pending * reservations for this page now. This will ensure that any * allocations necessary to record that reservation occur outside the * spinlock. For private mappings, we also lookup the pagecache * page now as it is used to determine if a reservation has been * consumed. */ if (write_access && !pte_write(entry)) { if (vma_needs_reservation(h, vma, address) < 0) { ret = VM_FAULT_OOM; goto out_mutex; } if (!(vma->vm_flags & VM_SHARED)) pagecache_page = hugetlbfs_pagecache_page(h, vma, address); } spin_lock(&mm->page_table_lock); /* Check for a racing update before calling hugetlb_cow */ if (unlikely(!pte_same(entry, huge_ptep_get(ptep)))) goto out_page_table_lock; if (write_access) { if (!pte_write(entry)) { ret = hugetlb_cow(mm, vma, address, ptep, entry, pagecache_page); goto out_page_table_lock; } entry = pte_mkdirty(entry); } entry = pte_mkyoung(entry); if (huge_ptep_set_access_flags(vma, address, ptep, entry, write_access)) update_mmu_cache(vma, address, entry);out_page_table_lock: spin_unlock(&mm->page_table_lock); if (pagecache_page) { unlock_page(pagecache_page); put_page(pagecache_page); }out_mutex: mutex_unlock(&hugetlb_instantiation_mutex); return ret;}/* Can be overriden by architectures */__attribute__((weak)) struct page *follow_huge_pud(struct mm_struct *mm, unsigned long address, pud_t *pud, int write){ BUG(); return NULL;}static int huge_zeropage_ok(pte_t *ptep, int write, int shared){ if (!ptep || write || shared) return 0; else return huge_pte_none(huge_ptep_get(ptep));}int follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma, struct page **pages, struct vm_area_struct **vmas, unsigned long *position, int *length, int i, int write){ unsigned long pfn_offset; unsigned long vaddr = *position; int remainder = *length; struct hstate *h = hstate_vma(vma); int zeropage_ok = 0; int shared = vma->vm_flags & VM_SHARED; spin_lock(&mm->page_table_lock); while (vaddr < vma->vm_end && remainder) { pte_t *pte; struct page *page; /* * Some archs (sparc64, sh*) have multiple pte_ts to * each hugepage. We have to make * sure we get the * first, for the page indexing below to work. */ pte = huge_pte_offset(mm, vaddr & huge_page_mask(h)); if (huge_zeropage_ok(pte, write, shared)) zeropage_ok = 1; if (!pte || (huge_pte_none(huge_ptep_get(pte)) && !zeropage_ok) || (write && !pte_write(huge_ptep_get(pte)))) { int ret; spin_unlock(&mm->page_table_lock); ret = hugetlb_fault(mm, vma, vaddr, write); spin_lock(&mm->page_table_lock); if (!(ret & VM_FAULT_ERROR)) continue; remainder = 0; if (!i) i = -EFAULT; break; } pfn_offset = (vaddr & ~huge_page_mask(h)) >> PAGE_SHIFT; page = pte_page(huge_ptep_get(pte));same_page: if (pages) { if (zeropage_ok) pages[i] = ZERO_PAGE(0); else pages[i] = mem_map_offset(page, pfn_offset); get_page(pages[i]); } if (vmas) vmas[i] = vma; vaddr += PAGE_SIZE; ++pfn_offset; --remainder; ++i; if (vaddr < vma->vm_end && remainder && pfn_offset < pages_per_huge_page(h)) { /* * We use pfn_offset to avoid touching the pageframes * of this compound page. */ goto same_page; } } spin_unlock(&mm->page_table_lock); *length = remainder; *position = vaddr; return i;}void hugetlb_change_protection(struct vm_area_struct *vma, unsigned long address, unsigned long end, pgprot_t newprot){ struct mm_struct *mm = vma->vm_mm; unsigned long start = address; pte_t *ptep; pte_t pte; struct hstate *h = hstate_vma(vma); BUG_ON(address >= end); flush_cache_range(vma, address, end); spin_lock(&vma->vm_file->f_mapping->i_mmap_lock); spin_lock(&mm->page_table_lock); for (; address < end; address += huge_page_size(h)) { ptep = huge_pte_offset(mm, address); if (!ptep) continue; if (huge_pmd_unshare(mm, &address, ptep)) continue; if (!huge_pte_none(huge_ptep_get(ptep))) { pte = huge_ptep_get_and_clear(mm, address, ptep); pte = pte_mkhuge(pte_modify(pte, newprot)); set_huge_pte_at(mm, address, ptep, pte); } } spin_unlock(&mm->page_table_lock); spin_unlock(&vma->vm_file->f_mapping->i_mmap_lock); flush_tlb_range(vma, start, end);}int hugetlb_reserve_pages(struct inode *inode, long from, long to, struct vm_area_struct *vma, int acctflag){ long ret, chg; struct hstate *h = hstate_inode(inode); /* * Only apply hugepage reservation if asked. At fault time, an * attempt will be made for VM_NORESERVE to allocate a page * and filesystem quota without using reserves */ if (acctflag & VM_NORESERVE) return 0; /* * Shared mappings base their reservation on the number of pages that * are already allocated on behalf of the file. Private mappings need * to reserve the full area even if read-only as mprotect() may be * called to make the mapping read-write. Assume !vma is a shm mapping */ if (!vma || vma->vm_flags & VM_SHARED) chg = region_chg(&inode->i_mapping->private_list, from, to); else { struct resv_map *resv_map = resv_map_alloc(); if (!resv_map) return -ENOMEM; chg = to - from; set_vma_resv_map(vma, resv_map); set_vma_resv_flags(vma, HPAGE_RESV_OWNER); } if (chg < 0) return chg; /* There must be enough filesystem quota for the mapping */ if (hugetlb_get_quota(inode->i_mapping, chg)) return -ENOSPC; /* * Check enough hugepages are available for the reservation. * Hand back the quota if there are not */ ret = hugetlb_acct_memory(h, chg); if (ret < 0) { hugetlb_put_quota(inode->i_mapping, chg); return ret; } /* * Account for the reservations made. Shared mappings record regions * that have reservations as they are shared by multiple VMAs. * When the last VMA disappears, the region map says how much * the reservation was and the page cache tells how much of * the reservation was consumed. Private mappings are per-VMA and * only the consumed reservations are tracked. When the VMA * disappears, the original reservation is the VMA size and the * consumed reservations are stored in the map. Hence, nothing * else has to be done for private mappings here */ if (!vma || vma->vm_flags & VM_SHARED) region_add(&inode->i_mapping->private_list, from, to); return 0;}void hugetlb_unreserve_pages(struct inode *inode, long offset, long freed){ struct hstate *h = hstate_inode(inode); long chg = region_truncate(&inode->i_mapping->private_list, offset); spin_lock(&inode->i_lock); inode->i_blocks -= blocks_per_huge_page(h); spin_unlock(&inode->i_lock); hugetlb_put_quota(inode->i_mapping, (chg - freed)); hugetlb_acct_memory(h, -(chg - freed));}
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -