hugetlbpage.c
来自「LINUX 2.6.17.4的源码」· C语言 代码 · 共 1,067 行 · 第 1/2 页
C
1,067 行
/* * PPC64 (POWER4) Huge TLB Page Support for Kernel. * * Copyright (C) 2003 David Gibson, IBM Corporation. * * Based on the IA-32 version: * Copyright (C) 2002, Rohit Seth <rohit.seth@intel.com> */#include <linux/init.h>#include <linux/fs.h>#include <linux/mm.h>#include <linux/hugetlb.h>#include <linux/pagemap.h>#include <linux/smp_lock.h>#include <linux/slab.h>#include <linux/err.h>#include <linux/sysctl.h>#include <asm/mman.h>#include <asm/pgalloc.h>#include <asm/tlb.h>#include <asm/tlbflush.h>#include <asm/mmu_context.h>#include <asm/machdep.h>#include <asm/cputable.h>#include <asm/tlb.h>#include <linux/sysctl.h>#define NUM_LOW_AREAS (0x100000000UL >> SID_SHIFT)#define NUM_HIGH_AREAS (PGTABLE_RANGE >> HTLB_AREA_SHIFT)#ifdef CONFIG_PPC_64K_PAGES#define HUGEPTE_INDEX_SIZE (PMD_SHIFT-HPAGE_SHIFT)#else#define HUGEPTE_INDEX_SIZE (PUD_SHIFT-HPAGE_SHIFT)#endif#define PTRS_PER_HUGEPTE (1 << HUGEPTE_INDEX_SIZE)#define HUGEPTE_TABLE_SIZE (sizeof(pte_t) << HUGEPTE_INDEX_SIZE)#define HUGEPD_SHIFT (HPAGE_SHIFT + HUGEPTE_INDEX_SIZE)#define HUGEPD_SIZE (1UL << HUGEPD_SHIFT)#define HUGEPD_MASK (~(HUGEPD_SIZE-1))#define huge_pgtable_cache (pgtable_cache[HUGEPTE_CACHE_NUM])/* Flag to mark huge PD pointers. This means pmd_bad() and pud_bad() * will choke on pointers to hugepte tables, which is handy for * catching screwups early. */#define HUGEPD_OK 0x1typedef struct { unsigned long pd; } hugepd_t;#define hugepd_none(hpd) ((hpd).pd == 0)static inline pte_t *hugepd_page(hugepd_t hpd){ BUG_ON(!(hpd.pd & HUGEPD_OK)); return (pte_t *)(hpd.pd & ~HUGEPD_OK);}static inline pte_t *hugepte_offset(hugepd_t *hpdp, unsigned long addr){ unsigned long idx = ((addr >> HPAGE_SHIFT) & (PTRS_PER_HUGEPTE-1)); pte_t *dir = hugepd_page(*hpdp); return dir + idx;}static int __hugepte_alloc(struct mm_struct *mm, hugepd_t *hpdp, unsigned long address){ pte_t *new = kmem_cache_alloc(huge_pgtable_cache, GFP_KERNEL|__GFP_REPEAT); if (! new) return -ENOMEM; spin_lock(&mm->page_table_lock); if (!hugepd_none(*hpdp)) kmem_cache_free(huge_pgtable_cache, new); else hpdp->pd = (unsigned long)new | HUGEPD_OK; spin_unlock(&mm->page_table_lock); return 0;}/* Modelled after find_linux_pte() */pte_t *huge_pte_offset(struct mm_struct *mm, unsigned long addr){ pgd_t *pg; pud_t *pu; BUG_ON(! in_hugepage_area(mm->context, addr)); addr &= HPAGE_MASK; pg = pgd_offset(mm, addr); if (!pgd_none(*pg)) { pu = pud_offset(pg, addr); if (!pud_none(*pu)) {#ifdef CONFIG_PPC_64K_PAGES pmd_t *pm; pm = pmd_offset(pu, addr); if (!pmd_none(*pm)) return hugepte_offset((hugepd_t *)pm, addr);#else return hugepte_offset((hugepd_t *)pu, addr);#endif } } return NULL;}pte_t *huge_pte_alloc(struct mm_struct *mm, unsigned long addr){ pgd_t *pg; pud_t *pu; hugepd_t *hpdp = NULL; BUG_ON(! in_hugepage_area(mm->context, addr)); addr &= HPAGE_MASK; pg = pgd_offset(mm, addr); pu = pud_alloc(mm, pg, addr); if (pu) {#ifdef CONFIG_PPC_64K_PAGES pmd_t *pm; pm = pmd_alloc(mm, pu, addr); if (pm) hpdp = (hugepd_t *)pm;#else hpdp = (hugepd_t *)pu;#endif } if (! hpdp) return NULL; if (hugepd_none(*hpdp) && __hugepte_alloc(mm, hpdp, addr)) return NULL; return hugepte_offset(hpdp, addr);}static void free_hugepte_range(struct mmu_gather *tlb, hugepd_t *hpdp){ pte_t *hugepte = hugepd_page(*hpdp); hpdp->pd = 0; tlb->need_flush = 1; pgtable_free_tlb(tlb, pgtable_free_cache(hugepte, HUGEPTE_CACHE_NUM, HUGEPTE_TABLE_SIZE-1));}#ifdef CONFIG_PPC_64K_PAGESstatic void hugetlb_free_pmd_range(struct mmu_gather *tlb, pud_t *pud, unsigned long addr, unsigned long end, unsigned long floor, unsigned long ceiling){ pmd_t *pmd; unsigned long next; unsigned long start; start = addr; pmd = pmd_offset(pud, addr); do { next = pmd_addr_end(addr, end); if (pmd_none(*pmd)) continue; free_hugepte_range(tlb, (hugepd_t *)pmd); } while (pmd++, addr = next, addr != end); start &= PUD_MASK; if (start < floor) return; if (ceiling) { ceiling &= PUD_MASK; if (!ceiling) return; } if (end - 1 > ceiling - 1) return; pmd = pmd_offset(pud, start); pud_clear(pud); pmd_free_tlb(tlb, pmd);}#endifstatic void hugetlb_free_pud_range(struct mmu_gather *tlb, pgd_t *pgd, unsigned long addr, unsigned long end, unsigned long floor, unsigned long ceiling){ pud_t *pud; unsigned long next; unsigned long start; start = addr; pud = pud_offset(pgd, addr); do { next = pud_addr_end(addr, end);#ifdef CONFIG_PPC_64K_PAGES if (pud_none_or_clear_bad(pud)) continue; hugetlb_free_pmd_range(tlb, pud, addr, next, floor, ceiling);#else if (pud_none(*pud)) continue; free_hugepte_range(tlb, (hugepd_t *)pud);#endif } while (pud++, addr = next, addr != end); start &= PGDIR_MASK; if (start < floor) return; if (ceiling) { ceiling &= PGDIR_MASK; if (!ceiling) return; } if (end - 1 > ceiling - 1) return; pud = pud_offset(pgd, start); pgd_clear(pgd); pud_free_tlb(tlb, pud);}/* * This function frees user-level page tables of a process. * * Must be called with pagetable lock held. */void hugetlb_free_pgd_range(struct mmu_gather **tlb, unsigned long addr, unsigned long end, unsigned long floor, unsigned long ceiling){ pgd_t *pgd; unsigned long next; unsigned long start; /* * Comments below take from the normal free_pgd_range(). They * apply here too. The tests against HUGEPD_MASK below are * essential, because we *don't* test for this at the bottom * level. Without them we'll attempt to free a hugepte table * when we unmap just part of it, even if there are other * active mappings using it. * * The next few lines have given us lots of grief... * * Why are we testing HUGEPD* at this top level? Because * often there will be no work to do at all, and we'd prefer * not to go all the way down to the bottom just to discover * that. * * Why all these "- 1"s? Because 0 represents both the bottom * of the address space and the top of it (using -1 for the * top wouldn't help much: the masks would do the wrong thing). * The rule is that addr 0 and floor 0 refer to the bottom of * the address space, but end 0 and ceiling 0 refer to the top * Comparisons need to use "end - 1" and "ceiling - 1" (though * that end 0 case should be mythical). * * Wherever addr is brought up or ceiling brought down, we * must be careful to reject "the opposite 0" before it * confuses the subsequent tests. But what about where end is * brought down by HUGEPD_SIZE below? no, end can't go down to * 0 there. * * Whereas we round start (addr) and ceiling down, by different * masks at different levels, in order to test whether a table * now has no other vmas using it, so can be freed, we don't * bother to round floor or end up - the tests don't need that. */ addr &= HUGEPD_MASK; if (addr < floor) { addr += HUGEPD_SIZE; if (!addr) return; } if (ceiling) { ceiling &= HUGEPD_MASK; if (!ceiling) return; } if (end - 1 > ceiling - 1) end -= HUGEPD_SIZE; if (addr > end - 1) return; start = addr; pgd = pgd_offset((*tlb)->mm, addr); do { BUG_ON(! in_hugepage_area((*tlb)->mm->context, addr)); next = pgd_addr_end(addr, end); if (pgd_none_or_clear_bad(pgd)) continue; hugetlb_free_pud_range(*tlb, pgd, addr, next, floor, ceiling); } while (pgd++, addr = next, addr != end);}void set_huge_pte_at(struct mm_struct *mm, unsigned long addr, pte_t *ptep, pte_t pte){ if (pte_present(*ptep)) { /* We open-code pte_clear because we need to pass the right * argument to hpte_update (huge / !huge) */ unsigned long old = pte_update(ptep, ~0UL); if (old & _PAGE_HASHPTE) hpte_update(mm, addr & HPAGE_MASK, ptep, old, 1); flush_tlb_pending(); } *ptep = __pte(pte_val(pte) & ~_PAGE_HPTEFLAGS);}pte_t huge_ptep_get_and_clear(struct mm_struct *mm, unsigned long addr, pte_t *ptep){ unsigned long old = pte_update(ptep, ~0UL); if (old & _PAGE_HASHPTE) hpte_update(mm, addr & HPAGE_MASK, ptep, old, 1); *ptep = __pte(0); return __pte(old);}struct slb_flush_info { struct mm_struct *mm; u16 newareas;};static void flush_low_segments(void *parm){ struct slb_flush_info *fi = parm; unsigned long i; BUILD_BUG_ON((sizeof(fi->newareas)*8) != NUM_LOW_AREAS); if (current->active_mm != fi->mm) return; /* Only need to do anything if this CPU is working in the same * mm as the one which has changed */ /* update the paca copy of the context struct */ get_paca()->context = current->active_mm->context; asm volatile("isync" : : : "memory"); for (i = 0; i < NUM_LOW_AREAS; i++) { if (! (fi->newareas & (1U << i))) continue; asm volatile("slbie %0" : : "r" ((i << SID_SHIFT) | SLBIE_C)); } asm volatile("isync" : : : "memory");}static void flush_high_segments(void *parm){ struct slb_flush_info *fi = parm; unsigned long i, j; BUILD_BUG_ON((sizeof(fi->newareas)*8) != NUM_HIGH_AREAS); if (current->active_mm != fi->mm) return; /* Only need to do anything if this CPU is working in the same * mm as the one which has changed */ /* update the paca copy of the context struct */ get_paca()->context = current->active_mm->context; asm volatile("isync" : : : "memory"); for (i = 0; i < NUM_HIGH_AREAS; i++) { if (! (fi->newareas & (1U << i))) continue; for (j = 0; j < (1UL << (HTLB_AREA_SHIFT-SID_SHIFT)); j++) asm volatile("slbie %0" :: "r" (((i << HTLB_AREA_SHIFT) + (j << SID_SHIFT)) | SLBIE_C)); } asm volatile("isync" : : : "memory");}static int prepare_low_area_for_htlb(struct mm_struct *mm, unsigned long area){ unsigned long start = area << SID_SHIFT; unsigned long end = (area+1) << SID_SHIFT; struct vm_area_struct *vma; BUG_ON(area >= NUM_LOW_AREAS); /* Check no VMAs are in the region */ vma = find_vma(mm, start); if (vma && (vma->vm_start < end)) return -EBUSY; return 0;}static int prepare_high_area_for_htlb(struct mm_struct *mm, unsigned long area){ unsigned long start = area << HTLB_AREA_SHIFT; unsigned long end = (area+1) << HTLB_AREA_SHIFT; struct vm_area_struct *vma; BUG_ON(area >= NUM_HIGH_AREAS); /* Hack, so that each addresses is controlled by exactly one * of the high or low area bitmaps, the first high area starts * at 4GB, not 0 */ if (start == 0) start = 0x100000000UL; /* Check no VMAs are in the region */ vma = find_vma(mm, start); if (vma && (vma->vm_start < end)) return -EBUSY; return 0;}static int open_low_hpage_areas(struct mm_struct *mm, u16 newareas){ unsigned long i; struct slb_flush_info fi; BUILD_BUG_ON((sizeof(newareas)*8) != NUM_LOW_AREAS); BUILD_BUG_ON((sizeof(mm->context.low_htlb_areas)*8) != NUM_LOW_AREAS); newareas &= ~(mm->context.low_htlb_areas); if (! newareas) return 0; /* The segments we want are already open */ for (i = 0; i < NUM_LOW_AREAS; i++) if ((1 << i) & newareas) if (prepare_low_area_for_htlb(mm, i) != 0) return -EBUSY; mm->context.low_htlb_areas |= newareas; /* the context change must make it to memory before the flush, * so that further SLB misses do the right thing. */ mb(); fi.mm = mm; fi.newareas = newareas; on_each_cpu(flush_low_segments, &fi, 0, 1); return 0;}static int open_high_hpage_areas(struct mm_struct *mm, u16 newareas){ struct slb_flush_info fi; unsigned long i; BUILD_BUG_ON((sizeof(newareas)*8) != NUM_HIGH_AREAS); BUILD_BUG_ON((sizeof(mm->context.high_htlb_areas)*8) != NUM_HIGH_AREAS); newareas &= ~(mm->context.high_htlb_areas); if (! newareas) return 0; /* The areas we want are already open */ for (i = 0; i < NUM_HIGH_AREAS; i++) if ((1 << i) & newareas) if (prepare_high_area_for_htlb(mm, i) != 0) return -EBUSY; mm->context.high_htlb_areas |= newareas; /* update the paca copy of the context struct */ get_paca()->context = mm->context; /* the context change must make it to memory before the flush, * so that further SLB misses do the right thing. */ mb(); fi.mm = mm; fi.newareas = newareas; on_each_cpu(flush_high_segments, &fi, 0, 1); return 0;}int prepare_hugepage_range(unsigned long addr, unsigned long len){ int err = 0; if ( (addr+len) < addr ) return -EINVAL; if (addr < 0x100000000UL) err = open_low_hpage_areas(current->mm, LOW_ESID_MASK(addr, len)); if ((addr + len) > 0x100000000UL) err = open_high_hpage_areas(current->mm, HTLB_AREA_MASK(addr, len)); if (err) { printk(KERN_DEBUG "prepare_hugepage_range(%lx, %lx)" " failed (lowmask: 0x%04hx, highmask: 0x%04hx)\n", addr, len, LOW_ESID_MASK(addr, len), HTLB_AREA_MASK(addr, len)); return err; } return 0;}struct page *follow_huge_addr(struct mm_struct *mm, unsigned long address, int write){ pte_t *ptep; struct page *page; if (! in_hugepage_area(mm->context, address)) return ERR_PTR(-EINVAL); ptep = huge_pte_offset(mm, address); page = pte_page(*ptep); if (page) page += (address % HPAGE_SIZE) / PAGE_SIZE;
⌨️ 快捷键说明
复制代码Ctrl + C
搜索代码Ctrl + F
全屏模式F11
增大字号Ctrl + =
减小字号Ctrl + -
显示快捷键?