hugetlbpage.c

来自「LINUX 2.6.17.4的源码」· C语言 代码 · 共 1,067 行 · 第 1/2 页

C
1,067
字号
/* * PPC64 (POWER4) Huge TLB Page Support for Kernel. * * Copyright (C) 2003 David Gibson, IBM Corporation. * * Based on the IA-32 version: * Copyright (C) 2002, Rohit Seth <rohit.seth@intel.com> */#include <linux/init.h>#include <linux/fs.h>#include <linux/mm.h>#include <linux/hugetlb.h>#include <linux/pagemap.h>#include <linux/smp_lock.h>#include <linux/slab.h>#include <linux/err.h>#include <linux/sysctl.h>#include <asm/mman.h>#include <asm/pgalloc.h>#include <asm/tlb.h>#include <asm/tlbflush.h>#include <asm/mmu_context.h>#include <asm/machdep.h>#include <asm/cputable.h>#include <asm/tlb.h>#include <linux/sysctl.h>#define NUM_LOW_AREAS	(0x100000000UL >> SID_SHIFT)#define NUM_HIGH_AREAS	(PGTABLE_RANGE >> HTLB_AREA_SHIFT)#ifdef CONFIG_PPC_64K_PAGES#define HUGEPTE_INDEX_SIZE	(PMD_SHIFT-HPAGE_SHIFT)#else#define HUGEPTE_INDEX_SIZE	(PUD_SHIFT-HPAGE_SHIFT)#endif#define PTRS_PER_HUGEPTE	(1 << HUGEPTE_INDEX_SIZE)#define HUGEPTE_TABLE_SIZE	(sizeof(pte_t) << HUGEPTE_INDEX_SIZE)#define HUGEPD_SHIFT		(HPAGE_SHIFT + HUGEPTE_INDEX_SIZE)#define HUGEPD_SIZE		(1UL << HUGEPD_SHIFT)#define HUGEPD_MASK		(~(HUGEPD_SIZE-1))#define huge_pgtable_cache	(pgtable_cache[HUGEPTE_CACHE_NUM])/* Flag to mark huge PD pointers.  This means pmd_bad() and pud_bad() * will choke on pointers to hugepte tables, which is handy for * catching screwups early. */#define HUGEPD_OK	0x1typedef struct { unsigned long pd; } hugepd_t;#define hugepd_none(hpd)	((hpd).pd == 0)static inline pte_t *hugepd_page(hugepd_t hpd){	BUG_ON(!(hpd.pd & HUGEPD_OK));	return (pte_t *)(hpd.pd & ~HUGEPD_OK);}static inline pte_t *hugepte_offset(hugepd_t *hpdp, unsigned long addr){	unsigned long idx = ((addr >> HPAGE_SHIFT) & (PTRS_PER_HUGEPTE-1));	pte_t *dir = hugepd_page(*hpdp);	return dir + idx;}static int __hugepte_alloc(struct mm_struct *mm, hugepd_t *hpdp,			   unsigned long address){	pte_t *new = kmem_cache_alloc(huge_pgtable_cache,				      GFP_KERNEL|__GFP_REPEAT);	if (! new)		return -ENOMEM;	spin_lock(&mm->page_table_lock);	if (!hugepd_none(*hpdp))		kmem_cache_free(huge_pgtable_cache, new);	else		hpdp->pd = (unsigned long)new | HUGEPD_OK;	spin_unlock(&mm->page_table_lock);	return 0;}/* Modelled after find_linux_pte() */pte_t *huge_pte_offset(struct mm_struct *mm, unsigned long addr){	pgd_t *pg;	pud_t *pu;	BUG_ON(! in_hugepage_area(mm->context, addr));	addr &= HPAGE_MASK;	pg = pgd_offset(mm, addr);	if (!pgd_none(*pg)) {		pu = pud_offset(pg, addr);		if (!pud_none(*pu)) {#ifdef CONFIG_PPC_64K_PAGES			pmd_t *pm;			pm = pmd_offset(pu, addr);			if (!pmd_none(*pm))				return hugepte_offset((hugepd_t *)pm, addr);#else			return hugepte_offset((hugepd_t *)pu, addr);#endif		}	}	return NULL;}pte_t *huge_pte_alloc(struct mm_struct *mm, unsigned long addr){	pgd_t *pg;	pud_t *pu;	hugepd_t *hpdp = NULL;	BUG_ON(! in_hugepage_area(mm->context, addr));	addr &= HPAGE_MASK;	pg = pgd_offset(mm, addr);	pu = pud_alloc(mm, pg, addr);	if (pu) {#ifdef CONFIG_PPC_64K_PAGES		pmd_t *pm;		pm = pmd_alloc(mm, pu, addr);		if (pm)			hpdp = (hugepd_t *)pm;#else		hpdp = (hugepd_t *)pu;#endif	}	if (! hpdp)		return NULL;	if (hugepd_none(*hpdp) && __hugepte_alloc(mm, hpdp, addr))		return NULL;	return hugepte_offset(hpdp, addr);}static void free_hugepte_range(struct mmu_gather *tlb, hugepd_t *hpdp){	pte_t *hugepte = hugepd_page(*hpdp);	hpdp->pd = 0;	tlb->need_flush = 1;	pgtable_free_tlb(tlb, pgtable_free_cache(hugepte, HUGEPTE_CACHE_NUM,						 HUGEPTE_TABLE_SIZE-1));}#ifdef CONFIG_PPC_64K_PAGESstatic void hugetlb_free_pmd_range(struct mmu_gather *tlb, pud_t *pud,				   unsigned long addr, unsigned long end,				   unsigned long floor, unsigned long ceiling){	pmd_t *pmd;	unsigned long next;	unsigned long start;	start = addr;	pmd = pmd_offset(pud, addr);	do {		next = pmd_addr_end(addr, end);		if (pmd_none(*pmd))			continue;		free_hugepte_range(tlb, (hugepd_t *)pmd);	} while (pmd++, addr = next, addr != end);	start &= PUD_MASK;	if (start < floor)		return;	if (ceiling) {		ceiling &= PUD_MASK;		if (!ceiling)			return;	}	if (end - 1 > ceiling - 1)		return;	pmd = pmd_offset(pud, start);	pud_clear(pud);	pmd_free_tlb(tlb, pmd);}#endifstatic void hugetlb_free_pud_range(struct mmu_gather *tlb, pgd_t *pgd,				   unsigned long addr, unsigned long end,				   unsigned long floor, unsigned long ceiling){	pud_t *pud;	unsigned long next;	unsigned long start;	start = addr;	pud = pud_offset(pgd, addr);	do {		next = pud_addr_end(addr, end);#ifdef CONFIG_PPC_64K_PAGES		if (pud_none_or_clear_bad(pud))			continue;		hugetlb_free_pmd_range(tlb, pud, addr, next, floor, ceiling);#else		if (pud_none(*pud))			continue;		free_hugepte_range(tlb, (hugepd_t *)pud);#endif	} while (pud++, addr = next, addr != end);	start &= PGDIR_MASK;	if (start < floor)		return;	if (ceiling) {		ceiling &= PGDIR_MASK;		if (!ceiling)			return;	}	if (end - 1 > ceiling - 1)		return;	pud = pud_offset(pgd, start);	pgd_clear(pgd);	pud_free_tlb(tlb, pud);}/* * This function frees user-level page tables of a process. * * Must be called with pagetable lock held. */void hugetlb_free_pgd_range(struct mmu_gather **tlb,			    unsigned long addr, unsigned long end,			    unsigned long floor, unsigned long ceiling){	pgd_t *pgd;	unsigned long next;	unsigned long start;	/*	 * Comments below take from the normal free_pgd_range().  They	 * apply here too.  The tests against HUGEPD_MASK below are	 * essential, because we *don't* test for this at the bottom	 * level.  Without them we'll attempt to free a hugepte table	 * when we unmap just part of it, even if there are other	 * active mappings using it.	 *	 * The next few lines have given us lots of grief...	 *	 * Why are we testing HUGEPD* at this top level?  Because	 * often there will be no work to do at all, and we'd prefer	 * not to go all the way down to the bottom just to discover	 * that.	 *	 * Why all these "- 1"s?  Because 0 represents both the bottom	 * of the address space and the top of it (using -1 for the	 * top wouldn't help much: the masks would do the wrong thing).	 * The rule is that addr 0 and floor 0 refer to the bottom of	 * the address space, but end 0 and ceiling 0 refer to the top	 * Comparisons need to use "end - 1" and "ceiling - 1" (though	 * that end 0 case should be mythical).	 *	 * Wherever addr is brought up or ceiling brought down, we	 * must be careful to reject "the opposite 0" before it	 * confuses the subsequent tests.  But what about where end is	 * brought down by HUGEPD_SIZE below? no, end can't go down to	 * 0 there.	 *	 * Whereas we round start (addr) and ceiling down, by different	 * masks at different levels, in order to test whether a table	 * now has no other vmas using it, so can be freed, we don't	 * bother to round floor or end up - the tests don't need that.	 */	addr &= HUGEPD_MASK;	if (addr < floor) {		addr += HUGEPD_SIZE;		if (!addr)			return;	}	if (ceiling) {		ceiling &= HUGEPD_MASK;		if (!ceiling)			return;	}	if (end - 1 > ceiling - 1)		end -= HUGEPD_SIZE;	if (addr > end - 1)		return;	start = addr;	pgd = pgd_offset((*tlb)->mm, addr);	do {		BUG_ON(! in_hugepage_area((*tlb)->mm->context, addr));		next = pgd_addr_end(addr, end);		if (pgd_none_or_clear_bad(pgd))			continue;		hugetlb_free_pud_range(*tlb, pgd, addr, next, floor, ceiling);	} while (pgd++, addr = next, addr != end);}void set_huge_pte_at(struct mm_struct *mm, unsigned long addr,		     pte_t *ptep, pte_t pte){	if (pte_present(*ptep)) {		/* We open-code pte_clear because we need to pass the right		 * argument to hpte_update (huge / !huge)		 */		unsigned long old = pte_update(ptep, ~0UL);		if (old & _PAGE_HASHPTE)			hpte_update(mm, addr & HPAGE_MASK, ptep, old, 1);		flush_tlb_pending();	}	*ptep = __pte(pte_val(pte) & ~_PAGE_HPTEFLAGS);}pte_t huge_ptep_get_and_clear(struct mm_struct *mm, unsigned long addr,			      pte_t *ptep){	unsigned long old = pte_update(ptep, ~0UL);	if (old & _PAGE_HASHPTE)		hpte_update(mm, addr & HPAGE_MASK, ptep, old, 1);	*ptep = __pte(0);	return __pte(old);}struct slb_flush_info {	struct mm_struct *mm;	u16 newareas;};static void flush_low_segments(void *parm){	struct slb_flush_info *fi = parm;	unsigned long i;	BUILD_BUG_ON((sizeof(fi->newareas)*8) != NUM_LOW_AREAS);	if (current->active_mm != fi->mm)		return;	/* Only need to do anything if this CPU is working in the same	 * mm as the one which has changed */	/* update the paca copy of the context struct */	get_paca()->context = current->active_mm->context;	asm volatile("isync" : : : "memory");	for (i = 0; i < NUM_LOW_AREAS; i++) {		if (! (fi->newareas & (1U << i)))			continue;		asm volatile("slbie %0"			     : : "r" ((i << SID_SHIFT) | SLBIE_C));	}	asm volatile("isync" : : : "memory");}static void flush_high_segments(void *parm){	struct slb_flush_info *fi = parm;	unsigned long i, j;	BUILD_BUG_ON((sizeof(fi->newareas)*8) != NUM_HIGH_AREAS);	if (current->active_mm != fi->mm)		return;	/* Only need to do anything if this CPU is working in the same	 * mm as the one which has changed */	/* update the paca copy of the context struct */	get_paca()->context = current->active_mm->context;	asm volatile("isync" : : : "memory");	for (i = 0; i < NUM_HIGH_AREAS; i++) {		if (! (fi->newareas & (1U << i)))			continue;		for (j = 0; j < (1UL << (HTLB_AREA_SHIFT-SID_SHIFT)); j++)			asm volatile("slbie %0"				     :: "r" (((i << HTLB_AREA_SHIFT)					      + (j << SID_SHIFT)) | SLBIE_C));	}	asm volatile("isync" : : : "memory");}static int prepare_low_area_for_htlb(struct mm_struct *mm, unsigned long area){	unsigned long start = area << SID_SHIFT;	unsigned long end = (area+1) << SID_SHIFT;	struct vm_area_struct *vma;	BUG_ON(area >= NUM_LOW_AREAS);	/* Check no VMAs are in the region */	vma = find_vma(mm, start);	if (vma && (vma->vm_start < end))		return -EBUSY;	return 0;}static int prepare_high_area_for_htlb(struct mm_struct *mm, unsigned long area){	unsigned long start = area << HTLB_AREA_SHIFT;	unsigned long end = (area+1) << HTLB_AREA_SHIFT;	struct vm_area_struct *vma;	BUG_ON(area >= NUM_HIGH_AREAS);	/* Hack, so that each addresses is controlled by exactly one	 * of the high or low area bitmaps, the first high area starts	 * at 4GB, not 0 */	if (start == 0)		start = 0x100000000UL;	/* Check no VMAs are in the region */	vma = find_vma(mm, start);	if (vma && (vma->vm_start < end))		return -EBUSY;	return 0;}static int open_low_hpage_areas(struct mm_struct *mm, u16 newareas){	unsigned long i;	struct slb_flush_info fi;	BUILD_BUG_ON((sizeof(newareas)*8) != NUM_LOW_AREAS);	BUILD_BUG_ON((sizeof(mm->context.low_htlb_areas)*8) != NUM_LOW_AREAS);	newareas &= ~(mm->context.low_htlb_areas);	if (! newareas)		return 0; /* The segments we want are already open */	for (i = 0; i < NUM_LOW_AREAS; i++)		if ((1 << i) & newareas)			if (prepare_low_area_for_htlb(mm, i) != 0)				return -EBUSY;	mm->context.low_htlb_areas |= newareas;	/* the context change must make it to memory before the flush,	 * so that further SLB misses do the right thing. */	mb();	fi.mm = mm;	fi.newareas = newareas;	on_each_cpu(flush_low_segments, &fi, 0, 1);	return 0;}static int open_high_hpage_areas(struct mm_struct *mm, u16 newareas){	struct slb_flush_info fi;	unsigned long i;	BUILD_BUG_ON((sizeof(newareas)*8) != NUM_HIGH_AREAS);	BUILD_BUG_ON((sizeof(mm->context.high_htlb_areas)*8)		     != NUM_HIGH_AREAS);	newareas &= ~(mm->context.high_htlb_areas);	if (! newareas)		return 0; /* The areas we want are already open */	for (i = 0; i < NUM_HIGH_AREAS; i++)		if ((1 << i) & newareas)			if (prepare_high_area_for_htlb(mm, i) != 0)				return -EBUSY;	mm->context.high_htlb_areas |= newareas;	/* update the paca copy of the context struct */	get_paca()->context = mm->context;	/* the context change must make it to memory before the flush,	 * so that further SLB misses do the right thing. */	mb();	fi.mm = mm;	fi.newareas = newareas;	on_each_cpu(flush_high_segments, &fi, 0, 1);	return 0;}int prepare_hugepage_range(unsigned long addr, unsigned long len){	int err = 0;	if ( (addr+len) < addr )		return -EINVAL;	if (addr < 0x100000000UL)		err = open_low_hpage_areas(current->mm,					  LOW_ESID_MASK(addr, len));	if ((addr + len) > 0x100000000UL)		err = open_high_hpage_areas(current->mm,					    HTLB_AREA_MASK(addr, len));	if (err) {		printk(KERN_DEBUG "prepare_hugepage_range(%lx, %lx)"		       " failed (lowmask: 0x%04hx, highmask: 0x%04hx)\n",		       addr, len,		       LOW_ESID_MASK(addr, len), HTLB_AREA_MASK(addr, len));		return err;	}	return 0;}struct page *follow_huge_addr(struct mm_struct *mm, unsigned long address, int write){	pte_t *ptep;	struct page *page;	if (! in_hugepage_area(mm->context, address))		return ERR_PTR(-EINVAL);	ptep = huge_pte_offset(mm, address);	page = pte_page(*ptep);	if (page)		page += (address % HPAGE_SIZE) / PAGE_SIZE;

⌨️ 快捷键说明

复制代码Ctrl + C
搜索代码Ctrl + F
全屏模式F11
增大字号Ctrl + =
减小字号Ctrl + -
显示快捷键?