page_tables.c

来自「linux 内核源代码」· C语言 代码 · 共 734 行 · 第 1/2 页

C
734
字号
/* We walk down the guest page tables to get a guest-physical address */unsigned long guest_pa(struct lguest *lg, unsigned long vaddr){	pgd_t gpgd;	pte_t gpte;	/* First step: get the top-level Guest page table entry. */	gpgd = lgread(lg, gpgd_addr(lg, vaddr), pgd_t);	/* Toplevel not present?  We can't map it in. */	if (!(pgd_flags(gpgd) & _PAGE_PRESENT))		kill_guest(lg, "Bad address %#lx", vaddr);	gpte = lgread(lg, gpte_addr(lg, gpgd, vaddr), pte_t);	if (!(pte_flags(gpte) & _PAGE_PRESENT))		kill_guest(lg, "Bad address %#lx", vaddr);	return pte_pfn(gpte) * PAGE_SIZE | (vaddr & ~PAGE_MASK);}/* We keep several page tables.  This is a simple routine to find the page * table (if any) corresponding to this top-level address the Guest has given * us. */static unsigned int find_pgdir(struct lguest *lg, unsigned long pgtable){	unsigned int i;	for (i = 0; i < ARRAY_SIZE(lg->pgdirs); i++)		if (lg->pgdirs[i].gpgdir == pgtable)			break;	return i;}/*H:435 And this is us, creating the new page directory.  If we really do * allocate a new one (and so the kernel parts are not there), we set * blank_pgdir. */static unsigned int new_pgdir(struct lguest *lg,			      unsigned long gpgdir,			      int *blank_pgdir){	unsigned int next;	/* We pick one entry at random to throw out.  Choosing the Least	 * Recently Used might be better, but this is easy. */	next = random32() % ARRAY_SIZE(lg->pgdirs);	/* If it's never been allocated at all before, try now. */	if (!lg->pgdirs[next].pgdir) {		lg->pgdirs[next].pgdir = (pgd_t *)get_zeroed_page(GFP_KERNEL);		/* If the allocation fails, just keep using the one we have */		if (!lg->pgdirs[next].pgdir)			next = lg->pgdidx;		else			/* This is a blank page, so there are no kernel			 * mappings: caller must map the stack! */			*blank_pgdir = 1;	}	/* Record which Guest toplevel this shadows. */	lg->pgdirs[next].gpgdir = gpgdir;	/* Release all the non-kernel mappings. */	flush_user_mappings(lg, next);	return next;}/*H:430 (iv) Switching page tables * * Now we've seen all the page table setting and manipulation, let's see what * what happens when the Guest changes page tables (ie. changes the top-level * pgdir).  This occurs on almost every context switch. */void guest_new_pagetable(struct lguest *lg, unsigned long pgtable){	int newpgdir, repin = 0;	/* Look to see if we have this one already. */	newpgdir = find_pgdir(lg, pgtable);	/* If not, we allocate or mug an existing one: if it's a fresh one,	 * repin gets set to 1. */	if (newpgdir == ARRAY_SIZE(lg->pgdirs))		newpgdir = new_pgdir(lg, pgtable, &repin);	/* Change the current pgd index to the new one. */	lg->pgdidx = newpgdir;	/* If it was completely blank, we map in the Guest kernel stack */	if (repin)		pin_stack_pages(lg);}/*H:470 Finally, a routine which throws away everything: all PGD entries in all * the shadow page tables, including the Guest's kernel mappings.  This is used * when we destroy the Guest. */static void release_all_pagetables(struct lguest *lg){	unsigned int i, j;	/* Every shadow pagetable this Guest has */	for (i = 0; i < ARRAY_SIZE(lg->pgdirs); i++)		if (lg->pgdirs[i].pgdir)			/* Every PGD entry except the Switcher at the top */			for (j = 0; j < SWITCHER_PGD_INDEX; j++)				release_pgd(lg, lg->pgdirs[i].pgdir + j);}/* We also throw away everything when a Guest tells us it's changed a kernel * mapping.  Since kernel mappings are in every page table, it's easiest to * throw them all away.  This traps the Guest in amber for a while as * everything faults back in, but it's rare. */void guest_pagetable_clear_all(struct lguest *lg){	release_all_pagetables(lg);	/* We need the Guest kernel stack mapped again. */	pin_stack_pages(lg);}/*:*//*M:009 Since we throw away all mappings when a kernel mapping changes, our * performance sucks for guests using highmem.  In fact, a guest with * PAGE_OFFSET 0xc0000000 (the default) and more than about 700MB of RAM is * usually slower than a Guest with less memory. * * This, of course, cannot be fixed.  It would take some kind of... well, I * don't know, but the term "puissant code-fu" comes to mind. :*//*H:420 This is the routine which actually sets the page table entry for then * "idx"'th shadow page table. * * Normally, we can just throw out the old entry and replace it with 0: if they * use it demand_page() will put the new entry in.  We need to do this anyway: * The Guest expects _PAGE_ACCESSED to be set on its PTE the first time a page * is read from, and _PAGE_DIRTY when it's written to. * * But Avi Kivity pointed out that most Operating Systems (Linux included) set * these bits on PTEs immediately anyway.  This is done to save the CPU from * having to update them, but it helps us the same way: if they set * _PAGE_ACCESSED then we can put a read-only PTE entry in immediately, and if * they set _PAGE_DIRTY then we can put a writable PTE entry in immediately. */static void do_set_pte(struct lguest *lg, int idx,		       unsigned long vaddr, pte_t gpte){	/* Look up the matching shadow page directory entry. */	pgd_t *spgd = spgd_addr(lg, idx, vaddr);	/* If the top level isn't present, there's no entry to update. */	if (pgd_flags(*spgd) & _PAGE_PRESENT) {		/* Otherwise, we start by releasing the existing entry. */		pte_t *spte = spte_addr(lg, *spgd, vaddr);		release_pte(*spte);		/* If they're setting this entry as dirty or accessed, we might		 * as well put that entry they've given us in now.  This shaves		 * 10% off a copy-on-write micro-benchmark. */		if (pte_flags(gpte) & (_PAGE_DIRTY | _PAGE_ACCESSED)) {			check_gpte(lg, gpte);			*spte = gpte_to_spte(lg, gpte,					     pte_flags(gpte) & _PAGE_DIRTY);		} else			/* Otherwise kill it and we can demand_page() it in			 * later. */			*spte = __pte(0);	}}/*H:410 Updating a PTE entry is a little trickier. * * We keep track of several different page tables (the Guest uses one for each * process, so it makes sense to cache at least a few).  Each of these have * identical kernel parts: ie. every mapping above PAGE_OFFSET is the same for * all processes.  So when the page table above that address changes, we update * all the page tables, not just the current one.  This is rare. * * The benefit is that when we have to track a new page table, we can copy keep * all the kernel mappings.  This speeds up context switch immensely. */void guest_set_pte(struct lguest *lg,		   unsigned long gpgdir, unsigned long vaddr, pte_t gpte){	/* Kernel mappings must be changed on all top levels.  Slow, but	 * doesn't happen often. */	if (vaddr >= lg->kernel_address) {		unsigned int i;		for (i = 0; i < ARRAY_SIZE(lg->pgdirs); i++)			if (lg->pgdirs[i].pgdir)				do_set_pte(lg, i, vaddr, gpte);	} else {		/* Is this page table one we have a shadow for? */		int pgdir = find_pgdir(lg, gpgdir);		if (pgdir != ARRAY_SIZE(lg->pgdirs))			/* If so, do the update. */			do_set_pte(lg, pgdir, vaddr, gpte);	}}/*H:400 * (iii) Setting up a page table entry when the Guest tells us one has changed. * * Just like we did in interrupts_and_traps.c, it makes sense for us to deal * with the other side of page tables while we're here: what happens when the * Guest asks for a page table to be updated? * * We already saw that demand_page() will fill in the shadow page tables when * needed, so we can simply remove shadow page table entries whenever the Guest * tells us they've changed.  When the Guest tries to use the new entry it will * fault and demand_page() will fix it up. * * So with that in mind here's our code to to update a (top-level) PGD entry: */void guest_set_pmd(struct lguest *lg, unsigned long gpgdir, u32 idx){	int pgdir;	/* The kernel seems to try to initialize this early on: we ignore its	 * attempts to map over the Switcher. */	if (idx >= SWITCHER_PGD_INDEX)		return;	/* If they're talking about a page table we have a shadow for... */	pgdir = find_pgdir(lg, gpgdir);	if (pgdir < ARRAY_SIZE(lg->pgdirs))		/* ... throw it away. */		release_pgd(lg, lg->pgdirs[pgdir].pgdir + idx);}/*H:500 (vii) Setting up the page tables initially. * * When a Guest is first created, the Launcher tells us where the toplevel of * its first page table is.  We set some things up here: */int init_guest_pagetable(struct lguest *lg, unsigned long pgtable){	/* We start on the first shadow page table, and give it a blank PGD	 * page. */	lg->pgdidx = 0;	lg->pgdirs[lg->pgdidx].gpgdir = pgtable;	lg->pgdirs[lg->pgdidx].pgdir = (pgd_t*)get_zeroed_page(GFP_KERNEL);	if (!lg->pgdirs[lg->pgdidx].pgdir)		return -ENOMEM;	return 0;}/* When the Guest calls LHCALL_LGUEST_INIT we do more setup. */void page_table_guest_data_init(struct lguest *lg){	/* We get the kernel address: above this is all kernel memory. */	if (get_user(lg->kernel_address, &lg->lguest_data->kernel_address)	    /* We tell the Guest that it can't use the top 4MB of virtual	     * addresses used by the Switcher. */	    || put_user(4U*1024*1024, &lg->lguest_data->reserve_mem)	    || put_user(lg->pgdirs[lg->pgdidx].gpgdir,&lg->lguest_data->pgdir))		kill_guest(lg, "bad guest page %p", lg->lguest_data);	/* In flush_user_mappings() we loop from 0 to	 * "pgd_index(lg->kernel_address)".  This assumes it won't hit the	 * Switcher mappings, so check that now. */	if (pgd_index(lg->kernel_address) >= SWITCHER_PGD_INDEX)		kill_guest(lg, "bad kernel address %#lx", lg->kernel_address);}/* When a Guest dies, our cleanup is fairly simple. */void free_guest_pagetable(struct lguest *lg){	unsigned int i;	/* Throw away all page table pages. */	release_all_pagetables(lg);	/* Now free the top levels: free_page() can handle 0 just fine. */	for (i = 0; i < ARRAY_SIZE(lg->pgdirs); i++)		free_page((long)lg->pgdirs[i].pgdir);}/*H:480 (vi) Mapping the Switcher when the Guest is about to run. * * The Switcher and the two pages for this CPU need to be visible in the * Guest (and not the pages for other CPUs).  We have the appropriate PTE pages * for each CPU already set up, we just need to hook them in now we know which * Guest is about to run on this CPU. */void map_switcher_in_guest(struct lguest *lg, struct lguest_pages *pages){	pte_t *switcher_pte_page = __get_cpu_var(switcher_pte_pages);	pgd_t switcher_pgd;	pte_t regs_pte;	/* Make the last PGD entry for this Guest point to the Switcher's PTE	 * page for this CPU (with appropriate flags). */	switcher_pgd = __pgd(__pa(switcher_pte_page) | _PAGE_KERNEL);	lg->pgdirs[lg->pgdidx].pgdir[SWITCHER_PGD_INDEX] = switcher_pgd;	/* We also change the Switcher PTE page.  When we're running the Guest,	 * we want the Guest's "regs" page to appear where the first Switcher	 * page for this CPU is.  This is an optimization: when the Switcher	 * saves the Guest registers, it saves them into the first page of this	 * CPU's "struct lguest_pages": if we make sure the Guest's register	 * page is already mapped there, we don't have to copy them out	 * again. */	regs_pte = pfn_pte (__pa(lg->regs_page) >> PAGE_SHIFT, __pgprot(_PAGE_KERNEL));	switcher_pte_page[(unsigned long)pages/PAGE_SIZE%PTRS_PER_PTE] = regs_pte;}/*:*/static void free_switcher_pte_pages(void){	unsigned int i;	for_each_possible_cpu(i)		free_page((long)switcher_pte_page(i));}/*H:520 Setting up the Switcher PTE page for given CPU is fairly easy, given * the CPU number and the "struct page"s for the Switcher code itself. * * Currently the Switcher is less than a page long, so "pages" is always 1. */static __init void populate_switcher_pte_page(unsigned int cpu,					      struct page *switcher_page[],					      unsigned int pages){	unsigned int i;	pte_t *pte = switcher_pte_page(cpu);	/* The first entries are easy: they map the Switcher code. */	for (i = 0; i < pages; i++) {		pte[i] = mk_pte(switcher_page[i],				__pgprot(_PAGE_PRESENT|_PAGE_ACCESSED));	}	/* The only other thing we map is this CPU's pair of pages. */	i = pages + cpu*2;	/* First page (Guest registers) is writable from the Guest */	pte[i] = pfn_pte(page_to_pfn(switcher_page[i]),			 __pgprot(_PAGE_PRESENT|_PAGE_ACCESSED|_PAGE_RW));	/* The second page contains the "struct lguest_ro_state", and is	 * read-only. */	pte[i+1] = pfn_pte(page_to_pfn(switcher_page[i+1]),			   __pgprot(_PAGE_PRESENT|_PAGE_ACCESSED));}/* We've made it through the page table code.  Perhaps our tired brains are * still processing the details, or perhaps we're simply glad it's over. * * If nothing else, note that all this complexity in juggling shadow page * tables in sync with the Guest's page tables is for one reason: for most * Guests this page table dance determines how bad performance will be.  This * is why Xen uses exotic direct Guest pagetable manipulation, and why both * Intel and AMD have implemented shadow page table support directly into * hardware. * * There is just one file remaining in the Host. *//*H:510 At boot or module load time, init_pagetables() allocates and populates * the Switcher PTE page for each CPU. */__init int init_pagetables(struct page **switcher_page, unsigned int pages){	unsigned int i;	for_each_possible_cpu(i) {		switcher_pte_page(i) = (pte_t *)get_zeroed_page(GFP_KERNEL);		if (!switcher_pte_page(i)) {			free_switcher_pte_pages();			return -ENOMEM;		}		populate_switcher_pte_page(i, switcher_page, pages);	}	return 0;}/*:*//* Cleaning up simply involves freeing the PTE page for each CPU. */void free_pagetables(void){	free_switcher_pte_pages();}

⌨️ 快捷键说明

复制代码Ctrl + C
搜索代码Ctrl + F
全屏模式F11
增大字号Ctrl + =
减小字号Ctrl + -
显示快捷键?