page_tables.c
来自「linux 内核源代码」· C语言 代码 · 共 734 行 · 第 1/2 页
C
734 行
/* We walk down the guest page tables to get a guest-physical address */unsigned long guest_pa(struct lguest *lg, unsigned long vaddr){ pgd_t gpgd; pte_t gpte; /* First step: get the top-level Guest page table entry. */ gpgd = lgread(lg, gpgd_addr(lg, vaddr), pgd_t); /* Toplevel not present? We can't map it in. */ if (!(pgd_flags(gpgd) & _PAGE_PRESENT)) kill_guest(lg, "Bad address %#lx", vaddr); gpte = lgread(lg, gpte_addr(lg, gpgd, vaddr), pte_t); if (!(pte_flags(gpte) & _PAGE_PRESENT)) kill_guest(lg, "Bad address %#lx", vaddr); return pte_pfn(gpte) * PAGE_SIZE | (vaddr & ~PAGE_MASK);}/* We keep several page tables. This is a simple routine to find the page * table (if any) corresponding to this top-level address the Guest has given * us. */static unsigned int find_pgdir(struct lguest *lg, unsigned long pgtable){ unsigned int i; for (i = 0; i < ARRAY_SIZE(lg->pgdirs); i++) if (lg->pgdirs[i].gpgdir == pgtable) break; return i;}/*H:435 And this is us, creating the new page directory. If we really do * allocate a new one (and so the kernel parts are not there), we set * blank_pgdir. */static unsigned int new_pgdir(struct lguest *lg, unsigned long gpgdir, int *blank_pgdir){ unsigned int next; /* We pick one entry at random to throw out. Choosing the Least * Recently Used might be better, but this is easy. */ next = random32() % ARRAY_SIZE(lg->pgdirs); /* If it's never been allocated at all before, try now. */ if (!lg->pgdirs[next].pgdir) { lg->pgdirs[next].pgdir = (pgd_t *)get_zeroed_page(GFP_KERNEL); /* If the allocation fails, just keep using the one we have */ if (!lg->pgdirs[next].pgdir) next = lg->pgdidx; else /* This is a blank page, so there are no kernel * mappings: caller must map the stack! */ *blank_pgdir = 1; } /* Record which Guest toplevel this shadows. */ lg->pgdirs[next].gpgdir = gpgdir; /* Release all the non-kernel mappings. */ flush_user_mappings(lg, next); return next;}/*H:430 (iv) Switching page tables * * Now we've seen all the page table setting and manipulation, let's see what * what happens when the Guest changes page tables (ie. changes the top-level * pgdir). This occurs on almost every context switch. */void guest_new_pagetable(struct lguest *lg, unsigned long pgtable){ int newpgdir, repin = 0; /* Look to see if we have this one already. */ newpgdir = find_pgdir(lg, pgtable); /* If not, we allocate or mug an existing one: if it's a fresh one, * repin gets set to 1. */ if (newpgdir == ARRAY_SIZE(lg->pgdirs)) newpgdir = new_pgdir(lg, pgtable, &repin); /* Change the current pgd index to the new one. */ lg->pgdidx = newpgdir; /* If it was completely blank, we map in the Guest kernel stack */ if (repin) pin_stack_pages(lg);}/*H:470 Finally, a routine which throws away everything: all PGD entries in all * the shadow page tables, including the Guest's kernel mappings. This is used * when we destroy the Guest. */static void release_all_pagetables(struct lguest *lg){ unsigned int i, j; /* Every shadow pagetable this Guest has */ for (i = 0; i < ARRAY_SIZE(lg->pgdirs); i++) if (lg->pgdirs[i].pgdir) /* Every PGD entry except the Switcher at the top */ for (j = 0; j < SWITCHER_PGD_INDEX; j++) release_pgd(lg, lg->pgdirs[i].pgdir + j);}/* We also throw away everything when a Guest tells us it's changed a kernel * mapping. Since kernel mappings are in every page table, it's easiest to * throw them all away. This traps the Guest in amber for a while as * everything faults back in, but it's rare. */void guest_pagetable_clear_all(struct lguest *lg){ release_all_pagetables(lg); /* We need the Guest kernel stack mapped again. */ pin_stack_pages(lg);}/*:*//*M:009 Since we throw away all mappings when a kernel mapping changes, our * performance sucks for guests using highmem. In fact, a guest with * PAGE_OFFSET 0xc0000000 (the default) and more than about 700MB of RAM is * usually slower than a Guest with less memory. * * This, of course, cannot be fixed. It would take some kind of... well, I * don't know, but the term "puissant code-fu" comes to mind. :*//*H:420 This is the routine which actually sets the page table entry for then * "idx"'th shadow page table. * * Normally, we can just throw out the old entry and replace it with 0: if they * use it demand_page() will put the new entry in. We need to do this anyway: * The Guest expects _PAGE_ACCESSED to be set on its PTE the first time a page * is read from, and _PAGE_DIRTY when it's written to. * * But Avi Kivity pointed out that most Operating Systems (Linux included) set * these bits on PTEs immediately anyway. This is done to save the CPU from * having to update them, but it helps us the same way: if they set * _PAGE_ACCESSED then we can put a read-only PTE entry in immediately, and if * they set _PAGE_DIRTY then we can put a writable PTE entry in immediately. */static void do_set_pte(struct lguest *lg, int idx, unsigned long vaddr, pte_t gpte){ /* Look up the matching shadow page directory entry. */ pgd_t *spgd = spgd_addr(lg, idx, vaddr); /* If the top level isn't present, there's no entry to update. */ if (pgd_flags(*spgd) & _PAGE_PRESENT) { /* Otherwise, we start by releasing the existing entry. */ pte_t *spte = spte_addr(lg, *spgd, vaddr); release_pte(*spte); /* If they're setting this entry as dirty or accessed, we might * as well put that entry they've given us in now. This shaves * 10% off a copy-on-write micro-benchmark. */ if (pte_flags(gpte) & (_PAGE_DIRTY | _PAGE_ACCESSED)) { check_gpte(lg, gpte); *spte = gpte_to_spte(lg, gpte, pte_flags(gpte) & _PAGE_DIRTY); } else /* Otherwise kill it and we can demand_page() it in * later. */ *spte = __pte(0); }}/*H:410 Updating a PTE entry is a little trickier. * * We keep track of several different page tables (the Guest uses one for each * process, so it makes sense to cache at least a few). Each of these have * identical kernel parts: ie. every mapping above PAGE_OFFSET is the same for * all processes. So when the page table above that address changes, we update * all the page tables, not just the current one. This is rare. * * The benefit is that when we have to track a new page table, we can copy keep * all the kernel mappings. This speeds up context switch immensely. */void guest_set_pte(struct lguest *lg, unsigned long gpgdir, unsigned long vaddr, pte_t gpte){ /* Kernel mappings must be changed on all top levels. Slow, but * doesn't happen often. */ if (vaddr >= lg->kernel_address) { unsigned int i; for (i = 0; i < ARRAY_SIZE(lg->pgdirs); i++) if (lg->pgdirs[i].pgdir) do_set_pte(lg, i, vaddr, gpte); } else { /* Is this page table one we have a shadow for? */ int pgdir = find_pgdir(lg, gpgdir); if (pgdir != ARRAY_SIZE(lg->pgdirs)) /* If so, do the update. */ do_set_pte(lg, pgdir, vaddr, gpte); }}/*H:400 * (iii) Setting up a page table entry when the Guest tells us one has changed. * * Just like we did in interrupts_and_traps.c, it makes sense for us to deal * with the other side of page tables while we're here: what happens when the * Guest asks for a page table to be updated? * * We already saw that demand_page() will fill in the shadow page tables when * needed, so we can simply remove shadow page table entries whenever the Guest * tells us they've changed. When the Guest tries to use the new entry it will * fault and demand_page() will fix it up. * * So with that in mind here's our code to to update a (top-level) PGD entry: */void guest_set_pmd(struct lguest *lg, unsigned long gpgdir, u32 idx){ int pgdir; /* The kernel seems to try to initialize this early on: we ignore its * attempts to map over the Switcher. */ if (idx >= SWITCHER_PGD_INDEX) return; /* If they're talking about a page table we have a shadow for... */ pgdir = find_pgdir(lg, gpgdir); if (pgdir < ARRAY_SIZE(lg->pgdirs)) /* ... throw it away. */ release_pgd(lg, lg->pgdirs[pgdir].pgdir + idx);}/*H:500 (vii) Setting up the page tables initially. * * When a Guest is first created, the Launcher tells us where the toplevel of * its first page table is. We set some things up here: */int init_guest_pagetable(struct lguest *lg, unsigned long pgtable){ /* We start on the first shadow page table, and give it a blank PGD * page. */ lg->pgdidx = 0; lg->pgdirs[lg->pgdidx].gpgdir = pgtable; lg->pgdirs[lg->pgdidx].pgdir = (pgd_t*)get_zeroed_page(GFP_KERNEL); if (!lg->pgdirs[lg->pgdidx].pgdir) return -ENOMEM; return 0;}/* When the Guest calls LHCALL_LGUEST_INIT we do more setup. */void page_table_guest_data_init(struct lguest *lg){ /* We get the kernel address: above this is all kernel memory. */ if (get_user(lg->kernel_address, &lg->lguest_data->kernel_address) /* We tell the Guest that it can't use the top 4MB of virtual * addresses used by the Switcher. */ || put_user(4U*1024*1024, &lg->lguest_data->reserve_mem) || put_user(lg->pgdirs[lg->pgdidx].gpgdir,&lg->lguest_data->pgdir)) kill_guest(lg, "bad guest page %p", lg->lguest_data); /* In flush_user_mappings() we loop from 0 to * "pgd_index(lg->kernel_address)". This assumes it won't hit the * Switcher mappings, so check that now. */ if (pgd_index(lg->kernel_address) >= SWITCHER_PGD_INDEX) kill_guest(lg, "bad kernel address %#lx", lg->kernel_address);}/* When a Guest dies, our cleanup is fairly simple. */void free_guest_pagetable(struct lguest *lg){ unsigned int i; /* Throw away all page table pages. */ release_all_pagetables(lg); /* Now free the top levels: free_page() can handle 0 just fine. */ for (i = 0; i < ARRAY_SIZE(lg->pgdirs); i++) free_page((long)lg->pgdirs[i].pgdir);}/*H:480 (vi) Mapping the Switcher when the Guest is about to run. * * The Switcher and the two pages for this CPU need to be visible in the * Guest (and not the pages for other CPUs). We have the appropriate PTE pages * for each CPU already set up, we just need to hook them in now we know which * Guest is about to run on this CPU. */void map_switcher_in_guest(struct lguest *lg, struct lguest_pages *pages){ pte_t *switcher_pte_page = __get_cpu_var(switcher_pte_pages); pgd_t switcher_pgd; pte_t regs_pte; /* Make the last PGD entry for this Guest point to the Switcher's PTE * page for this CPU (with appropriate flags). */ switcher_pgd = __pgd(__pa(switcher_pte_page) | _PAGE_KERNEL); lg->pgdirs[lg->pgdidx].pgdir[SWITCHER_PGD_INDEX] = switcher_pgd; /* We also change the Switcher PTE page. When we're running the Guest, * we want the Guest's "regs" page to appear where the first Switcher * page for this CPU is. This is an optimization: when the Switcher * saves the Guest registers, it saves them into the first page of this * CPU's "struct lguest_pages": if we make sure the Guest's register * page is already mapped there, we don't have to copy them out * again. */ regs_pte = pfn_pte (__pa(lg->regs_page) >> PAGE_SHIFT, __pgprot(_PAGE_KERNEL)); switcher_pte_page[(unsigned long)pages/PAGE_SIZE%PTRS_PER_PTE] = regs_pte;}/*:*/static void free_switcher_pte_pages(void){ unsigned int i; for_each_possible_cpu(i) free_page((long)switcher_pte_page(i));}/*H:520 Setting up the Switcher PTE page for given CPU is fairly easy, given * the CPU number and the "struct page"s for the Switcher code itself. * * Currently the Switcher is less than a page long, so "pages" is always 1. */static __init void populate_switcher_pte_page(unsigned int cpu, struct page *switcher_page[], unsigned int pages){ unsigned int i; pte_t *pte = switcher_pte_page(cpu); /* The first entries are easy: they map the Switcher code. */ for (i = 0; i < pages; i++) { pte[i] = mk_pte(switcher_page[i], __pgprot(_PAGE_PRESENT|_PAGE_ACCESSED)); } /* The only other thing we map is this CPU's pair of pages. */ i = pages + cpu*2; /* First page (Guest registers) is writable from the Guest */ pte[i] = pfn_pte(page_to_pfn(switcher_page[i]), __pgprot(_PAGE_PRESENT|_PAGE_ACCESSED|_PAGE_RW)); /* The second page contains the "struct lguest_ro_state", and is * read-only. */ pte[i+1] = pfn_pte(page_to_pfn(switcher_page[i+1]), __pgprot(_PAGE_PRESENT|_PAGE_ACCESSED));}/* We've made it through the page table code. Perhaps our tired brains are * still processing the details, or perhaps we're simply glad it's over. * * If nothing else, note that all this complexity in juggling shadow page * tables in sync with the Guest's page tables is for one reason: for most * Guests this page table dance determines how bad performance will be. This * is why Xen uses exotic direct Guest pagetable manipulation, and why both * Intel and AMD have implemented shadow page table support directly into * hardware. * * There is just one file remaining in the Host. *//*H:510 At boot or module load time, init_pagetables() allocates and populates * the Switcher PTE page for each CPU. */__init int init_pagetables(struct page **switcher_page, unsigned int pages){ unsigned int i; for_each_possible_cpu(i) { switcher_pte_page(i) = (pte_t *)get_zeroed_page(GFP_KERNEL); if (!switcher_pte_page(i)) { free_switcher_pte_pages(); return -ENOMEM; } populate_switcher_pte_page(i, switcher_page, pages); } return 0;}/*:*//* Cleaning up simply involves freeing the PTE page for each CPU. */void free_pagetables(void){ free_switcher_pte_pages();}
⌨️ 快捷键说明
复制代码Ctrl + C
搜索代码Ctrl + F
全屏模式F11
增大字号Ctrl + =
减小字号Ctrl + -
显示快捷键?