📄 boot.c
字号:
/*P:010 * A hypervisor allows multiple Operating Systems to run on a single machine. * To quote David Wheeler: "Any problem in computer science can be solved with * another layer of indirection." * * We keep things simple in two ways. First, we start with a normal Linux * kernel and insert a module (lg.ko) which allows us to run other Linux * kernels the same way we'd run processes. We call the first kernel the Host, * and the others the Guests. The program which sets up and configures Guests * (such as the example in Documentation/lguest/lguest.c) is called the * Launcher. * * Secondly, we only run specially modified Guests, not normal kernels. When * you set CONFIG_LGUEST to 'y' or 'm', this automatically sets * CONFIG_LGUEST_GUEST=y, which compiles this file into the kernel so it knows * how to be a Guest. This means that you can use the same kernel you boot * normally (ie. as a Host) as a Guest. * * These Guests know that they cannot do privileged operations, such as disable * interrupts, and that they have to ask the Host to do such things explicitly. * This file consists of all the replacements for such low-level native * hardware operations: these special Guest versions call the Host. * * So how does the kernel know it's a Guest? The Guest starts at a special * entry point marked with a magic string, which sets up a few things then * calls here. We replace the native functions various "paravirt" structures * with our Guest versions, then boot like normal. :*//* * Copyright (C) 2006, Rusty Russell <rusty@rustcorp.com.au> IBM Corporation. * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 2 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, but * WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or * NON INFRINGEMENT. See the GNU General Public License for more * details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. */#include <linux/kernel.h>#include <linux/start_kernel.h>#include <linux/string.h>#include <linux/console.h>#include <linux/screen_info.h>#include <linux/irq.h>#include <linux/interrupt.h>#include <linux/clocksource.h>#include <linux/clockchips.h>#include <linux/lguest.h>#include <linux/lguest_launcher.h>#include <linux/virtio_console.h>#include <linux/pm.h>#include <asm/paravirt.h>#include <asm/param.h>#include <asm/page.h>#include <asm/pgtable.h>#include <asm/desc.h>#include <asm/setup.h>#include <asm/e820.h>#include <asm/mce.h>#include <asm/io.h>#include <asm/i387.h>/*G:010 Welcome to the Guest! * * The Guest in our tale is a simple creature: identical to the Host but * behaving in simplified but equivalent ways. In particular, the Guest is the * same kernel as the Host (or at least, built from the same source code). :*//* Declarations for definitions in lguest_guest.S */extern char lguest_noirq_start[], lguest_noirq_end[];extern const char lgstart_cli[], lgend_cli[];extern const char lgstart_sti[], lgend_sti[];extern const char lgstart_popf[], lgend_popf[];extern const char lgstart_pushf[], lgend_pushf[];extern const char lgstart_iret[], lgend_iret[];extern void lguest_iret(void);struct lguest_data lguest_data = { .hcall_status = { [0 ... LHCALL_RING_SIZE-1] = 0xFF }, .noirq_start = (u32)lguest_noirq_start, .noirq_end = (u32)lguest_noirq_end, .kernel_address = PAGE_OFFSET, .blocked_interrupts = { 1 }, /* Block timer interrupts */ .syscall_vec = SYSCALL_VECTOR,};static cycle_t clock_base;/*G:037 async_hcall() is pretty simple: I'm quite proud of it really. We have a * ring buffer of stored hypercalls which the Host will run though next time we * do a normal hypercall. Each entry in the ring has 4 slots for the hypercall * arguments, and a "hcall_status" word which is 0 if the call is ready to go, * and 255 once the Host has finished with it. * * If we come around to a slot which hasn't been finished, then the table is * full and we just make the hypercall directly. This has the nice side * effect of causing the Host to run all the stored calls in the ring buffer * which empties it for next time! */static void async_hcall(unsigned long call, unsigned long arg1, unsigned long arg2, unsigned long arg3){ /* Note: This code assumes we're uniprocessor. */ static unsigned int next_call; unsigned long flags; /* Disable interrupts if not already disabled: we don't want an * interrupt handler making a hypercall while we're already doing * one! */ local_irq_save(flags); if (lguest_data.hcall_status[next_call] != 0xFF) { /* Table full, so do normal hcall which will flush table. */ hcall(call, arg1, arg2, arg3); } else { lguest_data.hcalls[next_call].arg0 = call; lguest_data.hcalls[next_call].arg1 = arg1; lguest_data.hcalls[next_call].arg2 = arg2; lguest_data.hcalls[next_call].arg3 = arg3; /* Arguments must all be written before we mark it to go */ wmb(); lguest_data.hcall_status[next_call] = 0; if (++next_call == LHCALL_RING_SIZE) next_call = 0; } local_irq_restore(flags);}/*G:035 Notice the lazy_hcall() above, rather than hcall(). This is our first * real optimization trick! * * When lazy_mode is set, it means we're allowed to defer all hypercalls and do * them as a batch when lazy_mode is eventually turned off. Because hypercalls * are reasonably expensive, batching them up makes sense. For example, a * large munmap might update dozens of page table entries: that code calls * paravirt_enter_lazy_mmu(), does the dozen updates, then calls * lguest_leave_lazy_mode(). * * So, when we're in lazy mode, we call async_hcall() to store the call for * future processing. */static void lazy_hcall(unsigned long call, unsigned long arg1, unsigned long arg2, unsigned long arg3){ if (paravirt_get_lazy_mode() == PARAVIRT_LAZY_NONE) hcall(call, arg1, arg2, arg3); else async_hcall(call, arg1, arg2, arg3);}/* When lazy mode is turned off reset the per-cpu lazy mode variable and then * issue a hypercall to flush any stored calls. */static void lguest_leave_lazy_mode(void){ paravirt_leave_lazy(paravirt_get_lazy_mode()); hcall(LHCALL_FLUSH_ASYNC, 0, 0, 0);}/*G:033 * After that diversion we return to our first native-instruction * replacements: four functions for interrupt control. * * The simplest way of implementing these would be to have "turn interrupts * off" and "turn interrupts on" hypercalls. Unfortunately, this is too slow: * these are by far the most commonly called functions of those we override. * * So instead we keep an "irq_enabled" field inside our "struct lguest_data", * which the Guest can update with a single instruction. The Host knows to * check there when it wants to deliver an interrupt. *//* save_flags() is expected to return the processor state (ie. "eflags"). The * eflags word contains all kind of stuff, but in practice Linux only cares * about the interrupt flag. Our "save_flags()" just returns that. */static unsigned long save_fl(void){ return lguest_data.irq_enabled;}/* restore_flags() just sets the flags back to the value given. */static void restore_fl(unsigned long flags){ lguest_data.irq_enabled = flags;}/* Interrupts go off... */static void irq_disable(void){ lguest_data.irq_enabled = 0;}/* Interrupts go on... */static void irq_enable(void){ lguest_data.irq_enabled = X86_EFLAGS_IF;}/*:*//*M:003 Note that we don't check for outstanding interrupts when we re-enable * them (or when we unmask an interrupt). This seems to work for the moment, * since interrupts are rare and we'll just get the interrupt on the next timer * tick, but when we turn on CONFIG_NO_HZ, we should revisit this. One way * would be to put the "irq_enabled" field in a page by itself, and have the * Host write-protect it when an interrupt comes in when irqs are disabled. * There will then be a page fault as soon as interrupts are re-enabled. :*//*G:034 * The Interrupt Descriptor Table (IDT). * * The IDT tells the processor what to do when an interrupt comes in. Each * entry in the table is a 64-bit descriptor: this holds the privilege level, * address of the handler, and... well, who cares? The Guest just asks the * Host to make the change anyway, because the Host controls the real IDT. */static void lguest_write_idt_entry(struct desc_struct *dt, int entrynum, u32 low, u32 high){ /* Keep the local copy up to date. */ write_dt_entry(dt, entrynum, low, high); /* Tell Host about this new entry. */ hcall(LHCALL_LOAD_IDT_ENTRY, entrynum, low, high);}/* Changing to a different IDT is very rare: we keep the IDT up-to-date every * time it is written, so we can simply loop through all entries and tell the * Host about them. */static void lguest_load_idt(const struct Xgt_desc_struct *desc){ unsigned int i; struct desc_struct *idt = (void *)desc->address; for (i = 0; i < (desc->size+1)/8; i++) hcall(LHCALL_LOAD_IDT_ENTRY, i, idt[i].a, idt[i].b);}/* * The Global Descriptor Table. * * The Intel architecture defines another table, called the Global Descriptor * Table (GDT). You tell the CPU where it is (and its size) using the "lgdt" * instruction, and then several other instructions refer to entries in the * table. There are three entries which the Switcher needs, so the Host simply * controls the entire thing and the Guest asks it to make changes using the * LOAD_GDT hypercall. * * This is the opposite of the IDT code where we have a LOAD_IDT_ENTRY * hypercall and use that repeatedly to load a new IDT. I don't think it * really matters, but wouldn't it be nice if they were the same? */static void lguest_load_gdt(const struct Xgt_desc_struct *desc){ BUG_ON((desc->size+1)/8 != GDT_ENTRIES); hcall(LHCALL_LOAD_GDT, __pa(desc->address), GDT_ENTRIES, 0);}/* For a single GDT entry which changes, we do the lazy thing: alter our GDT, * then tell the Host to reload the entire thing. This operation is so rare * that this naive implementation is reasonable. */static void lguest_write_gdt_entry(struct desc_struct *dt, int entrynum, u32 low, u32 high){ write_dt_entry(dt, entrynum, low, high); hcall(LHCALL_LOAD_GDT, __pa(dt), GDT_ENTRIES, 0);}/* OK, I lied. There are three "thread local storage" GDT entries which change * on every context switch (these three entries are how glibc implements * __thread variables). So we have a hypercall specifically for this case. */static void lguest_load_tls(struct thread_struct *t, unsigned int cpu){ /* There's one problem which normal hardware doesn't have: the Host * can't handle us removing entries we're currently using. So we clear * the GS register here: if it's needed it'll be reloaded anyway. */ loadsegment(gs, 0); lazy_hcall(LHCALL_LOAD_TLS, __pa(&t->tls_array), cpu, 0);}/*G:038 That's enough excitement for now, back to ploughing through each of * the different pv_ops structures (we're about 1/3 of the way through). * * This is the Local Descriptor Table, another weird Intel thingy. Linux only * uses this for some strange applications like Wine. We don't do anything * here, so they'll get an informative and friendly Segmentation Fault. */static void lguest_set_ldt(const void *addr, unsigned entries){}/* This loads a GDT entry into the "Task Register": that entry points to a * structure called the Task State Segment. Some comments scattered though the * kernel code indicate that this used for task switching in ages past, along * with blood sacrifice and astrology. * * Now there's nothing interesting in here that we don't get told elsewhere. * But the native version uses the "ltr" instruction, which makes the Host * complain to the Guest about a Segmentation Fault and it'll oops. So we * override the native version with a do-nothing version. */static void lguest_load_tr_desc(void){}/* The "cpuid" instruction is a way of querying both the CPU identity * (manufacturer, model, etc) and its features. It was introduced before the * Pentium in 1993 and keeps getting extended by both Intel and AMD. As you * might imagine, after a decade and a half this treatment, it is now a giant * ball of hair. Its entry in the current Intel manual runs to 28 pages. * * This instruction even it has its own Wikipedia entry. The Wikipedia entry * has been translated into 4 languages. I am not making this up! * * We could get funky here and identify ourselves as "GenuineLguest", but * instead we just use the real "cpuid" instruction. Then I pretty much turned * off feature bits until the Guest booted. (Don't say that: you'll damage * lguest sales!) Shut up, inner voice! (Hey, just pointing out that this is * hardly future proof.) Noone's listening! They don't like you anyway, * parenthetic weirdo! * * Replacing the cpuid so we can turn features off is great for the kernel, but * anyone (including userspace) can just use the raw "cpuid" instruction and * the Host won't even notice since it isn't privileged. So we try not to get * too worked up about it. */static void lguest_cpuid(unsigned int *eax, unsigned int *ebx, unsigned int *ecx, unsigned int *edx){ int function = *eax; native_cpuid(eax, ebx, ecx, edx); switch (function) { case 1: /* Basic feature request. */ /* We only allow kernel to see SSE3, CMPXCHG16B and SSSE3 */ *ecx &= 0x00002201; /* SSE, SSE2, FXSR, MMX, CMOV, CMPXCHG8B, FPU. */ *edx &= 0x07808101; /* The Host can do a nice optimization if it knows that the * kernel mappings (addresses above 0xC0000000 or whatever * PAGE_OFFSET is set to) haven't changed. But Linux calls * flush_tlb_user() for both user and kernel mappings unless * the Page Global Enable (PGE) feature bit is set. */ *edx |= 0x00002000; break; case 0x80000000: /* Futureproof this a little: if they ask how much extended * processor information there is, limit it to known fields. */ if (*eax > 0x80000008) *eax = 0x80000008; break; }}/* Intel has four control registers, imaginatively named cr0, cr2, cr3 and cr4. * I assume there's a cr1, but it hasn't bothered us yet, so we'll not bother * it. The Host needs to know when the Guest wants to change them, so we have * a whole series of functions like read_cr0() and write_cr0(). *
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -