📄 boot.c
字号:
* We start with cr0. cr0 allows you to turn on and off all kinds of basic * features, but Linux only really cares about one: the horrifically-named Task * Switched (TS) bit at bit 3 (ie. 8) * * What does the TS bit do? Well, it causes the CPU to trap (interrupt 7) if * the floating point unit is used. Which allows us to restore FPU state * lazily after a task switch, and Linux uses that gratefully, but wouldn't a * name like "FPUTRAP bit" be a little less cryptic? * * We store cr0 (and cr3) locally, because the Host never changes it. The * Guest sometimes wants to read it and we'd prefer not to bother the Host * unnecessarily. */static unsigned long current_cr0, current_cr3;static void lguest_write_cr0(unsigned long val){ lazy_hcall(LHCALL_TS, val & X86_CR0_TS, 0, 0); current_cr0 = val;}static unsigned long lguest_read_cr0(void){ return current_cr0;}/* Intel provided a special instruction to clear the TS bit for people too cool * to use write_cr0() to do it. This "clts" instruction is faster, because all * the vowels have been optimized out. */static void lguest_clts(void){ lazy_hcall(LHCALL_TS, 0, 0, 0); current_cr0 &= ~X86_CR0_TS;}/* cr2 is the virtual address of the last page fault, which the Guest only ever * reads. The Host kindly writes this into our "struct lguest_data", so we * just read it out of there. */static unsigned long lguest_read_cr2(void){ return lguest_data.cr2;}/* cr3 is the current toplevel pagetable page: the principle is the same as * cr0. Keep a local copy, and tell the Host when it changes. */static void lguest_write_cr3(unsigned long cr3){ lazy_hcall(LHCALL_NEW_PGTABLE, cr3, 0, 0); current_cr3 = cr3;}static unsigned long lguest_read_cr3(void){ return current_cr3;}/* cr4 is used to enable and disable PGE, but we don't care. */static unsigned long lguest_read_cr4(void){ return 0;}static void lguest_write_cr4(unsigned long val){}/* * Page Table Handling. * * Now would be a good time to take a rest and grab a coffee or similarly * relaxing stimulant. The easy parts are behind us, and the trek gradually * winds uphill from here. * * Quick refresher: memory is divided into "pages" of 4096 bytes each. The CPU * maps virtual addresses to physical addresses using "page tables". We could * use one huge index of 1 million entries: each address is 4 bytes, so that's * 1024 pages just to hold the page tables. But since most virtual addresses * are unused, we use a two level index which saves space. The cr3 register * contains the physical address of the top level "page directory" page, which * contains physical addresses of up to 1024 second-level pages. Each of these * second level pages contains up to 1024 physical addresses of actual pages, * or Page Table Entries (PTEs). * * Here's a diagram, where arrows indicate physical addresses: * * cr3 ---> +---------+ * | --------->+---------+ * | | | PADDR1 | * Top-level | | PADDR2 | * (PMD) page | | | * | | Lower-level | * | | (PTE) page | * | | | | * .... .... * * So to convert a virtual address to a physical address, we look up the top * level, which points us to the second level, which gives us the physical * address of that page. If the top level entry was not present, or the second * level entry was not present, then the virtual address is invalid (we * say "the page was not mapped"). * * Put another way, a 32-bit virtual address is divided up like so: * * 1 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 * |<---- 10 bits ---->|<---- 10 bits ---->|<------ 12 bits ------>| * Index into top Index into second Offset within page * page directory page pagetable page * * The kernel spends a lot of time changing both the top-level page directory * and lower-level pagetable pages. The Guest doesn't know physical addresses, * so while it maintains these page tables exactly like normal, it also needs * to keep the Host informed whenever it makes a change: the Host will create * the real page tables based on the Guests'. *//* The Guest calls this to set a second-level entry (pte), ie. to map a page * into a process' address space. We set the entry then tell the Host the * toplevel and address this corresponds to. The Guest uses one pagetable per * process, so we need to tell the Host which one we're changing (mm->pgd). */static void lguest_set_pte_at(struct mm_struct *mm, unsigned long addr, pte_t *ptep, pte_t pteval){ *ptep = pteval; lazy_hcall(LHCALL_SET_PTE, __pa(mm->pgd), addr, pteval.pte_low);}/* The Guest calls this to set a top-level entry. Again, we set the entry then * tell the Host which top-level page we changed, and the index of the entry we * changed. */static void lguest_set_pmd(pmd_t *pmdp, pmd_t pmdval){ *pmdp = pmdval; lazy_hcall(LHCALL_SET_PMD, __pa(pmdp)&PAGE_MASK, (__pa(pmdp)&(PAGE_SIZE-1))/4, 0);}/* There are a couple of legacy places where the kernel sets a PTE, but we * don't know the top level any more. This is useless for us, since we don't * know which pagetable is changing or what address, so we just tell the Host * to forget all of them. Fortunately, this is very rare. * * ... except in early boot when the kernel sets up the initial pagetables, * which makes booting astonishingly slow. So we don't even tell the Host * anything changed until we've done the first page table switch. */static void lguest_set_pte(pte_t *ptep, pte_t pteval){ *ptep = pteval; /* Don't bother with hypercall before initial setup. */ if (current_cr3) lazy_hcall(LHCALL_FLUSH_TLB, 1, 0, 0);}/* Unfortunately for Lguest, the pv_mmu_ops for page tables were based on * native page table operations. On native hardware you can set a new page * table entry whenever you want, but if you want to remove one you have to do * a TLB flush (a TLB is a little cache of page table entries kept by the CPU). * * So the lguest_set_pte_at() and lguest_set_pmd() functions above are only * called when a valid entry is written, not when it's removed (ie. marked not * present). Instead, this is where we come when the Guest wants to remove a * page table entry: we tell the Host to set that entry to 0 (ie. the present * bit is zero). */static void lguest_flush_tlb_single(unsigned long addr){ /* Simply set it to zero: if it was not, it will fault back in. */ lazy_hcall(LHCALL_SET_PTE, current_cr3, addr, 0);}/* This is what happens after the Guest has removed a large number of entries. * This tells the Host that any of the page table entries for userspace might * have changed, ie. virtual addresses below PAGE_OFFSET. */static void lguest_flush_tlb_user(void){ lazy_hcall(LHCALL_FLUSH_TLB, 0, 0, 0);}/* This is called when the kernel page tables have changed. That's not very * common (unless the Guest is using highmem, which makes the Guest extremely * slow), so it's worth separating this from the user flushing above. */static void lguest_flush_tlb_kernel(void){ lazy_hcall(LHCALL_FLUSH_TLB, 1, 0, 0);}/* * The Unadvanced Programmable Interrupt Controller. * * This is an attempt to implement the simplest possible interrupt controller. * I spent some time looking though routines like set_irq_chip_and_handler, * set_irq_chip_and_handler_name, set_irq_chip_data and set_phasers_to_stun and * I *think* this is as simple as it gets. * * We can tell the Host what interrupts we want blocked ready for using the * lguest_data.interrupts bitmap, so disabling (aka "masking") them is as * simple as setting a bit. We don't actually "ack" interrupts as such, we * just mask and unmask them. I wonder if we should be cleverer? */static void disable_lguest_irq(unsigned int irq){ set_bit(irq, lguest_data.blocked_interrupts);}static void enable_lguest_irq(unsigned int irq){ clear_bit(irq, lguest_data.blocked_interrupts);}/* This structure describes the lguest IRQ controller. */static struct irq_chip lguest_irq_controller = { .name = "lguest", .mask = disable_lguest_irq, .mask_ack = disable_lguest_irq, .unmask = enable_lguest_irq,};/* This sets up the Interrupt Descriptor Table (IDT) entry for each hardware * interrupt (except 128, which is used for system calls), and then tells the * Linux infrastructure that each interrupt is controlled by our level-based * lguest interrupt controller. */static void __init lguest_init_IRQ(void){ unsigned int i; for (i = 0; i < LGUEST_IRQS; i++) { int vector = FIRST_EXTERNAL_VECTOR + i; if (vector != SYSCALL_VECTOR) { set_intr_gate(vector, interrupt[i]); set_irq_chip_and_handler(i, &lguest_irq_controller, handle_level_irq); } } /* This call is required to set up for 4k stacks, where we have * separate stacks for hard and soft interrupts. */ irq_ctx_init(smp_processor_id());}/* * Time. * * It would be far better for everyone if the Guest had its own clock, but * until then the Host gives us the time on every interrupt. */static unsigned long lguest_get_wallclock(void){ return lguest_data.time.tv_sec;}static cycle_t lguest_clock_read(void){ unsigned long sec, nsec; /* If the Host tells the TSC speed, we can trust that. */ if (lguest_data.tsc_khz) return native_read_tsc(); /* If we can't use the TSC, we read the time value written by the Host. * Since it's in two parts (seconds and nanoseconds), we risk reading * it just as it's changing from 99 & 0.999999999 to 100 and 0, and * getting 99 and 0. As Linux tends to come apart under the stress of * time travel, we must be careful: */ do { /* First we read the seconds part. */ sec = lguest_data.time.tv_sec; /* This read memory barrier tells the compiler and the CPU that * this can't be reordered: we have to complete the above * before going on. */ rmb(); /* Now we read the nanoseconds part. */ nsec = lguest_data.time.tv_nsec; /* Make sure we've done that. */ rmb(); /* Now if the seconds part has changed, try again. */ } while (unlikely(lguest_data.time.tv_sec != sec)); /* Our non-TSC clock is in real nanoseconds. */ return sec*1000000000ULL + nsec;}/* This is what we tell the kernel is our clocksource. */static struct clocksource lguest_clock = { .name = "lguest", .rating = 400, .read = lguest_clock_read, .mask = CLOCKSOURCE_MASK(64), .mult = 1 << 22, .shift = 22, .flags = CLOCK_SOURCE_IS_CONTINUOUS,};/* The "scheduler clock" is just our real clock, adjusted to start at zero */static unsigned long long lguest_sched_clock(void){ return cyc2ns(&lguest_clock, lguest_clock_read() - clock_base);}/* We also need a "struct clock_event_device": Linux asks us to set it to go * off some time in the future. Actually, James Morris figured all this out, I * just applied the patch. */static int lguest_clockevent_set_next_event(unsigned long delta, struct clock_event_device *evt){ if (delta < LG_CLOCK_MIN_DELTA) { if (printk_ratelimit()) printk(KERN_DEBUG "%s: small delta %lu ns\n", __FUNCTION__, delta); return -ETIME; } hcall(LHCALL_SET_CLOCKEVENT, delta, 0, 0); return 0;}static void lguest_clockevent_set_mode(enum clock_event_mode mode, struct clock_event_device *evt){ switch (mode) { case CLOCK_EVT_MODE_UNUSED: case CLOCK_EVT_MODE_SHUTDOWN: /* A 0 argument shuts the clock down. */ hcall(LHCALL_SET_CLOCKEVENT, 0, 0, 0); break; case CLOCK_EVT_MODE_ONESHOT: /* This is what we expect. */ break; case CLOCK_EVT_MODE_PERIODIC: BUG(); case CLOCK_EVT_MODE_RESUME: break; }}/* This describes our primitive timer chip. */static struct clock_event_device lguest_clockevent = { .name = "lguest", .features = CLOCK_EVT_FEAT_ONESHOT, .set_next_event = lguest_clockevent_set_next_event, .set_mode = lguest_clockevent_set_mode, .rating = INT_MAX, .mult = 1, .shift = 0, .min_delta_ns = LG_CLOCK_MIN_DELTA, .max_delta_ns = LG_CLOCK_MAX_DELTA,};/* This is the Guest timer interrupt handler (hardware interrupt 0). We just * call the clockevent infrastructure and it does whatever needs doing. */static void lguest_time_irq(unsigned int irq, struct irq_desc *desc){ unsigned long flags; /* Don't interrupt us while this is running. */ local_irq_save(flags); lguest_clockevent.event_handler(&lguest_clockevent); local_irq_restore(flags);}/* At some point in the boot process, we get asked to set up our timing * infrastructure. The kernel doesn't expect timer interrupts before this, but * we cleverly initialized the "blocked_interrupts" field of "struct * lguest_data" so that timer interrupts were blocked until now. */static void lguest_time_init(void)
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -