📄 boot.c
字号:
{ /* Set up the timer interrupt (0) to go to our simple timer routine */ set_irq_handler(0, lguest_time_irq); /* Our clock structure looks like arch/x86/kernel/tsc_32.c if we can * use the TSC, otherwise it's a dumb nanosecond-resolution clock. * Either way, the "rating" is set so high that it's always chosen over * any other clocksource. */ if (lguest_data.tsc_khz) lguest_clock.mult = clocksource_khz2mult(lguest_data.tsc_khz, lguest_clock.shift); clock_base = lguest_clock_read(); clocksource_register(&lguest_clock); /* Now we've set up our clock, we can use it as the scheduler clock */ pv_time_ops.sched_clock = lguest_sched_clock; /* We can't set cpumask in the initializer: damn C limitations! Set it * here and register our timer device. */ lguest_clockevent.cpumask = cpumask_of_cpu(0); clockevents_register_device(&lguest_clockevent); /* Finally, we unblock the timer interrupt. */ enable_lguest_irq(0);}/* * Miscellaneous bits and pieces. * * Here is an oddball collection of functions which the Guest needs for things * to work. They're pretty simple. *//* The Guest needs to tell the Host what stack it expects traps to use. For * native hardware, this is part of the Task State Segment mentioned above in * lguest_load_tr_desc(), but to help hypervisors there's this special call. * * We tell the Host the segment we want to use (__KERNEL_DS is the kernel data * segment), the privilege level (we're privilege level 1, the Host is 0 and * will not tolerate us trying to use that), the stack pointer, and the number * of pages in the stack. */static void lguest_load_esp0(struct tss_struct *tss, struct thread_struct *thread){ lazy_hcall(LHCALL_SET_STACK, __KERNEL_DS|0x1, thread->esp0, THREAD_SIZE/PAGE_SIZE);}/* Let's just say, I wouldn't do debugging under a Guest. */static void lguest_set_debugreg(int regno, unsigned long value){ /* FIXME: Implement */}/* There are times when the kernel wants to make sure that no memory writes are * caught in the cache (that they've all reached real hardware devices). This * doesn't matter for the Guest which has virtual hardware. * * On the Pentium 4 and above, cpuid() indicates that the Cache Line Flush * (clflush) instruction is available and the kernel uses that. Otherwise, it * uses the older "Write Back and Invalidate Cache" (wbinvd) instruction. * Unlike clflush, wbinvd can only be run at privilege level 0. So we can * ignore clflush, but replace wbinvd. */static void lguest_wbinvd(void){}/* If the Guest expects to have an Advanced Programmable Interrupt Controller, * we play dumb by ignoring writes and returning 0 for reads. So it's no * longer Programmable nor Controlling anything, and I don't think 8 lines of * code qualifies for Advanced. It will also never interrupt anything. It * does, however, allow us to get through the Linux boot code. */#ifdef CONFIG_X86_LOCAL_APICstatic void lguest_apic_write(unsigned long reg, unsigned long v){}static unsigned long lguest_apic_read(unsigned long reg){ return 0;}#endif/* STOP! Until an interrupt comes in. */static void lguest_safe_halt(void){ hcall(LHCALL_HALT, 0, 0, 0);}/* Perhaps CRASH isn't the best name for this hypercall, but we use it to get a * message out when we're crashing as well as elegant termination like powering * off. * * Note that the Host always prefers that the Guest speak in physical addresses * rather than virtual addresses, so we use __pa() here. */static void lguest_power_off(void){ hcall(LHCALL_CRASH, __pa("Power down"), 0, 0);}/* * Panicing. * * Don't. But if you did, this is what happens. */static int lguest_panic(struct notifier_block *nb, unsigned long l, void *p){ hcall(LHCALL_CRASH, __pa(p), 0, 0); /* The hcall won't return, but to keep gcc happy, we're "done". */ return NOTIFY_DONE;}static struct notifier_block paniced = { .notifier_call = lguest_panic};/* Setting up memory is fairly easy. */static __init char *lguest_memory_setup(void){ /* We do this here and not earlier because lockcheck barfs if we do it * before start_kernel() */ atomic_notifier_chain_register(&panic_notifier_list, &paniced); /* The Linux bootloader header contains an "e820" memory map: the * Launcher populated the first entry with our memory limit. */ add_memory_region(boot_params.e820_map[0].addr, boot_params.e820_map[0].size, boot_params.e820_map[0].type); /* This string is for the boot messages. */ return "LGUEST";}/* We will eventually use the virtio console device to produce console output, * but before that is set up we use LHCALL_NOTIFY on normal memory to produce * console output. */static __init int early_put_chars(u32 vtermno, const char *buf, int count){ char scratch[17]; unsigned int len = count; /* We use a nul-terminated string, so we have to make a copy. Icky, * huh? */ if (len > sizeof(scratch) - 1) len = sizeof(scratch) - 1; scratch[len] = '\0'; memcpy(scratch, buf, len); hcall(LHCALL_NOTIFY, __pa(scratch), 0, 0); /* This routine returns the number of bytes actually written. */ return len;}/*G:050 * Patching (Powerfully Placating Performance Pedants) * * We have already seen that pv_ops structures let us replace simple * native instructions with calls to the appropriate back end all throughout * the kernel. This allows the same kernel to run as a Guest and as a native * kernel, but it's slow because of all the indirect branches. * * Remember that David Wheeler quote about "Any problem in computer science can * be solved with another layer of indirection"? The rest of that quote is * "... But that usually will create another problem." This is the first of * those problems. * * Our current solution is to allow the paravirt back end to optionally patch * over the indirect calls to replace them with something more efficient. We * patch the four most commonly called functions: disable interrupts, enable * interrupts, restore interrupts and save interrupts. We usually have 6 or 10 * bytes to patch into: the Guest versions of these operations are small enough * that we can fit comfortably. * * First we need assembly templates of each of the patchable Guest operations, * and these are in lguest_asm.S. *//*G:060 We construct a table from the assembler templates: */static const struct lguest_insns{ const char *start, *end;} lguest_insns[] = { [PARAVIRT_PATCH(pv_irq_ops.irq_disable)] = { lgstart_cli, lgend_cli }, [PARAVIRT_PATCH(pv_irq_ops.irq_enable)] = { lgstart_sti, lgend_sti }, [PARAVIRT_PATCH(pv_irq_ops.restore_fl)] = { lgstart_popf, lgend_popf }, [PARAVIRT_PATCH(pv_irq_ops.save_fl)] = { lgstart_pushf, lgend_pushf },};/* Now our patch routine is fairly simple (based on the native one in * paravirt.c). If we have a replacement, we copy it in and return how much of * the available space we used. */static unsigned lguest_patch(u8 type, u16 clobber, void *ibuf, unsigned long addr, unsigned len){ unsigned int insn_len; /* Don't do anything special if we don't have a replacement */ if (type >= ARRAY_SIZE(lguest_insns) || !lguest_insns[type].start) return paravirt_patch_default(type, clobber, ibuf, addr, len); insn_len = lguest_insns[type].end - lguest_insns[type].start; /* Similarly if we can't fit replacement (shouldn't happen, but let's * be thorough). */ if (len < insn_len) return paravirt_patch_default(type, clobber, ibuf, addr, len); /* Copy in our instructions. */ memcpy(ibuf, lguest_insns[type].start, insn_len); return insn_len;}/*G:030 Once we get to lguest_init(), we know we're a Guest. The pv_ops * structures in the kernel provide points for (almost) every routine we have * to override to avoid privileged instructions. */__init void lguest_init(void){ /* We're under lguest, paravirt is enabled, and we're running at * privilege level 1, not 0 as normal. */ pv_info.name = "lguest"; pv_info.paravirt_enabled = 1; pv_info.kernel_rpl = 1; /* We set up all the lguest overrides for sensitive operations. These * are detailed with the operations themselves. */ /* interrupt-related operations */ pv_irq_ops.init_IRQ = lguest_init_IRQ; pv_irq_ops.save_fl = save_fl; pv_irq_ops.restore_fl = restore_fl; pv_irq_ops.irq_disable = irq_disable; pv_irq_ops.irq_enable = irq_enable; pv_irq_ops.safe_halt = lguest_safe_halt; /* init-time operations */ pv_init_ops.memory_setup = lguest_memory_setup; pv_init_ops.patch = lguest_patch; /* Intercepts of various cpu instructions */ pv_cpu_ops.load_gdt = lguest_load_gdt; pv_cpu_ops.cpuid = lguest_cpuid; pv_cpu_ops.load_idt = lguest_load_idt; pv_cpu_ops.iret = lguest_iret; pv_cpu_ops.load_esp0 = lguest_load_esp0; pv_cpu_ops.load_tr_desc = lguest_load_tr_desc; pv_cpu_ops.set_ldt = lguest_set_ldt; pv_cpu_ops.load_tls = lguest_load_tls; pv_cpu_ops.set_debugreg = lguest_set_debugreg; pv_cpu_ops.clts = lguest_clts; pv_cpu_ops.read_cr0 = lguest_read_cr0; pv_cpu_ops.write_cr0 = lguest_write_cr0; pv_cpu_ops.read_cr4 = lguest_read_cr4; pv_cpu_ops.write_cr4 = lguest_write_cr4; pv_cpu_ops.write_gdt_entry = lguest_write_gdt_entry; pv_cpu_ops.write_idt_entry = lguest_write_idt_entry; pv_cpu_ops.wbinvd = lguest_wbinvd; pv_cpu_ops.lazy_mode.enter = paravirt_enter_lazy_cpu; pv_cpu_ops.lazy_mode.leave = lguest_leave_lazy_mode; /* pagetable management */ pv_mmu_ops.write_cr3 = lguest_write_cr3; pv_mmu_ops.flush_tlb_user = lguest_flush_tlb_user; pv_mmu_ops.flush_tlb_single = lguest_flush_tlb_single; pv_mmu_ops.flush_tlb_kernel = lguest_flush_tlb_kernel; pv_mmu_ops.set_pte = lguest_set_pte; pv_mmu_ops.set_pte_at = lguest_set_pte_at; pv_mmu_ops.set_pmd = lguest_set_pmd; pv_mmu_ops.read_cr2 = lguest_read_cr2; pv_mmu_ops.read_cr3 = lguest_read_cr3; pv_mmu_ops.lazy_mode.enter = paravirt_enter_lazy_mmu; pv_mmu_ops.lazy_mode.leave = lguest_leave_lazy_mode;#ifdef CONFIG_X86_LOCAL_APIC /* apic read/write intercepts */ pv_apic_ops.apic_write = lguest_apic_write; pv_apic_ops.apic_write_atomic = lguest_apic_write; pv_apic_ops.apic_read = lguest_apic_read;#endif /* time operations */ pv_time_ops.get_wallclock = lguest_get_wallclock; pv_time_ops.time_init = lguest_time_init; /* Now is a good time to look at the implementations of these functions * before returning to the rest of lguest_init(). */ /*G:070 Now we've seen all the paravirt_ops, we return to * lguest_init() where the rest of the fairly chaotic boot setup * occurs. */ /* The native boot code sets up initial page tables immediately after * the kernel itself, and sets init_pg_tables_end so they're not * clobbered. The Launcher places our initial pagetables somewhere at * the top of our physical memory, so we don't need extra space: set * init_pg_tables_end to the end of the kernel. */ init_pg_tables_end = __pa(pg0); /* Load the %fs segment register (the per-cpu segment register) with * the normal data segment to get through booting. */ asm volatile ("mov %0, %%fs" : : "r" (__KERNEL_DS) : "memory"); /* The Host uses the top of the Guest's virtual address space for the * Host<->Guest Switcher, and it tells us how big that is in * lguest_data.reserve_mem, set up on the LGUEST_INIT hypercall. */ reserve_top_address(lguest_data.reserve_mem); /* If we don't initialize the lock dependency checker now, it crashes * paravirt_disable_iospace. */ lockdep_init(); /* The IDE code spends about 3 seconds probing for disks: if we reserve * all the I/O ports up front it can't get them and so doesn't probe. * Other device drivers are similar (but less severe). This cuts the * kernel boot time on my machine from 4.1 seconds to 0.45 seconds. */ paravirt_disable_iospace(); /* This is messy CPU setup stuff which the native boot code does before * start_kernel, so we have to do, too: */ cpu_detect(&new_cpu_data); /* head.S usually sets up the first capability word, so do it here. */ new_cpu_data.x86_capability[0] = cpuid_edx(1); /* Math is always hard! */ new_cpu_data.hard_math = 1;#ifdef CONFIG_X86_MCE mce_disabled = 1;#endif#ifdef CONFIG_ACPI acpi_disabled = 1; acpi_ht = 0;#endif /* We set the perferred console to "hvc". This is the "hypervisor * virtual console" driver written by the PowerPC people, which we also * adapted for lguest's use. */ add_preferred_console("hvc", 0, NULL); /* Register our very early console. */ virtio_cons_early_init(early_put_chars); /* Last of all, we set the power management poweroff hook to point to * the Guest routine to power off. */ pm_power_off = lguest_power_off; /* Now we're set up, call start_kernel() in init/main.c and we proceed * to boot as normal. It never returns. */ start_kernel();}/* * This marks the end of stage II of our journey, The Guest. * * It is now time for us to explore the layer of virtual drivers and complete * our understanding of the Guest in "make Drivers". */
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -