📄 boot.c

📁 linux 内核源代码
💻 C
📖 第 1 页 / 共 3 页
字号:
 * We start with cr0.  cr0 allows you to turn on and off all kinds of basic * features, but Linux only really cares about one: the horrifically-named Task * Switched (TS) bit at bit 3 (ie. 8) * * What does the TS bit do?  Well, it causes the CPU to trap (interrupt 7) if * the floating point unit is used.  Which allows us to restore FPU state * lazily after a task switch, and Linux uses that gratefully, but wouldn't a * name like "FPUTRAP bit" be a little less cryptic? * * We store cr0 (and cr3) locally, because the Host never changes it.  The * Guest sometimes wants to read it and we'd prefer not to bother the Host * unnecessarily. */static unsigned long current_cr0, current_cr3;static void lguest_write_cr0(unsigned long val){	lazy_hcall(LHCALL_TS, val & X86_CR0_TS, 0, 0);	current_cr0 = val;}static unsigned long lguest_read_cr0(void){	return current_cr0;}/* Intel provided a special instruction to clear the TS bit for people too cool * to use write_cr0() to do it.  This "clts" instruction is faster, because all * the vowels have been optimized out. */static void lguest_clts(void){	lazy_hcall(LHCALL_TS, 0, 0, 0);	current_cr0 &= ~X86_CR0_TS;}/* cr2 is the virtual address of the last page fault, which the Guest only ever * reads.  The Host kindly writes this into our "struct lguest_data", so we * just read it out of there. */static unsigned long lguest_read_cr2(void){	return lguest_data.cr2;}/* cr3 is the current toplevel pagetable page: the principle is the same as * cr0.  Keep a local copy, and tell the Host when it changes. */static void lguest_write_cr3(unsigned long cr3){	lazy_hcall(LHCALL_NEW_PGTABLE, cr3, 0, 0);	current_cr3 = cr3;}static unsigned long lguest_read_cr3(void){	return current_cr3;}/* cr4 is used to enable and disable PGE, but we don't care. */static unsigned long lguest_read_cr4(void){	return 0;}static void lguest_write_cr4(unsigned long val){}/* * Page Table Handling. * * Now would be a good time to take a rest and grab a coffee or similarly * relaxing stimulant.  The easy parts are behind us, and the trek gradually * winds uphill from here. * * Quick refresher: memory is divided into "pages" of 4096 bytes each.  The CPU * maps virtual addresses to physical addresses using "page tables".  We could * use one huge index of 1 million entries: each address is 4 bytes, so that's * 1024 pages just to hold the page tables.   But since most virtual addresses * are unused, we use a two level index which saves space.  The cr3 register * contains the physical address of the top level "page directory" page, which * contains physical addresses of up to 1024 second-level pages.  Each of these * second level pages contains up to 1024 physical addresses of actual pages, * or Page Table Entries (PTEs). * * Here's a diagram, where arrows indicate physical addresses: * * cr3 ---> +---------+ *	    |  	   --------->+---------+ *	    |	      |	     | PADDR1  | *	  Top-level   |	     | PADDR2  | *	  (PMD) page  |	     | 	       | *	    |	      |	   Lower-level | *	    |	      |	   (PTE) page  | *	    |	      |	     |	       | *	      ....    	     	 .... * * So to convert a virtual address to a physical address, we look up the top * level, which points us to the second level, which gives us the physical * address of that page.  If the top level entry was not present, or the second * level entry was not present, then the virtual address is invalid (we * say "the page was not mapped"). * * Put another way, a 32-bit virtual address is divided up like so: * *  1 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 * |<---- 10 bits ---->|<---- 10 bits ---->|<------ 12 bits ------>| *    Index into top     Index into second      Offset within page *  page directory page    pagetable page * * The kernel spends a lot of time changing both the top-level page directory * and lower-level pagetable pages.  The Guest doesn't know physical addresses, * so while it maintains these page tables exactly like normal, it also needs * to keep the Host informed whenever it makes a change: the Host will create * the real page tables based on the Guests'. *//* The Guest calls this to set a second-level entry (pte), ie. to map a page * into a process' address space.  We set the entry then tell the Host the * toplevel and address this corresponds to.  The Guest uses one pagetable per * process, so we need to tell the Host which one we're changing (mm->pgd). */static void lguest_set_pte_at(struct mm_struct *mm, unsigned long addr,			      pte_t *ptep, pte_t pteval){	*ptep = pteval;	lazy_hcall(LHCALL_SET_PTE, __pa(mm->pgd), addr, pteval.pte_low);}/* The Guest calls this to set a top-level entry.  Again, we set the entry then * tell the Host which top-level page we changed, and the index of the entry we * changed. */static void lguest_set_pmd(pmd_t *pmdp, pmd_t pmdval){	*pmdp = pmdval;	lazy_hcall(LHCALL_SET_PMD, __pa(pmdp)&PAGE_MASK,		   (__pa(pmdp)&(PAGE_SIZE-1))/4, 0);}/* There are a couple of legacy places where the kernel sets a PTE, but we * don't know the top level any more.  This is useless for us, since we don't * know which pagetable is changing or what address, so we just tell the Host * to forget all of them.  Fortunately, this is very rare. * * ... except in early boot when the kernel sets up the initial pagetables, * which makes booting astonishingly slow.  So we don't even tell the Host * anything changed until we've done the first page table switch. */static void lguest_set_pte(pte_t *ptep, pte_t pteval){	*ptep = pteval;	/* Don't bother with hypercall before initial setup. */	if (current_cr3)		lazy_hcall(LHCALL_FLUSH_TLB, 1, 0, 0);}/* Unfortunately for Lguest, the pv_mmu_ops for page tables were based on * native page table operations.  On native hardware you can set a new page * table entry whenever you want, but if you want to remove one you have to do * a TLB flush (a TLB is a little cache of page table entries kept by the CPU). * * So the lguest_set_pte_at() and lguest_set_pmd() functions above are only * called when a valid entry is written, not when it's removed (ie. marked not * present).  Instead, this is where we come when the Guest wants to remove a * page table entry: we tell the Host to set that entry to 0 (ie. the present * bit is zero). */static void lguest_flush_tlb_single(unsigned long addr){	/* Simply set it to zero: if it was not, it will fault back in. */	lazy_hcall(LHCALL_SET_PTE, current_cr3, addr, 0);}/* This is what happens after the Guest has removed a large number of entries. * This tells the Host that any of the page table entries for userspace might * have changed, ie. virtual addresses below PAGE_OFFSET. */static void lguest_flush_tlb_user(void){	lazy_hcall(LHCALL_FLUSH_TLB, 0, 0, 0);}/* This is called when the kernel page tables have changed.  That's not very * common (unless the Guest is using highmem, which makes the Guest extremely * slow), so it's worth separating this from the user flushing above. */static void lguest_flush_tlb_kernel(void){	lazy_hcall(LHCALL_FLUSH_TLB, 1, 0, 0);}/* * The Unadvanced Programmable Interrupt Controller. * * This is an attempt to implement the simplest possible interrupt controller. * I spent some time looking though routines like set_irq_chip_and_handler, * set_irq_chip_and_handler_name, set_irq_chip_data and set_phasers_to_stun and * I *think* this is as simple as it gets. * * We can tell the Host what interrupts we want blocked ready for using the * lguest_data.interrupts bitmap, so disabling (aka "masking") them is as * simple as setting a bit.  We don't actually "ack" interrupts as such, we * just mask and unmask them.  I wonder if we should be cleverer? */static void disable_lguest_irq(unsigned int irq){	set_bit(irq, lguest_data.blocked_interrupts);}static void enable_lguest_irq(unsigned int irq){	clear_bit(irq, lguest_data.blocked_interrupts);}/* This structure describes the lguest IRQ controller. */static struct irq_chip lguest_irq_controller = {	.name		= "lguest",	.mask		= disable_lguest_irq,	.mask_ack	= disable_lguest_irq,	.unmask		= enable_lguest_irq,};/* This sets up the Interrupt Descriptor Table (IDT) entry for each hardware * interrupt (except 128, which is used for system calls), and then tells the * Linux infrastructure that each interrupt is controlled by our level-based * lguest interrupt controller. */static void __init lguest_init_IRQ(void){	unsigned int i;	for (i = 0; i < LGUEST_IRQS; i++) {		int vector = FIRST_EXTERNAL_VECTOR + i;		if (vector != SYSCALL_VECTOR) {			set_intr_gate(vector, interrupt[i]);			set_irq_chip_and_handler(i, &lguest_irq_controller,						 handle_level_irq);		}	}	/* This call is required to set up for 4k stacks, where we have	 * separate stacks for hard and soft interrupts. */	irq_ctx_init(smp_processor_id());}/* * Time. * * It would be far better for everyone if the Guest had its own clock, but * until then the Host gives us the time on every interrupt. */static unsigned long lguest_get_wallclock(void){	return lguest_data.time.tv_sec;}static cycle_t lguest_clock_read(void){	unsigned long sec, nsec;	/* If the Host tells the TSC speed, we can trust that. */	if (lguest_data.tsc_khz)		return native_read_tsc();	/* If we can't use the TSC, we read the time value written by the Host.	 * Since it's in two parts (seconds and nanoseconds), we risk reading	 * it just as it's changing from 99 & 0.999999999 to 100 and 0, and	 * getting 99 and 0.  As Linux tends to come apart under the stress of	 * time travel, we must be careful: */	do {		/* First we read the seconds part. */		sec = lguest_data.time.tv_sec;		/* This read memory barrier tells the compiler and the CPU that		 * this can't be reordered: we have to complete the above		 * before going on. */		rmb();		/* Now we read the nanoseconds part. */		nsec = lguest_data.time.tv_nsec;		/* Make sure we've done that. */		rmb();		/* Now if the seconds part has changed, try again. */	} while (unlikely(lguest_data.time.tv_sec != sec));	/* Our non-TSC clock is in real nanoseconds. */	return sec*1000000000ULL + nsec;}/* This is what we tell the kernel is our clocksource.  */static struct clocksource lguest_clock = {	.name		= "lguest",	.rating		= 400,	.read		= lguest_clock_read,	.mask		= CLOCKSOURCE_MASK(64),	.mult		= 1 << 22,	.shift		= 22,	.flags		= CLOCK_SOURCE_IS_CONTINUOUS,};/* The "scheduler clock" is just our real clock, adjusted to start at zero */static unsigned long long lguest_sched_clock(void){	return cyc2ns(&lguest_clock, lguest_clock_read() - clock_base);}/* We also need a "struct clock_event_device": Linux asks us to set it to go * off some time in the future.  Actually, James Morris figured all this out, I * just applied the patch. */static int lguest_clockevent_set_next_event(unsigned long delta,                                           struct clock_event_device *evt){	if (delta < LG_CLOCK_MIN_DELTA) {		if (printk_ratelimit())			printk(KERN_DEBUG "%s: small delta %lu ns\n",			       __FUNCTION__, delta);		return -ETIME;	}	hcall(LHCALL_SET_CLOCKEVENT, delta, 0, 0);	return 0;}static void lguest_clockevent_set_mode(enum clock_event_mode mode,                                      struct clock_event_device *evt){	switch (mode) {	case CLOCK_EVT_MODE_UNUSED:	case CLOCK_EVT_MODE_SHUTDOWN:		/* A 0 argument shuts the clock down. */		hcall(LHCALL_SET_CLOCKEVENT, 0, 0, 0);		break;	case CLOCK_EVT_MODE_ONESHOT:		/* This is what we expect. */		break;	case CLOCK_EVT_MODE_PERIODIC:		BUG();	case CLOCK_EVT_MODE_RESUME:		break;	}}/* This describes our primitive timer chip. */static struct clock_event_device lguest_clockevent = {	.name                   = "lguest",	.features               = CLOCK_EVT_FEAT_ONESHOT,	.set_next_event         = lguest_clockevent_set_next_event,	.set_mode               = lguest_clockevent_set_mode,	.rating                 = INT_MAX,	.mult                   = 1,	.shift                  = 0,	.min_delta_ns           = LG_CLOCK_MIN_DELTA,	.max_delta_ns           = LG_CLOCK_MAX_DELTA,};/* This is the Guest timer interrupt handler (hardware interrupt 0).  We just * call the clockevent infrastructure and it does whatever needs doing. */static void lguest_time_irq(unsigned int irq, struct irq_desc *desc){	unsigned long flags;	/* Don't interrupt us while this is running. */	local_irq_save(flags);	lguest_clockevent.event_handler(&lguest_clockevent);	local_irq_restore(flags);}/* At some point in the boot process, we get asked to set up our timing * infrastructure.  The kernel doesn't expect timer interrupts before this, but * we cleverly initialized the "blocked_interrupts" field of "struct * lguest_data" so that timer interrupts were blocked until now. */static void lguest_time_init(void)
⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -