📄 domain.c
字号:
rc = -EFAULT; if ( copy_to_guest(arg, &cpu_id, 1) ) break; rc = 0; break; } default: rc = -ENOSYS; break; } return rc;}#ifdef __x86_64__#define loadsegment(seg,value) ({ \ int __r = 1; \ asm volatile ( \ "1: movl %k1,%%" #seg "\n2:\n" \ ".section .fixup,\"ax\"\n" \ "3: xorl %k0,%k0\n" \ " movl %k0,%%" #seg "\n" \ " jmp 2b\n" \ ".previous\n" \ ".section __ex_table,\"a\"\n" \ " .align 8\n" \ " .quad 1b,3b\n" \ ".previous" \ : "=r" (__r) : "r" (value), "0" (__r) );\ __r; })/* * save_segments() writes a mask of segments which are dirty (non-zero), * allowing load_segments() to avoid some expensive segment loads and * MSR writes. */static DEFINE_PER_CPU(unsigned int, dirty_segment_mask);#define DIRTY_DS 0x01#define DIRTY_ES 0x02#define DIRTY_FS 0x04#define DIRTY_GS 0x08#define DIRTY_FS_BASE 0x10#define DIRTY_GS_BASE_USER 0x20static void load_segments(struct vcpu *n){ struct vcpu_guest_context *nctxt = &n->arch.guest_context; int all_segs_okay = 1; unsigned int dirty_segment_mask, cpu = smp_processor_id(); /* Load and clear the dirty segment mask. */ dirty_segment_mask = per_cpu(dirty_segment_mask, cpu); per_cpu(dirty_segment_mask, cpu) = 0; /* Either selector != 0 ==> reload. */ if ( unlikely((dirty_segment_mask & DIRTY_DS) | nctxt->user_regs.ds) ) all_segs_okay &= loadsegment(ds, nctxt->user_regs.ds); /* Either selector != 0 ==> reload. */ if ( unlikely((dirty_segment_mask & DIRTY_ES) | nctxt->user_regs.es) ) all_segs_okay &= loadsegment(es, nctxt->user_regs.es); /* * Either selector != 0 ==> reload. * Also reload to reset FS_BASE if it was non-zero. */ if ( unlikely((dirty_segment_mask & (DIRTY_FS | DIRTY_FS_BASE)) | nctxt->user_regs.fs) ) all_segs_okay &= loadsegment(fs, nctxt->user_regs.fs); /* * Either selector != 0 ==> reload. * Also reload to reset GS_BASE if it was non-zero. */ if ( unlikely((dirty_segment_mask & (DIRTY_GS | DIRTY_GS_BASE_USER)) | nctxt->user_regs.gs) ) { /* Reset GS_BASE with user %gs? */ if ( (dirty_segment_mask & DIRTY_GS) || !nctxt->gs_base_user ) all_segs_okay &= loadsegment(gs, nctxt->user_regs.gs); } if ( !is_pv_32on64_domain(n->domain) ) { /* This can only be non-zero if selector is NULL. */ if ( nctxt->fs_base ) wrmsr(MSR_FS_BASE, nctxt->fs_base, nctxt->fs_base>>32); /* Most kernels have non-zero GS base, so don't bother testing. */ /* (This is also a serialising instruction, avoiding AMD erratum #88.) */ wrmsr(MSR_SHADOW_GS_BASE, nctxt->gs_base_kernel, nctxt->gs_base_kernel>>32); /* This can only be non-zero if selector is NULL. */ if ( nctxt->gs_base_user ) wrmsr(MSR_GS_BASE, nctxt->gs_base_user, nctxt->gs_base_user>>32); /* If in kernel mode then switch the GS bases around. */ if ( (n->arch.flags & TF_kernel_mode) ) asm volatile ( "swapgs" ); } if ( unlikely(!all_segs_okay) ) { struct cpu_user_regs *regs = guest_cpu_user_regs(); unsigned long *rsp = (n->arch.flags & TF_kernel_mode) ? (unsigned long *)regs->rsp : (unsigned long *)nctxt->kernel_sp; unsigned long cs_and_mask, rflags; if ( is_pv_32on64_domain(n->domain) ) { unsigned int *esp = ring_1(regs) ? (unsigned int *)regs->rsp : (unsigned int *)nctxt->kernel_sp; unsigned int cs_and_mask, eflags; int ret = 0; /* CS longword also contains full evtchn_upcall_mask. */ cs_and_mask = (unsigned short)regs->cs | ((unsigned int)vcpu_info(n, evtchn_upcall_mask) << 16); /* Fold upcall mask into RFLAGS.IF. */ eflags = regs->_eflags & ~X86_EFLAGS_IF; eflags |= !vcpu_info(n, evtchn_upcall_mask) << 9; if ( !ring_1(regs) ) { ret = put_user(regs->ss, esp-1); ret |= put_user(regs->_esp, esp-2); esp -= 2; } if ( ret | put_user(eflags, esp-1) | put_user(cs_and_mask, esp-2) | put_user(regs->_eip, esp-3) | put_user(nctxt->user_regs.gs, esp-4) | put_user(nctxt->user_regs.fs, esp-5) | put_user(nctxt->user_regs.es, esp-6) | put_user(nctxt->user_regs.ds, esp-7) ) { gdprintk(XENLOG_ERR, "Error while creating compat " "failsafe callback frame.\n"); domain_crash(n->domain); } if ( test_bit(_VGCF_failsafe_disables_events, &n->arch.guest_context.flags) ) vcpu_info(n, evtchn_upcall_mask) = 1; regs->entry_vector = TRAP_syscall; regs->_eflags &= 0xFFFCBEFFUL; regs->ss = FLAT_COMPAT_KERNEL_SS; regs->_esp = (unsigned long)(esp-7); regs->cs = FLAT_COMPAT_KERNEL_CS; regs->_eip = nctxt->failsafe_callback_eip; return; } if ( !(n->arch.flags & TF_kernel_mode) ) toggle_guest_mode(n); else regs->cs &= ~3; /* CS longword also contains full evtchn_upcall_mask. */ cs_and_mask = (unsigned long)regs->cs | ((unsigned long)vcpu_info(n, evtchn_upcall_mask) << 32); /* Fold upcall mask into RFLAGS.IF. */ rflags = regs->rflags & ~X86_EFLAGS_IF; rflags |= !vcpu_info(n, evtchn_upcall_mask) << 9; if ( put_user(regs->ss, rsp- 1) | put_user(regs->rsp, rsp- 2) | put_user(rflags, rsp- 3) | put_user(cs_and_mask, rsp- 4) | put_user(regs->rip, rsp- 5) | put_user(nctxt->user_regs.gs, rsp- 6) | put_user(nctxt->user_regs.fs, rsp- 7) | put_user(nctxt->user_regs.es, rsp- 8) | put_user(nctxt->user_regs.ds, rsp- 9) | put_user(regs->r11, rsp-10) | put_user(regs->rcx, rsp-11) ) { gdprintk(XENLOG_ERR, "Error while creating failsafe " "callback frame.\n"); domain_crash(n->domain); } if ( test_bit(_VGCF_failsafe_disables_events, &n->arch.guest_context.flags) ) vcpu_info(n, evtchn_upcall_mask) = 1; regs->entry_vector = TRAP_syscall; regs->rflags &= ~(X86_EFLAGS_AC|X86_EFLAGS_VM|X86_EFLAGS_RF| X86_EFLAGS_NT|X86_EFLAGS_TF); regs->ss = FLAT_KERNEL_SS; regs->rsp = (unsigned long)(rsp-11); regs->cs = FLAT_KERNEL_CS; regs->rip = nctxt->failsafe_callback_eip; }}static void save_segments(struct vcpu *v){ struct vcpu_guest_context *ctxt = &v->arch.guest_context; struct cpu_user_regs *regs = &ctxt->user_regs; unsigned int dirty_segment_mask = 0; regs->ds = read_segment_register(ds); regs->es = read_segment_register(es); regs->fs = read_segment_register(fs); regs->gs = read_segment_register(gs); if ( regs->ds ) dirty_segment_mask |= DIRTY_DS; if ( regs->es ) dirty_segment_mask |= DIRTY_ES; if ( regs->fs || is_pv_32on64_domain(v->domain) ) { dirty_segment_mask |= DIRTY_FS; ctxt->fs_base = 0; /* != 0 selector kills fs_base */ } else if ( ctxt->fs_base ) { dirty_segment_mask |= DIRTY_FS_BASE; } if ( regs->gs || is_pv_32on64_domain(v->domain) ) { dirty_segment_mask |= DIRTY_GS; ctxt->gs_base_user = 0; /* != 0 selector kills gs_base_user */ } else if ( ctxt->gs_base_user ) { dirty_segment_mask |= DIRTY_GS_BASE_USER; } this_cpu(dirty_segment_mask) = dirty_segment_mask;}#define switch_kernel_stack(v) ((void)0)#elif defined(__i386__)#define load_segments(n) ((void)0)#define save_segments(p) ((void)0)static inline void switch_kernel_stack(struct vcpu *v){ struct tss_struct *tss = &init_tss[smp_processor_id()]; tss->esp1 = v->arch.guest_context.kernel_sp; tss->ss1 = v->arch.guest_context.kernel_ss;}#endif /* __i386__ */static void paravirt_ctxt_switch_from(struct vcpu *v){ save_segments(v); /* * Disable debug breakpoints. We do this aggressively because if we switch * to an HVM guest we may load DR0-DR3 with values that can cause #DE * inside Xen, before we get a chance to reload DR7, and this cannot always * safely be handled. */ if ( unlikely(v->arch.guest_context.debugreg[7] & DR7_ACTIVE_MASK) ) write_debugreg(7, 0);}static void paravirt_ctxt_switch_to(struct vcpu *v){ unsigned long cr4; set_int80_direct_trap(v); switch_kernel_stack(v); cr4 = pv_guest_cr4_to_real_cr4(v->arch.guest_context.ctrlreg[4]); if ( unlikely(cr4 != read_cr4()) ) write_cr4(cr4); if ( unlikely(v->arch.guest_context.debugreg[7] & DR7_ACTIVE_MASK) ) { write_debugreg(0, v->arch.guest_context.debugreg[0]); write_debugreg(1, v->arch.guest_context.debugreg[1]); write_debugreg(2, v->arch.guest_context.debugreg[2]); write_debugreg(3, v->arch.guest_context.debugreg[3]); write_debugreg(6, v->arch.guest_context.debugreg[6]); write_debugreg(7, v->arch.guest_context.debugreg[7]); }}static void __context_switch(void){ struct cpu_user_regs *stack_regs = guest_cpu_user_regs(); unsigned int cpu = smp_processor_id(); struct vcpu *p = per_cpu(curr_vcpu, cpu); struct vcpu *n = current; ASSERT(p != n); ASSERT(cpus_empty(n->vcpu_dirty_cpumask)); if ( !is_idle_vcpu(p) ) { memcpy(&p->arch.guest_context.user_regs, stack_regs, CTXT_SWITCH_STACK_BYTES); unlazy_fpu(p); p->arch.ctxt_switch_from(p); } if ( !is_idle_vcpu(n) ) { memcpy(stack_regs, &n->arch.guest_context.user_regs, CTXT_SWITCH_STACK_BYTES); n->arch.ctxt_switch_to(n); } if ( p->domain != n->domain ) cpu_set(cpu, n->domain->domain_dirty_cpumask); cpu_set(cpu, n->vcpu_dirty_cpumask); write_ptbase(n); if ( p->vcpu_id != n->vcpu_id ) { char gdt_load[10]; *(unsigned short *)(&gdt_load[0]) = LAST_RESERVED_GDT_BYTE; *(unsigned long *)(&gdt_load[2]) = GDT_VIRT_START(n); asm volatile ( "lgdt %0" : "=m" (gdt_load) ); } if ( p->domain != n->domain ) cpu_clear(cpu, p->domain->domain_dirty_cpumask); cpu_clear(cpu, p->vcpu_dirty_cpumask); per_cpu(curr_vcpu, cpu) = n;}void context_switch(struct vcpu *prev, struct vcpu *next){ unsigned int cpu = smp_processor_id(); cpumask_t dirty_mask = next->vcpu_dirty_cpumask; ASSERT(local_irq_is_enabled()); /* Allow at most one CPU at a time to be dirty. */ ASSERT(cpus_weight(dirty_mask) <= 1); if ( unlikely(!cpu_isset(cpu, dirty_mask) && !cpus_empty(dirty_mask)) ) { /* Other cpus call __sync_lazy_execstate from flush ipi handler. */ if ( !cpus_empty(next->vcpu_dirty_cpumask) ) flush_tlb_mask(next->vcpu_dirty_cpumask); } local_irq_disable(); if ( is_hvm_vcpu(prev) && !list_empty(&prev->arch.hvm_vcpu.tm_list) ) pt_save_timer(prev); set_current(next); if ( (per_cpu(curr_vcpu, cpu) == next) || is_idle_vcpu(next) ) { local_irq_enable(); } else { __context_switch();#ifdef CONFIG_COMPAT if ( !is_hvm_vcpu(next) && (is_idle_vcpu(prev) || is_hvm_vcpu(prev) || is_pv_32on64_vcpu(prev) != is_pv_32on64_vcpu(next)) ) { uint64_t efer = read_efer(); if ( !(efer & EFER_SCE) ) write_efer(efer | EFER_SCE); flush_tlb_one_local(GDT_VIRT_START(next) + FIRST_RESERVED_GDT_BYTE); }#endif /* Re-enable interrupts before restoring state which may fault. */ local_irq_enable(); if ( !is_hvm_vcpu(next) ) { load_LDT(next); load_segments(next); } } context_saved(prev); /* Update per-VCPU guest runstate shared memory area (if registered). */ if ( !guest_handle_is_null(runstate_guest(next)) ) { if ( !is_pv_32on64_domain(next->domain) ) __copy_to_guest(runstate_guest(next), &next->runstate, 1);#ifdef CONFIG_COMPAT else { struct compat_vcpu_runstate_info info; XLAT_vcpu_runstate_info(&info, &next->runstate); __copy_to_guest(next->runstate_guest.compat, &info, 1); }#endif } schedule_tail(next); BUG();}void continue_running(struct vcpu *same){ schedule_tail(same); BUG();}int __sync_lazy_execstate(void){ unsigned long flags; int switch_required; local_irq_save(flags); switch_required = (this_cpu(curr_vcpu) != current); if ( switch_required ) { ASSERT(current == idle_vcpu[smp_processor_id()]); __context_switch(); } local_irq_restore(flags); return switch_required;}void sync_vcpu_execstate(struct vcpu *v){ if ( cpu_isset(smp_processor_id(), v->vcpu_dirty_cpumask) ) (void)__sync_lazy_execstate(); /* Other cpus call __sync_lazy_execstate from flush ipi handler. */ flush_tlb_mask(v->vcpu_dirty_cpumask);}struct migrate_info { long (*func)(void *data); void *data; void (*saved_schedule_tail)(struct vcpu *); cpumask_t saved_affinity;};static void continue_hypercall_on_cpu_helper(struct vcpu *v){ struct cpu_user_regs *regs = guest_cpu_user_regs(); struct migrate_info *info = v->arch.continue_info; cpumask_t mask = info->saved_affinity;
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -