kvm_main.c
来自「linux 内核源代码」· C语言 代码 · 共 2,825 行 · 第 1/5 页
C
2,825 行
/* * Kernel-based Virtual Machine driver for Linux * * This module enables machines with Intel VT-x extensions to run virtual * machines without emulation or binary translation. * * Copyright (C) 2006 Qumranet, Inc. * * Authors: * Avi Kivity <avi@qumranet.com> * Yaniv Kamay <yaniv@qumranet.com> * * This work is licensed under the terms of the GNU GPL, version 2. See * the COPYING file in the top-level directory. * */#include "kvm.h"#include "x86_emulate.h"#include "segment_descriptor.h"#include "irq.h"#include <linux/kvm.h>#include <linux/module.h>#include <linux/errno.h>#include <linux/percpu.h>#include <linux/gfp.h>#include <linux/mm.h>#include <linux/miscdevice.h>#include <linux/vmalloc.h>#include <linux/reboot.h>#include <linux/debugfs.h>#include <linux/highmem.h>#include <linux/file.h>#include <linux/sysdev.h>#include <linux/cpu.h>#include <linux/sched.h>#include <linux/cpumask.h>#include <linux/smp.h>#include <linux/anon_inodes.h>#include <linux/profile.h>#include <asm/processor.h>#include <asm/msr.h>#include <asm/io.h>#include <asm/uaccess.h>#include <asm/desc.h>MODULE_AUTHOR("Qumranet");MODULE_LICENSE("GPL");static DEFINE_SPINLOCK(kvm_lock);static LIST_HEAD(vm_list);static cpumask_t cpus_hardware_enabled;struct kvm_x86_ops *kvm_x86_ops;struct kmem_cache *kvm_vcpu_cache;EXPORT_SYMBOL_GPL(kvm_vcpu_cache);static __read_mostly struct preempt_ops kvm_preempt_ops;#define STAT_OFFSET(x) offsetof(struct kvm_vcpu, stat.x)static struct kvm_stats_debugfs_item { const char *name; int offset; struct dentry *dentry;} debugfs_entries[] = { { "pf_fixed", STAT_OFFSET(pf_fixed) }, { "pf_guest", STAT_OFFSET(pf_guest) }, { "tlb_flush", STAT_OFFSET(tlb_flush) }, { "invlpg", STAT_OFFSET(invlpg) }, { "exits", STAT_OFFSET(exits) }, { "io_exits", STAT_OFFSET(io_exits) }, { "mmio_exits", STAT_OFFSET(mmio_exits) }, { "signal_exits", STAT_OFFSET(signal_exits) }, { "irq_window", STAT_OFFSET(irq_window_exits) }, { "halt_exits", STAT_OFFSET(halt_exits) }, { "halt_wakeup", STAT_OFFSET(halt_wakeup) }, { "request_irq", STAT_OFFSET(request_irq_exits) }, { "irq_exits", STAT_OFFSET(irq_exits) }, { "light_exits", STAT_OFFSET(light_exits) }, { "efer_reload", STAT_OFFSET(efer_reload) }, { NULL }};static struct dentry *debugfs_dir;#define MAX_IO_MSRS 256#define CR0_RESERVED_BITS \ (~(unsigned long)(X86_CR0_PE | X86_CR0_MP | X86_CR0_EM | X86_CR0_TS \ | X86_CR0_ET | X86_CR0_NE | X86_CR0_WP | X86_CR0_AM \ | X86_CR0_NW | X86_CR0_CD | X86_CR0_PG))#define CR4_RESERVED_BITS \ (~(unsigned long)(X86_CR4_VME | X86_CR4_PVI | X86_CR4_TSD | X86_CR4_DE\ | X86_CR4_PSE | X86_CR4_PAE | X86_CR4_MCE \ | X86_CR4_PGE | X86_CR4_PCE | X86_CR4_OSFXSR \ | X86_CR4_OSXMMEXCPT | X86_CR4_VMXE))#define CR8_RESERVED_BITS (~(unsigned long)X86_CR8_TPR)#define EFER_RESERVED_BITS 0xfffffffffffff2fe#ifdef CONFIG_X86_64// LDT or TSS descriptor in the GDT. 16 bytes.struct segment_descriptor_64 { struct segment_descriptor s; u32 base_higher; u32 pad_zero;};#endifstatic long kvm_vcpu_ioctl(struct file *file, unsigned int ioctl, unsigned long arg);unsigned long segment_base(u16 selector){ struct descriptor_table gdt; struct segment_descriptor *d; unsigned long table_base; typedef unsigned long ul; unsigned long v; if (selector == 0) return 0; asm ("sgdt %0" : "=m"(gdt)); table_base = gdt.base; if (selector & 4) { /* from ldt */ u16 ldt_selector; asm ("sldt %0" : "=g"(ldt_selector)); table_base = segment_base(ldt_selector); } d = (struct segment_descriptor *)(table_base + (selector & ~7)); v = d->base_low | ((ul)d->base_mid << 16) | ((ul)d->base_high << 24);#ifdef CONFIG_X86_64 if (d->system == 0 && (d->type == 2 || d->type == 9 || d->type == 11)) v |= ((ul)((struct segment_descriptor_64 *)d)->base_higher) << 32;#endif return v;}EXPORT_SYMBOL_GPL(segment_base);static inline int valid_vcpu(int n){ return likely(n >= 0 && n < KVM_MAX_VCPUS);}void kvm_load_guest_fpu(struct kvm_vcpu *vcpu){ if (!vcpu->fpu_active || vcpu->guest_fpu_loaded) return; vcpu->guest_fpu_loaded = 1; fx_save(&vcpu->host_fx_image); fx_restore(&vcpu->guest_fx_image);}EXPORT_SYMBOL_GPL(kvm_load_guest_fpu);void kvm_put_guest_fpu(struct kvm_vcpu *vcpu){ if (!vcpu->guest_fpu_loaded) return; vcpu->guest_fpu_loaded = 0; fx_save(&vcpu->guest_fx_image); fx_restore(&vcpu->host_fx_image);}EXPORT_SYMBOL_GPL(kvm_put_guest_fpu);/* * Switches to specified vcpu, until a matching vcpu_put() */static void vcpu_load(struct kvm_vcpu *vcpu){ int cpu; mutex_lock(&vcpu->mutex); cpu = get_cpu(); preempt_notifier_register(&vcpu->preempt_notifier); kvm_x86_ops->vcpu_load(vcpu, cpu); put_cpu();}static void vcpu_put(struct kvm_vcpu *vcpu){ preempt_disable(); kvm_x86_ops->vcpu_put(vcpu); preempt_notifier_unregister(&vcpu->preempt_notifier); preempt_enable(); mutex_unlock(&vcpu->mutex);}static void ack_flush(void *_completed){}void kvm_flush_remote_tlbs(struct kvm *kvm){ int i, cpu; cpumask_t cpus; struct kvm_vcpu *vcpu; cpus_clear(cpus); for (i = 0; i < KVM_MAX_VCPUS; ++i) { vcpu = kvm->vcpus[i]; if (!vcpu) continue; if (test_and_set_bit(KVM_TLB_FLUSH, &vcpu->requests)) continue; cpu = vcpu->cpu; if (cpu != -1 && cpu != raw_smp_processor_id()) cpu_set(cpu, cpus); } smp_call_function_mask(cpus, ack_flush, NULL, 1);}int kvm_vcpu_init(struct kvm_vcpu *vcpu, struct kvm *kvm, unsigned id){ struct page *page; int r; mutex_init(&vcpu->mutex); vcpu->cpu = -1; vcpu->mmu.root_hpa = INVALID_PAGE; vcpu->kvm = kvm; vcpu->vcpu_id = id; if (!irqchip_in_kernel(kvm) || id == 0) vcpu->mp_state = VCPU_MP_STATE_RUNNABLE; else vcpu->mp_state = VCPU_MP_STATE_UNINITIALIZED; init_waitqueue_head(&vcpu->wq); page = alloc_page(GFP_KERNEL | __GFP_ZERO); if (!page) { r = -ENOMEM; goto fail; } vcpu->run = page_address(page); page = alloc_page(GFP_KERNEL | __GFP_ZERO); if (!page) { r = -ENOMEM; goto fail_free_run; } vcpu->pio_data = page_address(page); r = kvm_mmu_create(vcpu); if (r < 0) goto fail_free_pio_data; return 0;fail_free_pio_data: free_page((unsigned long)vcpu->pio_data);fail_free_run: free_page((unsigned long)vcpu->run);fail: return -ENOMEM;}EXPORT_SYMBOL_GPL(kvm_vcpu_init);void kvm_vcpu_uninit(struct kvm_vcpu *vcpu){ kvm_mmu_destroy(vcpu); if (vcpu->apic) hrtimer_cancel(&vcpu->apic->timer.dev); kvm_free_apic(vcpu->apic); free_page((unsigned long)vcpu->pio_data); free_page((unsigned long)vcpu->run);}EXPORT_SYMBOL_GPL(kvm_vcpu_uninit);static struct kvm *kvm_create_vm(void){ struct kvm *kvm = kzalloc(sizeof(struct kvm), GFP_KERNEL); if (!kvm) return ERR_PTR(-ENOMEM); kvm_io_bus_init(&kvm->pio_bus); mutex_init(&kvm->lock); INIT_LIST_HEAD(&kvm->active_mmu_pages); kvm_io_bus_init(&kvm->mmio_bus); spin_lock(&kvm_lock); list_add(&kvm->vm_list, &vm_list); spin_unlock(&kvm_lock); return kvm;}/* * Free any memory in @free but not in @dont. */static void kvm_free_physmem_slot(struct kvm_memory_slot *free, struct kvm_memory_slot *dont){ int i; if (!dont || free->phys_mem != dont->phys_mem) if (free->phys_mem) { for (i = 0; i < free->npages; ++i) if (free->phys_mem[i]) __free_page(free->phys_mem[i]); vfree(free->phys_mem); } if (!dont || free->dirty_bitmap != dont->dirty_bitmap) vfree(free->dirty_bitmap); free->phys_mem = NULL; free->npages = 0; free->dirty_bitmap = NULL;}static void kvm_free_physmem(struct kvm *kvm){ int i; for (i = 0; i < kvm->nmemslots; ++i) kvm_free_physmem_slot(&kvm->memslots[i], NULL);}static void free_pio_guest_pages(struct kvm_vcpu *vcpu){ int i; for (i = 0; i < ARRAY_SIZE(vcpu->pio.guest_pages); ++i) if (vcpu->pio.guest_pages[i]) { __free_page(vcpu->pio.guest_pages[i]); vcpu->pio.guest_pages[i] = NULL; }}static void kvm_unload_vcpu_mmu(struct kvm_vcpu *vcpu){ vcpu_load(vcpu); kvm_mmu_unload(vcpu); vcpu_put(vcpu);}static void kvm_free_vcpus(struct kvm *kvm){ unsigned int i; /* * Unpin any mmu pages first. */ for (i = 0; i < KVM_MAX_VCPUS; ++i) if (kvm->vcpus[i]) kvm_unload_vcpu_mmu(kvm->vcpus[i]); for (i = 0; i < KVM_MAX_VCPUS; ++i) { if (kvm->vcpus[i]) { kvm_x86_ops->vcpu_free(kvm->vcpus[i]); kvm->vcpus[i] = NULL; } }}static void kvm_destroy_vm(struct kvm *kvm){ spin_lock(&kvm_lock); list_del(&kvm->vm_list); spin_unlock(&kvm_lock); kvm_io_bus_destroy(&kvm->pio_bus); kvm_io_bus_destroy(&kvm->mmio_bus); kfree(kvm->vpic); kfree(kvm->vioapic); kvm_free_vcpus(kvm); kvm_free_physmem(kvm); kfree(kvm);}static int kvm_vm_release(struct inode *inode, struct file *filp){ struct kvm *kvm = filp->private_data; kvm_destroy_vm(kvm); return 0;}static void inject_gp(struct kvm_vcpu *vcpu){ kvm_x86_ops->inject_gp(vcpu, 0);}/* * Load the pae pdptrs. Return true is they are all valid. */static int load_pdptrs(struct kvm_vcpu *vcpu, unsigned long cr3){ gfn_t pdpt_gfn = cr3 >> PAGE_SHIFT; unsigned offset = ((cr3 & (PAGE_SIZE-1)) >> 5) << 2; int i; u64 *pdpt; int ret; struct page *page; u64 pdpte[ARRAY_SIZE(vcpu->pdptrs)]; mutex_lock(&vcpu->kvm->lock); page = gfn_to_page(vcpu->kvm, pdpt_gfn); if (!page) { ret = 0; goto out; } pdpt = kmap_atomic(page, KM_USER0); memcpy(pdpte, pdpt+offset, sizeof(pdpte)); kunmap_atomic(pdpt, KM_USER0); for (i = 0; i < ARRAY_SIZE(pdpte); ++i) { if ((pdpte[i] & 1) && (pdpte[i] & 0xfffffff0000001e6ull)) { ret = 0; goto out; } } ret = 1; memcpy(vcpu->pdptrs, pdpte, sizeof(vcpu->pdptrs));out: mutex_unlock(&vcpu->kvm->lock); return ret;}void set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0){ if (cr0 & CR0_RESERVED_BITS) { printk(KERN_DEBUG "set_cr0: 0x%lx #GP, reserved bits 0x%lx\n", cr0, vcpu->cr0); inject_gp(vcpu); return; } if ((cr0 & X86_CR0_NW) && !(cr0 & X86_CR0_CD)) { printk(KERN_DEBUG "set_cr0: #GP, CD == 0 && NW == 1\n"); inject_gp(vcpu); return; } if ((cr0 & X86_CR0_PG) && !(cr0 & X86_CR0_PE)) { printk(KERN_DEBUG "set_cr0: #GP, set PG flag " "and a clear PE flag\n"); inject_gp(vcpu); return; } if (!is_paging(vcpu) && (cr0 & X86_CR0_PG)) {#ifdef CONFIG_X86_64 if ((vcpu->shadow_efer & EFER_LME)) { int cs_db, cs_l; if (!is_pae(vcpu)) { printk(KERN_DEBUG "set_cr0: #GP, start paging " "in long mode while PAE is disabled\n"); inject_gp(vcpu); return; } kvm_x86_ops->get_cs_db_l_bits(vcpu, &cs_db, &cs_l); if (cs_l) { printk(KERN_DEBUG "set_cr0: #GP, start paging " "in long mode while CS.L == 1\n"); inject_gp(vcpu); return; } } else#endif if (is_pae(vcpu) && !load_pdptrs(vcpu, vcpu->cr3)) { printk(KERN_DEBUG "set_cr0: #GP, pdptrs " "reserved bits\n"); inject_gp(vcpu); return; } } kvm_x86_ops->set_cr0(vcpu, cr0); vcpu->cr0 = cr0; mutex_lock(&vcpu->kvm->lock); kvm_mmu_reset_context(vcpu); mutex_unlock(&vcpu->kvm->lock); return;}EXPORT_SYMBOL_GPL(set_cr0);void lmsw(struct kvm_vcpu *vcpu, unsigned long msw){ set_cr0(vcpu, (vcpu->cr0 & ~0x0ful) | (msw & 0x0f));}EXPORT_SYMBOL_GPL(lmsw);void set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4){ if (cr4 & CR4_RESERVED_BITS) { printk(KERN_DEBUG "set_cr4: #GP, reserved bits\n"); inject_gp(vcpu); return; } if (is_long_mode(vcpu)) { if (!(cr4 & X86_CR4_PAE)) { printk(KERN_DEBUG "set_cr4: #GP, clearing PAE while " "in long mode\n"); inject_gp(vcpu); return; } } else if (is_paging(vcpu) && !is_pae(vcpu) && (cr4 & X86_CR4_PAE) && !load_pdptrs(vcpu, vcpu->cr3)) { printk(KERN_DEBUG "set_cr4: #GP, pdptrs reserved bits\n"); inject_gp(vcpu); return; } if (cr4 & X86_CR4_VMXE) { printk(KERN_DEBUG "set_cr4: #GP, setting VMXE\n"); inject_gp(vcpu); return; } kvm_x86_ops->set_cr4(vcpu, cr4); vcpu->cr4 = cr4; mutex_lock(&vcpu->kvm->lock); kvm_mmu_reset_context(vcpu); mutex_unlock(&vcpu->kvm->lock);}EXPORT_SYMBOL_GPL(set_cr4);void set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3){ if (is_long_mode(vcpu)) { if (cr3 & CR3_L_MODE_RESERVED_BITS) { printk(KERN_DEBUG "set_cr3: #GP, reserved bits\n"); inject_gp(vcpu); return; } } else { if (is_pae(vcpu)) { if (cr3 & CR3_PAE_RESERVED_BITS) { printk(KERN_DEBUG "set_cr3: #GP, reserved bits\n"); inject_gp(vcpu); return; } if (is_paging(vcpu) && !load_pdptrs(vcpu, cr3)) { printk(KERN_DEBUG "set_cr3: #GP, pdptrs " "reserved bits\n"); inject_gp(vcpu); return; } } else { if (cr3 & CR3_NONPAE_RESERVED_BITS) { printk(KERN_DEBUG "set_cr3: #GP, reserved bits\n"); inject_gp(vcpu); return; } } }
⌨️ 快捷键说明
复制代码Ctrl + C
搜索代码Ctrl + F
全屏模式F11
增大字号Ctrl + =
减小字号Ctrl + -
显示快捷键?