kvm_main.c

来自「linux 内核源代码」· C语言 代码 · 共 2,825 行 · 第 1/5 页

C
2,825
字号
/* * Kernel-based Virtual Machine driver for Linux * * This module enables machines with Intel VT-x extensions to run virtual * machines without emulation or binary translation. * * Copyright (C) 2006 Qumranet, Inc. * * Authors: *   Avi Kivity   <avi@qumranet.com> *   Yaniv Kamay  <yaniv@qumranet.com> * * This work is licensed under the terms of the GNU GPL, version 2.  See * the COPYING file in the top-level directory. * */#include "kvm.h"#include "x86_emulate.h"#include "segment_descriptor.h"#include "irq.h"#include <linux/kvm.h>#include <linux/module.h>#include <linux/errno.h>#include <linux/percpu.h>#include <linux/gfp.h>#include <linux/mm.h>#include <linux/miscdevice.h>#include <linux/vmalloc.h>#include <linux/reboot.h>#include <linux/debugfs.h>#include <linux/highmem.h>#include <linux/file.h>#include <linux/sysdev.h>#include <linux/cpu.h>#include <linux/sched.h>#include <linux/cpumask.h>#include <linux/smp.h>#include <linux/anon_inodes.h>#include <linux/profile.h>#include <asm/processor.h>#include <asm/msr.h>#include <asm/io.h>#include <asm/uaccess.h>#include <asm/desc.h>MODULE_AUTHOR("Qumranet");MODULE_LICENSE("GPL");static DEFINE_SPINLOCK(kvm_lock);static LIST_HEAD(vm_list);static cpumask_t cpus_hardware_enabled;struct kvm_x86_ops *kvm_x86_ops;struct kmem_cache *kvm_vcpu_cache;EXPORT_SYMBOL_GPL(kvm_vcpu_cache);static __read_mostly struct preempt_ops kvm_preempt_ops;#define STAT_OFFSET(x) offsetof(struct kvm_vcpu, stat.x)static struct kvm_stats_debugfs_item {	const char *name;	int offset;	struct dentry *dentry;} debugfs_entries[] = {	{ "pf_fixed", STAT_OFFSET(pf_fixed) },	{ "pf_guest", STAT_OFFSET(pf_guest) },	{ "tlb_flush", STAT_OFFSET(tlb_flush) },	{ "invlpg", STAT_OFFSET(invlpg) },	{ "exits", STAT_OFFSET(exits) },	{ "io_exits", STAT_OFFSET(io_exits) },	{ "mmio_exits", STAT_OFFSET(mmio_exits) },	{ "signal_exits", STAT_OFFSET(signal_exits) },	{ "irq_window", STAT_OFFSET(irq_window_exits) },	{ "halt_exits", STAT_OFFSET(halt_exits) },	{ "halt_wakeup", STAT_OFFSET(halt_wakeup) },	{ "request_irq", STAT_OFFSET(request_irq_exits) },	{ "irq_exits", STAT_OFFSET(irq_exits) },	{ "light_exits", STAT_OFFSET(light_exits) },	{ "efer_reload", STAT_OFFSET(efer_reload) },	{ NULL }};static struct dentry *debugfs_dir;#define MAX_IO_MSRS 256#define CR0_RESERVED_BITS						\	(~(unsigned long)(X86_CR0_PE | X86_CR0_MP | X86_CR0_EM | X86_CR0_TS \			  | X86_CR0_ET | X86_CR0_NE | X86_CR0_WP | X86_CR0_AM \			  | X86_CR0_NW | X86_CR0_CD | X86_CR0_PG))#define CR4_RESERVED_BITS						\	(~(unsigned long)(X86_CR4_VME | X86_CR4_PVI | X86_CR4_TSD | X86_CR4_DE\			  | X86_CR4_PSE | X86_CR4_PAE | X86_CR4_MCE	\			  | X86_CR4_PGE | X86_CR4_PCE | X86_CR4_OSFXSR	\			  | X86_CR4_OSXMMEXCPT | X86_CR4_VMXE))#define CR8_RESERVED_BITS (~(unsigned long)X86_CR8_TPR)#define EFER_RESERVED_BITS 0xfffffffffffff2fe#ifdef CONFIG_X86_64// LDT or TSS descriptor in the GDT. 16 bytes.struct segment_descriptor_64 {	struct segment_descriptor s;	u32 base_higher;	u32 pad_zero;};#endifstatic long kvm_vcpu_ioctl(struct file *file, unsigned int ioctl,			   unsigned long arg);unsigned long segment_base(u16 selector){	struct descriptor_table gdt;	struct segment_descriptor *d;	unsigned long table_base;	typedef unsigned long ul;	unsigned long v;	if (selector == 0)		return 0;	asm ("sgdt %0" : "=m"(gdt));	table_base = gdt.base;	if (selector & 4) {           /* from ldt */		u16 ldt_selector;		asm ("sldt %0" : "=g"(ldt_selector));		table_base = segment_base(ldt_selector);	}	d = (struct segment_descriptor *)(table_base + (selector & ~7));	v = d->base_low | ((ul)d->base_mid << 16) | ((ul)d->base_high << 24);#ifdef CONFIG_X86_64	if (d->system == 0	    && (d->type == 2 || d->type == 9 || d->type == 11))		v |= ((ul)((struct segment_descriptor_64 *)d)->base_higher) << 32;#endif	return v;}EXPORT_SYMBOL_GPL(segment_base);static inline int valid_vcpu(int n){	return likely(n >= 0 && n < KVM_MAX_VCPUS);}void kvm_load_guest_fpu(struct kvm_vcpu *vcpu){	if (!vcpu->fpu_active || vcpu->guest_fpu_loaded)		return;	vcpu->guest_fpu_loaded = 1;	fx_save(&vcpu->host_fx_image);	fx_restore(&vcpu->guest_fx_image);}EXPORT_SYMBOL_GPL(kvm_load_guest_fpu);void kvm_put_guest_fpu(struct kvm_vcpu *vcpu){	if (!vcpu->guest_fpu_loaded)		return;	vcpu->guest_fpu_loaded = 0;	fx_save(&vcpu->guest_fx_image);	fx_restore(&vcpu->host_fx_image);}EXPORT_SYMBOL_GPL(kvm_put_guest_fpu);/* * Switches to specified vcpu, until a matching vcpu_put() */static void vcpu_load(struct kvm_vcpu *vcpu){	int cpu;	mutex_lock(&vcpu->mutex);	cpu = get_cpu();	preempt_notifier_register(&vcpu->preempt_notifier);	kvm_x86_ops->vcpu_load(vcpu, cpu);	put_cpu();}static void vcpu_put(struct kvm_vcpu *vcpu){	preempt_disable();	kvm_x86_ops->vcpu_put(vcpu);	preempt_notifier_unregister(&vcpu->preempt_notifier);	preempt_enable();	mutex_unlock(&vcpu->mutex);}static void ack_flush(void *_completed){}void kvm_flush_remote_tlbs(struct kvm *kvm){	int i, cpu;	cpumask_t cpus;	struct kvm_vcpu *vcpu;	cpus_clear(cpus);	for (i = 0; i < KVM_MAX_VCPUS; ++i) {		vcpu = kvm->vcpus[i];		if (!vcpu)			continue;		if (test_and_set_bit(KVM_TLB_FLUSH, &vcpu->requests))			continue;		cpu = vcpu->cpu;		if (cpu != -1 && cpu != raw_smp_processor_id())			cpu_set(cpu, cpus);	}	smp_call_function_mask(cpus, ack_flush, NULL, 1);}int kvm_vcpu_init(struct kvm_vcpu *vcpu, struct kvm *kvm, unsigned id){	struct page *page;	int r;	mutex_init(&vcpu->mutex);	vcpu->cpu = -1;	vcpu->mmu.root_hpa = INVALID_PAGE;	vcpu->kvm = kvm;	vcpu->vcpu_id = id;	if (!irqchip_in_kernel(kvm) || id == 0)		vcpu->mp_state = VCPU_MP_STATE_RUNNABLE;	else		vcpu->mp_state = VCPU_MP_STATE_UNINITIALIZED;	init_waitqueue_head(&vcpu->wq);	page = alloc_page(GFP_KERNEL | __GFP_ZERO);	if (!page) {		r = -ENOMEM;		goto fail;	}	vcpu->run = page_address(page);	page = alloc_page(GFP_KERNEL | __GFP_ZERO);	if (!page) {		r = -ENOMEM;		goto fail_free_run;	}	vcpu->pio_data = page_address(page);	r = kvm_mmu_create(vcpu);	if (r < 0)		goto fail_free_pio_data;	return 0;fail_free_pio_data:	free_page((unsigned long)vcpu->pio_data);fail_free_run:	free_page((unsigned long)vcpu->run);fail:	return -ENOMEM;}EXPORT_SYMBOL_GPL(kvm_vcpu_init);void kvm_vcpu_uninit(struct kvm_vcpu *vcpu){	kvm_mmu_destroy(vcpu);	if (vcpu->apic)		hrtimer_cancel(&vcpu->apic->timer.dev);	kvm_free_apic(vcpu->apic);	free_page((unsigned long)vcpu->pio_data);	free_page((unsigned long)vcpu->run);}EXPORT_SYMBOL_GPL(kvm_vcpu_uninit);static struct kvm *kvm_create_vm(void){	struct kvm *kvm = kzalloc(sizeof(struct kvm), GFP_KERNEL);	if (!kvm)		return ERR_PTR(-ENOMEM);	kvm_io_bus_init(&kvm->pio_bus);	mutex_init(&kvm->lock);	INIT_LIST_HEAD(&kvm->active_mmu_pages);	kvm_io_bus_init(&kvm->mmio_bus);	spin_lock(&kvm_lock);	list_add(&kvm->vm_list, &vm_list);	spin_unlock(&kvm_lock);	return kvm;}/* * Free any memory in @free but not in @dont. */static void kvm_free_physmem_slot(struct kvm_memory_slot *free,				  struct kvm_memory_slot *dont){	int i;	if (!dont || free->phys_mem != dont->phys_mem)		if (free->phys_mem) {			for (i = 0; i < free->npages; ++i)				if (free->phys_mem[i])					__free_page(free->phys_mem[i]);			vfree(free->phys_mem);		}	if (!dont || free->dirty_bitmap != dont->dirty_bitmap)		vfree(free->dirty_bitmap);	free->phys_mem = NULL;	free->npages = 0;	free->dirty_bitmap = NULL;}static void kvm_free_physmem(struct kvm *kvm){	int i;	for (i = 0; i < kvm->nmemslots; ++i)		kvm_free_physmem_slot(&kvm->memslots[i], NULL);}static void free_pio_guest_pages(struct kvm_vcpu *vcpu){	int i;	for (i = 0; i < ARRAY_SIZE(vcpu->pio.guest_pages); ++i)		if (vcpu->pio.guest_pages[i]) {			__free_page(vcpu->pio.guest_pages[i]);			vcpu->pio.guest_pages[i] = NULL;		}}static void kvm_unload_vcpu_mmu(struct kvm_vcpu *vcpu){	vcpu_load(vcpu);	kvm_mmu_unload(vcpu);	vcpu_put(vcpu);}static void kvm_free_vcpus(struct kvm *kvm){	unsigned int i;	/*	 * Unpin any mmu pages first.	 */	for (i = 0; i < KVM_MAX_VCPUS; ++i)		if (kvm->vcpus[i])			kvm_unload_vcpu_mmu(kvm->vcpus[i]);	for (i = 0; i < KVM_MAX_VCPUS; ++i) {		if (kvm->vcpus[i]) {			kvm_x86_ops->vcpu_free(kvm->vcpus[i]);			kvm->vcpus[i] = NULL;		}	}}static void kvm_destroy_vm(struct kvm *kvm){	spin_lock(&kvm_lock);	list_del(&kvm->vm_list);	spin_unlock(&kvm_lock);	kvm_io_bus_destroy(&kvm->pio_bus);	kvm_io_bus_destroy(&kvm->mmio_bus);	kfree(kvm->vpic);	kfree(kvm->vioapic);	kvm_free_vcpus(kvm);	kvm_free_physmem(kvm);	kfree(kvm);}static int kvm_vm_release(struct inode *inode, struct file *filp){	struct kvm *kvm = filp->private_data;	kvm_destroy_vm(kvm);	return 0;}static void inject_gp(struct kvm_vcpu *vcpu){	kvm_x86_ops->inject_gp(vcpu, 0);}/* * Load the pae pdptrs.  Return true is they are all valid. */static int load_pdptrs(struct kvm_vcpu *vcpu, unsigned long cr3){	gfn_t pdpt_gfn = cr3 >> PAGE_SHIFT;	unsigned offset = ((cr3 & (PAGE_SIZE-1)) >> 5) << 2;	int i;	u64 *pdpt;	int ret;	struct page *page;	u64 pdpte[ARRAY_SIZE(vcpu->pdptrs)];	mutex_lock(&vcpu->kvm->lock);	page = gfn_to_page(vcpu->kvm, pdpt_gfn);	if (!page) {		ret = 0;		goto out;	}	pdpt = kmap_atomic(page, KM_USER0);	memcpy(pdpte, pdpt+offset, sizeof(pdpte));	kunmap_atomic(pdpt, KM_USER0);	for (i = 0; i < ARRAY_SIZE(pdpte); ++i) {		if ((pdpte[i] & 1) && (pdpte[i] & 0xfffffff0000001e6ull)) {			ret = 0;			goto out;		}	}	ret = 1;	memcpy(vcpu->pdptrs, pdpte, sizeof(vcpu->pdptrs));out:	mutex_unlock(&vcpu->kvm->lock);	return ret;}void set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0){	if (cr0 & CR0_RESERVED_BITS) {		printk(KERN_DEBUG "set_cr0: 0x%lx #GP, reserved bits 0x%lx\n",		       cr0, vcpu->cr0);		inject_gp(vcpu);		return;	}	if ((cr0 & X86_CR0_NW) && !(cr0 & X86_CR0_CD)) {		printk(KERN_DEBUG "set_cr0: #GP, CD == 0 && NW == 1\n");		inject_gp(vcpu);		return;	}	if ((cr0 & X86_CR0_PG) && !(cr0 & X86_CR0_PE)) {		printk(KERN_DEBUG "set_cr0: #GP, set PG flag "		       "and a clear PE flag\n");		inject_gp(vcpu);		return;	}	if (!is_paging(vcpu) && (cr0 & X86_CR0_PG)) {#ifdef CONFIG_X86_64		if ((vcpu->shadow_efer & EFER_LME)) {			int cs_db, cs_l;			if (!is_pae(vcpu)) {				printk(KERN_DEBUG "set_cr0: #GP, start paging "				       "in long mode while PAE is disabled\n");				inject_gp(vcpu);				return;			}			kvm_x86_ops->get_cs_db_l_bits(vcpu, &cs_db, &cs_l);			if (cs_l) {				printk(KERN_DEBUG "set_cr0: #GP, start paging "				       "in long mode while CS.L == 1\n");				inject_gp(vcpu);				return;			}		} else#endif		if (is_pae(vcpu) && !load_pdptrs(vcpu, vcpu->cr3)) {			printk(KERN_DEBUG "set_cr0: #GP, pdptrs "			       "reserved bits\n");			inject_gp(vcpu);			return;		}	}	kvm_x86_ops->set_cr0(vcpu, cr0);	vcpu->cr0 = cr0;	mutex_lock(&vcpu->kvm->lock);	kvm_mmu_reset_context(vcpu);	mutex_unlock(&vcpu->kvm->lock);	return;}EXPORT_SYMBOL_GPL(set_cr0);void lmsw(struct kvm_vcpu *vcpu, unsigned long msw){	set_cr0(vcpu, (vcpu->cr0 & ~0x0ful) | (msw & 0x0f));}EXPORT_SYMBOL_GPL(lmsw);void set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4){	if (cr4 & CR4_RESERVED_BITS) {		printk(KERN_DEBUG "set_cr4: #GP, reserved bits\n");		inject_gp(vcpu);		return;	}	if (is_long_mode(vcpu)) {		if (!(cr4 & X86_CR4_PAE)) {			printk(KERN_DEBUG "set_cr4: #GP, clearing PAE while "			       "in long mode\n");			inject_gp(vcpu);			return;		}	} else if (is_paging(vcpu) && !is_pae(vcpu) && (cr4 & X86_CR4_PAE)		   && !load_pdptrs(vcpu, vcpu->cr3)) {		printk(KERN_DEBUG "set_cr4: #GP, pdptrs reserved bits\n");		inject_gp(vcpu);		return;	}	if (cr4 & X86_CR4_VMXE) {		printk(KERN_DEBUG "set_cr4: #GP, setting VMXE\n");		inject_gp(vcpu);		return;	}	kvm_x86_ops->set_cr4(vcpu, cr4);	vcpu->cr4 = cr4;	mutex_lock(&vcpu->kvm->lock);	kvm_mmu_reset_context(vcpu);	mutex_unlock(&vcpu->kvm->lock);}EXPORT_SYMBOL_GPL(set_cr4);void set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3){	if (is_long_mode(vcpu)) {		if (cr3 & CR3_L_MODE_RESERVED_BITS) {			printk(KERN_DEBUG "set_cr3: #GP, reserved bits\n");			inject_gp(vcpu);			return;		}	} else {		if (is_pae(vcpu)) {			if (cr3 & CR3_PAE_RESERVED_BITS) {				printk(KERN_DEBUG				       "set_cr3: #GP, reserved bits\n");				inject_gp(vcpu);				return;			}			if (is_paging(vcpu) && !load_pdptrs(vcpu, cr3)) {				printk(KERN_DEBUG "set_cr3: #GP, pdptrs "				       "reserved bits\n");				inject_gp(vcpu);				return;			}		} else {			if (cr3 & CR3_NONPAE_RESERVED_BITS) {				printk(KERN_DEBUG				       "set_cr3: #GP, reserved bits\n");				inject_gp(vcpu);				return;			}		}	}

⌨️ 快捷键说明

复制代码Ctrl + C
搜索代码Ctrl + F
全屏模式F11
增大字号Ctrl + =
减小字号Ctrl + -
显示快捷键?