📄 mm.c
字号:
/****************************************************************************** * arch/x86/mm.c * * Copyright (c) 2002-2005 K A Fraser * Copyright (c) 2004 Christian Limpach * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 2 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA *//* * A description of the x86 page table API: * * Domains trap to do_mmu_update with a list of update requests. * This is a list of (ptr, val) pairs, where the requested operation * is *ptr = val. * * Reference counting of pages: * ---------------------------- * Each page has two refcounts: tot_count and type_count. * * TOT_COUNT is the obvious reference count. It counts all uses of a * physical page frame by a domain, including uses as a page directory, * a page table, or simple mappings via a PTE. This count prevents a * domain from releasing a frame back to the free pool when it still holds * a reference to it. * * TYPE_COUNT is more subtle. A frame can be put to one of three * mutually-exclusive uses: it might be used as a page directory, or a * page table, or it may be mapped writable by the domain [of course, a * frame may not be used in any of these three ways!]. * So, type_count is a count of the number of times a frame is being * referred to in its current incarnation. Therefore, a page can only * change its type when its type count is zero. * * Pinning the page type: * ---------------------- * The type of a page can be pinned/unpinned with the commands * MMUEXT_[UN]PIN_L?_TABLE. Each page can be pinned exactly once (that is, * pinning is not reference counted, so it can't be nested). * This is useful to prevent a page's type count falling to zero, at which * point safety checks would need to be carried out next time the count * is increased again. * * A further note on writable page mappings: * ----------------------------------------- * For simplicity, the count of writable mappings for a page may not * correspond to reality. The 'writable count' is incremented for every * PTE which maps the page with the _PAGE_RW flag set. However, for * write access to be possible the page directory entry must also have * its _PAGE_RW bit set. We do not check this as it complicates the * reference counting considerably [consider the case of multiple * directory entries referencing a single page table, some with the RW * bit set, others not -- it starts getting a bit messy]. * In normal use, this simplification shouldn't be a problem. * However, the logic can be added if required. * * One more note on read-only page mappings: * ----------------------------------------- * We want domains to be able to map pages for read-only access. The * main reason is that page tables and directories should be readable * by a domain, but it would not be safe for them to be writable. * However, domains have free access to rings 1 & 2 of the Intel * privilege model. In terms of page protection, these are considered * to be part of 'supervisor mode'. The WP bit in CR0 controls whether * read-only restrictions are respected in supervisor mode -- if the * bit is clear then any mapped page is writable. * * We get round this by always setting the WP bit and disallowing * updates to it. This is very unlikely to cause a problem for guest * OS's, which will generally use the WP bit to simplify copy-on-write * implementation (in that case, OS wants a fault when it writes to * an application-supplied buffer). */#include <xen/config.h>#include <xen/init.h>#include <xen/kernel.h>#include <xen/lib.h>#include <xen/mm.h>#include <xen/domain.h>#include <xen/sched.h>#include <xen/errno.h>#include <xen/perfc.h>#include <xen/irq.h>#include <xen/softirq.h>#include <xen/domain_page.h>#include <xen/event.h>#include <xen/iocap.h>#include <xen/guest_access.h>#include <asm/paging.h>#include <asm/shadow.h>#include <asm/page.h>#include <asm/flushtlb.h>#include <asm/io.h>#include <asm/ldt.h>#include <asm/x86_emulate.h>#include <asm/e820.h>#include <asm/hypercall.h>#include <asm/shared.h>#include <public/memory.h>#include <xsm/xsm.h>#include <xen/trace.h>#define MEM_LOG(_f, _a...) gdprintk(XENLOG_WARNING , _f "\n" , ## _a)/* * PTE updates can be done with ordinary writes except: * 1. Debug builds get extra checking by using CMPXCHG[8B]. * 2. PAE builds perform an atomic 8-byte store with CMPXCHG8B. */#if !defined(NDEBUG) || defined(CONFIG_X86_PAE)#define PTE_UPDATE_WITH_CMPXCHG#endif/* Used to defer flushing of memory structures. */struct percpu_mm_info {#define DOP_FLUSH_TLB (1<<0) /* Flush the local TLB. */#define DOP_FLUSH_ALL_TLBS (1<<1) /* Flush TLBs of all VCPUs of current dom. */#define DOP_RELOAD_LDT (1<<2) /* Reload the LDT shadow mapping. */ unsigned int deferred_ops; /* If non-NULL, specifies a foreign subject domain for some operations. */ struct domain *foreign;};static DEFINE_PER_CPU(struct percpu_mm_info, percpu_mm_info);/* * Returns the current foreign domain; defaults to the currently-executing * domain if a foreign override hasn't been specified. */#define FOREIGNDOM (this_cpu(percpu_mm_info).foreign ?: current->domain)/* Private domain structs for DOMID_XEN and DOMID_IO. */static struct domain *dom_xen, *dom_io;/* Frame table and its size in pages. */struct page_info *frame_table;unsigned long max_page;unsigned long total_pages;#define PAGE_CACHE_ATTRS (_PAGE_PAT|_PAGE_PCD|_PAGE_PWT)#define l1_disallow_mask(d) \ ((d != dom_io) && \ (rangeset_is_empty((d)->iomem_caps) && \ rangeset_is_empty((d)->arch.ioport_caps)) ? \ L1_DISALLOW_MASK : (L1_DISALLOW_MASK & ~PAGE_CACHE_ATTRS))#ifdef CONFIG_COMPATl2_pgentry_t *compat_idle_pg_table_l2 = NULL;#define l3_disallow_mask(d) (!is_pv_32on64_domain(d) ? \ L3_DISALLOW_MASK : \ COMPAT_L3_DISALLOW_MASK)#else#define l3_disallow_mask(d) L3_DISALLOW_MASK#endifstatic void queue_deferred_ops(struct domain *d, unsigned int ops){ ASSERT(d == current->domain); this_cpu(percpu_mm_info).deferred_ops |= ops;}void __init init_frametable(void){ unsigned long nr_pages, page_step, i, mfn; frame_table = (struct page_info *)FRAMETABLE_VIRT_START; nr_pages = PFN_UP(max_page * sizeof(*frame_table)); page_step = (1 << L2_PAGETABLE_SHIFT) >> PAGE_SHIFT; for ( i = 0; i < nr_pages; i += page_step ) { mfn = alloc_boot_pages(min(nr_pages - i, page_step), page_step); if ( mfn == 0 ) panic("Not enough memory for frame table\n"); map_pages_to_xen( FRAMETABLE_VIRT_START + (i << PAGE_SHIFT), mfn, page_step, PAGE_HYPERVISOR); } memset(frame_table, 0, nr_pages << PAGE_SHIFT);}void __init arch_init_memory(void){ extern void subarch_init_memory(void); unsigned long i, pfn, rstart_pfn, rend_pfn; /* * Initialise our DOMID_XEN domain. * Any Xen-heap pages that we will allow to be mapped will have * their domain field set to dom_xen. */ dom_xen = alloc_domain(DOMID_XEN); BUG_ON(dom_xen == NULL); /* * Initialise our DOMID_IO domain. * This domain owns I/O pages that are within the range of the page_info * array. Mappings occur at the priv of the caller. */ dom_io = alloc_domain(DOMID_IO); BUG_ON(dom_io == NULL); /* First 1MB of RAM is historically marked as I/O. */ for ( i = 0; i < 0x100; i++ ) share_xen_page_with_guest(mfn_to_page(i), dom_io, XENSHARE_writable); /* Any areas not specified as RAM by the e820 map are considered I/O. */ for ( i = 0, pfn = 0; pfn < max_page; i++ ) { while ( (i < e820.nr_map) && (e820.map[i].type != E820_RAM) && (e820.map[i].type != E820_UNUSABLE) ) i++; if ( i >= e820.nr_map ) { /* No more RAM regions: mark as I/O right to end of memory map. */ rstart_pfn = rend_pfn = max_page; } else { /* Mark as I/O just up as far as next RAM region. */ rstart_pfn = min_t(unsigned long, max_page, PFN_UP(e820.map[i].addr)); rend_pfn = max_t(unsigned long, rstart_pfn, PFN_DOWN(e820.map[i].addr + e820.map[i].size)); } /* Mark as I/O up to next RAM region. */ for ( ; pfn < rstart_pfn; pfn++ ) { BUG_ON(!mfn_valid(pfn)); share_xen_page_with_guest( mfn_to_page(pfn), dom_io, XENSHARE_writable); } /* Skip the RAM region. */ pfn = rend_pfn; } subarch_init_memory();}int memory_is_conventional_ram(paddr_t p){ int i; for ( i = 0; i < e820.nr_map; i++ ) { if ( (e820.map[i].type == E820_RAM) && (e820.map[i].addr <= p) && (e820.map[i].size > p) ) return 1; } return 0;}unsigned long domain_get_maximum_gpfn(struct domain *d){ if ( is_hvm_domain(d) ) return d->arch.p2m.max_mapped_pfn; /* NB. PV guests specify nr_pfns rather than max_pfn so we adjust here. */ return arch_get_max_pfn(d) - 1;}void share_xen_page_with_guest( struct page_info *page, struct domain *d, int readonly){ if ( page_get_owner(page) == d ) return; set_gpfn_from_mfn(page_to_mfn(page), INVALID_M2P_ENTRY); spin_lock(&d->page_alloc_lock); /* The incremented type count pins as writable or read-only. */ page->u.inuse.type_info = (readonly ? PGT_none : PGT_writable_page); page->u.inuse.type_info |= PGT_validated | 1; page_set_owner(page, d); wmb(); /* install valid domain ptr before updating refcnt. */ ASSERT(page->count_info == 0); /* Only add to the allocation list if the domain isn't dying. */ if ( !d->is_dying ) { page->count_info |= PGC_allocated | 1; if ( unlikely(d->xenheap_pages++ == 0) ) get_knownalive_domain(d); list_add_tail(&page->list, &d->xenpage_list); } spin_unlock(&d->page_alloc_lock);}void share_xen_page_with_privileged_guests( struct page_info *page, int readonly){ share_xen_page_with_guest(page, dom_xen, readonly);}#if defined(CONFIG_X86_PAE)#ifdef NDEBUG/* Only PDPTs above 4GB boundary need to be shadowed in low memory. */#define l3tab_needs_shadow(mfn) ((mfn) >= 0x100000)#else/* * In debug builds we shadow a selection of <4GB PDPTs to exercise code paths. * We cannot safely shadow the idle page table, nor shadow (v1) page tables * (detected by lack of an owning domain). As required for correctness, we * always shadow PDPTs above 4GB. */#define l3tab_needs_shadow(mfn) \ (((((mfn) << PAGE_SHIFT) != __pa(idle_pg_table)) && \ (page_get_owner(mfn_to_page(mfn)) != NULL) && \ ((mfn) & 1)) || /* odd MFNs are shadowed */ \ ((mfn) >= 0x100000))#endifstatic l1_pgentry_t *fix_pae_highmem_pl1e;/* Cache the address of PAE high-memory fixmap page tables. */static int __init cache_pae_fixmap_address(void){ unsigned long fixmap_base = fix_to_virt(FIX_PAE_HIGHMEM_0); l2_pgentry_t *pl2e = virt_to_xen_l2e(fixmap_base); fix_pae_highmem_pl1e = l2e_to_l1e(*pl2e) + l1_table_offset(fixmap_base); return 0;}__initcall(cache_pae_fixmap_address);static DEFINE_PER_CPU(u32, make_cr3_timestamp);void make_cr3(struct vcpu *v, unsigned long mfn)/* Takes the MFN of a PAE l3 table, copies the contents to below 4GB if * necessary, and sets v->arch.cr3 to the value to load in CR3. */{ l3_pgentry_t *highmem_l3tab, *lowmem_l3tab; struct pae_l3_cache *cache = &v->arch.pae_l3_cache; unsigned int cpu = smp_processor_id(); /* Fast path: does this mfn need a shadow at all? */ if ( !l3tab_needs_shadow(mfn) ) { v->arch.cr3 = mfn << PAGE_SHIFT; /* Cache is no longer in use or valid */ cache->high_mfn = 0; return; } /* Caching logic is not interrupt safe. */ ASSERT(!in_irq()); /* Protects against pae_flush_pgd(). */ spin_lock(&cache->lock); cache->inuse_idx ^= 1; cache->high_mfn = mfn; /* Map the guest L3 table and copy to the chosen low-memory cache. */ l1e_write(fix_pae_highmem_pl1e-cpu, l1e_from_pfn(mfn, __PAGE_HYPERVISOR)); /* First check the previous high mapping can't be in the TLB. * (i.e. have we loaded CR3 since we last did this?) */ if ( unlikely(this_cpu(make_cr3_timestamp) == this_cpu(tlbflush_time)) ) flush_tlb_one_local(fix_to_virt(FIX_PAE_HIGHMEM_0 + cpu)); highmem_l3tab = (l3_pgentry_t *)fix_to_virt(FIX_PAE_HIGHMEM_0 + cpu); lowmem_l3tab = cache->table[cache->inuse_idx]; memcpy(lowmem_l3tab, highmem_l3tab, sizeof(cache->table[0])); l1e_write(fix_pae_highmem_pl1e-cpu, l1e_empty()); this_cpu(make_cr3_timestamp) = this_cpu(tlbflush_time); v->arch.cr3 = __pa(lowmem_l3tab); spin_unlock(&cache->lock);}#else /* !CONFIG_X86_PAE */void make_cr3(struct vcpu *v, unsigned long mfn){ v->arch.cr3 = mfn << PAGE_SHIFT;}#endif /* !CONFIG_X86_PAE */void write_ptbase(struct vcpu *v){ write_cr3(v->arch.cr3);}/* * Should be called after CR3 is updated. * * Uses values found in vcpu->arch.(guest_table and guest_table_user), and * for HVM guests, arch.monitor_table and hvm's guest CR3. *
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -