📄 mm.c
字号:
/* * Copyright (C) 2005 Intel Co * Kun Tian (Kevin Tian) <kevin.tian@intel.com> * * 05/04/29 Kun Tian (Kevin Tian) <kevin.tian@intel.com> Add VTI domain support * * Copyright (c) 2006 Isaku Yamahata <yamahata at valinux co jp> * VA Linux Systems Japan K.K. * dom0 vp model support *//* * NOTES on SMP * * * shared structures * There are some structures which are accessed by CPUs concurrently. * Here is the list of shared structures and operations on them which * read/write the structures. * * - struct page_info * This is a xen global resource. This structure is accessed by * any CPUs. * * operations on this structure: * - get_page() and its variant * - put_page() and its variant * * - vTLB * vcpu->arch.{d, i}tlb: Software tlb cache. These are per VCPU data. * DEFINE_PER_CPU (unsigned long, vhpt_paddr): VHPT table per physical CPU. * * domain_flush_vtlb_range() and domain_flush_vtlb_all() * write vcpu->arch.{d, i}tlb and VHPT table of vcpu which isn't current. * So there are potential races to read/write VHPT and vcpu->arch.{d, i}tlb. * Please note that reading VHPT is done by hardware page table walker. * * operations on this structure: * - global tlb purge * vcpu_ptc_g(), vcpu_ptc_ga() and domain_page_flush_and_put() * I.e. callers of domain_flush_vtlb_range() and domain_flush_vtlb_all() * These functions invalidate VHPT entry and vcpu->arch.{i, d}tlb * * - tlb insert and fc * vcpu_itc_i() * vcpu_itc_d() * ia64_do_page_fault() * vcpu_fc() * These functions set VHPT entry and vcpu->arch.{i, d}tlb. * Actually vcpu_itc_no_srlz() does. * * - the P2M table * domain->mm and pgd, pud, pmd, pte table page. * This structure is used to convert domain pseudo physical address * to machine address. This is per domain resource. * * operations on this structure: * - populate the P2M table tree * lookup_alloc_domain_pte() and its variants. * - set p2m entry * assign_new_domain_page() and its variants. * assign_domain_page() and its variants. * - xchg p2m entry * assign_domain_page_replace() * - cmpxchg p2m entry * assign_domain_page_cmpxchg_rel() * replace_grant_host_mapping() * steal_page() * zap_domain_page_one() * - read p2m entry * lookup_alloc_domain_pte() and its variants. * * - the M2P table * mpt_table (or machine_to_phys_mapping) * This is a table which converts from machine address to pseudo physical * address. This is a global structure. * * operations on this structure: * - set m2p entry * set_gpfn_from_mfn() * - zap m2p entry * set_gpfn_from_mfn(INVALID_P2M_ENTRY) * - get m2p entry * get_gpfn_from_mfn() * * * * avoiding races * The resources which are shared by CPUs must be accessed carefully * to avoid race. * IA64 has weak memory ordering so that attention must be paid * to access shared structures. [SDM vol2 PartII chap. 2] * * - struct page_info memory ordering * get_page() has acquire semantics. * put_page() has release semantics. * * - populating the p2m table * pgd, pud, pmd are append only. * * - races when updating the P2M tables and the M2P table * The P2M entry are shared by more than one vcpu. * So they are accessed atomic operations. * I.e. xchg or cmpxchg must be used to update the p2m entry. * NOTE: When creating/destructing a domain, we don't need to take care of * this race. * * The M2P table is inverse of the P2M table. * I.e. P2M(M2P(p)) = p and M2P(P2M(m)) = m * The M2P table and P2M table must be updated consistently. * Here is the update sequence * * xchg or cmpxchg case * - set_gpfn_from_mfn(new_mfn, gpfn) * - memory barrier * - atomic update of the p2m entry (xchg or cmpxchg the p2m entry) * get old_mfn entry as a result. * - memory barrier * - set_gpfn_from_mfn(old_mfn, INVALID_P2M_ENTRY) * * Here memory barrier can be achieved by release semantics. * * - races between global tlb purge and tlb insert * This is a race between reading/writing vcpu->arch.{d, i}tlb or VHPT entry. * When a vcpu is about to insert tlb, another vcpu may purge tlb * cache globally. Inserting tlb (vcpu_itc_no_srlz()) or global tlb purge * (domain_flush_vtlb_range() and domain_flush_vtlb_all()) can't update * cpu->arch.{d, i}tlb, VHPT and mTLB. So there is a race here. * * Here check vcpu->arch.{d, i}tlb.p bit * After inserting tlb entry, check the p bit and retry to insert. * This means that when global tlb purge and tlb insert are issued * simultaneously, always global tlb purge happens after tlb insert. * * - races between p2m entry update and tlb insert * This is a race between reading/writing the p2m entry. * reader: vcpu_itc_i(), vcpu_itc_d(), ia64_do_page_fault(), vcpu_fc() * writer: assign_domain_page_cmpxchg_rel(), replace_grant_host_mapping(), * steal_page(), zap_domain_page_one() * * For example, vcpu_itc_i() is about to insert tlb by calling * vcpu_itc_no_srlz() after reading the p2m entry. * At the same time, the p2m entry is replaced by xchg or cmpxchg and * tlb cache of the page is flushed. * There is a possibility that the p2m entry doesn't already point to the * old page, but tlb cache still points to the old page. * This can be detected similar to sequence lock using the p2m entry itself. * reader remember the read value of the p2m entry, and insert tlb. * Then read the p2m entry again. If the new p2m entry value is different * from the used p2m entry value, the retry. * * - races between referencing page and p2m entry update * This is a race between reading/writing the p2m entry. * reader: vcpu_get_domain_bundle(), vmx_get_domain_bundle(), * efi_emulate_get_time() * writer: assign_domain_page_cmpxchg_rel(), replace_grant_host_mapping(), * steal_page(), zap_domain_page_one() * * A page which assigned to a domain can be de-assigned by another vcpu. * So before read/write to a domain page, the page's reference count * must be incremented. * vcpu_get_domain_bundle(), vmx_get_domain_bundle() and * efi_emulate_get_time() * */#include <xen/config.h>#include <xen/sched.h>#include <xen/domain.h>#include <asm/xentypes.h>#include <xen/mm.h>#include <xen/errno.h>#include <asm/pgalloc.h>#include <asm/vhpt.h>#include <asm/vcpu.h>#include <asm/shadow.h>#include <asm/p2m_entry.h>#include <asm/tlb_track.h>#include <linux/efi.h>#include <linux/sort.h>#include <xen/guest_access.h>#include <asm/page.h>#include <asm/dom_fw_common.h>#include <public/memory.h>#include <asm/event.h>#include <asm/debugger.h>static void domain_page_flush_and_put(struct domain* d, unsigned long mpaddr, volatile pte_t* ptep, pte_t old_pte, struct page_info* page);extern unsigned long ia64_iobase;struct domain *dom_xen, *dom_io;/* * This number is bigger than DOMID_SELF, DOMID_XEN and DOMID_IO. * If more reserved domain ids are introduced, this might be increased. */#define DOMID_P2M (0x7FF8U)static struct domain *dom_p2m;// followings are stolen from arch_init_memory() @ xen/arch/x86/mm.cvoidalloc_dom_xen_and_dom_io(void){ /* * Initialise our DOMID_XEN domain. * Any Xen-heap pages that we will allow to be mapped will have * their domain field set to dom_xen. */ dom_xen = domain_create(DOMID_XEN, DOMCRF_dummy, 0); BUG_ON(dom_xen == NULL); /* * Initialise our DOMID_IO domain. * This domain owns I/O pages that are within the range of the page_info * array. Mappings occur at the priv of the caller. */ dom_io = domain_create(DOMID_IO, DOMCRF_dummy, 0); BUG_ON(dom_io == NULL);}static intmm_teardown_can_skip(struct domain* d, unsigned long offset){ return d->arch.mm_teardown_offset > offset;}static voidmm_teardown_update_offset(struct domain* d, unsigned long offset){ d->arch.mm_teardown_offset = offset;}static voidmm_teardown_pte(struct domain* d, volatile pte_t* pte, unsigned long offset){ pte_t old_pte; unsigned long mfn; struct page_info* page; old_pte = ptep_get_and_clear(&d->arch.mm, offset, pte);// acquire semantics // vmx domain use bit[58:56] to distinguish io region from memory. // see vmx_build_physmap_table() in vmx_init.c if (!pte_mem(old_pte)) return; // domain might map IO space or acpi table pages. check it. mfn = pte_pfn(old_pte); if (!mfn_valid(mfn)) return; page = mfn_to_page(mfn); BUG_ON(page_get_owner(page) == NULL); // struct page_info corresponding to mfn may exist or not depending // on CONFIG_VIRTUAL_FRAME_TABLE. // The above check is too easy. // The right way is to check whether this page is of io area or acpi pages if (pte_pgc_allocated(old_pte)) { BUG_ON(page_get_owner(page) != d); BUG_ON(get_gpfn_from_mfn(mfn) == INVALID_M2P_ENTRY); set_gpfn_from_mfn(mfn, INVALID_M2P_ENTRY); if (test_and_clear_bit(_PGC_allocated, &page->count_info)) put_page(page); } else { put_page(page); }}static intmm_teardown_pmd(struct domain* d, volatile pmd_t* pmd, unsigned long offset){ unsigned long i; volatile pte_t* pte = pte_offset_map(pmd, offset); for (i = 0; i < PTRS_PER_PTE; i++, pte++) { unsigned long cur_offset = offset + (i << PAGE_SHIFT); if (mm_teardown_can_skip(d, cur_offset + PAGE_SIZE)) continue; if (!pte_present(*pte)) { // acquire semantics mm_teardown_update_offset(d, cur_offset); continue; } mm_teardown_update_offset(d, cur_offset); mm_teardown_pte(d, pte, cur_offset); if (hypercall_preempt_check()) return -EAGAIN; } return 0;}static intmm_teardown_pud(struct domain* d, volatile pud_t *pud, unsigned long offset){ unsigned long i; volatile pmd_t *pmd = pmd_offset(pud, offset); for (i = 0; i < PTRS_PER_PMD; i++, pmd++) { unsigned long cur_offset = offset + (i << PMD_SHIFT); if (mm_teardown_can_skip(d, cur_offset + PMD_SIZE)) continue; if (!pmd_present(*pmd)) { // acquire semantics mm_teardown_update_offset(d, cur_offset); continue; } if (mm_teardown_pmd(d, pmd, cur_offset)) return -EAGAIN; } return 0;}static intmm_teardown_pgd(struct domain* d, volatile pgd_t *pgd, unsigned long offset){ unsigned long i; volatile pud_t *pud = pud_offset(pgd, offset); for (i = 0; i < PTRS_PER_PUD; i++, pud++) { unsigned long cur_offset = offset + (i << PUD_SHIFT);#ifndef __PAGETABLE_PUD_FOLDED if (mm_teardown_can_skip(d, cur_offset + PUD_SIZE)) continue;#endif if (!pud_present(*pud)) { // acquire semantics#ifndef __PAGETABLE_PUD_FOLDED mm_teardown_update_offset(d, cur_offset);#endif continue; } if (mm_teardown_pud(d, pud, cur_offset)) return -EAGAIN; } return 0;}intmm_teardown(struct domain* d){ struct mm_struct* mm = &d->arch.mm; unsigned long i; volatile pgd_t* pgd; if (mm->pgd == NULL) return 0; pgd = pgd_offset(mm, 0); for (i = 0; i < PTRS_PER_PGD; i++, pgd++) { unsigned long cur_offset = i << PGDIR_SHIFT; if (mm_teardown_can_skip(d, cur_offset + PGDIR_SIZE)) continue; if (!pgd_present(*pgd)) { // acquire semantics mm_teardown_update_offset(d, cur_offset); continue; } if (mm_teardown_pgd(d, pgd, cur_offset)) return -EAGAIN; } foreign_p2m_destroy(d); return 0;}static voidmm_p2m_teardown_pmd(struct domain* d, volatile pmd_t* pmd, unsigned long offset){ pte_free_kernel(pte_offset_map(pmd, offset));}static voidmm_p2m_teardown_pud(struct domain* d, volatile pud_t *pud, unsigned long offset){ unsigned long i; volatile pmd_t *pmd = pmd_offset(pud, offset); for (i = 0; i < PTRS_PER_PMD; i++, pmd++) { if (!pmd_present(*pmd)) continue; mm_p2m_teardown_pmd(d, pmd, offset + (i << PMD_SHIFT)); } pmd_free(pmd_offset(pud, offset));}static voidmm_p2m_teardown_pgd(struct domain* d, volatile pgd_t *pgd, unsigned long offset){ unsigned long i; volatile pud_t *pud = pud_offset(pgd, offset); for (i = 0; i < PTRS_PER_PUD; i++, pud++) { if (!pud_present(*pud)) continue; mm_p2m_teardown_pud(d, pud, offset + (i << PUD_SHIFT)); } pud_free(pud_offset(pgd, offset));}static voidmm_p2m_teardown(struct domain* d){ struct mm_struct* mm = &d->arch.mm; unsigned long i; volatile pgd_t* pgd; BUG_ON(mm->pgd == NULL); pgd = pgd_offset(mm, 0); for (i = 0; i < PTRS_PER_PGD; i++, pgd++) { if (!pgd_present(*pgd)) continue; mm_p2m_teardown_pgd(d, pgd, i << PGDIR_SHIFT); } pgd_free(mm->pgd); mm->pgd = NULL;}voidmm_final_teardown(struct domain* d){ if (d->arch.shadow_bitmap != NULL) { xfree(d->arch.shadow_bitmap); d->arch.shadow_bitmap = NULL; }
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -