# HG changeset patch # User cegger # Date 1302011049 -7200 Implement Nested-on-Nested. This allows the guest to run nested guest with hap enabled. Signed-off-by: Christoph Egger diff -r cfde4384be14 -r 28809c365861 xen/arch/x86/hvm/hvm.c --- a/xen/arch/x86/hvm/hvm.c +++ b/xen/arch/x86/hvm/hvm.c @@ -1170,21 +1170,50 @@ void hvm_inject_exception(unsigned int t hvm_funcs.inject_exception(trapnr, errcode, cr2); } -bool_t hvm_hap_nested_page_fault(unsigned long gpa, - bool_t gla_valid, - unsigned long gla, - bool_t access_valid, - bool_t access_r, - bool_t access_w, - bool_t access_x) +int hvm_hap_nested_page_fault(unsigned long gpa, + bool_t gla_valid, + unsigned long gla, + bool_t access_valid, + bool_t access_r, + bool_t access_w, + bool_t access_x) { unsigned long gfn = gpa >> PAGE_SHIFT; p2m_type_t p2mt; p2m_access_t p2ma; mfn_t mfn; struct vcpu *v = current; - struct p2m_domain *p2m = p2m_get_hostp2m(v->domain); - + struct p2m_domain *p2m = NULL; + + /* On Nested Virtualization, walk the guest page table. + * If this succeeds, all is fine. + * If this fails, inject a nested page fault into the guest. + */ + if ( nestedhvm_enabled(v->domain) + && nestedhvm_vcpu_in_guestmode(v) + && nestedhvm_paging_mode_hap(v) ) + { + int rv; + + /* The vcpu is in guest mode and the l1 guest + * uses hap. That means 'gpa' is in l2 guest + * physical address space. + * Fix the nested p2m or inject nested page fault + * into l1 guest if not fixable. The algorithm is + * the same as for shadow paging. + */ + rv = nestedhvm_hap_nested_page_fault(v, gpa); + switch (rv) { + case NESTEDHVM_PAGEFAULT_DONE: + return 1; + case NESTEDHVM_PAGEFAULT_ERROR: + return 0; + case NESTEDHVM_PAGEFAULT_INJECT: + return -1; + } + } + + p2m = p2m_get_hostp2m(v->domain); mfn = gfn_to_mfn_type_current(p2m, gfn, &p2mt, &p2ma, p2m_guest); /* Check access permissions first, then handle faults */ @@ -1328,6 +1357,15 @@ int hvm_set_efer(uint64_t value) return X86EMUL_EXCEPTION; } + if ( nestedhvm_enabled(v->domain) && cpu_has_svm && + ((value & EFER_SVME) == 0 ) && + ((value ^ v->arch.hvm_vcpu.guest_efer) & EFER_SVME) ) + { + /* Cleared EFER.SVME: Flush all nestedp2m tables */ + p2m_flush_nestedp2m(v->domain); + nestedhvm_vcpu_reset(v); + } + value |= v->arch.hvm_vcpu.guest_efer & EFER_LMA; v->arch.hvm_vcpu.guest_efer = value; hvm_update_guest_efer(v); @@ -1478,8 +1516,12 @@ int hvm_set_cr0(unsigned long value) v->arch.hvm_vcpu.guest_cr[0] = value; hvm_update_guest_cr(v, 0); - if ( (value ^ old_value) & X86_CR0_PG ) - paging_update_paging_modes(v); + if ( (value ^ old_value) & X86_CR0_PG ) { + if ( !nestedhvm_vmswitch_in_progress(v) && nestedhvm_vcpu_in_guestmode(v) ) + paging_update_nestedmode(v); + else + paging_update_paging_modes(v); + } return X86EMUL_OKAY; @@ -1546,8 +1588,12 @@ int hvm_set_cr4(unsigned long value) hvm_update_guest_cr(v, 4); /* Modifying CR4.{PSE,PAE,PGE} invalidates all TLB entries, inc. Global. */ - if ( (old_cr ^ value) & (X86_CR4_PSE | X86_CR4_PGE | X86_CR4_PAE) ) - paging_update_paging_modes(v); + if ( (old_cr ^ value) & (X86_CR4_PSE | X86_CR4_PGE | X86_CR4_PAE) ) { + if ( !nestedhvm_vmswitch_in_progress(v) && nestedhvm_vcpu_in_guestmode(v) ) + paging_update_nestedmode(v); + else + paging_update_paging_modes(v); + } return X86EMUL_OKAY; @@ -2060,7 +2106,7 @@ static enum hvm_copy_result __hvm_copy( void *buf, paddr_t addr, int size, unsigned int flags, uint32_t pfec) { struct vcpu *curr = current; - struct p2m_domain *p2m = p2m_get_hostp2m(curr->domain); + struct p2m_domain *p2m; unsigned long gfn, mfn; p2m_type_t p2mt; char *p; @@ -2082,6 +2128,8 @@ static enum hvm_copy_result __hvm_copy( return HVMCOPY_unhandleable; #endif + p2m = p2m_get_hostp2m(curr->domain); + while ( todo > 0 ) { count = min_t(int, PAGE_SIZE - (addr & ~PAGE_MASK), todo); diff -r cfde4384be14 -r 28809c365861 xen/arch/x86/hvm/nestedhvm.c --- a/xen/arch/x86/hvm/nestedhvm.c +++ b/xen/arch/x86/hvm/nestedhvm.c @@ -20,6 +20,7 @@ #include #include /* for HVM_DELIVER_NO_ERROR_CODE */ #include +#include /* for struct p2m_domain */ #include #include /* for local_event_delivery_(en|dis)able */ #include /* for paging_mode_hap() */ @@ -96,6 +97,54 @@ nestedhvm_vcpu_destroy(struct vcpu *v) return nhvm_vcpu_destroy(v); } +static void +nestedhvm_flushtlb_ipi(void *info) +{ + struct vcpu *v = current; + struct domain *d = info; + + ASSERT(d != NULL); + if (v->domain != d) { + /* This cpu doesn't belong to the domain */ + return; + } + + /* Just flush the ASID (or request a new one). + * This is cheaper than flush_tlb_local() and has + * the same desired effect. + */ + hvm_asid_flush_core(); + vcpu_nestedhvm(v).nv_p2m = NULL; +} + +void +nestedhvm_vmcx_flushtlb(struct p2m_domain *p2m) +{ + on_selected_cpus(&p2m->p2m_dirty_cpumask, nestedhvm_flushtlb_ipi, + p2m->domain, 1); + cpus_clear(p2m->p2m_dirty_cpumask); +} + +void +nestedhvm_vmcx_flushtlbdomain(struct domain *d) +{ + on_selected_cpus(&d->domain_dirty_cpumask, nestedhvm_flushtlb_ipi, d, 1); +} + +bool_t +nestedhvm_is_n2(struct vcpu *v) +{ + if (!nestedhvm_enabled(v->domain) + || nestedhvm_vmswitch_in_progress(v) + || !nestedhvm_paging_mode_hap(v)) + return 0; + + if (nestedhvm_vcpu_in_guestmode(v)) + return 1; + + return 0; +} + /* Common shadow IO Permission bitmap */ /* There four global patterns of io bitmap each guest can diff -r cfde4384be14 -r 28809c365861 xen/arch/x86/hvm/svm/nestedsvm.c --- a/xen/arch/x86/hvm/svm/nestedsvm.c +++ b/xen/arch/x86/hvm/svm/nestedsvm.c @@ -26,6 +26,7 @@ #include #include /* paging_mode_hap */ #include /* for local_event_delivery_(en|dis)able */ +#include /* p2m_get_pagetable, p2m_get_nestedp2m */ static void nestedsvm_vcpu_clgi(struct vcpu *v) @@ -320,6 +321,18 @@ static int nsvm_vmrun_permissionmap(stru return 0; } +static void nestedsvm_vmcb_set_nestedp2m(struct vcpu *v, + struct vmcb_struct *vvmcb, struct vmcb_struct *n2vmcb) +{ + struct p2m_domain *p2m; + + ASSERT(v != NULL); + ASSERT(vvmcb != NULL); + ASSERT(n2vmcb != NULL); + p2m = p2m_get_nestedp2m(v, vvmcb->_h_cr3); + n2vmcb->_h_cr3 = pagetable_get_paddr(p2m_get_pagetable(p2m)); +} + static int nsvm_vmcb_prepare4vmrun(struct vcpu *v, struct cpu_user_regs *regs) { struct nestedvcpu *nv = &vcpu_nestedhvm(v); @@ -475,6 +488,9 @@ static int nsvm_vmcb_prepare4vmrun(struc /* Nested paging mode */ if (nestedhvm_paging_mode_hap(v)) { /* host nested paging + guest nested paging. */ + n2vmcb->_np_enable = 1; + + nestedsvm_vmcb_set_nestedp2m(v, ns_vmcb, n2vmcb); /* hvm_set_cr3() below sets v->arch.hvm_vcpu.guest_cr[3] for us. */ rc = hvm_set_cr3(ns_vmcb->_cr3); @@ -1318,8 +1334,20 @@ asmlinkage void nsvm_vcpu_switch(struct ret = nsvm_vcpu_vmrun(v, regs); if (ret < 0) goto vmexit; + + ASSERT(nestedhvm_vcpu_in_guestmode(v)); nv->nv_vmentry_pending = 0; - return; + } + + if (nestedhvm_vcpu_in_guestmode(v) + && nestedhvm_paging_mode_hap(v)) + { + /* In case left the l2 guest due to a physical interrupt (e.g. IPI) + * that is not for the l1 guest then we continue running the l2 guest + * but check if the nestedp2m is still valid. + */ + if (nv->nv_p2m == NULL) + nestedsvm_vmcb_set_nestedp2m(v, nv->nv_vvmcx, nv->nv_n2vmcx); } } diff -r cfde4384be14 -r 28809c365861 xen/arch/x86/hvm/svm/svm.c --- a/xen/arch/x86/hvm/svm/svm.c +++ b/xen/arch/x86/hvm/svm/svm.c @@ -1014,14 +1014,16 @@ struct hvm_function_table * __init start return &svm_function_table; } -static void svm_do_nested_pgfault(paddr_t gpa) +static void svm_do_nested_pgfault(struct vcpu *v, + struct cpu_user_regs *regs, paddr_t gpa) { + int ret; unsigned long gfn = gpa >> PAGE_SHIFT; mfn_t mfn; p2m_type_t p2mt; - struct p2m_domain *p2m; + struct p2m_domain *p2m = NULL; - p2m = p2m_get_hostp2m(current->domain); + ret = hvm_hap_nested_page_fault(gpa, 0, ~0ul, 0, 0, 0, 0); if ( tb_init_done ) { @@ -1032,6 +1034,7 @@ static void svm_do_nested_pgfault(paddr_ uint32_t p2mt; } _d; + p2m = p2m_get_p2m(v); _d.gpa = gpa; _d.qualification = 0; _d.mfn = mfn_x(gfn_to_mfn_query(p2m, gfn, &_d.p2mt)); @@ -1039,14 +1042,26 @@ static void svm_do_nested_pgfault(paddr_ __trace_var(TRC_HVM_NPF, 0, sizeof(_d), &_d); } - if ( hvm_hap_nested_page_fault(gpa, 0, ~0ul, 0, 0, 0, 0) ) + switch (ret) { + case 0: + break; + case 1: return; + case -1: + ASSERT(nestedhvm_enabled(v->domain) && nestedhvm_vcpu_in_guestmode(v)); + /* inject #VMEXIT(NPF) into guest. */ + nestedsvm_vmexit_defer(v, VMEXIT_NPF, regs->error_code, gpa); + return; + } + if ( p2m == NULL ) + p2m = p2m_get_p2m(v); /* Everything else is an error. */ mfn = gfn_to_mfn_guest(p2m, gfn, &p2mt); - gdprintk(XENLOG_ERR, "SVM violation gpa %#"PRIpaddr", mfn %#lx, type %i\n", - gpa, mfn_x(mfn), p2mt); - domain_crash(current->domain); + gdprintk(XENLOG_ERR, + "SVM violation gpa %#"PRIpaddr", mfn %#lx, type %i\n", + gpa, mfn_x(mfn), p2mt); + domain_crash(v->domain); } static void svm_fpu_dirty_intercept(void) @@ -1659,6 +1674,8 @@ asmlinkage void svm_vmexit_handler(struc struct vmcb_struct *ns_vmcb = nv->nv_vvmcx; uint64_t exitinfo1, exitinfo2; + paging_update_nestedmode(v); + /* Write real exitinfo1 back into virtual vmcb. * nestedsvm_check_intercepts() expects to have the correct * exitinfo1 value there. @@ -1948,7 +1965,7 @@ asmlinkage void svm_vmexit_handler(struc case VMEXIT_NPF: perfc_incra(svmexits, VMEXIT_NPF_PERFC); regs->error_code = vmcb->exitinfo1; - svm_do_nested_pgfault(vmcb->exitinfo2); + svm_do_nested_pgfault(v, regs, vmcb->exitinfo2); break; case VMEXIT_IRET: { diff -r cfde4384be14 -r 28809c365861 xen/arch/x86/mm/hap/Makefile --- a/xen/arch/x86/mm/hap/Makefile +++ b/xen/arch/x86/mm/hap/Makefile @@ -3,6 +3,7 @@ obj-y += guest_walk_2level.o obj-y += guest_walk_3level.o obj-y += guest_walk_4level.o obj-y += p2m-ept.o +obj-y += nested_hap.o guest_levels = $(subst level,,$(filter %level,$(subst ., ,$(subst _, ,$(1))))) guest_walk_defns = -DGUEST_PAGING_LEVELS=$(call guest_levels,$(1)) diff -r cfde4384be14 -r 28809c365861 xen/arch/x86/mm/hap/guest_walk.c --- a/xen/arch/x86/mm/hap/guest_walk.c +++ b/xen/arch/x86/mm/hap/guest_walk.c @@ -29,24 +29,32 @@ #define _hap_gva_to_gfn(levels) hap_gva_to_gfn_##levels##_levels #define hap_gva_to_gfn(levels) _hap_gva_to_gfn(levels) +#define _hap_p2m_ga_to_gfn(levels) hap_p2m_ga_to_gfn_##levels##_levels +#define hap_p2m_ga_to_gfn(levels) _hap_p2m_ga_to_gfn(levels) + #if GUEST_PAGING_LEVELS <= CONFIG_PAGING_LEVELS #include #include unsigned long hap_gva_to_gfn(GUEST_PAGING_LEVELS)( - struct vcpu *v, unsigned long gva, uint32_t *pfec) + struct vcpu *v, struct p2m_domain *p2m, unsigned long gva, uint32_t *pfec) { - unsigned long cr3; + unsigned long cr3 = v->arch.hvm_vcpu.guest_cr[3]; + return hap_p2m_ga_to_gfn(GUEST_PAGING_LEVELS)(v, p2m, cr3, gva, pfec); +} + +unsigned long hap_p2m_ga_to_gfn(GUEST_PAGING_LEVELS)( + struct vcpu *v, struct p2m_domain *p2m, unsigned long cr3, + paddr_t ga, uint32_t *pfec) +{ uint32_t missing; mfn_t top_mfn; void *top_map; p2m_type_t p2mt; walk_t gw; - struct p2m_domain *p2m = p2m_get_hostp2m(v->domain); /* Get the top-level table's MFN */ - cr3 = v->arch.hvm_vcpu.guest_cr[3]; top_mfn = gfn_to_mfn_unshare(p2m, cr3 >> PAGE_SHIFT, &p2mt, 0); if ( p2m_is_paging(p2mt) ) { @@ -72,7 +80,7 @@ unsigned long hap_gva_to_gfn(GUEST_PAGIN #if GUEST_PAGING_LEVELS == 3 top_map += (cr3 & ~(PAGE_MASK | 31)); #endif - missing = guest_walk_tables(v, p2m, gva, &gw, pfec[0], top_mfn, top_map); + missing = guest_walk_tables(v, p2m, ga, &gw, pfec[0], top_mfn, top_map); unmap_domain_page(top_map); /* Interpret the answer */ @@ -122,6 +130,15 @@ unsigned long hap_gva_to_gfn(GUEST_PAGIN return INVALID_GFN; } +unsigned long hap_p2m_ga_to_gfn(GUEST_PAGING_LEVELS)( + struct vcpu *v, struct p2m_domain *p2m, unsigned long cr3, + paddr_t ga, uint32_t *pfec) +{ + gdprintk(XENLOG_ERR, + "Guest paging level is greater than host paging level!\n"); + domain_crash(v->domain); + return INVALID_GFN; +} #endif diff -r cfde4384be14 -r 28809c365861 xen/arch/x86/mm/hap/hap.c --- a/xen/arch/x86/mm/hap/hap.c +++ b/xen/arch/x86/mm/hap/hap.c @@ -40,6 +40,7 @@ #include #include #include +#include #include "private.h" @@ -582,6 +583,7 @@ void hap_domain_init(struct domain *d) int hap_enable(struct domain *d, u32 mode) { unsigned int old_pages; + uint8_t i; int rv = 0; domain_pause(d); @@ -620,6 +622,12 @@ int hap_enable(struct domain *d, u32 mod goto out; } + for (i = 0; i < MAX_NESTEDP2M; i++) { + rv = p2m_alloc_table(d->arch.nested_p2m[i]); + if ( rv != 0 ) + goto out; + } + /* Now let other users see the new mode */ d->arch.paging.mode = mode | PG_HAP_enable; @@ -630,6 +638,13 @@ int hap_enable(struct domain *d, u32 mod void hap_final_teardown(struct domain *d) { + uint8_t i; + + /* Destroy nestedp2m's first */ + for (i = 0; i < MAX_NESTEDP2M; i++) { + p2m_teardown(d->arch.nested_p2m[i]); + } + if ( d->arch.paging.hap.total_pages != 0 ) hap_teardown(d); @@ -657,7 +672,7 @@ void hap_teardown(struct domain *d) /* release the monitor table held by each vcpu */ for_each_vcpu ( d, v ) { - if ( v->arch.paging.mode && paging_mode_external(d) ) + if ( paging_get_hostmode(v) && paging_mode_external(d) ) { mfn = pagetable_get_mfn(v->arch.monitor_table); if ( mfn_valid(mfn) && (mfn_x(mfn) != 0) ) @@ -725,6 +740,7 @@ static const struct paging_mode hap_pagi void hap_vcpu_init(struct vcpu *v) { v->arch.paging.mode = &hap_paging_real_mode; + v->arch.paging.nestedmode = &hap_paging_real_mode; } /************************************************/ @@ -751,6 +767,15 @@ static int hap_page_fault(struct vcpu *v */ static int hap_invlpg(struct vcpu *v, unsigned long va) { + if (nestedhvm_enabled(v->domain)) { + /* Emulate INVLPGA: + * Must perform the flush right now or an other vcpu may + * use it when we use the next VMRUN emulation, otherwise. + */ + p2m_flush(v, vcpu_nestedhvm(v).nv_p2m); + return 1; + } + HAP_ERROR("Intercepted a guest INVLPG (%u:%u) with HAP enabled.\n", v->domain->domain_id, v->vcpu_id); domain_crash(v->domain); @@ -763,17 +788,22 @@ static void hap_update_cr3(struct vcpu * hvm_update_guest_cr(v, 3); } +const struct paging_mode * +hap_paging_get_mode(struct vcpu *v) +{ + return !hvm_paging_enabled(v) ? &hap_paging_real_mode : + hvm_long_mode_enabled(v) ? &hap_paging_long_mode : + hvm_pae_enabled(v) ? &hap_paging_pae_mode : + &hap_paging_protected_mode; +} + static void hap_update_paging_modes(struct vcpu *v) { struct domain *d = v->domain; hap_lock(d); - v->arch.paging.mode = - !hvm_paging_enabled(v) ? &hap_paging_real_mode : - hvm_long_mode_enabled(v) ? &hap_paging_long_mode : - hvm_pae_enabled(v) ? &hap_paging_pae_mode : - &hap_paging_protected_mode; + v->arch.paging.mode = hap_paging_get_mode(v); if ( pagetable_is_null(v->arch.monitor_table) ) { @@ -834,38 +864,70 @@ static void hap_write_p2m_entry(struct vcpu *v, unsigned long gfn, l1_pgentry_t *p, mfn_t table_mfn, l1_pgentry_t new, unsigned int level) { + struct domain *d = v->domain; uint32_t old_flags; + bool_t flush_nestedp2m = 0; - hap_lock(v->domain); + /* We know always use the host p2m here, regardless if the vcpu + * is in host or guest mode. The vcpu can be in guest mode by + * a hypercall which passes a domain and chooses mostly the first + * vcpu. + * XXX This is the reason why this function can not be used re-used + * for updating the nestedp2m. Otherwise, hypercalls would randomly + * operate on host p2m and nested p2m. + */ + hap_lock(d); old_flags = l1e_get_flags(*p); + + if ( nestedhvm_enabled(d) && (old_flags & _PAGE_PRESENT) ) { + /* We are replacing a valid entry so we need to flush nested p2ms, + * unless the only change is an increase in access rights. */ + mfn_t omfn = _mfn(l1e_get_pfn(*p)); + mfn_t nmfn = _mfn(l1e_get_pfn(new)); + flush_nestedp2m = !( mfn_x(omfn) == mfn_x(nmfn) + && perms_strictly_increased(old_flags, l1e_get_flags(new)) ); + } + safe_write_pte(p, new); if ( (old_flags & _PAGE_PRESENT) && (level == 1 || (level == 2 && (old_flags & _PAGE_PSE))) ) - flush_tlb_mask(&v->domain->domain_dirty_cpumask); + flush_tlb_mask(&d->domain_dirty_cpumask); #if CONFIG_PAGING_LEVELS == 3 /* install P2M in monitor table for PAE Xen */ if ( level == 3 ) /* We have written to the p2m l3: need to sync the per-vcpu * copies of it in the monitor tables */ - p2m_install_entry_in_monitors(v->domain, (l3_pgentry_t *)p); + p2m_install_entry_in_monitors(d, (l3_pgentry_t *)p); #endif - hap_unlock(v->domain); + hap_unlock(d); + + if ( flush_nestedp2m ) + p2m_flush_nestedp2m(d); } static unsigned long hap_gva_to_gfn_real_mode( - struct vcpu *v, unsigned long gva, uint32_t *pfec) + struct vcpu *v, struct p2m_domain *p2m, unsigned long gva, uint32_t *pfec) { return ((paddr_t)gva >> PAGE_SHIFT); } +static unsigned long hap_p2m_ga_to_gfn_real_mode( + struct vcpu *v, struct p2m_domain *p2m, unsigned long cr3, + paddr_t ga, uint32_t *pfec) +{ + return (ga >> PAGE_SHIFT); +} + + /* Entry points into this mode of the hap code. */ static const struct paging_mode hap_paging_real_mode = { .page_fault = hap_page_fault, .invlpg = hap_invlpg, .gva_to_gfn = hap_gva_to_gfn_real_mode, + .p2m_ga_to_gfn = hap_p2m_ga_to_gfn_real_mode, .update_cr3 = hap_update_cr3, .update_paging_modes = hap_update_paging_modes, .write_p2m_entry = hap_write_p2m_entry, @@ -876,6 +938,7 @@ static const struct paging_mode hap_pagi .page_fault = hap_page_fault, .invlpg = hap_invlpg, .gva_to_gfn = hap_gva_to_gfn_2_levels, + .p2m_ga_to_gfn = hap_p2m_ga_to_gfn_2_levels, .update_cr3 = hap_update_cr3, .update_paging_modes = hap_update_paging_modes, .write_p2m_entry = hap_write_p2m_entry, @@ -886,6 +949,7 @@ static const struct paging_mode hap_pagi .page_fault = hap_page_fault, .invlpg = hap_invlpg, .gva_to_gfn = hap_gva_to_gfn_3_levels, + .p2m_ga_to_gfn = hap_p2m_ga_to_gfn_3_levels, .update_cr3 = hap_update_cr3, .update_paging_modes = hap_update_paging_modes, .write_p2m_entry = hap_write_p2m_entry, @@ -896,6 +960,7 @@ static const struct paging_mode hap_pagi .page_fault = hap_page_fault, .invlpg = hap_invlpg, .gva_to_gfn = hap_gva_to_gfn_4_levels, + .p2m_ga_to_gfn = hap_p2m_ga_to_gfn_4_levels, .update_cr3 = hap_update_cr3, .update_paging_modes = hap_update_paging_modes, .write_p2m_entry = hap_write_p2m_entry, diff -r cfde4384be14 -r 28809c365861 xen/arch/x86/mm/hap/nested_hap.c --- /dev/null +++ b/xen/arch/x86/mm/hap/nested_hap.c @@ -0,0 +1,236 @@ +/****************************************************************************** + * arch/x86/mm/hap/nested_hap.c + * + * Code for Nested Virtualization + * Copyright (c) 2011 Advanced Micro Devices + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +#include "private.h" + +/* AlGORITHM for NESTED PAGE FAULT + * + * NOTATION + * Levels: L0, L1, L2 + * Guests: L1 guest, L2 guest + * Hypervisor: L0 hypervisor + * Addresses: L2-GVA, L2-GPA, L1-GVA, L1-GPA, MPA + * + * On L0, when #NPF happens, the handler function should do: + * hap_page_fault(GPA) + * { + * 1. If #NPF is from L1 guest, then we crash the guest VM (same as old + * code) + * 2. If #NPF is from L2 guest, then we continue from (3) + * 3. Get h_cr3 from L1 guest. Map h_cr3 into L0 hypervisor address space. + * 4. Walk the h_cr3 page table + * 5. - if not present, then we inject #NPF back to L1 guest and + * re-launch L1 guest (L1 guest will either treat this #NPF as MMIO, + * or fix its p2m table for L2 guest) + * 6. - if present, then we will get the a new translated value L1-GPA + * (points to L1 machine memory) + * 7. * Use L1-GPA to walk L0 P2M table + * 8. - if not present, then crash the guest (should not happen) + * 9. - if present, then we get a new translated value MPA + * (points to real machine memory) + * 10. * Finally, use GPA and MPA to walk nested_p2m + * and fix the bits. + * } + * + */ + + +/********************************************/ +/* NESTED VIRT P2M FUNCTIONS */ +/********************************************/ +/* Override macros from asm/page.h to make them work with mfn_t */ +#undef mfn_valid +#define mfn_valid(_mfn) __mfn_valid(mfn_x(_mfn)) +#undef page_to_mfn +#define page_to_mfn(_pg) _mfn(__page_to_mfn(_pg)) + +void +nestedp2m_write_p2m_entry(struct p2m_domain *p2m, unsigned long gfn, + l1_pgentry_t *p, mfn_t table_mfn, l1_pgentry_t new, unsigned int level) +{ + struct domain *d = p2m->domain; + uint32_t old_flags; + + hap_lock(d); + + old_flags = l1e_get_flags(*p); + safe_write_pte(p, new); + if (old_flags & _PAGE_PRESENT) + nestedhvm_vmcx_flushtlb(p2m); + + hap_unlock(d); +} + +/********************************************/ +/* NESTED VIRT FUNCTIONS */ +/********************************************/ +static void +nestedhap_fix_p2m(struct p2m_domain *p2m, paddr_t L2_gpa, paddr_t L0_gpa, + p2m_type_t p2mt, p2m_access_t p2ma) +{ + int rv; + ASSERT(p2m); + ASSERT(p2m->set_entry); + + rv = p2m->set_entry(p2m, L2_gpa >> PAGE_SHIFT, + page_to_mfn(maddr_to_page(L0_gpa)), + 0 /*4K*/, p2mt, p2ma); + if (rv == 0) { + gdprintk(XENLOG_ERR, + "failed to set entry for 0x%"PRIx64" -> 0x%"PRIx64"\n", + L2_gpa, L0_gpa); + BUG(); + } +} + +/* This function uses L1_gpa to walk the P2M table in L0 hypervisor. If the + * walk is successful, the translated value is returned in L0_gpa. The return + * value tells the upper level what to do. + */ +static int +nestedhap_walk_L0_p2m(struct p2m_domain *p2m, paddr_t L1_gpa, paddr_t *L0_gpa) +{ + mfn_t mfn; + p2m_type_t p2mt; + + /* we use gfn_to_mfn_query() function to walk L0 P2M table */ + mfn = gfn_to_mfn_query(p2m, L1_gpa >> PAGE_SHIFT, &p2mt); + + if ( p2m_is_paging(p2mt) || p2m_is_shared(p2mt) || !p2m_is_ram(p2mt) ) + return NESTEDHVM_PAGEFAULT_ERROR; + + if ( !mfn_valid(mfn) ) + return NESTEDHVM_PAGEFAULT_ERROR; + + *L0_gpa = (mfn_x(mfn) << PAGE_SHIFT) + (L1_gpa & ~PAGE_MASK); + return NESTEDHVM_PAGEFAULT_DONE; +} + +/* This function uses L2_gpa to walk the P2M page table in L1. If the + * walk is successful, the translated value is returned in + * L1_gpa. The result value tells what to do next. + */ +static int +nestedhap_walk_L1_p2m(struct vcpu *v, struct p2m_domain *p2m, + paddr_t L2_gpa, paddr_t *L1_gpa) +{ + uint32_t pfec; + unsigned long nested_cr3, gfn; + const struct paging_mode *mode = paging_get_hostmode(v); + + nested_cr3 = nhvm_vcpu_hostcr3(v); + + /* walk the guest table */ + gfn = paging_p2m_ga_to_gfn(v, p2m, mode, nested_cr3, L2_gpa, &pfec); + + if ( gfn == INVALID_GFN ) + return NESTEDHVM_PAGEFAULT_INJECT; + + *L1_gpa = (gfn << PAGE_SHIFT) + (L2_gpa & ~PAGE_MASK); + return NESTEDHVM_PAGEFAULT_DONE; +} + +/* + * The following function, nestedhap_page_fault(), is for steps (3)--(10). + * + * Returns: + */ +int +nestedhvm_hap_nested_page_fault(struct vcpu *v, paddr_t L2_gpa) +{ + int rv; + paddr_t L1_gpa, L0_gpa; + struct domain *d = v->domain; + struct p2m_domain *p2m, *nested_p2m; + + p2m = p2m_get_hostp2m(d); /* L0 p2m */ + nested_p2m = p2m_get_nestedp2m(v, nhvm_vcpu_hostcr3(v)); + + /* walk the L1 P2M table, note we have to pass p2m + * and not nested_p2m here or we fail the walk forever, + * otherwise. */ + rv = nestedhap_walk_L1_p2m(v, p2m, L2_gpa, &L1_gpa); + + /* let caller to handle these two cases */ + switch (rv) { + case NESTEDHVM_PAGEFAULT_INJECT: + return rv; + case NESTEDHVM_PAGEFAULT_ERROR: + return rv; + case NESTEDHVM_PAGEFAULT_DONE: + break; + default: + BUG(); + break; + } + + /* ==> we have to walk L0 P2M */ + rv = nestedhap_walk_L0_p2m(p2m, L1_gpa, &L0_gpa); + + /* let upper level caller to handle these two cases */ + switch (rv) { + case NESTEDHVM_PAGEFAULT_INJECT: + return rv; + case NESTEDHVM_PAGEFAULT_ERROR: + return rv; + case NESTEDHVM_PAGEFAULT_DONE: + break; + default: + BUG(); + break; + } + + nestedp2m_lock(d); + /* fix p2m_get_pagetable(nested_p2m) */ + nestedhap_fix_p2m(nested_p2m, L2_gpa, L0_gpa, + p2m_ram_rw, + p2m_access_rwx /* FIXME: Should use same permission as l1 guest */); + nestedp2m_unlock(d); + + return NESTEDHVM_PAGEFAULT_DONE; +} + +/********************************************/ +/* NESTED VIRT INITIALIZATION FUNCS */ +/********************************************/ + +/* + * Local variables: + * mode: C + * c-set-style: "BSD" + * c-basic-offset: 4 + * tab-width: 4 + * indent-tabs-mode: nil + * End: + */ diff -r cfde4384be14 -r 28809c365861 xen/arch/x86/mm/hap/private.h --- a/xen/arch/x86/mm/hap/private.h +++ b/xen/arch/x86/mm/hap/private.h @@ -23,11 +23,27 @@ /********************************************/ /* GUEST TRANSLATION FUNCS */ /********************************************/ -unsigned long hap_gva_to_gfn_2_levels(struct vcpu *v, unsigned long gva, +unsigned long hap_gva_to_gfn_2_levels(struct vcpu *v, + struct p2m_domain *p2m, + unsigned long gva, uint32_t *pfec); -unsigned long hap_gva_to_gfn_3_levels(struct vcpu *v, unsigned long gva, +unsigned long hap_gva_to_gfn_3_levels(struct vcpu *v, + struct p2m_domain *p2m, + unsigned long gva, uint32_t *pfec); -unsigned long hap_gva_to_gfn_4_levels(struct vcpu *v, unsigned long gva, +unsigned long hap_gva_to_gfn_4_levels(struct vcpu *v, + struct p2m_domain *p2m, + unsigned long gva, uint32_t *pfec); +unsigned long hap_p2m_ga_to_gfn_2_levels(struct vcpu *v, + struct p2m_domain *p2m, unsigned long cr3, + paddr_t ga, uint32_t *pfec); +unsigned long hap_p2m_ga_to_gfn_3_levels(struct vcpu *v, + struct p2m_domain *p2m, unsigned long cr3, + paddr_t ga, uint32_t *pfec); +unsigned long hap_p2m_ga_to_gfn_4_levels(struct vcpu *v, + struct p2m_domain *p2m, unsigned long cr3, + paddr_t ga, uint32_t *pfec); + #endif /* __HAP_PRIVATE_H__ */ diff -r cfde4384be14 -r 28809c365861 xen/arch/x86/mm/p2m.c --- a/xen/arch/x86/mm/p2m.c +++ b/xen/arch/x86/mm/p2m.c @@ -34,6 +34,7 @@ #include #include #include +#include /* Debugging and auditing of the P2M code? */ #define P2M_AUDIT 0 @@ -75,7 +76,7 @@ boolean_param("hap_2mb", opt_hap_2mb); #define SUPERPAGE_PAGES (1UL << 9) #define superpage_aligned(_x) (((_x)&(SUPERPAGE_PAGES-1))==0) -static unsigned long p2m_type_to_flags(p2m_type_t t, mfn_t mfn) +unsigned long p2m_type_to_flags(p2m_type_t t, mfn_t mfn) { unsigned long flags; #ifdef __x86_64__ @@ -121,9 +122,9 @@ static void audit_p2m(struct p2m_domain // Find the next level's P2M entry, checking for out-of-range gfn's... // Returns NULL on error. // -static l1_pgentry_t * +l1_pgentry_t * p2m_find_entry(void *table, unsigned long *gfn_remainder, - unsigned long gfn, u32 shift, u32 max) + unsigned long gfn, uint32_t shift, uint32_t max) { u32 index; @@ -224,20 +225,17 @@ p2m_next_level(struct p2m_domain *p2m, m switch ( type ) { case PGT_l3_page_table: - paging_write_p2m_entry(p2m->domain, gfn, - p2m_entry, *table_mfn, new_entry, 4); + p2m->write_p2m_entry(p2m, gfn, p2m_entry, *table_mfn, new_entry, 4); break; case PGT_l2_page_table: #if CONFIG_PAGING_LEVELS == 3 /* for PAE mode, PDPE only has PCD/PWT/P bits available */ new_entry = l1e_from_pfn(mfn_x(page_to_mfn(pg)), _PAGE_PRESENT); #endif - paging_write_p2m_entry(p2m->domain, gfn, - p2m_entry, *table_mfn, new_entry, 3); + p2m->write_p2m_entry(p2m, gfn, p2m_entry, *table_mfn, new_entry, 3); break; case PGT_l1_page_table: - paging_write_p2m_entry(p2m->domain, gfn, - p2m_entry, *table_mfn, new_entry, 2); + p2m->write_p2m_entry(p2m, gfn, p2m_entry, *table_mfn, new_entry, 2); break; default: BUG(); @@ -264,14 +262,13 @@ p2m_next_level(struct p2m_domain *p2m, m for ( i = 0; i < L2_PAGETABLE_ENTRIES; i++ ) { new_entry = l1e_from_pfn(pfn + (i * L1_PAGETABLE_ENTRIES), flags); - paging_write_p2m_entry(p2m->domain, gfn, - l1_entry+i, *table_mfn, new_entry, 2); + p2m->write_p2m_entry(p2m, gfn, + l1_entry+i, *table_mfn, new_entry, 2); } unmap_domain_page(l1_entry); new_entry = l1e_from_pfn(mfn_x(page_to_mfn(pg)), __PAGE_HYPERVISOR|_PAGE_USER); //disable PSE - paging_write_p2m_entry(p2m->domain, gfn, - p2m_entry, *table_mfn, new_entry, 3); + p2m->write_p2m_entry(p2m, gfn, p2m_entry, *table_mfn, new_entry, 3); } @@ -298,15 +295,15 @@ p2m_next_level(struct p2m_domain *p2m, m for ( i = 0; i < L1_PAGETABLE_ENTRIES; i++ ) { new_entry = l1e_from_pfn(pfn + i, flags); - paging_write_p2m_entry(p2m->domain, gfn, - l1_entry+i, *table_mfn, new_entry, 1); + p2m->write_p2m_entry(p2m, gfn, + l1_entry+i, *table_mfn, new_entry, 1); } unmap_domain_page(l1_entry); new_entry = l1e_from_pfn(mfn_x(page_to_mfn(pg)), __PAGE_HYPERVISOR|_PAGE_USER); - paging_write_p2m_entry(p2m->domain, gfn, - p2m_entry, *table_mfn, new_entry, 2); + p2m->write_p2m_entry(p2m, gfn, + p2m_entry, *table_mfn, new_entry, 2); } *table_mfn = _mfn(l1e_get_pfn(*p2m_entry)); @@ -1369,8 +1366,7 @@ p2m_set_entry(struct p2m_domain *p2m, un p2m_type_to_flags(p2mt, mfn) | _PAGE_PSE) : l3e_empty(); entry_content.l1 = l3e_content.l3; - paging_write_p2m_entry(p2m->domain, gfn, p2m_entry, - table_mfn, entry_content, 3); + p2m->write_p2m_entry(p2m, gfn, p2m_entry, table_mfn, entry_content, 3); /* NB: paging_write_p2m_entry() handles tlb flushes properly */ /* Free old intermediate tables if necessary */ @@ -1410,8 +1406,7 @@ p2m_set_entry(struct p2m_domain *p2m, un entry_content = l1e_empty(); /* level 1 entry */ - paging_write_p2m_entry(p2m->domain, gfn, p2m_entry, - table_mfn, entry_content, 1); + p2m->write_p2m_entry(p2m, gfn, p2m_entry, table_mfn, entry_content, 1); /* NB: paging_write_p2m_entry() handles tlb flushes properly */ } else if ( page_order == 9 ) @@ -1440,8 +1435,7 @@ p2m_set_entry(struct p2m_domain *p2m, un l2e_content = l2e_empty(); entry_content.l1 = l2e_content.l2; - paging_write_p2m_entry(p2m->domain, gfn, p2m_entry, - table_mfn, entry_content, 2); + p2m->write_p2m_entry(p2m, gfn, p2m_entry, table_mfn, entry_content, 2); /* NB: paging_write_p2m_entry() handles tlb flushes properly */ /* Free old intermediate tables if necessary */ @@ -1806,10 +1800,13 @@ static void p2m_initialise(struct domain p2m->domain = d; p2m->default_access = p2m_access_rwx; + p2m->cr3 = CR3_EADDR; p2m->set_entry = p2m_set_entry; p2m->get_entry = p2m_gfn_to_mfn; p2m->get_entry_current = p2m_gfn_to_mfn_current; p2m->change_entry_type_global = p2m_change_type_global; + p2m->write_p2m_entry = paging_write_p2m_entry; + cpus_clear(p2m->p2m_dirty_cpumask); if ( hap_enabled(d) && (boot_cpu_data.x86_vendor == X86_VENDOR_INTEL) ) ept_p2m_init(d); @@ -1817,6 +1814,25 @@ static void p2m_initialise(struct domain return; } +static int +p2m_init_nestedp2m(struct domain *d) +{ + uint8_t i; + struct p2m_domain *p2m; + + nestedp2m_lock_init(d); + for (i = 0; i < MAX_NESTEDP2M; i++) { + d->arch.nested_p2m[i] = p2m = xmalloc(struct p2m_domain); + if (p2m == NULL) + return -ENOMEM; + p2m_initialise(d, p2m); + p2m->get_entry_current = p2m->get_entry; + p2m->write_p2m_entry = nestedp2m_write_p2m_entry; + } + + return 0; +} + int p2m_init(struct domain *d) { struct p2m_domain *p2m; @@ -1825,8 +1841,12 @@ int p2m_init(struct domain *d) if ( p2m == NULL ) return -ENOMEM; p2m_initialise(d, p2m); - - return 0; + + /* Must initialise nestedp2m unconditionally + * since nestedhvm_enabled(d) returns false here. + * (p2m_init runs too early for HVM_PARAM_* options) + */ + return p2m_init_nestedp2m(d); } void p2m_change_entry_type_global(struct p2m_domain *p2m, @@ -1919,6 +1939,9 @@ int p2m_alloc_table(struct p2m_domain *p p2m_invalid, p2m->default_access) ) goto error; + if (p2m_is_nestedp2m(p2m)) + goto nesteddone; + /* Copy all existing mappings from the page list and m2p */ spin_lock(&p2m->domain->page_alloc_lock); page_list_for_each(page, &p2m->domain->page_list) @@ -1940,6 +1963,7 @@ int p2m_alloc_table(struct p2m_domain *p } spin_unlock(&p2m->domain->page_alloc_lock); + nesteddone: P2M_PRINTK("p2m table initialised (%u pages)\n", page_count); p2m_unlock(p2m); return 0; @@ -1966,6 +1990,9 @@ void p2m_teardown(struct p2m_domain *p2m mfn_t mfn; #endif + if (p2m == NULL) + return; + p2m_lock(p2m); #ifdef __x86_64__ @@ -1984,11 +2011,26 @@ void p2m_teardown(struct p2m_domain *p2m p2m_unlock(p2m); } +static void p2m_teardown_nestedp2m(struct domain *d) +{ + uint8_t i; + + for (i = 0; i < MAX_NESTEDP2M; i++) { + xfree(d->arch.nested_p2m[i]); + d->arch.nested_p2m[i] = NULL; + } +} + void p2m_final_teardown(struct domain *d) { /* Iterate over all p2m tables per domain */ xfree(d->arch.p2m); d->arch.p2m = NULL; + + /* We must teardown unconditionally because + * we initialise them unconditionally. + */ + p2m_teardown_nestedp2m(d); } #if P2M_AUDIT @@ -2573,9 +2615,9 @@ void p2m_change_type_global(struct p2m_d gfn = get_gpfn_from_mfn(mfn); flags = p2m_type_to_flags(nt, _mfn(mfn)); l1e_content = l1e_from_pfn(mfn, flags | _PAGE_PSE); - paging_write_p2m_entry(p2m->domain, gfn, - (l1_pgentry_t *)&l3e[i3], - l3mfn, l1e_content, 3); + p2m->write_p2m_entry(p2m, gfn, + (l1_pgentry_t *)&l3e[i3], + l3mfn, l1e_content, 3); continue; } @@ -2604,9 +2646,9 @@ void p2m_change_type_global(struct p2m_d * L2_PAGETABLE_ENTRIES) * L1_PAGETABLE_ENTRIES; flags = p2m_type_to_flags(nt, _mfn(mfn)); l1e_content = l1e_from_pfn(mfn, flags | _PAGE_PSE); - paging_write_p2m_entry(p2m->domain, gfn, - (l1_pgentry_t *)&l2e[i2], - l2mfn, l1e_content, 2); + p2m->write_p2m_entry(p2m, gfn, + (l1_pgentry_t *)&l2e[i2], + l2mfn, l1e_content, 2); continue; } @@ -2628,8 +2670,8 @@ void p2m_change_type_global(struct p2m_d /* create a new 1le entry with the new type */ flags = p2m_type_to_flags(nt, _mfn(mfn)); l1e_content = l1e_from_pfn(mfn, flags); - paging_write_p2m_entry(p2m->domain, gfn, &l1e[i1], - l1mfn, l1e_content, 1); + p2m->write_p2m_entry(p2m, gfn, &l1e[i1], + l1mfn, l1e_content, 1); } unmap_domain_page(l1e); } @@ -3048,6 +3090,182 @@ void p2m_mem_access_resume(struct p2m_do } #endif /* __x86_64__ */ +static struct p2m_domain * +p2m_getlru_nestedp2m(struct domain *d, struct p2m_domain *p2m) +{ + int i, lru_index = -1; + struct p2m_domain *lrup2m, *tmp; + + if (p2m == NULL) { + lru_index = MAX_NESTEDP2M - 1; + lrup2m = d->arch.nested_p2m[lru_index]; + } else { + lrup2m = p2m; + for (i = 0; i < MAX_NESTEDP2M; i++) { + if (d->arch.nested_p2m[i] == p2m) { + lru_index = i; + break; + } + } + } + + ASSERT(lru_index >= 0); + if (lru_index == 0) { + return lrup2m; + } + + /* move the other's down the array "list" */ + for (i = lru_index - 1; i >= 0; i--) { + tmp = d->arch.nested_p2m[i]; + d->arch.nested_p2m[i+1] = tmp; + } + + /* make the entry the first one */ + d->arch.nested_p2m[0] = lrup2m; + + return lrup2m; +} + +static int +p2m_flush_locked(struct p2m_domain *p2m) +{ + ASSERT(p2m); + if (p2m->cr3 == CR3_EADDR) + /* Microoptimisation: p2m is already empty. + * => about 0.3% speedup of overall system performance. + */ + return 0; + + p2m_teardown(p2m); + p2m_initialise(p2m->domain, p2m); + p2m->get_entry_current = p2m->get_entry; + p2m->write_p2m_entry = nestedp2m_write_p2m_entry; + return p2m_alloc_table(p2m); +} + +void +p2m_flush(struct vcpu *v, struct p2m_domain *p2m) +{ + struct domain *d = p2m->domain; + + ASSERT(v->domain == d); + vcpu_nestedhvm(v).nv_p2m = NULL; + nestedp2m_lock(d); + BUG_ON(p2m_flush_locked(p2m) != 0); + hvm_asid_flush_vcpu(v); + nestedhvm_vmcx_flushtlb(p2m); + nestedp2m_unlock(d); +} + +void +p2m_flush_nestedp2m(struct domain *d) +{ + int i; + + nestedp2m_lock(d); + for (i = 0; i < MAX_NESTEDP2M; i++) { + struct p2m_domain *p2m = d->arch.nested_p2m[i]; + BUG_ON(p2m_flush_locked(p2m) != 0); + cpus_clear(p2m->p2m_dirty_cpumask); + } + nestedhvm_vmcx_flushtlbdomain(d); + nestedp2m_unlock(d); +} + +struct p2m_domain * +p2m_get_nestedp2m(struct vcpu *v, uint64_t cr3) +{ + /* Use volatile to prevent gcc to cache nv->nv_p2m in a cpu register as + * this may change within the loop by an other (v)cpu. + */ + volatile struct nestedvcpu *nv = &vcpu_nestedhvm(v); + struct domain *d; + struct p2m_domain *p2m; + int i, rv; + + if (cr3 == 0 || cr3 == CR3_EADDR) + cr3 = v->arch.hvm_vcpu.guest_cr[3]; + + if (nv->nv_flushp2m && nv->nv_p2m) { + nv->nv_p2m = NULL; + } + + d = v->domain; + nestedp2m_lock(d); + for (i = 0; i < MAX_NESTEDP2M; i++) { + p2m = d->arch.nested_p2m[i]; + if ((p2m->cr3 != cr3 && p2m->cr3 != CR3_EADDR) || (p2m != nv->nv_p2m)) + continue; + + nv->nv_flushp2m = 0; + p2m_getlru_nestedp2m(d, p2m); + nv->nv_p2m = p2m; + if (p2m->cr3 == CR3_EADDR) + hvm_asid_flush_vcpu(v); + p2m->cr3 = cr3; + cpu_set(v->processor, p2m->p2m_dirty_cpumask); + nestedp2m_unlock(d); + return p2m; + } + + /* All p2m's are or were in use. Take the least recent used one, + * flush it and reuse. + */ + for (i = 0; i < MAX_NESTEDP2M; i++) { + p2m = p2m_getlru_nestedp2m(d, NULL); + rv = p2m_flush_locked(p2m); + if (rv == 0) + break; + } + nv->nv_p2m = p2m; + p2m->cr3 = cr3; + nv->nv_flushp2m = 0; + hvm_asid_flush_vcpu(v); + nestedhvm_vmcx_flushtlb(nv->nv_p2m); + cpu_set(v->processor, p2m->p2m_dirty_cpumask); + nestedp2m_unlock(d); + + return p2m; +} + +struct p2m_domain * +p2m_get_p2m(struct vcpu *v) +{ + if (!nestedhvm_is_n2(v)) + return p2m_get_hostp2m(v->domain); + + return p2m_get_nestedp2m(v, nhvm_vcpu_hostcr3(v)); +} + +unsigned long paging_gva_to_gfn(struct vcpu *v, + unsigned long va, + uint32_t *pfec) +{ + struct p2m_domain *hostp2m = p2m_get_hostp2m(v->domain); + const struct paging_mode *hostmode = paging_get_hostmode(v); + + if ( is_hvm_domain(v->domain) + && paging_mode_hap(v->domain) + && nestedhvm_is_n2(v) ) + { + unsigned long gfn; + struct p2m_domain *p2m; + const struct paging_mode *mode; + uint64_t ncr3 = nhvm_vcpu_hostcr3(v); + + /* translate l2 guest va into l2 guest gfn */ + p2m = p2m_get_nestedp2m(v, ncr3); + mode = paging_get_nestedmode(v); + gfn = mode->gva_to_gfn(v, p2m, va, pfec); + + /* translate l2 guest gfn into l1 guest gfn */ + return hostmode->p2m_ga_to_gfn(v, hostp2m, ncr3, + gfn << PAGE_SHIFT, pfec); + } + + return hostmode->gva_to_gfn(v, hostp2m, va, pfec); +} + /* * Local variables: * mode: C diff -r cfde4384be14 -r 28809c365861 xen/arch/x86/mm/paging.c --- a/xen/arch/x86/mm/paging.c +++ b/xen/arch/x86/mm/paging.c @@ -26,6 +26,7 @@ #include #include #include +#include #include #include @@ -851,21 +852,58 @@ void paging_dump_vcpu_info(struct vcpu * printk(" paging assistance: "); if ( paging_mode_shadow(v->domain) ) { - if ( v->arch.paging.mode ) + if ( paging_get_hostmode(v) ) printk("shadowed %u-on-%u\n", - v->arch.paging.mode->guest_levels, - v->arch.paging.mode->shadow.shadow_levels); + paging_get_hostmode(v)->guest_levels, + paging_get_hostmode(v)->shadow.shadow_levels); else printk("not shadowed\n"); } - else if ( paging_mode_hap(v->domain) && v->arch.paging.mode ) + else if ( paging_mode_hap(v->domain) && paging_get_hostmode(v) ) printk("hap, %u levels\n", - v->arch.paging.mode->guest_levels); + paging_get_hostmode(v)->guest_levels); else printk("none\n"); } } +const struct paging_mode *paging_get_mode(struct vcpu *v) +{ + if (!nestedhvm_is_n2(v)) + return paging_get_hostmode(v); + + return paging_get_nestedmode(v); +} + +extern const struct paging_mode *hap_paging_get_mode(struct vcpu *); + +void paging_update_nestedmode(struct vcpu *v) +{ + ASSERT(nestedhvm_enabled(v->domain)); + if (nestedhvm_paging_mode_hap(v)) + /* nested-on-nested */ + v->arch.paging.nestedmode = hap_paging_get_mode(v); + else + /* TODO: shadow-on-shadow */ + v->arch.paging.nestedmode = NULL; +} + +void paging_write_p2m_entry(struct p2m_domain *p2m, unsigned long gfn, + l1_pgentry_t *p, mfn_t table_mfn, + l1_pgentry_t new, unsigned int level) +{ + struct domain *d = p2m->domain; + struct vcpu *v = current; + if ( v->domain != d ) + v = d->vcpu ? d->vcpu[0] : NULL; + if ( likely(v && paging_mode_enabled(d) && paging_get_hostmode(v) != NULL) ) + { + return paging_get_hostmode(v)->write_p2m_entry(v, gfn, p, table_mfn, + new, level); + } + else + safe_write_pte(p, new); +} /* * Local variables: diff -r cfde4384be14 -r 28809c365861 xen/arch/x86/mm/shadow/multi.c --- a/xen/arch/x86/mm/shadow/multi.c +++ b/xen/arch/x86/mm/shadow/multi.c @@ -837,22 +837,6 @@ shadow_write_entries(void *d, void *s, i if ( map != NULL ) sh_unmap_domain_page(map); } -static inline int -perms_strictly_increased(u32 old_flags, u32 new_flags) -/* Given the flags of two entries, are the new flags a strict - * increase in rights over the old ones? */ -{ - u32 of = old_flags & (_PAGE_PRESENT|_PAGE_RW|_PAGE_USER|_PAGE_NX_BIT); - u32 nf = new_flags & (_PAGE_PRESENT|_PAGE_RW|_PAGE_USER|_PAGE_NX_BIT); - /* Flip the NX bit, since it's the only one that decreases rights; - * we calculate as if it were an "X" bit. */ - of ^= _PAGE_NX_BIT; - nf ^= _PAGE_NX_BIT; - /* If the changed bits are all set in the new flags, then rights strictly - * increased between old and new. */ - return ((of | (of ^ nf)) == nf); -} - /* type is only used to distinguish grant map pages from ordinary RAM * i.e. non-p2m_is_grant() pages are treated as p2m_ram_rw. */ static int inline @@ -3768,7 +3752,8 @@ sh_invlpg(struct vcpu *v, unsigned long static unsigned long -sh_gva_to_gfn(struct vcpu *v, unsigned long va, uint32_t *pfec) +sh_gva_to_gfn(struct vcpu *v, struct p2m_domain *p2m, + unsigned long va, uint32_t *pfec) /* Called to translate a guest virtual address to what the *guest* * pagetables would map it to. */ { @@ -4820,7 +4805,7 @@ static mfn_t emulate_gva_to_mfn(struct v struct p2m_domain *p2m = p2m_get_hostp2m(v->domain); /* Translate the VA to a GFN */ - gfn = sh_gva_to_gfn(v, vaddr, &pfec); + gfn = sh_gva_to_gfn(v, p2m, vaddr, &pfec); if ( gfn == INVALID_GFN ) { if ( is_hvm_vcpu(v) ) diff -r cfde4384be14 -r 28809c365861 xen/include/asm-x86/domain.h --- a/xen/include/asm-x86/domain.h +++ b/xen/include/asm-x86/domain.h @@ -210,6 +210,8 @@ struct paging_domain { struct paging_vcpu { /* Pointers to mode-specific entry points. */ const struct paging_mode *mode; + /* Nested Virtualization: paging mode of nested guest */ + const struct paging_mode *nestedmode; /* HVM guest: last emulate was to a pagetable */ unsigned int last_write_was_pt:1; /* HVM guest: last write emulation succeeds */ @@ -225,6 +227,7 @@ struct paging_vcpu { #define MAX_CPUID_INPUT 40 typedef xen_domctl_cpuid_t cpuid_input_t; +#define MAX_NESTEDP2M 10 struct p2m_domain; struct time_scale { int shift; @@ -258,6 +261,12 @@ struct arch_domain struct paging_domain paging; struct p2m_domain *p2m; + /* nestedhvm: translate l2 guest physical to host physical */ + struct p2m_domain *nested_p2m[MAX_NESTEDP2M]; + spinlock_t nested_p2m_lock; + int nested_p2m_locker; + const char *nested_p2m_function; + /* NB. protected by d->event_lock and by irq_desc[irq].lock */ int *irq_pirq; int *pirq_irq; diff -r cfde4384be14 -r 28809c365861 xen/include/asm-x86/hvm/hvm.h --- a/xen/include/asm-x86/hvm/hvm.h +++ b/xen/include/asm-x86/hvm/hvm.h @@ -374,12 +374,12 @@ static inline void hvm_set_info_guest(st int hvm_debug_op(struct vcpu *v, int32_t op); -bool_t hvm_hap_nested_page_fault(unsigned long gpa, - bool_t gla_valid, unsigned long gla, - bool_t access_valid, - bool_t access_r, - bool_t access_w, - bool_t access_x); +int hvm_hap_nested_page_fault(unsigned long gpa, + bool_t gla_valid, unsigned long gla, + bool_t access_valid, + bool_t access_r, + bool_t access_w, + bool_t access_x); #define hvm_msr_tsc_aux(v) ({ \ struct domain *__d = (v)->domain; \ diff -r cfde4384be14 -r 28809c365861 xen/include/asm-x86/hvm/nestedhvm.h --- a/xen/include/asm-x86/hvm/nestedhvm.h +++ b/xen/include/asm-x86/hvm/nestedhvm.h @@ -60,4 +60,9 @@ unsigned long *nestedhvm_vcpu_iomap_get( #define nestedhvm_vmswitch_in_progress(v) \ (!!vcpu_nestedhvm((v)).nv_vmswitch_in_progress) +void nestedhvm_vmcx_flushtlb(struct p2m_domain *p2m); +void nestedhvm_vmcx_flushtlbdomain(struct domain *d); + +bool_t nestedhvm_is_n2(struct vcpu *v); + #endif /* _HVM_NESTEDHVM_H */ diff -r cfde4384be14 -r 28809c365861 xen/include/asm-x86/p2m.h --- a/xen/include/asm-x86/p2m.h +++ b/xen/include/asm-x86/p2m.h @@ -199,7 +199,15 @@ struct p2m_domain { /* Shadow translated domain: p2m mapping */ pagetable_t phys_table; + /* Same as domain_dirty_cpumask but limited to + * this p2m and those physical cpus whose vcpu's are in + * guestmode. + */ + cpumask_t p2m_dirty_cpumask; + struct domain *domain; /* back pointer to domain */ +#define CR3_EADDR (~0ULL) + uint64_t cr3; /* to identify this p2m for re-use */ /* Pages used to construct the p2m */ struct page_list_head pages; @@ -223,6 +231,11 @@ struct p2m_domain { p2m_type_t ot, p2m_type_t nt); + void (*write_p2m_entry)(struct p2m_domain *p2m, + unsigned long gfn, l1_pgentry_t *p, + mfn_t table_mfn, l1_pgentry_t new, + unsigned int level); + /* Default P2M access type for each page in the the domain: new pages, * swapped in pages, cleared pages, and pages that are ambiquously * retyped get this access type. See definition of p2m_access_t. */ @@ -264,8 +277,26 @@ struct p2m_domain { /* get host p2m table */ #define p2m_get_hostp2m(d) ((d)->arch.p2m) +/* Get p2m table (re)usable for specified cr3. + * Automatically destroys and re-initializes a p2m if none found. + * If cr3 == 0 then v->arch.hvm_vcpu.guest_cr[3] is used. + */ +struct p2m_domain *p2m_get_nestedp2m(struct vcpu *v, uint64_t cr3); + +/* If vcpu is in host mode then behaviour matches p2m_get_hostp2m(). + * If vcpu is in guest mode then behaviour matches p2m_get_nestedp2m(). + */ +struct p2m_domain *p2m_get_p2m(struct vcpu *v); + +#define p2m_is_nestedp2m(p2m) ((p2m) != p2m_get_hostp2m((p2m->domain))) + #define p2m_get_pagetable(p2m) ((p2m)->phys_table) +/* Flushes specified p2m table */ +void p2m_flush(struct vcpu *v, struct p2m_domain *p2m); +/* Flushes all nested p2m tables */ +void p2m_flush_nestedp2m(struct domain *d); + /* * The P2M lock. This protects all updates to the p2m table. * Updates are expected to be safe against concurrent reads, @@ -307,6 +338,38 @@ struct p2m_domain { (current->processor == (_p2m)->locker) +#define nestedp2m_lock_init(_domain) \ + do { \ + spin_lock_init(&(_domain)->arch.nested_p2m_lock); \ + (_domain)->arch.nested_p2m_locker = -1; \ + (_domain)->arch.nested_p2m_function = "nobody"; \ + } while (0) + +#define nestedp2m_locked_by_me(_domain) \ + (current->processor == (_domain)->arch.nested_p2m_locker) + +#define nestedp2m_lock(_domain) \ + do { \ + if ( nestedp2m_locked_by_me(_domain) ) \ + { \ + printk("Error: p2m lock held by %s\n", \ + (_domain)->arch.nested_p2m_function); \ + BUG(); \ + } \ + spin_lock(&(_domain)->arch.nested_p2m_lock); \ + ASSERT((_domain)->arch.nested_p2m_locker == -1); \ + (_domain)->arch.nested_p2m_locker = current->processor; \ + (_domain)->arch.nested_p2m_function = __func__; \ + } while (0) + +#define nestedp2m_unlock(_domain) \ + do { \ + ASSERT(nestedp2m_locked_by_me(_domain)); \ + (_domain)->arch.nested_p2m_locker = -1; \ + (_domain)->arch.nested_p2m_function = "nobody"; \ + spin_unlock(&(_domain)->arch.nested_p2m_lock); \ + } while (0) + /* Extract the type from the PTE flags that store it */ static inline p2m_type_t p2m_flags_to_type(unsigned long flags) { @@ -424,11 +487,21 @@ static inline unsigned long mfn_to_gfn(s /* Init the datastructures for later use by the p2m code */ int p2m_init(struct domain *d); +/* PTE flags for various types of p2m entry */ +unsigned long p2m_type_to_flags(p2m_type_t t, mfn_t mfn); + /* Allocate a new p2m table for a domain. * * Returns 0 for success or -errno. */ int p2m_alloc_table(struct p2m_domain *p2m); +/* Find the next level's P2M entry, checking for out-of-range gfn's... + * Returns NULL on error. + */ +l1_pgentry_t * +p2m_find_entry(void *table, unsigned long *gfn_remainder, + unsigned long gfn, uint32_t shift, uint32_t max); + /* Return all the p2m resources to Xen. */ void p2m_teardown(struct p2m_domain *p2m); void p2m_final_teardown(struct domain *d); @@ -502,6 +575,8 @@ p2m_type_t p2m_change_type(struct p2m_do int set_mmio_p2m_entry(struct p2m_domain *p2m, unsigned long gfn, mfn_t mfn); int clear_mmio_p2m_entry(struct p2m_domain *p2m, unsigned long gfn); +void nestedp2m_write_p2m_entry(struct p2m_domain *p2m, unsigned long gfn, + l1_pgentry_t *p, mfn_t table_mfn, l1_pgentry_t new, unsigned int level); #ifdef __x86_64__ /* Modify p2m table for shared gfn */ diff -r cfde4384be14 -r 28809c365861 xen/include/asm-x86/page.h --- a/xen/include/asm-x86/page.h +++ b/xen/include/asm-x86/page.h @@ -391,6 +391,23 @@ static inline uint32_t cacheattr_to_pte_ return ((cacheattr & 4) << 5) | ((cacheattr & 3) << 3); } +/* return true if permission increased */ +static inline bool_t +perms_strictly_increased(uint32_t old_flags, uint32_t new_flags) +/* Given the flags of two entries, are the new flags a strict + * increase in rights over the old ones? */ +{ + uint32_t of = old_flags & (_PAGE_PRESENT|_PAGE_RW|_PAGE_USER|_PAGE_NX_BIT); + uint32_t nf = new_flags & (_PAGE_PRESENT|_PAGE_RW|_PAGE_USER|_PAGE_NX_BIT); + /* Flip the NX bit, since it's the only one that decreases rights; + * we calculate as if it were an "X" bit. */ + of ^= _PAGE_NX_BIT; + nf ^= _PAGE_NX_BIT; + /* If the changed bits are all set in the new flags, then rights strictly + * increased between old and new. */ + return ((of | (of ^ nf)) == nf); +} + #endif /* !__ASSEMBLY__ */ #define PAGE_ALIGN(x) (((x) + PAGE_SIZE - 1) & PAGE_MASK) diff -r cfde4384be14 -r 28809c365861 xen/include/asm-x86/paging.h --- a/xen/include/asm-x86/paging.h +++ b/xen/include/asm-x86/paging.h @@ -108,8 +108,14 @@ struct paging_mode { int (*page_fault )(struct vcpu *v, unsigned long va, struct cpu_user_regs *regs); int (*invlpg )(struct vcpu *v, unsigned long va); - unsigned long (*gva_to_gfn )(struct vcpu *v, unsigned long va, + unsigned long (*gva_to_gfn )(struct vcpu *v, + struct p2m_domain *p2m, + unsigned long va, uint32_t *pfec); + unsigned long (*p2m_ga_to_gfn )(struct vcpu *v, + struct p2m_domain *p2m, + unsigned long cr3, + paddr_t ga, uint32_t *pfec); void (*update_cr3 )(struct vcpu *v, int do_locking); void (*update_paging_modes )(struct vcpu *v); void (*write_p2m_entry )(struct vcpu *v, unsigned long gfn, @@ -219,6 +225,10 @@ void paging_final_teardown(struct domain * creation. */ int paging_enable(struct domain *d, u32 mode); +#define paging_get_hostmode(v) ((v)->arch.paging.mode) +#define paging_get_nestedmode(v) ((v)->arch.paging.nestedmode) +const struct paging_mode *paging_get_mode(struct vcpu *v); +void paging_update_nestedmode(struct vcpu *v); /* Page fault handler * Called from pagefault handler in Xen, and from the HVM trap handlers @@ -233,7 +243,7 @@ static inline int paging_fault(unsigned long va, struct cpu_user_regs *regs) { struct vcpu *v = current; - return v->arch.paging.mode->page_fault(v, va, regs); + return paging_get_hostmode(v)->page_fault(v, va, regs); } /* Handle invlpg requests on vcpus. @@ -241,7 +251,7 @@ paging_fault(unsigned long va, struct cp * or 0 if it's safe not to do so. */ static inline int paging_invlpg(struct vcpu *v, unsigned long va) { - return v->arch.paging.mode->invlpg(v, va); + return paging_get_hostmode(v)->invlpg(v, va); } /* Translate a guest virtual address to the frame number that the @@ -251,11 +261,30 @@ static inline int paging_invlpg(struct v * walking the tables. The caller should set the PFEC_page_present bit * in pfec[0]; in the failure case, that bit will be cleared if appropriate. */ #define INVALID_GFN (-1UL) -static inline unsigned long paging_gva_to_gfn(struct vcpu *v, - unsigned long va, - uint32_t *pfec) +unsigned long paging_gva_to_gfn(struct vcpu *v, + unsigned long va, + uint32_t *pfec); + +/* Translates a guest virtual address to guest physical address + * where the specified cr3 is translated to host physical address + * using the specified p2m table. + * This allows to do page walks in the guest or even in the nested guest. + * It returns the guest's gfn or the nested guest's gfn. + * Use 'paddr_t' for the guest address so it won't overflow when + * guest or nested guest is in 32bit PAE mode. + */ +static inline unsigned long paging_p2m_ga_to_gfn(struct vcpu *v, + struct p2m_domain *p2m, + const struct paging_mode *mode, + unsigned long cr3, + paddr_t ga, + uint32_t *pfec) { - return v->arch.paging.mode->gva_to_gfn(v, va, pfec); + if ( is_hvm_domain(v->domain) && paging_mode_hap(v->domain) ) + return mode->p2m_ga_to_gfn(v, p2m, cr3, ga, pfec); + + /* shadow paging */ + return paging_gva_to_gfn(v, ga, pfec); } /* Update all the things that are derived from the guest's CR3. @@ -263,7 +292,7 @@ static inline unsigned long paging_gva_t * as the value to load into the host CR3 to schedule this vcpu */ static inline void paging_update_cr3(struct vcpu *v) { - v->arch.paging.mode->update_cr3(v, 1); + paging_get_hostmode(v)->update_cr3(v, 1); } /* Update all the things that are derived from the guest's CR0/CR3/CR4. @@ -271,7 +300,7 @@ static inline void paging_update_cr3(str * has changed, and when bringing up a VCPU for the first time. */ static inline void paging_update_paging_modes(struct vcpu *v) { - v->arch.paging.mode->update_paging_modes(v); + paging_get_hostmode(v)->update_paging_modes(v); } @@ -283,7 +312,7 @@ static inline int paging_write_guest_ent { if ( unlikely(paging_mode_enabled(v->domain) && v->arch.paging.mode != NULL) ) - return v->arch.paging.mode->write_guest_entry(v, p, new, gmfn); + return paging_get_hostmode(v)->write_guest_entry(v, p, new, gmfn); else return (!__copy_to_user(p, &new, sizeof(new))); } @@ -299,7 +328,7 @@ static inline int paging_cmpxchg_guest_e { if ( unlikely(paging_mode_enabled(v->domain) && v->arch.paging.mode != NULL) ) - return v->arch.paging.mode->cmpxchg_guest_entry(v, p, old, new, gmfn); + return paging_get_hostmode(v)->cmpxchg_guest_entry(v, p, old, new, gmfn); else return (!cmpxchg_user(p, *old, new)); } @@ -327,21 +356,11 @@ static inline void safe_write_pte(l1_pge * a pointer to the entry to be written, the MFN in which the entry resides, * the new contents of the entry, and the level in the p2m tree at which * we are writing. */ -static inline void paging_write_p2m_entry(struct domain *d, unsigned long gfn, - l1_pgentry_t *p, mfn_t table_mfn, - l1_pgentry_t new, unsigned int level) -{ - struct vcpu *v = current; - if ( v->domain != d ) - v = d->vcpu ? d->vcpu[0] : NULL; - if ( likely(v && paging_mode_enabled(d) && v->arch.paging.mode != NULL) ) - { - return v->arch.paging.mode->write_p2m_entry(v, gfn, p, table_mfn, - new, level); - } - else - safe_write_pte(p, new); -} +struct p2m_domain; + +void paging_write_p2m_entry(struct p2m_domain *p2m, unsigned long gfn, + l1_pgentry_t *p, mfn_t table_mfn, + l1_pgentry_t new, unsigned int level); /* Called from the guest to indicate that the a process is being * torn down and its pagetables will soon be discarded */ @@ -362,7 +381,7 @@ guest_map_l1e(struct vcpu *v, unsigned l l2_pgentry_t l2e; if ( unlikely(paging_mode_translate(v->domain)) ) - return v->arch.paging.mode->guest_map_l1e(v, addr, gl1mfn); + return paging_get_hostmode(v)->guest_map_l1e(v, addr, gl1mfn); /* Find this l1e and its enclosing l1mfn in the linear map */ if ( __copy_from_user(&l2e, @@ -398,7 +417,7 @@ guest_get_eff_l1e(struct vcpu *v, unsign return; } - v->arch.paging.mode->guest_get_eff_l1e(v, addr, eff_l1e); + paging_get_hostmode(v)->guest_get_eff_l1e(v, addr, eff_l1e); } /* Read the guest's l1e that maps this address, from the kernel-mode