# HG changeset patch # User cegger # Date 1299670565 -3600 Implement SVM specific part for Nested Virtualization Signed-off-by: Christoph Egger diff -r e842b80fcee0 -r 3df1f127bd4f xen/arch/x86/hvm/svm/Makefile --- a/xen/arch/x86/hvm/svm/Makefile +++ b/xen/arch/x86/hvm/svm/Makefile @@ -2,6 +2,8 @@ obj-y += asid.o obj-y += emulate.o obj-y += entry.o obj-y += intr.o +obj-y += nestedsvm.o obj-y += svm.o +obj-y += svmdebug.o obj-y += vmcb.o obj-y += vpmu.o diff -r e842b80fcee0 -r 3df1f127bd4f xen/arch/x86/hvm/svm/emulate.c --- a/xen/arch/x86/hvm/svm/emulate.c +++ b/xen/arch/x86/hvm/svm/emulate.c @@ -102,6 +102,11 @@ MAKE_INSTR(INT3, 1, 0xcc); MAKE_INSTR(RDTSC, 2, 0x0f, 0x31); MAKE_INSTR(PAUSE, 1, 0x90); MAKE_INSTR(XSETBV, 3, 0x0f, 0x01, 0xd1); +MAKE_INSTR(VMRUN, 3, 0x0f, 0x01, 0xd8); +MAKE_INSTR(VMLOAD, 3, 0x0f, 0x01, 0xda); +MAKE_INSTR(VMSAVE, 3, 0x0f, 0x01, 0xdb); +MAKE_INSTR(STGI, 3, 0x0f, 0x01, 0xdc); +MAKE_INSTR(CLGI, 3, 0x0f, 0x01, 0xdd); static const u8 *opc_bytes[INSTR_MAX_COUNT] = { @@ -116,6 +121,11 @@ static const u8 *opc_bytes[INSTR_MAX_COU [INSTR_RDTSC] = OPCODE_RDTSC, [INSTR_PAUSE] = OPCODE_PAUSE, [INSTR_XSETBV] = OPCODE_XSETBV, + [INSTR_VMRUN] = OPCODE_VMRUN, + [INSTR_VMLOAD] = OPCODE_VMLOAD, + [INSTR_VMSAVE] = OPCODE_VMSAVE, + [INSTR_STGI] = OPCODE_STGI, + [INSTR_CLGI] = OPCODE_CLGI, }; static int fetch(struct vcpu *v, u8 *buf, unsigned long addr, int len) diff -r e842b80fcee0 -r 3df1f127bd4f xen/arch/x86/hvm/svm/entry.S --- a/xen/arch/x86/hvm/svm/entry.S +++ b/xen/arch/x86/hvm/svm/entry.S @@ -54,6 +54,7 @@ ENTRY(svm_asm_do_resume) call svm_intr_assist + call_with_regs(nsvm_vcpu_switch) get_current(bx) CLGI diff -r e842b80fcee0 -r 3df1f127bd4f xen/arch/x86/hvm/svm/nestedsvm.c --- /dev/null +++ b/xen/arch/x86/hvm/svm/nestedsvm.c @@ -0,0 +1,1279 @@ +/* + * nestedsvm.c: Nested Virtualization + * Copyright (c) 2011, Advanced Micro Devices, Inc + * + * This program is free software; you can redistribute it and/or modify it + * under the terms and conditions of the GNU General Public License, + * version 2, as published by the Free Software Foundation. + * + * This program is distributed in the hope it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for + * more details. + * + * You should have received a copy of the GNU General Public License along with + * this program; if not, write to the Free Software Foundation, Inc., 59 Temple + * Place - Suite 330, Boston, MA 02111-1307 USA. + * + */ + +#include +#include +#include +#include +#include +#include +#include +#include /* paging_mode_hap */ + +static int +nestedsvm_vmcb_isvalid(struct vcpu *v, uint64_t vmcxaddr) +{ + if ( !hvm_svm_enabled(v) || hvm_guest_x86_mode(v) < 2 ) + return 0; + + /* Maximum valid physical address. + * See AMD BKDG for HSAVE_PA MSR. + */ + if ( vmcxaddr > 0xfd00000000ULL ) + return 0; + + return 1; +} + +int nestedsvm_vmcb_map(struct vcpu *v, uint64_t vmcbaddr) +{ + struct nestedvcpu *nv = &vcpu_nestedhvm(v); + + if (nv->nv_vvmcx != NULL && nv->nv_vvmcxaddr != vmcbaddr) { + ASSERT(nv->nv_vvmcx != NULL); + ASSERT(nv->nv_vvmcxaddr != VMCX_EADDR); + hvm_unmap_guest_frame(nv->nv_vvmcx); + nv->nv_vvmcx = NULL; + nv->nv_vvmcxaddr = VMCX_EADDR; + } + + if (nv->nv_vvmcx == NULL) { + nv->nv_vvmcx = hvm_map_guest_frame_rw(vmcbaddr >> PAGE_SHIFT); + if (nv->nv_vvmcx == NULL) + return 0; + nv->nv_vvmcxaddr = vmcbaddr; + } + + return 1; +} + +/* Interface methods */ +int nsvm_vcpu_initialise(struct vcpu *v) +{ + void *msrpm; + struct nestedvcpu *nv = &vcpu_nestedhvm(v); + struct nestedsvm *svm = &vcpu_nestedsvm(v); + + msrpm = alloc_xenheap_pages(get_order_from_bytes(MSRPM_SIZE), 0); + svm->ns_cached_msrpm = msrpm; + if (msrpm == NULL) + goto err; + memset(msrpm, 0x0, MSRPM_SIZE); + + msrpm = alloc_xenheap_pages(get_order_from_bytes(MSRPM_SIZE), 0); + svm->ns_merged_msrpm = msrpm; + if (msrpm == NULL) + goto err; + memset(msrpm, 0x0, MSRPM_SIZE); + + nv->nv_n2vmcx = alloc_vmcb(); + if (nv->nv_n2vmcx == NULL) + goto err; + nv->nv_n2vmcx_pa = virt_to_maddr(nv->nv_n2vmcx); + + return 0; + +err: + nsvm_vcpu_destroy(v); + return -ENOMEM; +} + +int nsvm_vcpu_destroy(struct vcpu *v) +{ + struct nestedvcpu *nv = &vcpu_nestedhvm(v); + struct nestedsvm *svm = &vcpu_nestedsvm(v); + + if (svm->ns_cached_msrpm) { + free_xenheap_pages(svm->ns_cached_msrpm, + get_order_from_bytes(MSRPM_SIZE)); + svm->ns_cached_msrpm = NULL; + } + if (svm->ns_merged_msrpm) { + free_xenheap_pages(svm->ns_merged_msrpm, + get_order_from_bytes(MSRPM_SIZE)); + svm->ns_merged_msrpm = NULL; + } + if (nv->nv_n2vmcx) { + free_vmcb(nv->nv_n2vmcx); + nv->nv_n2vmcx = NULL; + nv->nv_n2vmcx_pa = VMCX_EADDR; + } + if (svm->ns_iomap) + svm->ns_iomap = NULL; + + return 0; +} + +int nsvm_vcpu_reset(struct vcpu *v) +{ + struct nestedsvm *svm = &vcpu_nestedsvm(v); + + svm->ns_msr_hsavepa = VMCX_EADDR; + svm->ns_ovvmcb_pa = VMCX_EADDR; + + svm->ns_cr_intercepts = 0; + svm->ns_dr_intercepts = 0; + svm->ns_exception_intercepts = 0; + svm->ns_general1_intercepts = 0; + svm->ns_general2_intercepts = 0; + svm->ns_lbr_control.bytes = 0; + + svm->ns_hap_enabled = 0; + svm->ns_vmcb_guestcr3 = 0; + svm->ns_vmcb_hostcr3 = 0; + svm->ns_guest_asid = 0; + svm->ns_hostflags.bytes = 0; + svm->ns_vmexit.exitinfo1 = 0; + svm->ns_vmexit.exitinfo2 = 0; + + if (svm->ns_iomap) + svm->ns_iomap = NULL; + + return 0; +} + +static int nsvm_vcpu_hostsave(struct vcpu *v, unsigned int inst_len) +{ + struct nestedsvm *svm = &vcpu_nestedsvm(v); + struct nestedvcpu *nv = &vcpu_nestedhvm(v); + struct vmcb_struct *n1vmcb; + + n1vmcb = nv->nv_n1vmcx; + ASSERT(n1vmcb != NULL); + + n1vmcb->rip += inst_len; + + /* Remember the host interrupt flag */ + svm->ns_hostflags.fields.rflagsif = + (n1vmcb->rflags & X86_EFLAGS_IF) ? 1 : 0; + + return 0; +} + +int nsvm_vcpu_hostrestore(struct vcpu *v, struct cpu_user_regs *regs) +{ + struct nestedvcpu *nv = &vcpu_nestedhvm(v); + struct vmcb_struct *n1vmcb, *n2vmcb; + int rc; + + n1vmcb = nv->nv_n1vmcx; + n2vmcb = nv->nv_n2vmcx; + ASSERT(n1vmcb != NULL); + ASSERT(n2vmcb != NULL); + + /* nsvm_vmcb_prepare4vmexit() already saved register values + * handled by VMSAVE/VMLOAD into n1vmcb directly. + */ + + /* switch vmcb to l1 guest's vmcb */ + v->arch.hvm_svm.vmcb = n1vmcb; + v->arch.hvm_svm.vmcb_pa = nv->nv_n1vmcx_pa; + + /* EFER */ + v->arch.hvm_vcpu.guest_efer = n1vmcb->_efer; + rc = hvm_set_efer(n1vmcb->_efer); + if (rc != X86EMUL_OKAY) + gdprintk(XENLOG_ERR, "hvm_set_efer failed, rc: %u\n", rc); + + /* CR4 */ + v->arch.hvm_vcpu.guest_cr[4] = n1vmcb->_cr4; + rc = hvm_set_cr4(n1vmcb->_cr4); + if (rc != X86EMUL_OKAY) + gdprintk(XENLOG_ERR, "hvm_set_cr4 failed, rc: %u\n", rc); + + /* CR0 */ + v->arch.hvm_vcpu.guest_cr[0] = n1vmcb->_cr0 | X86_CR0_PE; + n1vmcb->rflags &= ~X86_EFLAGS_VM; + rc = hvm_set_cr0(n1vmcb->_cr0 | X86_CR0_PE); + if (rc != X86EMUL_OKAY) + gdprintk(XENLOG_ERR, "hvm_set_cr0 failed, rc: %u\n", rc); + + /* CR2 */ + v->arch.hvm_vcpu.guest_cr[2] = n1vmcb->_cr2; + hvm_update_guest_cr(v, 2); + + /* CR3 */ + /* Nested paging mode */ + if (nestedhvm_paging_mode_hap(v)) { + /* host nested paging + guest nested paging. */ + /* hvm_set_cr3() below sets v->arch.hvm_vcpu.guest_cr[3] for us. */ + } else if (paging_mode_hap(v->domain)) { + /* host nested paging + guest shadow paging. */ + /* hvm_set_cr3() below sets v->arch.hvm_vcpu.guest_cr[3] for us. */ + } else { + /* host shadow paging + guest shadow paging. */ + + /* Reset MMU context -- XXX (hostrestore) not yet working*/ + if (!pagetable_is_null(v->arch.guest_table)) + put_page(pagetable_get_page(v->arch.guest_table)); + v->arch.guest_table = pagetable_null(); + /* hvm_set_cr3() below sets v->arch.hvm_vcpu.guest_cr[3] for us. */ + } + rc = hvm_set_cr3(n1vmcb->_cr3); + if (rc != X86EMUL_OKAY) + gdprintk(XENLOG_ERR, "hvm_set_cr3 failed, rc: %u\n", rc); + + regs->eax = n1vmcb->rax; + regs->esp = n1vmcb->rsp; + regs->eip = n1vmcb->rip; + regs->eflags = n1vmcb->rflags; + n1vmcb->_dr7 = 0; /* disable all breakpoints */ + n1vmcb->_cpl = 0; + + /* Clear exitintinfo to prevent a fault loop of re-injecting + * exceptions forever. + */ + n1vmcb->exitintinfo.bytes = 0; + + /* Cleanbits */ + n1vmcb->cleanbits.bytes = 0; + + hvm_asid_flush_vcpu(v); + + return 0; +} + +static int nsvm_vmrun_permissionmap(struct vcpu *v, bool_t viopm) +{ + struct arch_svm_struct *arch_svm = &v->arch.hvm_svm; + struct nestedsvm *svm = &vcpu_nestedsvm(v); + struct nestedvcpu *nv = &vcpu_nestedhvm(v); + struct vmcb_struct *ns_vmcb = nv->nv_vvmcx; + struct vmcb_struct *host_vmcb = arch_svm->vmcb; + unsigned long *ns_msrpm_ptr; + unsigned int i; + enum hvm_copy_result ret; + unsigned long *ns_viomap; + bool_t ioport_80, ioport_ed; + + ns_msrpm_ptr = (unsigned long *)svm->ns_cached_msrpm; + + ret = hvm_copy_from_guest_phys(svm->ns_cached_msrpm, + ns_vmcb->_msrpm_base_pa, MSRPM_SIZE); + if (ret != HVMCOPY_okay) { + gdprintk(XENLOG_ERR, "hvm_copy_from_guest_phys msrpm %u\n", ret); + return 1; + } + + /* Check l1 guest io permission map and get a shadow one based on + * if l1 guest intercepts io ports 0x80 and/or 0xED. + */ + svm->ns_oiomap_pa = svm->ns_iomap_pa; + svm->ns_iomap_pa = ns_vmcb->_iopm_base_pa; + + ns_viomap = hvm_map_guest_frame_ro(svm->ns_iomap_pa >> PAGE_SHIFT); + ASSERT(ns_viomap != NULL); + ioport_80 = test_bit(0x80, ns_viomap); + ioport_ed = test_bit(0xed, ns_viomap); + hvm_unmap_guest_frame(ns_viomap); + + svm->ns_iomap = nestedhvm_vcpu_iomap_get(ioport_80, ioport_ed); + + nv->nv_ioport80 = ioport_80; + nv->nv_ioportED = ioport_ed; + + /* v->arch.hvm_svm.msrpm has type unsigned long, thus + * BYTES_PER_LONG. + */ + for (i = 0; i < MSRPM_SIZE / BYTES_PER_LONG; i++) + svm->ns_merged_msrpm[i] = arch_svm->msrpm[i] | ns_msrpm_ptr[i]; + + host_vmcb->_iopm_base_pa = + (uint64_t)virt_to_maddr(svm->ns_iomap); + host_vmcb->_msrpm_base_pa = + (uint64_t)virt_to_maddr(svm->ns_merged_msrpm); + + return 0; +} + +static int nsvm_vmcb_prepare4vmrun(struct vcpu *v, struct cpu_user_regs *regs) +{ + struct nestedvcpu *nv = &vcpu_nestedhvm(v); + struct nestedsvm *svm = &vcpu_nestedsvm(v); + struct vmcb_struct *ns_vmcb, *n1vmcb, *n2vmcb; + bool_t vcleanbits_valid; + int rc; + + ns_vmcb = nv->nv_vvmcx; + n1vmcb = nv->nv_n1vmcx; + n2vmcb = nv->nv_n2vmcx; + ASSERT(ns_vmcb != NULL); + ASSERT(n1vmcb != NULL); + ASSERT(n2vmcb != NULL); + + /* Check if virtual VMCB cleanbits are valid */ + vcleanbits_valid = 1; + if (svm->ns_ovvmcb_pa == VMCX_EADDR) + vcleanbits_valid = 0; + if (svm->ns_ovvmcb_pa != nv->nv_vvmcxaddr) + vcleanbits_valid = 0; + +#define vcleanbit_set(_name) \ + (vcleanbits_valid && ns_vmcb->cleanbits.fields._name) + + /* Enable l2 guest intercepts */ + if (!vcleanbit_set(intercepts)) { + svm->ns_cr_intercepts = ns_vmcb->_cr_intercepts; + svm->ns_dr_intercepts = ns_vmcb->_dr_intercepts; + svm->ns_exception_intercepts = ns_vmcb->_exception_intercepts; + svm->ns_general1_intercepts = ns_vmcb->_general1_intercepts; + svm->ns_general2_intercepts = ns_vmcb->_general2_intercepts; + } + + /* We could track the cleanbits of the n1vmcb from + * last emulated #VMEXIT to this emulated VMRUN to safe the merges + * below. Those cleanbits would be tracked in an integer field + * in struct nestedsvm. + * But this effort is not worth doing because: + * - Only the intercepts bit of the n1vmcb can effectively be used here + * - The CPU runs more instructions for the tracking than can be + * safed here. + * The overhead comes from (ordered from highest to lowest): + * - svm_ctxt_switch_to (CPU context switching) + * - svm_fpu_enter, svm_fpu_leave (lazy FPU switching) + * - emulated CLGI (clears VINTR intercept) + * - host clears VINTR intercept + * Test results show that the overhead is high enough that the + * tracked intercepts bit of the n1vmcb is practically *always* cleared. + */ + + n2vmcb->_cr_intercepts = + n1vmcb->_cr_intercepts | ns_vmcb->_cr_intercepts; + n2vmcb->_dr_intercepts = + n1vmcb->_dr_intercepts | ns_vmcb->_dr_intercepts; + n2vmcb->_exception_intercepts = + n1vmcb->_exception_intercepts | ns_vmcb->_exception_intercepts; + n2vmcb->_general1_intercepts = + n1vmcb->_general1_intercepts | ns_vmcb->_general1_intercepts; + n2vmcb->_general2_intercepts = + n1vmcb->_general2_intercepts | ns_vmcb->_general2_intercepts; + + /* Nested Pause Filter */ + if (ns_vmcb->_general1_intercepts & GENERAL1_INTERCEPT_PAUSE) + n2vmcb->_pause_filter_count = + min(n1vmcb->_pause_filter_count, ns_vmcb->_pause_filter_count); + else + n2vmcb->_pause_filter_count = n1vmcb->_pause_filter_count; + + /* TSC offset */ + n2vmcb->_tsc_offset = n1vmcb->_tsc_offset + ns_vmcb->_tsc_offset; + + /* Nested IO permission bitmaps */ + rc = nsvm_vmrun_permissionmap(v, vcleanbit_set(iopm)); + if (rc) + return rc; + + /* ASID */ + hvm_asid_flush_vcpu(v); + /* n2vmcb->_guest_asid = ns_vmcb->_guest_asid; */ + + /* TLB control */ + n2vmcb->tlb_control = n1vmcb->tlb_control | ns_vmcb->tlb_control; + + /* Virtual Interrupts */ + if (!vcleanbit_set(tpr)) { + n2vmcb->_vintr = ns_vmcb->_vintr; + n2vmcb->_vintr.fields.intr_masking = 1; + } + + /* Shadow Mode */ + n2vmcb->interrupt_shadow = ns_vmcb->interrupt_shadow; + + /* Exit codes */ + n2vmcb->exitcode = ns_vmcb->exitcode; + n2vmcb->exitinfo1 = ns_vmcb->exitinfo1; + n2vmcb->exitinfo2 = ns_vmcb->exitinfo2; + n2vmcb->exitintinfo = ns_vmcb->exitintinfo; + + /* Pending Interrupts */ + n2vmcb->eventinj = ns_vmcb->eventinj; + + /* LBR virtualization */ + if (!vcleanbit_set(lbr)) { + svm->ns_lbr_control = ns_vmcb->lbr_control; + } + n2vmcb->lbr_control.bytes = + n1vmcb->lbr_control.bytes | ns_vmcb->lbr_control.bytes; + + /* NextRIP */ + n2vmcb->nextrip = ns_vmcb->nextrip; + + /* + * VMCB Save State Area + */ + + /* Segments */ + if (!vcleanbit_set(seg)) { + n2vmcb->es = ns_vmcb->es; + n2vmcb->cs = ns_vmcb->cs; + n2vmcb->ss = ns_vmcb->ss; + n2vmcb->ds = ns_vmcb->ds; + /* CPL */ + n2vmcb->_cpl = ns_vmcb->_cpl; + } + if (!vcleanbit_set(dt)) { + n2vmcb->gdtr = ns_vmcb->gdtr; + n2vmcb->idtr = ns_vmcb->idtr; + } + + /* EFER */ + v->arch.hvm_vcpu.guest_efer = ns_vmcb->_efer; + rc = hvm_set_efer(ns_vmcb->_efer); + if (rc != X86EMUL_OKAY) + gdprintk(XENLOG_ERR, "hvm_set_efer failed, rc: %u\n", rc); + + /* CR4 */ + v->arch.hvm_vcpu.guest_cr[4] = ns_vmcb->_cr4; + rc = hvm_set_cr4(ns_vmcb->_cr4); + if (rc != X86EMUL_OKAY) + gdprintk(XENLOG_ERR, "hvm_set_cr4 failed, rc: %u\n", rc); + + /* CR0 */ + v->arch.hvm_vcpu.guest_cr[0] = ns_vmcb->_cr0; + rc = hvm_set_cr0(ns_vmcb->_cr0); + if (rc != X86EMUL_OKAY) + gdprintk(XENLOG_ERR, "hvm_set_cr0 failed, rc: %u\n", rc); + + /* CR2 */ + v->arch.hvm_vcpu.guest_cr[2] = ns_vmcb->_cr2; + hvm_update_guest_cr(v, 2); + + /* Nested paging mode */ + if (nestedhvm_paging_mode_hap(v)) { + /* host nested paging + guest nested paging. */ + + /* hvm_set_cr3() below sets v->arch.hvm_vcpu.guest_cr[3] for us. */ + rc = hvm_set_cr3(ns_vmcb->_cr3); + if (rc != X86EMUL_OKAY) + gdprintk(XENLOG_ERR, "hvm_set_cr3 failed, rc: %u\n", rc); + } else if (paging_mode_hap(v->domain)) { + /* host nested paging + guest shadow paging. */ + n2vmcb->_np_enable = 1; + /* Keep h_cr3 as it is. */ + /* When l1 guest does shadow paging + * we assume it intercepts page faults. + */ + /* hvm_set_cr3() below sets v->arch.hvm_vcpu.guest_cr[3] for us. */ + rc = hvm_set_cr3(ns_vmcb->_cr3); + if (rc != X86EMUL_OKAY) + gdprintk(XENLOG_ERR, "hvm_set_cr3 failed, rc: %u\n", rc); + } else { + /* host shadow paging + guest shadow paging. */ + n2vmcb->_np_enable = 0; + n2vmcb->_h_cr3 = 0x0; + + /* TODO: Once shadow-shadow paging is in place come back to here + * and set host_vmcb->_cr3 to the shadowed shadow table. + */ + } + + /* DRn */ + if (!vcleanbit_set(dr)) { + n2vmcb->_dr7 = ns_vmcb->_dr7; + n2vmcb->_dr6 = ns_vmcb->_dr6; + } + + /* RFLAGS */ + n2vmcb->rflags = ns_vmcb->rflags; + + /* RIP */ + n2vmcb->rip = ns_vmcb->rip; + + /* RSP */ + n2vmcb->rsp = ns_vmcb->rsp; + + /* RAX */ + n2vmcb->rax = ns_vmcb->rax; + + /* Keep the host values of the fs, gs, ldtr, tr, kerngsbase, + * star, lstar, cstar, sfmask, sysenter_cs, sysenter_esp, + * sysenter_eip. These are handled via VMSAVE/VMLOAD emulation. + */ + + /* Page tables */ + n2vmcb->pdpe0 = ns_vmcb->pdpe0; + n2vmcb->pdpe1 = ns_vmcb->pdpe1; + n2vmcb->pdpe2 = ns_vmcb->pdpe2; + n2vmcb->pdpe3 = ns_vmcb->pdpe3; + + /* PAT */ + if (!vcleanbit_set(np)) { + n2vmcb->_g_pat = ns_vmcb->_g_pat; + } + + if (!vcleanbit_set(lbr)) { + /* Debug Control MSR */ + n2vmcb->_debugctlmsr = ns_vmcb->_debugctlmsr; + + /* LBR MSRs */ + n2vmcb->_lastbranchfromip = ns_vmcb->_lastbranchfromip; + n2vmcb->_lastbranchtoip = ns_vmcb->_lastbranchtoip; + n2vmcb->_lastintfromip = ns_vmcb->_lastintfromip; + n2vmcb->_lastinttoip = ns_vmcb->_lastinttoip; + } + + /* Cleanbits */ + n2vmcb->cleanbits.bytes = 0; + + rc = svm_vmcb_isvalid(__func__, ns_vmcb, 1); + if (rc) { + gdprintk(XENLOG_ERR, "virtual vmcb invalid\n"); + return rc; + } + + rc = svm_vmcb_isvalid(__func__, n2vmcb, 1); + if (rc) { + gdprintk(XENLOG_ERR, "n2vmcb invalid\n"); + return rc; + } + + /* Switch guest registers to l2 guest */ + regs->eax = ns_vmcb->rax; + regs->eip = ns_vmcb->rip; + regs->esp = ns_vmcb->rsp; + regs->eflags = ns_vmcb->rflags; + +#undef vcleanbit_set + return 0; +} + +static int +nsvm_vcpu_vmentry(struct vcpu *v, struct cpu_user_regs *regs, + unsigned int inst_len) +{ + int ret; + struct nestedvcpu *nv = &vcpu_nestedhvm(v); + struct nestedsvm *svm = &vcpu_nestedsvm(v); + struct vmcb_struct *ns_vmcb; + + ns_vmcb = nv->nv_vvmcx; + ASSERT(ns_vmcb != NULL); + ASSERT(nv->nv_n2vmcx != NULL); + ASSERT(nv->nv_n2vmcx_pa != VMCX_EADDR); + + /* Save values for later use. Needed for Nested-on-Nested and + * Shadow-on-Shadow paging. + */ + svm->ns_vmcb_guestcr3 = ns_vmcb->_cr3; + svm->ns_vmcb_hostcr3 = ns_vmcb->_h_cr3; + + nv->nv_flushp2m = (ns_vmcb->tlb_control + || (svm->ns_guest_asid != ns_vmcb->_guest_asid)); + svm->ns_guest_asid = ns_vmcb->_guest_asid; + + /* nested paging for the guest */ + svm->ns_hap_enabled = (ns_vmcb->_np_enable) ? 1 : 0; + + /* Remember the V_INTR_MASK in hostflags */ + svm->ns_hostflags.fields.vintrmask = + (ns_vmcb->_vintr.fields.intr_masking) ? 1 : 0; + + /* Save l1 guest state (= host state) */ + ret = nsvm_vcpu_hostsave(v, inst_len); + if (ret) { + gdprintk(XENLOG_ERR, "hostsave failed, ret = %i\n", ret); + return ret; + } + + /* switch vmcb to shadow vmcb */ + v->arch.hvm_svm.vmcb = nv->nv_n2vmcx; + v->arch.hvm_svm.vmcb_pa = nv->nv_n2vmcx_pa; + + ret = nsvm_vmcb_prepare4vmrun(v, regs); + if (ret) { + gdprintk(XENLOG_ERR, "prepare4vmrun failed, ret = %i\n", ret); + return ret; + } + + return 0; +} + +int +nsvm_vcpu_vmrun(struct vcpu *v, struct cpu_user_regs *regs) +{ + int ret; + unsigned int inst_len; + struct nestedvcpu *nv = &vcpu_nestedhvm(v); + struct nestedsvm *svm = &vcpu_nestedsvm(v); + + inst_len = __get_instruction_length(v, INSTR_VMRUN); + if (inst_len == 0) { + svm->ns_vmexit.exitcode = VMEXIT_SHUTDOWN; + return -1; + } + + nv->nv_vmswitch_in_progress = 1; + ASSERT(nv->nv_vvmcx != NULL); + + /* save host state */ + ret = nsvm_vcpu_vmentry(v, regs, inst_len); + if (ret) { + gdprintk(XENLOG_ERR, + "nsvm_vcpu_vmentry failed, injecting #UD\n"); + hvm_inject_exception(TRAP_invalid_op, HVM_DELIVER_NO_ERROR_CODE, 0); + nv->nv_vmswitch_in_progress = 0; + return 1; + } + + /* Switch vcpu to guest mode + */ + nestedhvm_vcpu_enter_guestmode(v); + nv->nv_vmswitch_in_progress = 0; + return 0; +} + +int +nsvm_vcpu_vmexit_inject(struct vcpu *v, struct cpu_user_regs *regs, + uint64_t exitcode) +{ + struct nestedvcpu *nv = &vcpu_nestedhvm(v); + struct nestedsvm *svm = &vcpu_nestedsvm(v); + struct vmcb_struct *ns_vmcb; + + ns_vmcb = nv->nv_vvmcx; + + if (nv->nv_vmexit_pending) { + + switch (exitcode) { + case VMEXIT_INTR: + if ( unlikely(ns_vmcb->eventinj.fields.v) + && nv->nv_vmentry_pending + && hvm_event_needs_reinjection(ns_vmcb->eventinj.fields.type, + ns_vmcb->eventinj.fields.vector) ) + { + ns_vmcb->exitintinfo.bytes = ns_vmcb->eventinj.bytes; + } + break; + case VMEXIT_EXCEPTION_PF: + ns_vmcb->_cr2 = ns_vmcb->exitinfo2; + /* fall through */ + case VMEXIT_NPF: + /* PF error code */ + ns_vmcb->exitinfo1 = svm->ns_vmexit.exitinfo1; + /* fault address */ + ns_vmcb->exitinfo2 = svm->ns_vmexit.exitinfo2; + break; + case VMEXIT_EXCEPTION_NP: + case VMEXIT_EXCEPTION_SS: + case VMEXIT_EXCEPTION_GP: + case VMEXIT_EXCEPTION_15: + case VMEXIT_EXCEPTION_MF: + case VMEXIT_EXCEPTION_AC: + ns_vmcb->exitinfo1 = svm->ns_vmexit.exitinfo1; + break; + default: + break; + } + } + + ns_vmcb->exitcode = exitcode; + ns_vmcb->eventinj.bytes = 0; + return 0; +} + +int +nsvm_vcpu_vmexit_trap(struct vcpu *v, unsigned int trapnr, + int errcode, unsigned long cr2) +{ + ASSERT(vcpu_nestedhvm(v).nv_vvmcx != NULL); + + nestedsvm_vmexit_defer(v, VMEXIT_EXCEPTION_DE + trapnr, errcode, cr2); + return NESTEDHVM_VMEXIT_DONE; +} + +uint64_t nsvm_vcpu_guestcr3(struct vcpu *v) +{ + return vcpu_nestedsvm(v).ns_vmcb_guestcr3; +} + +uint64_t nsvm_vcpu_hostcr3(struct vcpu *v) +{ + return vcpu_nestedsvm(v).ns_vmcb_hostcr3; +} + +uint32_t nsvm_vcpu_asid(struct vcpu *v) +{ + return vcpu_nestedsvm(v).ns_guest_asid; +} + +static int +nsvm_vmcb_guest_intercepts_msr(unsigned long *msr_bitmap, + uint32_t msr, bool_t write) +{ + bool_t enabled; + unsigned long *msr_bit; + + msr_bit = svm_msrbit(msr_bitmap, msr); + + if (msr_bit == NULL) + /* MSR not in the permission map: Let the guest handle it. */ + return NESTEDHVM_VMEXIT_INJECT; + + BUG_ON(msr_bit == NULL); + msr &= 0x1fff; + + if (write) + /* write access */ + enabled = test_bit(msr * 2 + 1, msr_bit); + else + /* read access */ + enabled = test_bit(msr * 2, msr_bit); + + if (!enabled) + return NESTEDHVM_VMEXIT_HOST; + + return NESTEDHVM_VMEXIT_INJECT; +} + +static int +nsvm_vmcb_guest_intercepts_ioio(paddr_t iopm_pa, uint64_t exitinfo1) +{ + unsigned long iopm_gfn = iopm_pa >> PAGE_SHIFT; + unsigned long *io_bitmap = NULL; + ioio_info_t ioinfo; + uint16_t port; + bool_t enabled; + + ioinfo.bytes = exitinfo1; + port = ioinfo.fields.port; + + switch (port) { + case 0 ... 32767: /* first 4KB page */ + io_bitmap = hvm_map_guest_frame_ro(iopm_gfn); + break; + case 32768 ... 65535: /* second 4KB page */ + port -= 32768; + io_bitmap = hvm_map_guest_frame_ro(iopm_gfn+1); + break; + default: + BUG(); + break; + } + + if (io_bitmap == NULL) { + gdprintk(XENLOG_ERR, + "IOIO intercept: mapping of permission map failed\n"); + return NESTEDHVM_VMEXIT_ERROR; + } + + enabled = test_bit(port, io_bitmap); + hvm_unmap_guest_frame(io_bitmap); + if (!enabled) + return NESTEDHVM_VMEXIT_HOST; + + return NESTEDHVM_VMEXIT_INJECT; +} + +int +nsvm_vmcb_guest_intercepts_exitcode(struct vcpu *v, + struct cpu_user_regs *regs, uint64_t exitcode) +{ + uint64_t exit_bits; + struct nestedvcpu *nv = &vcpu_nestedhvm(v); + struct nestedsvm *svm = &vcpu_nestedsvm(v); + struct vmcb_struct *ns_vmcb = nv->nv_vvmcx; + enum nestedhvm_vmexits vmexits; + + switch (exitcode) { + case VMEXIT_CR0_READ ... VMEXIT_CR15_READ: + case VMEXIT_CR0_WRITE ... VMEXIT_CR15_WRITE: + exit_bits = 1ULL << (exitcode - VMEXIT_CR0_READ); + if (svm->ns_cr_intercepts & exit_bits) + break; + return 0; + + case VMEXIT_DR0_READ ... VMEXIT_DR7_READ: + case VMEXIT_DR0_WRITE ... VMEXIT_DR7_WRITE: + exit_bits = 1ULL << (exitcode - VMEXIT_DR0_READ); + if (svm->ns_dr_intercepts & exit_bits) + break; + return 0; + + case VMEXIT_EXCEPTION_DE ... VMEXIT_EXCEPTION_XF: + exit_bits = 1ULL << (exitcode - VMEXIT_EXCEPTION_DE); + if (svm->ns_exception_intercepts & exit_bits) + break; + return 0; + + case VMEXIT_INTR ... VMEXIT_SHUTDOWN: + exit_bits = 1ULL << (exitcode - VMEXIT_INTR); + if (svm->ns_general1_intercepts & exit_bits) + break; + return 0; + + case VMEXIT_VMRUN ... VMEXIT_XSETBV: + exit_bits = 1ULL << (exitcode - VMEXIT_VMRUN); + if (svm->ns_general2_intercepts & exit_bits) + break; + return 0; + + case VMEXIT_NPF: + case VMEXIT_INVALID: + /* Always intercepted */ + break; + + default: + gdprintk(XENLOG_ERR, "Illegal exitcode 0x%"PRIx64"\n", exitcode); + BUG(); + break; + } + + /* Special cases: Do more detailed checks */ + switch (exitcode) { + case VMEXIT_MSR: + ASSERT(regs != NULL); + nestedsvm_vmcb_map(v, nv->nv_vvmcxaddr); + ASSERT(nv->nv_vvmcx != NULL); + ns_vmcb = nv->nv_vvmcx; + vmexits = nsvm_vmcb_guest_intercepts_msr(svm->ns_cached_msrpm, + regs->ecx, ns_vmcb->exitinfo1 != 0); + if (vmexits == NESTEDHVM_VMEXIT_HOST) + return 0; + break; + case VMEXIT_IOIO: + nestedsvm_vmcb_map(v, nv->nv_vvmcxaddr); + ASSERT(nv->nv_vvmcx != NULL); + ns_vmcb = nv->nv_vvmcx; + vmexits = nsvm_vmcb_guest_intercepts_ioio(ns_vmcb->_iopm_base_pa, + ns_vmcb->exitinfo1); + if (vmexits == NESTEDHVM_VMEXIT_HOST) + return 0; + break; + } + + return 1; +} + +int +nsvm_vmcb_guest_intercepts_trap(struct vcpu *v, unsigned int trapnr) +{ + return nsvm_vmcb_guest_intercepts_exitcode(v, + guest_cpu_user_regs(), VMEXIT_EXCEPTION_DE + trapnr); +} + +static int +nsvm_vmcb_prepare4vmexit(struct vcpu *v) +{ + struct nestedvcpu *nv = &vcpu_nestedhvm(v); + struct nestedsvm *svm = &vcpu_nestedsvm(v); + struct vmcb_struct *ns_vmcb = nv->nv_vvmcx; + struct vmcb_struct *n2vmcb = nv->nv_n2vmcx; + + svm_vmsave(nv->nv_n1vmcx); + + /* Cache guest physical address of virtual vmcb + * for VMCB Cleanbit emulation. + */ + svm->ns_ovvmcb_pa = nv->nv_vvmcxaddr; + + /* Intercepts - keep them as they are */ + + /* Pausefilter - keep it as is */ + + /* Nested IO permission bitmap */ + /* Just keep the iopm_base_pa and msrpm_base_pa values. + * The guest must not see the virtualized values. + */ + + /* TSC offset */ + /* Keep it. It's maintainted by the l1 guest. */ + + /* ASID */ + /* ns_vmcb->_guest_asid = n2vmcb->_guest_asid; */ + + /* TLB control */ + ns_vmcb->tlb_control = 0; + + /* Virtual Interrupts */ + ns_vmcb->_vintr = n2vmcb->_vintr; + if (!(svm->ns_hostflags.fields.vintrmask)) + ns_vmcb->_vintr.fields.intr_masking = 0; + + /* Shadow mode */ + ns_vmcb->interrupt_shadow = n2vmcb->interrupt_shadow; + + /* Exit codes */ + ns_vmcb->exitcode = n2vmcb->exitcode; + ns_vmcb->exitinfo1 = n2vmcb->exitinfo1; + ns_vmcb->exitinfo2 = n2vmcb->exitinfo2; + ns_vmcb->exitintinfo = n2vmcb->exitintinfo; + + /* Interrupts */ + /* If we emulate a VMRUN/#VMEXIT in the same host #VMEXIT cycle we have + * to make sure that we do not lose injected events. So check eventinj + * here and copy it to exitintinfo if it is valid. + * exitintinfo and eventinj can't be both valid because the case below + * only happens on a VMRUN instruction intercept which has no valid + * exitintinfo set. + */ + if ( unlikely(n2vmcb->eventinj.fields.v) && + hvm_event_needs_reinjection(n2vmcb->eventinj.fields.type, + n2vmcb->eventinj.fields.vector) ) + { + ns_vmcb->exitintinfo = n2vmcb->eventinj; + } + + ns_vmcb->eventinj.bytes = 0; + + /* Nested paging mode */ + if (nestedhvm_paging_mode_hap(v)) { + /* host nested paging + guest nested paging. */ + ns_vmcb->_np_enable = n2vmcb->_np_enable; + ns_vmcb->_cr3 = n2vmcb->_cr3; + /* The vmcb->h_cr3 is the shadowed h_cr3. The original + * unshadowed guest h_cr3 is kept in ns_vmcb->h_cr3, + * hence we keep the ns_vmcb->h_cr3 value. */ + } else if (paging_mode_hap(v->domain)) { + /* host nested paging + guest shadow paging. */ + ns_vmcb->_np_enable = 0; + /* Throw h_cr3 away. Guest is not allowed to set it or + * it can break out, otherwise (security hole!) */ + ns_vmcb->_h_cr3 = 0x0; + /* Stop intercepting #PF (already done above + * by restoring cached intercepts). */ + ns_vmcb->_cr3 = n2vmcb->_cr3; + } else { + /* host shadow paging + guest shadow paging. */ + ns_vmcb->_np_enable = 0; + ns_vmcb->_h_cr3 = 0x0; + /* The vmcb->_cr3 is the shadowed cr3. The original + * unshadowed guest cr3 is kept in ns_vmcb->_cr3, + * hence we keep the ns_vmcb->_cr3 value. */ + } + + /* LBR virtualization - keep lbr control as is */ + + /* NextRIP */ + ns_vmcb->nextrip = n2vmcb->nextrip; + + /* + * VMCB Save State Area + */ + + /* Segments */ + ns_vmcb->es = n2vmcb->es; + ns_vmcb->cs = n2vmcb->cs; + ns_vmcb->ss = n2vmcb->ss; + ns_vmcb->ds = n2vmcb->ds; + ns_vmcb->gdtr = n2vmcb->gdtr; + ns_vmcb->idtr = n2vmcb->idtr; + + /* CPL */ + ns_vmcb->_cpl = n2vmcb->_cpl; + + /* EFER */ + ns_vmcb->_efer = n2vmcb->_efer; + + /* CRn */ + ns_vmcb->_cr4 = n2vmcb->_cr4; + ns_vmcb->_cr0 = n2vmcb->_cr0; + + /* DRn */ + ns_vmcb->_dr7 = n2vmcb->_dr7; + ns_vmcb->_dr6 = n2vmcb->_dr6; + + /* RFLAGS */ + ns_vmcb->rflags = n2vmcb->rflags; + + /* RIP */ + ns_vmcb->rip = n2vmcb->rip; + + /* RSP */ + ns_vmcb->rsp = n2vmcb->rsp; + + /* RAX */ + ns_vmcb->rax = n2vmcb->rax; + + /* Keep the l2 guest values of the fs, gs, ldtr, tr, kerngsbase, + * star, lstar, cstar, sfmask, sysenter_cs, sysenter_esp, + * sysenter_eip. These are handled via VMSAVE/VMLOAD emulation. + */ + + /* CR2 */ + ns_vmcb->_cr2 = n2vmcb->_cr2; + + /* Page tables */ + ns_vmcb->pdpe0 = n2vmcb->pdpe0; + ns_vmcb->pdpe1 = n2vmcb->pdpe1; + ns_vmcb->pdpe2 = n2vmcb->pdpe2; + ns_vmcb->pdpe3 = n2vmcb->pdpe3; + + /* PAT */ + ns_vmcb->_g_pat = n2vmcb->_g_pat; + + /* Debug Control MSR */ + ns_vmcb->_debugctlmsr = n2vmcb->_debugctlmsr; + + /* LBR MSRs */ + ns_vmcb->_lastbranchfromip = n2vmcb->_lastbranchfromip; + ns_vmcb->_lastbranchtoip = n2vmcb->_lastbranchtoip; + ns_vmcb->_lastintfromip = n2vmcb->_lastintfromip; + ns_vmcb->_lastinttoip = n2vmcb->_lastinttoip; + + return 0; +} + +bool_t +nsvm_vmcb_hap_enabled(struct vcpu *v) +{ + return vcpu_nestedsvm(v).ns_hap_enabled; +} + +/* MSR handling */ +int nsvm_rdmsr(struct vcpu *v, unsigned int msr, uint64_t *msr_content) +{ + struct nestedsvm *svm = &vcpu_nestedsvm(v); + int ret = 1; + + *msr_content = 0; + + switch (msr) { + case MSR_K8_VM_CR: + break; + case MSR_K8_VM_HSAVE_PA: + *msr_content = svm->ns_msr_hsavepa; + break; + default: + ret = 0; + break; + } + + return ret; +} + +int nsvm_wrmsr(struct vcpu *v, unsigned int msr, uint64_t msr_content) +{ + int ret = 1; + struct nestedsvm *svm = &vcpu_nestedsvm(v); + + switch (msr) { + case MSR_K8_VM_CR: + /* ignore write. handle all bits as read-only. */ + break; + case MSR_K8_VM_HSAVE_PA: + if (!nestedsvm_vmcb_isvalid(v, msr_content)) { + gdprintk(XENLOG_ERR, + "MSR_K8_VM_HSAVE_PA value invalid 0x%"PRIx64"\n", msr_content); + ret = -1; /* inject #GP */ + break; + } + svm->ns_msr_hsavepa = msr_content; + break; + default: + ret = 0; + break; + } + + return ret; +} + +/* VMEXIT emulation */ +void +nestedsvm_vmexit_defer(struct vcpu *v, + uint64_t exitcode, uint64_t exitinfo1, uint64_t exitinfo2) +{ + struct nestedsvm *svm = &vcpu_nestedsvm(v); + + svm->ns_vmexit.exitcode = exitcode; + svm->ns_vmexit.exitinfo1 = exitinfo1; + svm->ns_vmexit.exitinfo2 = exitinfo2; + vcpu_nestedhvm(v).nv_vmexit_pending = 1; +} + +enum nestedhvm_vmexits +nestedsvm_check_intercepts(struct vcpu *v, struct cpu_user_regs *regs, + uint64_t exitcode) +{ + bool_t is_intercepted; + struct nestedvcpu *nv = &vcpu_nestedhvm(v); + + ASSERT(nv->nv_vmexit_pending == 0); + is_intercepted = nsvm_vmcb_guest_intercepts_exitcode(v, regs, exitcode); + + switch (exitcode) { + case VMEXIT_INVALID: + if (is_intercepted) + return NESTEDHVM_VMEXIT_INJECT; + return NESTEDHVM_VMEXIT_HOST; + + case VMEXIT_INTR: + case VMEXIT_NMI: + return NESTEDHVM_VMEXIT_HOST; + case VMEXIT_EXCEPTION_NM: + /* Host must handle lazy fpu context switching first. + * Then inject the VMEXIT if L1 guest intercepts this. + */ + return NESTEDHVM_VMEXIT_HOST; + + case VMEXIT_NPF: + if (nestedhvm_paging_mode_hap(v)) { + if (!is_intercepted) + return NESTEDHVM_VMEXIT_FATALERROR; + /* host nested paging + guest nested paging */ + return NESTEDHVM_VMEXIT_HOST; + } + if (paging_mode_hap(v->domain)) { + if (is_intercepted) + return NESTEDHVM_VMEXIT_FATALERROR; + /* host nested paging + guest shadow paging */ + return NESTEDHVM_VMEXIT_HOST; + } + /* host shadow paging + guest shadow paging */ + /* Can this happen? */ + BUG(); + return NESTEDHVM_VMEXIT_FATALERROR; + case VMEXIT_EXCEPTION_PF: + if (nestedhvm_paging_mode_hap(v)) { + /* host nested paging + guest nested paging */ + if (!is_intercepted) + /* l1 guest intercepts #PF unnecessarily */ + return NESTEDHVM_VMEXIT_HOST; + /* l2 guest intercepts #PF unnecessarily */ + return NESTEDHVM_VMEXIT_INJECT; + } + if (!paging_mode_hap(v->domain)) { + /* host shadow paging + guest shadow paging */ + return NESTEDHVM_VMEXIT_HOST; + } + /* host nested paging + guest shadow paging */ + return NESTEDHVM_VMEXIT_INJECT; + case VMEXIT_VMMCALL: + /* Always let the guest handle VMMCALL/VMCALL */ + return NESTEDHVM_VMEXIT_INJECT; + default: + break; + } + + if (is_intercepted) + return NESTEDHVM_VMEXIT_INJECT; + return NESTEDHVM_VMEXIT_HOST; +} + +enum nestedhvm_vmexits +nestedsvm_vmexit_n2n1(struct vcpu *v, struct cpu_user_regs *regs) +{ + int rc; + enum nestedhvm_vmexits ret = NESTEDHVM_VMEXIT_DONE; + + ASSERT(vcpu_nestedhvm(v).nv_vmswitch_in_progress); + ASSERT(nestedhvm_vcpu_in_guestmode(v)); + + rc = nsvm_vmcb_prepare4vmexit(v); + if (rc) + ret = NESTEDHVM_VMEXIT_ERROR; + + rc = nhvm_vcpu_hostrestore(v, regs); + if (rc) + ret = NESTEDHVM_VMEXIT_FATALERROR; + + nestedhvm_vcpu_exit_guestmode(v); + return ret; +} + +/* The exitcode is in native SVM/VMX format. The forced exitcode + * is in generic format. + */ +static enum nestedhvm_vmexits +nestedsvm_vcpu_vmexit(struct vcpu *v, struct cpu_user_regs *regs, + uint64_t exitcode) +{ + int rc; + struct nestedvcpu *nv = &vcpu_nestedhvm(v); + + nv->nv_vmswitch_in_progress = 1; + + ASSERT(nv->nv_vvmcx != NULL); + + /* On special intercepts the host has to handle + * the vcpu is still in guest mode here. + */ + if (nestedhvm_vcpu_in_guestmode(v)) { + enum nestedhvm_vmexits ret; + + ret = nestedsvm_vmexit_n2n1(v, regs); + switch (ret) { + case NESTEDHVM_VMEXIT_FATALERROR: + gdprintk(XENLOG_ERR, "VMEXIT: fatal error\n"); + return ret; + case NESTEDHVM_VMEXIT_HOST: + BUG(); + return ret; + case NESTEDHVM_VMEXIT_ERROR: + exitcode = VMEXIT_INVALID; + break; + default: + ASSERT(!nestedhvm_vcpu_in_guestmode(v)); + break; + } + + /* host state has been restored */ + } + + ASSERT(!nestedhvm_vcpu_in_guestmode(v)); + + /* Prepare for running the l1 guest. Make the actual + * modifications to the virtual VMCB/VMCS. + */ + rc = nhvm_vcpu_vmexit(v, regs, exitcode); + + nv->nv_vmswitch_in_progress = 0; + + if (rc) + return NESTEDHVM_VMEXIT_FATALERROR; + + return NESTEDHVM_VMEXIT_DONE; +} + +/* VCPU switch */ +asmlinkage void nsvm_vcpu_switch(struct cpu_user_regs *regs) +{ + struct vcpu *v = current; + struct nestedvcpu *nv; + struct nestedsvm *svm; + + if (!nestedhvm_enabled(v->domain)) + return; + + nv = &vcpu_nestedhvm(v); + svm = &vcpu_nestedsvm(v); + ASSERT(v->arch.hvm_svm.vmcb != NULL); + ASSERT(nv->nv_n1vmcx != NULL); + ASSERT(nv->nv_n2vmcx != NULL); + ASSERT(nv->nv_n1vmcx_pa != VMCX_EADDR); + ASSERT(nv->nv_n2vmcx_pa != VMCX_EADDR); + + if (nv->nv_vmexit_pending) { + vmexit: + nestedsvm_vcpu_vmexit(v, regs, svm->ns_vmexit.exitcode); + nv->nv_vmexit_pending = 0; + nv->nv_vmentry_pending = 0; + return; + } + if (nv->nv_vmentry_pending) { + int ret; + ASSERT(!nv->nv_vmexit_pending); + ret = nsvm_vcpu_vmrun(v, regs); + if (ret < 0) + goto vmexit; + nv->nv_vmentry_pending = 0; + return; + } +} + + diff -r e842b80fcee0 -r 3df1f127bd4f xen/arch/x86/hvm/svm/svm.c --- a/xen/arch/x86/hvm/svm/svm.c +++ b/xen/arch/x86/hvm/svm/svm.c @@ -49,6 +49,9 @@ #include #include #include +#include +#include +#include #include #include #include @@ -106,6 +109,44 @@ static void svm_cpu_down(void) write_efer(read_efer() & ~EFER_SVME); } +unsigned long * +svm_msrbit(unsigned long *msr_bitmap, uint32_t msr) +{ + unsigned long *msr_bit = NULL; + + /* + * See AMD64 Programmers Manual, Vol 2, Section 15.10 (MSR-Bitmap Address). + */ + if ( msr <= 0x1fff ) + msr_bit = msr_bitmap + 0x0000 / BYTES_PER_LONG; + else if ( (msr >= 0xc0000000) && (msr <= 0xc0001fff) ) + msr_bit = msr_bitmap + 0x0800 / BYTES_PER_LONG; + else if ( (msr >= 0xc0010000) && (msr <= 0xc0011fff) ) + msr_bit = msr_bitmap + 0x1000 / BYTES_PER_LONG; + + return msr_bit; +} + +void svm_intercept_msr(struct vcpu *v, uint32_t msr, int enable) +{ + unsigned long *msr_bit; + + msr_bit = svm_msrbit(v->arch.hvm_svm.msrpm, msr); + BUG_ON(msr_bit == NULL); + msr &= 0x1fff; + + if ( enable ) + { + __set_bit(msr * 2, msr_bit); + __set_bit(msr * 2 + 1, msr_bit); + } + else + { + __clear_bit(msr * 2, msr_bit); + __clear_bit(msr * 2 + 1, msr_bit); + } +} + static void svm_save_dr(struct vcpu *v) { struct vmcb_struct *vmcb = v->arch.hvm_svm.vmcb; @@ -296,7 +337,7 @@ static int svm_load_vmcb_ctxt(struct vcp { svm_load_cpu_state(v, ctxt); if (svm_vmcb_restore(v, ctxt)) { - printk("svm_vmcb restore failed!\n"); + gdprintk(XENLOG_ERR, "svm_vmcb restore failed!\n"); domain_crash(v->domain); return -EINVAL; } @@ -588,7 +629,24 @@ static void svm_set_segment_register(str static void svm_set_tsc_offset(struct vcpu *v, u64 offset) { struct vmcb_struct *vmcb = v->arch.hvm_svm.vmcb; - vmcb_set_tsc_offset(vmcb, offset); + struct vmcb_struct *n1vmcb, *n2vmcb; + uint64_t n2_tsc_offset = 0; + + if ( !nestedhvm_enabled(v->domain) ) { + vmcb_set_tsc_offset(vmcb, offset); + return; + } + + n1vmcb = vcpu_nestedhvm(v).nv_n1vmcx; + n2vmcb = vcpu_nestedhvm(v).nv_n2vmcx; + + if ( nestedhvm_vcpu_in_guestmode(v) ) { + n2_tsc_offset = vmcb_get_tsc_offset(n2vmcb) - + vmcb_get_tsc_offset(n1vmcb); + vmcb_set_tsc_offset(n1vmcb, offset); + } + + vmcb_set_tsc_offset(vmcb, offset + n2_tsc_offset); } static void svm_set_rdtsc_exiting(struct vcpu *v, bool_t enable) @@ -683,9 +741,13 @@ static void svm_do_resume(struct vcpu *v { struct vmcb_struct *vmcb = v->arch.hvm_svm.vmcb; bool_t debug_state = v->domain->debugger_attached; - vintr_t intr; + bool_t vcpu_guestmode = 0; - if ( unlikely(v->arch.hvm_vcpu.debug_state_latch != debug_state) ) + if ( nestedhvm_enabled(v->domain) && nestedhvm_vcpu_in_guestmode(v) ) + vcpu_guestmode = 1; + + if ( !vcpu_guestmode && + unlikely(v->arch.hvm_vcpu.debug_state_latch != debug_state) ) { uint32_t intercepts = vmcb_get_exception_intercepts(vmcb); uint32_t mask = (1U << TRAP_debug) | (1U << TRAP_int3); @@ -703,13 +765,19 @@ static void svm_do_resume(struct vcpu *v hvm_asid_flush_vcpu(v); } - /* Reflect the vlapic's TPR in the hardware vtpr */ - intr = vmcb_get_vintr(vmcb); - intr.fields.tpr = - (vlapic_get_reg(vcpu_vlapic(v), APIC_TASKPRI) & 0xFF) >> 4; - vmcb_set_vintr(vmcb, intr); + if ( !vcpu_guestmode ) + { + vintr_t intr; + + /* Reflect the vlapic's TPR in the hardware vtpr */ + intr = vmcb_get_vintr(vmcb); + intr.fields.tpr = + (vlapic_get_reg(vcpu_vlapic(v), APIC_TASKPRI) & 0xFF) >> 4; + vmcb_set_vintr(vmcb, intr); + } hvm_do_resume(v); + reset_stack_and_jump(svm_asm_do_resume); } @@ -961,8 +1029,8 @@ static void svm_do_nested_pgfault(paddr_ struct { uint64_t gpa; uint64_t mfn; - u32 qualification; - u32 p2mt; + uint32_t qualification; + uint32_t p2mt; } _d; _d.gpa = gpa; @@ -984,12 +1052,21 @@ static void svm_do_nested_pgfault(paddr_ static void svm_fpu_dirty_intercept(void) { - struct vcpu *curr = current; - struct vmcb_struct *vmcb = curr->arch.hvm_svm.vmcb; + struct vcpu *v = current; + struct vmcb_struct *vmcb = v->arch.hvm_svm.vmcb; - svm_fpu_enter(curr); + svm_fpu_enter(v); - if ( !(curr->arch.hvm_vcpu.guest_cr[0] & X86_CR0_TS) ) + if ( nestedhvm_enabled(v->domain) && nestedhvm_vcpu_in_guestmode(v) ) { + /* Check if guest must make FPU ready for the nested guest */ + if ( v->arch.hvm_vcpu.guest_cr[0] & X86_CR0_TS ) + hvm_inject_exception(TRAP_no_device, HVM_DELIVER_NO_ERROR_CODE, 0); + else + vmcb_set_cr0(vmcb, vmcb_get_cr0(vmcb) & ~X86_CR0_TS); + return; + } + + if ( !(v->arch.hvm_vcpu.guest_cr[0] & X86_CR0_TS) ) vmcb_set_cr0(vmcb, vmcb_get_cr0(vmcb) & ~X86_CR0_TS); } @@ -1003,11 +1080,14 @@ static void svm_cpuid_intercept( hvm_cpuid(input, eax, ebx, ecx, edx); - if ( input == 0x80000001 ) - { + switch (input) { + case 0x80000001: /* Fix up VLAPIC details. */ if ( vlapic_hw_disabled(vcpu_vlapic(v)) ) __clear_bit(X86_FEATURE_APIC & 31, edx); + break; + default: + break; } HVMTRACE_5D (CPUID, input, *eax, *ebx, *ecx, *edx); @@ -1043,6 +1123,7 @@ static void svm_dr_access(struct vcpu *v static int svm_msr_read_intercept(unsigned int msr, uint64_t *msr_content) { + int ret; struct vcpu *v = current; struct vmcb_struct *vmcb = v->arch.hvm_svm.vmcb; @@ -1076,9 +1157,6 @@ static int svm_msr_read_intercept(unsign *msr_content = 0; break; - case MSR_K8_VM_HSAVE_PA: - goto gpf; - case MSR_IA32_DEBUGCTLMSR: *msr_content = vmcb_get_debugctlmsr(vmcb); break; @@ -1111,6 +1189,11 @@ static int svm_msr_read_intercept(unsign break; default: + ret = nsvm_rdmsr(v, msr, msr_content); + if ( ret < 0 ) + goto gpf; + else if ( ret ) + break; if ( rdmsr_viridian_regs(msr, msr_content) || rdmsr_hypervisor_regs(msr, msr_content) ) @@ -1133,6 +1216,7 @@ static int svm_msr_read_intercept(unsign static int svm_msr_write_intercept(unsigned int msr, uint64_t msr_content) { + int ret; struct vcpu *v = current; struct vmcb_struct *vmcb = v->arch.hvm_svm.vmcb; int sync = 0; @@ -1153,9 +1237,6 @@ static int svm_msr_write_intercept(unsig switch ( msr ) { - case MSR_K8_VM_HSAVE_PA: - goto gpf; - case MSR_IA32_SYSENTER_CS: vmcb->sysenter_cs = v->arch.hvm_svm.guest_sysenter_cs = msr_content; break; @@ -1215,6 +1296,12 @@ static int svm_msr_write_intercept(unsig break; default: + ret = nsvm_wrmsr(v, msr, msr_content); + if ( ret < 0 ) + goto gpf; + else if ( ret ) + break; + if ( wrmsr_viridian_regs(msr, msr_content) ) break; @@ -1298,6 +1385,96 @@ static void svm_vmexit_do_pause(struct c do_sched_op_compat(SCHEDOP_yield, 0); } +static void +svm_vmexit_do_vmrun(struct cpu_user_regs *regs, + struct vcpu *v, uint64_t vmcbaddr) +{ + if (!nestedhvm_enabled(v->domain)) { + gdprintk(XENLOG_ERR, "VMRUN: nestedhvm disabled, injecting #UD\n"); + hvm_inject_exception(TRAP_invalid_op, HVM_DELIVER_NO_ERROR_CODE, 0); + return; + } + + if (!nestedsvm_vmcb_map(v, vmcbaddr)) { + gdprintk(XENLOG_ERR, "VMRUN: mapping vmcb failed, injecting #UD\n"); + hvm_inject_exception(TRAP_invalid_op, HVM_DELIVER_NO_ERROR_CODE, 0); + return; + } + + vcpu_nestedhvm(v).nv_vmentry_pending = 1; + return; +} + +static void +svm_vmexit_do_vmload(struct vmcb_struct *vmcb, + struct cpu_user_regs *regs, + struct vcpu *v, uint64_t vmcbaddr) +{ + int ret; + unsigned int inst_len; + struct nestedvcpu *nv = &vcpu_nestedhvm(v); + + if ( (inst_len = __get_instruction_length(v, INSTR_VMLOAD)) == 0 ) + return; + + if (!nestedhvm_enabled(v->domain)) { + gdprintk(XENLOG_ERR, "VMLOAD: nestedhvm disabled, injecting #UD\n"); + ret = TRAP_invalid_op; + goto inject; + } + + if (!nestedsvm_vmcb_map(v, vmcbaddr)) { + gdprintk(XENLOG_ERR, "VMLOAD: mapping vmcb failed, injecting #UD\n"); + ret = TRAP_invalid_op; + goto inject; + } + + svm_vmload(nv->nv_vvmcx); + /* State in L1 VMCB is stale now */ + v->arch.hvm_svm.vmcb_in_sync = 0; + + __update_guest_eip(regs, inst_len); + return; + + inject: + hvm_inject_exception(ret, HVM_DELIVER_NO_ERROR_CODE, 0); + return; +} + +static void +svm_vmexit_do_vmsave(struct vmcb_struct *vmcb, + struct cpu_user_regs *regs, + struct vcpu *v, uint64_t vmcbaddr) +{ + int ret; + unsigned int inst_len; + struct nestedvcpu *nv = &vcpu_nestedhvm(v); + + if ( (inst_len = __get_instruction_length(v, INSTR_VMSAVE)) == 0 ) + return; + + if (!nestedhvm_enabled(v->domain)) { + gdprintk(XENLOG_ERR, "VMSAVE: nestedhvm disabled, injecting #UD\n"); + ret = TRAP_invalid_op; + goto inject; + } + + if (!nestedsvm_vmcb_map(v, vmcbaddr)) { + gdprintk(XENLOG_ERR, "VMSAVE: mapping vmcb failed, injecting #UD\n"); + ret = TRAP_invalid_op; + goto inject; + } + + svm_vmsave(nv->nv_vvmcx); + + __update_guest_eip(regs, inst_len); + return; + + inject: + hvm_inject_exception(ret, HVM_DELIVER_NO_ERROR_CODE, 0); + return; +} + static void svm_vmexit_ud_intercept(struct cpu_user_regs *regs) { struct hvm_emulate_ctxt ctxt; @@ -1428,22 +1605,38 @@ static struct hvm_function_table __read_ .msr_read_intercept = svm_msr_read_intercept, .msr_write_intercept = svm_msr_write_intercept, .invlpg_intercept = svm_invlpg_intercept, - .set_rdtsc_exiting = svm_set_rdtsc_exiting + .set_rdtsc_exiting = svm_set_rdtsc_exiting, + + .nhvm_vcpu_initialise = nsvm_vcpu_initialise, + .nhvm_vcpu_destroy = nsvm_vcpu_destroy, + .nhvm_vcpu_reset = nsvm_vcpu_reset, + .nhvm_vcpu_hostrestore = nsvm_vcpu_hostrestore, + .nhvm_vcpu_vmexit = nsvm_vcpu_vmexit_inject, + .nhvm_vcpu_vmexit_trap = nsvm_vcpu_vmexit_trap, + .nhvm_vcpu_guestcr3 = nsvm_vcpu_guestcr3, + .nhvm_vcpu_hostcr3 = nsvm_vcpu_hostcr3, + .nhvm_vcpu_asid = nsvm_vcpu_asid, + .nhvm_vmcx_guest_intercepts_trap = nsvm_vmcb_guest_intercepts_trap, + .nhvm_vmcx_hap_enabled = nsvm_vmcb_hap_enabled, }; asmlinkage void svm_vmexit_handler(struct cpu_user_regs *regs) { - unsigned int exit_reason; + uint64_t exit_reason; struct vcpu *v = current; struct vmcb_struct *vmcb = v->arch.hvm_svm.vmcb; eventinj_t eventinj; int inst_len, rc; vintr_t intr; + bool_t vcpu_guestmode = 0; if ( paging_mode_hap(v->domain) ) v->arch.hvm_vcpu.guest_cr[3] = v->arch.hvm_vcpu.hw_cr[3] = vmcb_get_cr3(vmcb); + if ( nestedhvm_enabled(v->domain) && nestedhvm_vcpu_in_guestmode(v) ) + vcpu_guestmode = 1; + /* * Before doing anything else, we need to sync up the VLAPIC's TPR with * SVM's vTPR. It's OK if the guest doesn't touch CR8 (e.g. 32-bit Windows) @@ -1451,13 +1644,73 @@ asmlinkage void svm_vmexit_handler(struc * NB. We need to preserve the low bits of the TPR to make checked builds * of Windows work, even though they don't actually do anything. */ - intr = vmcb_get_vintr(vmcb); - vlapic_set_reg(vcpu_vlapic(v), APIC_TASKPRI, + if ( !vcpu_guestmode ) { + intr = vmcb_get_vintr(vmcb); + vlapic_set_reg(vcpu_vlapic(v), APIC_TASKPRI, ((intr.fields.tpr & 0x0F) << 4) | (vlapic_get_reg(vcpu_vlapic(v), APIC_TASKPRI) & 0x0F)); + } exit_reason = vmcb->exitcode; + if ( vcpu_guestmode ) { + enum nestedhvm_vmexits nsret; + struct nestedvcpu *nv = &vcpu_nestedhvm(v); + struct vmcb_struct *ns_vmcb = nv->nv_vvmcx; + uint64_t exitinfo1, exitinfo2; + + /* Write real exitinfo1 back into virtual vmcb. + * nestedsvm_check_intercepts() expects to have the correct + * exitinfo1 value there. + */ + exitinfo1 = ns_vmcb->exitinfo1; + ns_vmcb->exitinfo1 = vmcb->exitinfo1; + nsret = nestedsvm_check_intercepts(v, regs, exit_reason); + switch (nsret) { + case NESTEDHVM_VMEXIT_CONTINUE: + BUG(); + break; + case NESTEDHVM_VMEXIT_HOST: + break; + case NESTEDHVM_VMEXIT_INJECT: + /* Switch vcpu from l2 to l1 guest. We must perform + * the switch here to have svm_do_resume() working + * as intended. + */ + exitinfo1 = vmcb->exitinfo1; + exitinfo2 = vmcb->exitinfo2; + nv->nv_vmswitch_in_progress = 1; + nsret = nestedsvm_vmexit_n2n1(v, regs); + nv->nv_vmswitch_in_progress = 0; + switch (nsret) { + case NESTEDHVM_VMEXIT_DONE: + /* defer VMEXIT injection */ + nestedsvm_vmexit_defer(v, exit_reason, exitinfo1, exitinfo2); + goto out; + case NESTEDHVM_VMEXIT_FATALERROR: + gdprintk(XENLOG_ERR, "unexpected nestedsvm_vmexit() error\n"); + goto exit_and_crash; + + default: + BUG(); + case NESTEDHVM_VMEXIT_ERROR: + break; + } + case NESTEDHVM_VMEXIT_ERROR: + gdprintk(XENLOG_ERR, + "nestedsvm_check_intercepts() returned NESTEDHVM_VMEXIT_ERROR\n"); + goto out; + case NESTEDHVM_VMEXIT_FATALERROR: + gdprintk(XENLOG_ERR, + "unexpected nestedsvm_check_intercepts() error\n"); + goto exit_and_crash; + default: + gdprintk(XENLOG_INFO, "nestedsvm_check_intercepts() returned %i\n", + nsret); + goto exit_and_crash; + } + } + if ( hvm_long_mode_enabled(v) ) HVMTRACE_ND(VMEXIT64, 1/*cycles*/, 3, exit_reason, (uint32_t)regs->eip, (uint32_t)((uint64_t)regs->eip >> 32), @@ -1469,7 +1722,7 @@ asmlinkage void svm_vmexit_handler(struc if ( unlikely(exit_reason == VMEXIT_INVALID) ) { - svm_dump_vmcb(__func__, vmcb); + svm_vmcb_dump(__func__, vmcb); goto exit_and_crash; } @@ -1630,6 +1883,7 @@ asmlinkage void svm_vmexit_handler(struc case VMEXIT_VMMCALL: if ( (inst_len = __get_instruction_length(v, INSTR_VMCALL)) == 0 ) break; + BUG_ON(vcpu_guestmode); HVMTRACE_1D(VMMCALL, regs->eax); rc = hvm_do_hypercall(regs); if ( rc != HVM_HCALL_preempted ) @@ -1662,9 +1916,18 @@ asmlinkage void svm_vmexit_handler(struc case VMEXIT_MONITOR: case VMEXIT_MWAIT: + hvm_inject_exception(TRAP_invalid_op, HVM_DELIVER_NO_ERROR_CODE, 0); + break; + case VMEXIT_VMRUN: + svm_vmexit_do_vmrun(regs, v, regs->eax); + break; case VMEXIT_VMLOAD: + svm_vmexit_do_vmload(vmcb, regs, v, regs->eax); + break; case VMEXIT_VMSAVE: + svm_vmexit_do_vmsave(vmcb, regs, v, regs->eax); + break; case VMEXIT_STGI: case VMEXIT_CLGI: case VMEXIT_SKINIT: @@ -1708,7 +1971,7 @@ asmlinkage void svm_vmexit_handler(struc default: exit_and_crash: - gdprintk(XENLOG_ERR, "unexpected VMEXIT: exit reason = 0x%x, " + gdprintk(XENLOG_ERR, "unexpected VMEXIT: exit reason = 0x%"PRIx64", " "exitinfo1 = %"PRIx64", exitinfo2 = %"PRIx64"\n", exit_reason, (u64)vmcb->exitinfo1, (u64)vmcb->exitinfo2); @@ -1716,6 +1979,11 @@ asmlinkage void svm_vmexit_handler(struc break; } + out: + if ( vcpu_guestmode ) + /* Don't clobber TPR of the nested guest. */ + return; + /* The exit may have updated the TPR: reflect this in the hardware vtpr */ intr = vmcb_get_vintr(vmcb); intr.fields.tpr = diff -r e842b80fcee0 -r 3df1f127bd4f xen/arch/x86/hvm/svm/svmdebug.c --- /dev/null +++ b/xen/arch/x86/hvm/svm/svmdebug.c @@ -0,0 +1,191 @@ +/* + * svmdebug.c: debug functions + * Copyright (c) 2011, Advanced Micro Devices, Inc. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms and conditions of the GNU General Public License, + * version 2, as published by the Free Software Foundation. + * + * This program is distributed in the hope it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for + * more details. + * + * You should have received a copy of the GNU General Public License along with + * this program; if not, write to the Free Software Foundation, Inc., 59 Temple + * Place - Suite 330, Boston, MA 02111-1307 USA. + * + */ + +#include +#include +#include + +static void svm_dump_sel(const char *name, svm_segment_register_t *s) +{ + printk("%s: sel=0x%04x, attr=0x%04x, limit=0x%08x, base=0x%016llx\n", + name, s->sel, s->attr.bytes, s->limit, + (unsigned long long)s->base); +} + +/* This function can directly access fields which are covered by clean bits. */ +void svm_vmcb_dump(const char *from, struct vmcb_struct *vmcb) +{ + printk("Dumping guest's current state at %s...\n", from); + printk("Size of VMCB = %d, paddr = 0x%016lx, vaddr = %p\n", + (int) sizeof(struct vmcb_struct), virt_to_maddr(vmcb), vmcb); + + printk("cr_intercepts = 0x%08x dr_intercepts = 0x%08x " + "exception_intercepts = 0x%08x\n", + vmcb->_cr_intercepts, vmcb->_dr_intercepts, + vmcb->_exception_intercepts); + printk("general1_intercepts = 0x%08x general2_intercepts = 0x%08x\n", + vmcb->_general1_intercepts, vmcb->_general2_intercepts); + printk("iopm_base_pa = 0x%016llx msrpm_base_pa = 0x%016llx tsc_offset = " + "0x%016llx\n", + (unsigned long long)vmcb->_iopm_base_pa, + (unsigned long long)vmcb->_msrpm_base_pa, + (unsigned long long)vmcb->_tsc_offset); + printk("tlb_control = 0x%08x vintr = 0x%016llx interrupt_shadow = " + "0x%016llx\n", vmcb->tlb_control, + (unsigned long long)vmcb->_vintr.bytes, + (unsigned long long)vmcb->interrupt_shadow); + printk("exitcode = 0x%016llx exitintinfo = 0x%016llx\n", + (unsigned long long)vmcb->exitcode, + (unsigned long long)vmcb->exitintinfo.bytes); + printk("exitinfo1 = 0x%016llx exitinfo2 = 0x%016llx \n", + (unsigned long long)vmcb->exitinfo1, + (unsigned long long)vmcb->exitinfo2); + printk("np_enable = 0x%016llx guest_asid = 0x%03x\n", + (unsigned long long)vmcb->_np_enable, vmcb->_guest_asid); + printk("cpl = %d efer = 0x%016llx star = 0x%016llx lstar = 0x%016llx\n", + vmcb->_cpl, (unsigned long long)vmcb->_efer, + (unsigned long long)vmcb->star, (unsigned long long)vmcb->lstar); + printk("CR0 = 0x%016llx CR2 = 0x%016llx\n", + (unsigned long long)vmcb->_cr0, (unsigned long long)vmcb->_cr2); + printk("CR3 = 0x%016llx CR4 = 0x%016llx\n", + (unsigned long long)vmcb->_cr3, (unsigned long long)vmcb->_cr4); + printk("RSP = 0x%016llx RIP = 0x%016llx\n", + (unsigned long long)vmcb->rsp, (unsigned long long)vmcb->rip); + printk("RAX = 0x%016llx RFLAGS=0x%016llx\n", + (unsigned long long)vmcb->rax, (unsigned long long)vmcb->rflags); + printk("DR6 = 0x%016llx, DR7 = 0x%016llx\n", + (unsigned long long)vmcb->_dr6, (unsigned long long)vmcb->_dr7); + printk("CSTAR = 0x%016llx SFMask = 0x%016llx\n", + (unsigned long long)vmcb->cstar, + (unsigned long long)vmcb->sfmask); + printk("KernGSBase = 0x%016llx PAT = 0x%016llx \n", + (unsigned long long)vmcb->kerngsbase, + (unsigned long long)vmcb->_g_pat); + printk("H_CR3 = 0x%016llx CleanBits = 0x%08x\n", + (unsigned long long)vmcb->_h_cr3, vmcb->cleanbits.bytes); + + /* print out all the selectors */ + svm_dump_sel("CS", &vmcb->cs); + svm_dump_sel("DS", &vmcb->ds); + svm_dump_sel("SS", &vmcb->ss); + svm_dump_sel("ES", &vmcb->es); + svm_dump_sel("FS", &vmcb->fs); + svm_dump_sel("GS", &vmcb->gs); + svm_dump_sel("GDTR", &vmcb->gdtr); + svm_dump_sel("LDTR", &vmcb->ldtr); + svm_dump_sel("IDTR", &vmcb->idtr); + svm_dump_sel("TR", &vmcb->tr); +} + +bool_t +svm_vmcb_isvalid(const char *from, struct vmcb_struct *vmcb, + bool_t verbose) +{ + bool_t ret = 0; /* ok */ + +#define PRINTF(...) \ + if (verbose) { ret = 1; printk("%s: ", from); printk(__VA_ARGS__); \ + } else return 1; + + if ((vmcb->_efer & EFER_SVME) == 0) { + PRINTF("EFER: SVME bit not set (0x%"PRIx64")\n", vmcb->_efer); + } + + if ((vmcb->_cr0 & X86_CR0_CD) == 0 && (vmcb->_cr0 & X86_CR0_NW) != 0) { + PRINTF("CR0: CD bit is zero and NW bit set (0x%"PRIx64")\n", + vmcb->_cr0); + } + + if ((vmcb->_cr0 >> 32U) != 0) { + PRINTF("CR0: bits [63:32] are not zero (0x%"PRIx64")\n", + vmcb->_cr0); + } + + if ((vmcb->_cr3 & 0x7) != 0) { + PRINTF("CR3: MBZ bits are set (0x%"PRIx64")\n", vmcb->_cr3); + } + if ((vmcb->_efer & EFER_LMA) && (vmcb->_cr3 & 0xfe) != 0) { + PRINTF("CR3: MBZ bits are set (0x%"PRIx64")\n", vmcb->_cr3); + } + + if ((vmcb->_cr4 >> 11U) != 0) { + PRINTF("CR4: bits [63:11] are not zero (0x%"PRIx64")\n", + vmcb->_cr4); + } + + if ((vmcb->_dr6 >> 32U) != 0) { + PRINTF("DR6: bits [63:32] are not zero (0x%"PRIx64")\n", + vmcb->_dr6); + } + + if ((vmcb->_dr7 >> 32U) != 0) { + PRINTF("DR7: bits [63:32] are not zero (0x%"PRIx64")\n", + vmcb->_dr7); + } + + if ((vmcb->_efer >> 15U) != 0) { + PRINTF("EFER: bits [63:15] are not zero (0x%"PRIx64")\n", + vmcb->_efer); + } + + if ((vmcb->_efer & EFER_LME) != 0 && ((vmcb->_cr0 & X86_CR0_PG) != 0)) { + if ((vmcb->_cr4 & X86_CR4_PAE) == 0) { + PRINTF("EFER_LME and CR0.PG are both set and CR4.PAE is zero.\n"); + } + if ((vmcb->_cr0 & X86_CR0_PE) == 0) { + PRINTF("EFER_LME and CR0.PG are both set and CR0.PE is zero.\n"); + } + } + + if ((vmcb->_efer & EFER_LME) != 0 + && (vmcb->_cr0 & X86_CR0_PG) != 0 + && (vmcb->_cr4 & X86_CR4_PAE) != 0 + && (vmcb->cs.attr.fields.l != 0) + && (vmcb->cs.attr.fields.db != 0)) + { + PRINTF("EFER_LME, CR0.PG, CR4.PAE, CS.L and CS.D are all non-zero.\n"); + } + + if ((vmcb->_general2_intercepts & GENERAL2_INTERCEPT_VMRUN) == 0) { + PRINTF("GENERAL2_INTERCEPT: VMRUN intercept bit is clear (0x%"PRIx32")\n", + vmcb->_general2_intercepts); + } + + if (vmcb->eventinj.fields.resvd1 != 0) { + PRINTF("eventinj: MBZ bits are set (0x%"PRIx64")\n", + vmcb->eventinj.bytes); + } + + if (vmcb->_np_enable && vmcb->_h_cr3 == 0) { + PRINTF("nested paging enabled but host cr3 is 0\n"); + } + +#undef PRINTF + return ret; +} + +/* + * Local variables: + * mode: C + * c-set-style: "BSD" + * c-basic-offset: 4 + * tab-width: 4 + * indent-tabs-mode: nil + * End: + */ diff -r e842b80fcee0 -r 3df1f127bd4f xen/arch/x86/hvm/svm/vmcb.c --- a/xen/arch/x86/hvm/svm/vmcb.c +++ b/xen/arch/x86/hvm/svm/vmcb.c @@ -33,6 +33,7 @@ #include #include #include +#include #include #include #include @@ -40,9 +41,6 @@ extern int svm_dbg_on; -#define IOPM_SIZE (12 * 1024) -#define MSRPM_SIZE (8 * 1024) - struct vmcb_struct *alloc_vmcb(void) { struct vmcb_struct *vmcb; @@ -78,37 +76,6 @@ struct host_save_area *alloc_host_save_a return hsa; } -void svm_intercept_msr(struct vcpu *v, uint32_t msr, int enable) -{ - unsigned long *msr_bitmap = v->arch.hvm_svm.msrpm; - unsigned long *msr_bit = NULL; - - /* - * See AMD64 Programmers Manual, Vol 2, Section 15.10 (MSR-Bitmap Address). - */ - if ( msr <= 0x1fff ) - msr_bit = msr_bitmap + 0x0000 / BYTES_PER_LONG; - else if ( (msr >= 0xc0000000) && (msr <= 0xc0001fff) ) - msr_bit = msr_bitmap + 0x0800 / BYTES_PER_LONG; - else if ( (msr >= 0xc0010000) && (msr <= 0xc0011fff) ) - msr_bit = msr_bitmap + 0x1000 / BYTES_PER_LONG; - - BUG_ON(msr_bit == NULL); - - msr &= 0x1fff; - - if ( enable ) - { - __set_bit(msr * 2, msr_bit); - __set_bit(msr * 2 + 1, msr_bit); - } - else - { - __clear_bit(msr * 2, msr_bit); - __clear_bit(msr * 2 + 1, msr_bit); - } -} - /* This function can directly access fields which are covered by clean bits. */ static int construct_vmcb(struct vcpu *v) { @@ -257,7 +224,7 @@ static int construct_vmcb(struct vcpu *v if ( cpu_has_pause_filter ) { - vmcb->_pause_filter_count = 3000; + vmcb->_pause_filter_count = SVM_PAUSEFILTER_INIT; vmcb->_general1_intercepts |= GENERAL1_INTERCEPT_PAUSE; } @@ -268,34 +235,38 @@ static int construct_vmcb(struct vcpu *v int svm_create_vmcb(struct vcpu *v) { + struct nestedvcpu *nv = &vcpu_nestedhvm(v); struct arch_svm_struct *arch_svm = &v->arch.hvm_svm; int rc; - if ( (arch_svm->vmcb == NULL) && - (arch_svm->vmcb = alloc_vmcb()) == NULL ) + if ( (nv->nv_n1vmcx == NULL) && + (nv->nv_n1vmcx = alloc_vmcb()) == NULL ) { printk("Failed to create a new VMCB\n"); return -ENOMEM; } - if ( (rc = construct_vmcb(v)) != 0 ) + arch_svm->vmcb = nv->nv_n1vmcx; + rc = construct_vmcb(v); + if ( rc != 0 ) { - free_vmcb(arch_svm->vmcb); + free_vmcb(nv->nv_n1vmcx); + nv->nv_n1vmcx = NULL; arch_svm->vmcb = NULL; return rc; } - arch_svm->vmcb_pa = virt_to_maddr(arch_svm->vmcb); - + arch_svm->vmcb_pa = nv->nv_n1vmcx_pa = virt_to_maddr(arch_svm->vmcb); return 0; } void svm_destroy_vmcb(struct vcpu *v) { + struct nestedvcpu *nv = &vcpu_nestedhvm(v); struct arch_svm_struct *arch_svm = &v->arch.hvm_svm; - if ( arch_svm->vmcb != NULL ) - free_vmcb(arch_svm->vmcb); + if ( nv->nv_n1vmcx != NULL ) + free_vmcb(nv->nv_n1vmcx); if ( arch_svm->msrpm != NULL ) { @@ -304,81 +275,11 @@ void svm_destroy_vmcb(struct vcpu *v) arch_svm->msrpm = NULL; } + nv->nv_n1vmcx = NULL; + nv->nv_n1vmcx_pa = VMCX_EADDR; arch_svm->vmcb = NULL; } -static void svm_dump_sel(char *name, svm_segment_register_t *s) -{ - printk("%s: sel=0x%04x, attr=0x%04x, limit=0x%08x, base=0x%016llx\n", - name, s->sel, s->attr.bytes, s->limit, - (unsigned long long)s->base); -} - -/* This function can directly access fields which are covered by clean bits. */ -void svm_dump_vmcb(const char *from, struct vmcb_struct *vmcb) -{ - printk("Dumping guest's current state at %s...\n", from); - printk("Size of VMCB = %d, paddr = 0x%016lx, vaddr = %p\n", - (int) sizeof(struct vmcb_struct), virt_to_maddr(vmcb), vmcb); - - printk("cr_intercepts = 0x%08x dr_intercepts = 0x%08x " - "exception_intercepts = 0x%08x\n", - vmcb->_cr_intercepts, vmcb->_dr_intercepts, - vmcb->_exception_intercepts); - printk("general1_intercepts = 0x%08x general2_intercepts = 0x%08x\n", - vmcb->_general1_intercepts, vmcb->_general2_intercepts); - printk("iopm_base_pa = 0x%016llx msrpm_base_pa = 0x%016llx tsc_offset = " - "0x%016llx\n", - (unsigned long long)vmcb->_iopm_base_pa, - (unsigned long long)vmcb->_msrpm_base_pa, - (unsigned long long)vmcb->_tsc_offset); - printk("tlb_control = 0x%08x vintr = 0x%016llx interrupt_shadow = " - "0x%016llx\n", vmcb->tlb_control, - (unsigned long long)vmcb->_vintr.bytes, - (unsigned long long)vmcb->interrupt_shadow); - printk("exitcode = 0x%016llx exitintinfo = 0x%016llx\n", - (unsigned long long)vmcb->exitcode, - (unsigned long long)vmcb->exitintinfo.bytes); - printk("exitinfo1 = 0x%016llx exitinfo2 = 0x%016llx \n", - (unsigned long long)vmcb->exitinfo1, - (unsigned long long)vmcb->exitinfo2); - printk("np_enable = 0x%016llx guest_asid = 0x%03x\n", - (unsigned long long)vmcb->_np_enable, vmcb->_guest_asid); - printk("cpl = %d efer = 0x%016llx star = 0x%016llx lstar = 0x%016llx\n", - vmcb->_cpl, (unsigned long long)vmcb->_efer, - (unsigned long long)vmcb->star, (unsigned long long)vmcb->lstar); - printk("CR0 = 0x%016llx CR2 = 0x%016llx\n", - (unsigned long long)vmcb->_cr0, (unsigned long long)vmcb->_cr2); - printk("CR3 = 0x%016llx CR4 = 0x%016llx\n", - (unsigned long long)vmcb->_cr3, (unsigned long long)vmcb->_cr4); - printk("RSP = 0x%016llx RIP = 0x%016llx\n", - (unsigned long long)vmcb->rsp, (unsigned long long)vmcb->rip); - printk("RAX = 0x%016llx RFLAGS=0x%016llx\n", - (unsigned long long)vmcb->rax, (unsigned long long)vmcb->rflags); - printk("DR6 = 0x%016llx, DR7 = 0x%016llx\n", - (unsigned long long)vmcb->_dr6, (unsigned long long)vmcb->_dr7); - printk("CSTAR = 0x%016llx SFMask = 0x%016llx\n", - (unsigned long long)vmcb->cstar, - (unsigned long long)vmcb->sfmask); - printk("KernGSBase = 0x%016llx PAT = 0x%016llx \n", - (unsigned long long)vmcb->kerngsbase, - (unsigned long long)vmcb->_g_pat); - printk("H_CR3 = 0x%016llx CleanBits = 0x%08x\n", - (unsigned long long)vmcb->_h_cr3, vmcb->cleanbits.bytes); - - /* print out all the selectors */ - svm_dump_sel("CS", &vmcb->cs); - svm_dump_sel("DS", &vmcb->ds); - svm_dump_sel("SS", &vmcb->ss); - svm_dump_sel("ES", &vmcb->es); - svm_dump_sel("FS", &vmcb->fs); - svm_dump_sel("GS", &vmcb->gs); - svm_dump_sel("GDTR", &vmcb->gdtr); - svm_dump_sel("LDTR", &vmcb->ldtr); - svm_dump_sel("IDTR", &vmcb->idtr); - svm_dump_sel("TR", &vmcb->tr); -} - static void vmcb_dump(unsigned char ch) { struct domain *d; @@ -396,7 +297,7 @@ static void vmcb_dump(unsigned char ch) for_each_vcpu ( d, v ) { printk("\tVCPU %d\n", v->vcpu_id); - svm_dump_vmcb("key_handler", v->arch.hvm_svm.vmcb); + svm_vmcb_dump("key_handler", v->arch.hvm_svm.vmcb); } } diff -r e842b80fcee0 -r 3df1f127bd4f xen/include/asm-x86/hvm/svm/emulate.h --- a/xen/include/asm-x86/hvm/svm/emulate.h +++ b/xen/include/asm-x86/hvm/svm/emulate.h @@ -33,6 +33,11 @@ enum instruction_index { INSTR_RDTSC, INSTR_PAUSE, INSTR_XSETBV, + INSTR_VMRUN, + INSTR_VMLOAD, + INSTR_VMSAVE, + INSTR_STGI, + INSTR_CLGI, INSTR_MAX_COUNT /* Must be last - Number of instructions supported */ }; diff -r e842b80fcee0 -r 3df1f127bd4f xen/include/asm-x86/hvm/svm/nestedsvm.h --- /dev/null +++ b/xen/include/asm-x86/hvm/svm/nestedsvm.h @@ -0,0 +1,129 @@ +/* + * nestedsvm.h: Nested Virtualization + * Copyright (c) 2011, Advanced Micro Devices, Inc + * + * This program is free software; you can redistribute it and/or modify it + * under the terms and conditions of the GNU General Public License, + * version 2, as published by the Free Software Foundation. + * + * This program is distributed in the hope it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for + * more details. + * + * You should have received a copy of the GNU General Public License along with + * this program; if not, write to the Free Software Foundation, Inc., 59 Temple + * Place - Suite 330, Boston, MA 02111-1307 USA. + * + */ +#ifndef __ASM_X86_HVM_SVM_NESTEDSVM_H__ +#define __ASM_X86_HVM_SVM_NESTEDSVM_H__ + +#include +#include +#include + +struct nestedsvm { + uint64_t ns_msr_hsavepa; /* MSR HSAVE_PA value */ + + /* l1 guest physical address of virtual vmcb used by prior VMRUN. + * Needed for VMCB Cleanbit emulation. + */ + uint64_t ns_ovvmcb_pa; + + /* Cached real intercepts of the l2 guest */ + uint32_t ns_cr_intercepts; + uint32_t ns_dr_intercepts; + uint32_t ns_exception_intercepts; + uint32_t ns_general1_intercepts; + uint32_t ns_general2_intercepts; + + /* Cached real lbr of the l2 guest */ + lbrctrl_t ns_lbr_control; + + /* Cached real MSR permission bitmaps of the l2 guest */ + unsigned long *ns_cached_msrpm; + /* Merged MSR permission bitmap */ + unsigned long *ns_merged_msrpm; + + /* guest physical address of virtual io permission map */ + paddr_t ns_iomap_pa, ns_oiomap_pa; + /* Shadow io permission map */ + unsigned long *ns_iomap; + + /* Cache guest cr3/host cr3 the guest sets up for the l2 guest. + * Used by Shadow-on-Shadow and Nested-on-Nested. + * ns_vmcb_guestcr3: in l2 guest physical address space and points to + * the l2 guest page table + * ns_vmcb_hostcr3: in l1 guest physical address space and points to + * the l1 guest nested page table + */ + uint64_t ns_vmcb_guestcr3, ns_vmcb_hostcr3; + uint32_t ns_guest_asid; + + bool_t ns_hap_enabled; + + /* Only meaningful when vmexit_pending flag is set */ + struct { + uint64_t exitcode; /* native exitcode to inject into l1 guest */ + uint64_t exitinfo1; /* additional information to the exitcode */ + uint64_t exitinfo2; /* additional information to the exitcode */ + } ns_vmexit; + union { + uint32_t bytes; + struct { + uint32_t rflagsif: 1; + uint32_t vintrmask: 1; + uint32_t reserved: 30; + } fields; + } ns_hostflags; +}; + +#define vcpu_nestedsvm(v) (vcpu_nestedhvm(v).u.nsvm) + +/* True when l1 guest enabled SVM in EFER */ +#define hvm_svm_enabled(v) \ + (!!((v)->arch.hvm_vcpu.guest_efer & EFER_SVME)) + +int nestedsvm_vmcb_map(struct vcpu *v, uint64_t vmcbaddr); +void nestedsvm_vmexit_defer(struct vcpu *v, + uint64_t exitcode, uint64_t exitinfo1, uint64_t exitinfo2); +enum nestedhvm_vmexits +nestedsvm_vmexit_n2n1(struct vcpu *v, struct cpu_user_regs *regs); +enum nestedhvm_vmexits +nestedsvm_check_intercepts(struct vcpu *v, struct cpu_user_regs *regs, + uint64_t exitcode); + +/* Interface methods */ +int nsvm_vcpu_destroy(struct vcpu *v); +int nsvm_vcpu_initialise(struct vcpu *v); +int nsvm_vcpu_reset(struct vcpu *v); +int nsvm_vcpu_hostrestore(struct vcpu *v, struct cpu_user_regs *regs); +int nsvm_vcpu_vmrun(struct vcpu *v, struct cpu_user_regs *regs); +int nsvm_vcpu_vmexit_inject(struct vcpu *v, struct cpu_user_regs *regs, + uint64_t exitcode); +int nsvm_vcpu_vmexit_trap(struct vcpu *v, unsigned int trapnr, + int errcode, unsigned long cr2); +uint64_t nsvm_vcpu_guestcr3(struct vcpu *v); +uint64_t nsvm_vcpu_hostcr3(struct vcpu *v); +uint32_t nsvm_vcpu_asid(struct vcpu *v); +int nsvm_vmcb_guest_intercepts_exitcode(struct vcpu *v, + struct cpu_user_regs *regs, uint64_t exitcode); +int nsvm_vmcb_guest_intercepts_trap(struct vcpu *v, unsigned int trapnr); +bool_t nsvm_vmcb_hap_enabled(struct vcpu *v); + +/* MSRs */ +int nsvm_rdmsr(struct vcpu *v, unsigned int msr, uint64_t *msr_content); +int nsvm_wrmsr(struct vcpu *v, unsigned int msr, uint64_t msr_content); + +#endif /* ASM_X86_HVM_SVM_NESTEDSVM_H__ */ + +/* + * Local variables: + * mode: C + * c-set-style: "BSD" + * c-basic-offset: 4 + * tab-width: 4 + * indent-tabs-mode: nil + * End: + */ diff -r e842b80fcee0 -r 3df1f127bd4f xen/include/asm-x86/hvm/svm/svm.h --- a/xen/include/asm-x86/hvm/svm/svm.h +++ b/xen/include/asm-x86/hvm/svm/svm.h @@ -29,8 +29,6 @@ #include #include -void svm_dump_vmcb(const char *from, struct vmcb_struct *vmcb); - #define SVM_REG_EAX (0) #define SVM_REG_ECX (1) #define SVM_REG_EDX (2) @@ -62,6 +60,8 @@ static inline void svm_vmsave(void *vmcb : : "a" (__pa(vmcb)) : "memory" ); } +unsigned long *svm_msrbit(unsigned long *msr_bitmap, uint32_t msr); + extern u32 svm_feature_flags; #define SVM_FEATURE_NPT 0 /* Nested page table support */ @@ -82,4 +82,6 @@ extern u32 svm_feature_flags; #define cpu_has_svm_cleanbits cpu_has_svm_feature(SVM_FEATURE_VMCBCLEAN) #define cpu_has_pause_filter cpu_has_svm_feature(SVM_FEATURE_PAUSEFILTER) +#define SVM_PAUSEFILTER_INIT 3000 + #endif /* __ASM_X86_HVM_SVM_H__ */ diff -r e842b80fcee0 -r 3df1f127bd4f xen/include/asm-x86/hvm/svm/svmdebug.h --- /dev/null +++ b/xen/include/asm-x86/hvm/svm/svmdebug.h @@ -0,0 +1,30 @@ +/* + * svmdebug.h: SVM related debug defintions + * Copyright (c) 2011, AMD Corporation. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms and conditions of the GNU General Public License, + * version 2, as published by the Free Software Foundation. + * + * This program is distributed in the hope it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for + * more details. + * + * You should have received a copy of the GNU General Public License along with + * this program; if not, write to the Free Software Foundation, Inc., 59 Temple + * Place - Suite 330, Boston, MA 02111-1307 USA. + * + */ + +#ifndef __ASM_X86_HVM_SVM_SVMDEBUG_H__ +#define __ASM_X86_HVM_SVM_SVMDEBUG_H__ + +#include +#include + +void svm_vmcb_dump(const char *from, struct vmcb_struct *vmcb); +bool_t svm_vmcb_isvalid(const char *from, struct vmcb_struct *vmcb, + bool_t verbose); + +#endif /* __ASM_X86_HVM_SVM_SVMDEBUG_H__ */ diff -r e842b80fcee0 -r 3df1f127bd4f xen/include/asm-x86/hvm/svm/vmcb.h --- a/xen/include/asm-x86/hvm/svm/vmcb.h +++ b/xen/include/asm-x86/hvm/svm/vmcb.h @@ -398,6 +398,9 @@ typedef union } fields; } __attribute__ ((packed)) vmcbcleanbits_t; +#define IOPM_SIZE (12 * 1024) +#define MSRPM_SIZE (8 * 1024) + struct vmcb_struct { u32 _cr_intercepts; /* offset 0x00 - cleanbit 0 */ u32 _dr_intercepts; /* offset 0x04 - cleanbit 0 */ diff -r e842b80fcee0 -r 3df1f127bd4f xen/include/asm-x86/hvm/vcpu.h --- a/xen/include/asm-x86/hvm/vcpu.h +++ b/xen/include/asm-x86/hvm/vcpu.h @@ -25,6 +25,7 @@ #include #include #include +#include #include enum hvm_io_state { @@ -50,6 +51,7 @@ struct nestedvcpu { /* SVM/VMX arch specific */ union { + struct nestedsvm nsvm; } u; bool_t nv_flushp2m; /* True, when p2m table must be flushed */