# HG changeset patch # User cegger # Date 1275580729 -7200 Implement SVM specific part for Nested Virtualization. diff -r 9665f1bbdc20 -r f4ba5c5bc13d xen/arch/x86/hvm/svm/emulate.c --- a/xen/arch/x86/hvm/svm/emulate.c +++ b/xen/arch/x86/hvm/svm/emulate.c @@ -100,6 +100,11 @@ MAKE_INSTR(VMCALL, 3, 0x0f, 0x01, 0xd9); MAKE_INSTR(HLT, 1, 0xf4); MAKE_INSTR(INT3, 1, 0xcc); MAKE_INSTR(RDTSC, 2, 0x0f, 0x31); +MAKE_INSTR(VMRUN, 3, 0x0f, 0x01, 0xd8); +MAKE_INSTR(VMLOAD, 3, 0x0f, 0x01, 0xda); +MAKE_INSTR(VMSAVE, 3, 0x0f, 0x01, 0xdb); +MAKE_INSTR(STGI, 3, 0x0f, 0x01, 0xdc); +MAKE_INSTR(CLGI, 3, 0x0f, 0x01, 0xdd); static const u8 *opc_bytes[INSTR_MAX_COUNT] = { @@ -111,7 +116,12 @@ static const u8 *opc_bytes[INSTR_MAX_COU [INSTR_VMCALL] = OPCODE_VMCALL, [INSTR_HLT] = OPCODE_HLT, [INSTR_INT3] = OPCODE_INT3, - [INSTR_RDTSC] = OPCODE_RDTSC + [INSTR_RDTSC] = OPCODE_RDTSC, + [INSTR_VMRUN] = OPCODE_VMRUN, + [INSTR_VMLOAD] = OPCODE_VMLOAD, + [INSTR_VMSAVE] = OPCODE_VMSAVE, + [INSTR_STGI] = OPCODE_STGI, + [INSTR_CLGI] = OPCODE_CLGI, }; static int fetch(struct vcpu *v, u8 *buf, unsigned long addr, int len) diff -r 9665f1bbdc20 -r f4ba5c5bc13d xen/arch/x86/hvm/svm/svm.c --- a/xen/arch/x86/hvm/svm/svm.c +++ b/xen/arch/x86/hvm/svm/svm.c @@ -48,6 +48,7 @@ #include #include #include +#include #include #include #include @@ -325,7 +326,7 @@ static int svm_load_vmcb_ctxt(struct vcp { svm_load_cpu_state(v, ctxt); if (svm_vmcb_restore(v, ctxt)) { - printk("svm_vmcb restore failed!\n"); + gdprintk(XENLOG_ERR, "svm_vmcb restore failed!\n"); domain_crash(v->domain); return -EINVAL; } @@ -692,8 +693,10 @@ static void svm_ctxt_switch_to(struct vc static void svm_do_resume(struct vcpu *v) { bool_t debug_state = v->domain->debugger_attached; - - if ( unlikely(v->arch.hvm_vcpu.debug_state_latch != debug_state) ) + bool_t guestmode = nestedhvm_vcpu_in_guestmode(v); + + if ( !guestmode && + unlikely(v->arch.hvm_vcpu.debug_state_latch != debug_state) ) { uint32_t mask = (1U << TRAP_debug) | (1U << TRAP_int3); v->arch.hvm_vcpu.debug_state_latch = debug_state; @@ -712,11 +715,14 @@ static void svm_do_resume(struct vcpu *v hvm_asid_flush_vcpu(v); } - /* Reflect the vlapic's TPR in the hardware vtpr */ - v->arch.hvm_svm.vmcb->vintr.fields.tpr = - (vlapic_get_reg(vcpu_vlapic(v), APIC_TASKPRI) & 0xFF) >> 4; - - hvm_do_resume(v); + if ( !guestmode ) + { + /* Reflect the vlapic's TPR in the hardware vtpr */ + v->arch.hvm_svm.vmcb->vintr.fields.tpr = + (vlapic_get_reg(vcpu_vlapic(v), APIC_TASKPRI) & 0xFF) >> 4; + + hvm_do_resume(v); + } reset_stack_and_jump(svm_asm_do_resume); } @@ -857,6 +863,633 @@ static void svm_init_erratum_383(struct amd_erratum383_found = 1; } +/* + * Nested SVM + */ +static int nsvm_vcpu_initialise(struct vcpu *v) +{ + void *msrpm; + + ASSERT(VCPU_NESTEDHVM(v).nh_hostsave == NULL); + VCPU_NESTEDHVM(v).nh_hostsave = alloc_vmcb(); + if (VCPU_NESTEDHVM(v).nh_hostsave == NULL) + goto err0; + + msrpm = alloc_xenheap_pages(get_order_from_bytes(MSRPM_SIZE), 0); + VCPU_NESTEDHVM(v).nh_msrpm = msrpm; + if ( msrpm == NULL ) + goto err1; + memset(msrpm, 0x0, MSRPM_SIZE); + + return 0; + +err1: + free_vmcb(VCPU_NESTEDHVM(v).nh_hostsave); + VCPU_NESTEDHVM(v).nh_hostsave = NULL; +err0: + return -ENOMEM; +} + +static int nsvm_vcpu_destroy(struct vcpu *v) +{ + if (VCPU_NESTEDHVM(v).nh_hostsave) { + free_vmcb(VCPU_NESTEDHVM(v).nh_hostsave); + VCPU_NESTEDHVM(v).nh_hostsave = NULL; + } + if (VCPU_NESTEDHVM(v).nh_msrpm) { + free_xenheap_pages(VCPU_NESTEDHVM(v).nh_msrpm, + get_order_from_bytes(MSRPM_SIZE)); + VCPU_NESTEDHVM(v).nh_msrpm = NULL; + } + + return 0; +} + +static int nsvm_vcpu_features(struct vcpu *v, + uint32_t *eax, uint32_t *ebx, uint32_t *ecx, uint32_t *edx) +{ + uint32_t dummy; + + cpuid(0x8000000a, eax, ebx, ecx, &dummy); + + *edx = 0; + + if ( cpu_has_svm_lbrv ) + *edx |= (1U << SVM_FEATURE_LBRV); +#if 0 /* not yet implemented */ + if ( cpu_has_svm_svml ) + *edx |= (1U << SVM_FEATURE_SVML); +#endif + if ( cpu_has_svm_nrips ) + *edx |= (1U << SVM_FEATURE_NRIPS); + if ( cpu_has_pause_filter ) + *edx |= (1U << SVM_FEATURE_PAUSEF); + + return 0; +} + +static void nsvm_vmcb_loadsave(struct vmcb_struct *from, + struct vmcb_struct *to) +{ + to->fs = from->fs; + to->gs = from->gs; + to->tr = from->tr; + to->ldtr = from->ldtr; + to->kerngsbase = from->kerngsbase; + to->star = from->star; + to->lstar = from->lstar; + to->cstar = from->cstar; + to->sfmask = from->sfmask; + to->sysenter_cs = from->sysenter_cs; + to->sysenter_esp = from->sysenter_esp; + to->sysenter_eip = from->sysenter_eip; +} + +static int nsvm_vcpu_hostsave(struct vcpu *v, unsigned int inst_len) +{ + struct vmcb_struct *hsave, *vmcb; + + hsave = VCPU_NESTEDHVM(v).nh_hostsave; + vmcb = v->arch.hvm_svm.vmcb; + + memcpy(hsave, vmcb, sizeof(struct vmcb_struct)); + hsave->rip += inst_len; + + /* Remember the host interrupt flag */ + if (hsave->rflags & X86_EFLAGS_IF) + VCPU_NESTEDHVM(v).nh_hostflags.fields.rflagsif = 1; + else + VCPU_NESTEDHVM(v).nh_hostflags.fields.rflagsif = 0; + + /* Nested paging mode */ + if (nestedhvm_paging_mode_hap(v)) + hsave->cr3 = vmcb->cr3; + hsave->h_cr3 = vmcb->h_cr3; + if (paging_mode_hap(v->domain)) + hsave->cr3 = vmcb->cr3; + else + hsave->cr3 = v->arch.hvm_vcpu.guest_cr[3]; + + hsave->efer = v->arch.hvm_vcpu.guest_efer; + hsave->cr0 = v->arch.hvm_vcpu.guest_cr[0]; + hsave->cr2 = v->arch.hvm_vcpu.guest_cr[2]; + hsave->cr4 = v->arch.hvm_vcpu.guest_cr[4]; + + return 0; +} + +static int nsvm_vcpu_hostrestore(struct vcpu *v, struct cpu_user_regs *regs) +{ + struct vmcb_struct *hsave, *vmcb; + int rc; + + hsave = VCPU_NESTEDHVM(v).nh_hostsave; + vmcb = v->arch.hvm_svm.vmcb; + + /* Must keep register values handled by VMSAVE/VMLOAD */ + nsvm_vmcb_loadsave(vmcb, hsave); + memcpy(vmcb, hsave, sizeof(struct vmcb_struct)); + + /* EFER */ + v->arch.hvm_vcpu.guest_efer = vmcb->efer; + rc = hvm_set_efer(vmcb->efer); + if (rc != X86EMUL_OKAY) + gdprintk(XENLOG_ERR, "hvm_set_efer failed, rc: %u\n", rc); + + /* CR4 */ + v->arch.hvm_vcpu.guest_cr[4] = vmcb->cr4; + rc = hvm_set_cr4(vmcb->cr4); + if (rc != X86EMUL_OKAY) + gdprintk(XENLOG_ERR, "hvm_set_cr4 failed, rc: %u\n", rc); + + /* CR0 */ + v->arch.hvm_vcpu.guest_cr[0] = vmcb->cr0 | X86_CR0_PE; + vmcb->rflags &= ~X86_EFLAGS_VM; + rc = hvm_set_cr0(vmcb->cr0 | X86_CR0_PE); + if (rc != X86EMUL_OKAY) + gdprintk(XENLOG_ERR, "hvm_set_cr0 failed, rc: %u\n", rc); + + /* CR2 */ + v->arch.hvm_vcpu.guest_cr[2] = vmcb->cr2; + hvm_update_guest_cr(v, 2); + + /* CR3 */ + /* Nested paging mode */ + if (nestedhvm_paging_mode_hap(v)) { + /* host nested paging + guest nested paging. */ + /* hvm_set_cr3() below sets v->arch.hvm_vcpu.guest_cr[3] for us. */ + } else if (paging_mode_hap(v->domain)) { + /* host nested paging + guest shadow paging. */ + /* hvm_set_cr3() below sets v->arch.hvm_vcpu.guest_cr[3] for us. */ + } else { + /* host shadow paging + guest shadow paging. */ + + /* Reset MMU context -- XXX (hostrestore) not yet working*/ + if (!pagetable_is_null(v->arch.guest_table)) + put_page(pagetable_get_page(v->arch.guest_table)); + v->arch.guest_table = pagetable_null(); + /* hvm_set_cr3() below sets v->arch.hvm_vcpu.guest_cr[3] for us. */ + } + rc = hvm_set_cr3(vmcb->cr3); + if (rc != X86EMUL_OKAY) + gdprintk(XENLOG_ERR, "hvm_set_cr3 failed, rc: %u\n", rc); + + regs->eax = vmcb->rax; + regs->esp = vmcb->rsp; + regs->eip = vmcb->rip; + regs->eflags = vmcb->rflags; + vmcb->dr7 = 0; /* disable all breakpoints */ + vmcb->cpl = 0; + + /* Clear exitintinfo to prevent a fault loop of re-injecting + * exceptions forever. + */ + vmcb->exitintinfo.bytes = 0; + + hvm_asid_flush_vcpu(v); + + return 0; +} + +static int nsvm_vcpu_vmload(struct vcpu *v, uint64_t vmcbaddr) +{ + int ret = 0; + struct vmcb_struct *vmcb = v->arch.hvm_svm.vmcb; + struct vmcb_struct tmp_vmcb; + + ret = nestedhvm_vmcb_fromguest(&tmp_vmcb, vmcbaddr); + if (ret) + return ret; + + nsvm_vmcb_loadsave(&tmp_vmcb, vmcb); + svm_vmload(vmcb); + + return 0; +} + +static int nsvm_vcpu_vmsave(struct vcpu *v, uint64_t vmcbaddr) +{ + int ret = 0; + struct vmcb_struct *vmcb = v->arch.hvm_svm.vmcb; + struct vmcb_struct tmp_vmcb; + + ret = nestedhvm_vmcb_fromguest(&tmp_vmcb, vmcbaddr); + if (ret) + return ret; + + svm_vmsave(vmcb); + nsvm_vmcb_loadsave(vmcb, &tmp_vmcb); + + ret = nestedhvm_vmcb_toguest(&tmp_vmcb, vmcbaddr); + + return ret; +} + +static int nsvm_vcpu_vmrun(struct vcpu *v, struct cpu_user_regs *regs, + unsigned int flags) +{ + struct vmcb_struct *ns_vmcb, *vmcb; + + ns_vmcb = VCPU_NESTEDHVM(v).nh_vmcb; + vmcb = v->arch.hvm_svm.vmcb; + + if (flags & NESTEDHVM_VMRUN_INTERCEPT) { + /* We are in a VMRUN intercept. */ + regs->eax = ns_vmcb->rax; + regs->eip = ns_vmcb->rip; + regs->esp = ns_vmcb->rsp; + regs->eflags = ns_vmcb->rflags; + return 0; + } + + if (flags & NESTEDHVM_VMRUN_VMEXIT) { + return 0; + } + + return 1; +} + +static int nsvm_vmrun_permissionmap(struct vcpu *v) +{ + struct arch_svm_struct *arch_svm = &v->arch.hvm_svm; + struct vmcb_struct *ns_vmcb = VCPU_NESTEDHVM(v).nh_vmcb; + struct vmcb_struct *host_vmcb = arch_svm->vmcb; + unsigned long *ns_msrpm_ptr; + unsigned int i; + enum hvm_copy_result ret; + /* Must be static or hvm_copy_from_guest_phys fails, otherwise. */ + static uint8_t ns_msrpm[MSRPM_SIZE] __attribute__((__aligned__)); + static DEFINE_SPINLOCK(ns_pmlock); + + spin_lock(&ns_pmlock); + ns_msrpm_ptr = (unsigned long *)ns_msrpm; + + ret = hvm_copy_from_guest_phys(ns_msrpm, + ns_vmcb->msrpm_base_pa, MSRPM_SIZE); + if (ret != HVMCOPY_okay) { + spin_unlock(&ns_pmlock); + gdprintk(XENLOG_ERR, "hvm_copy_from_guest_phys msrpm %u\n", ret); + return 1; + } + + /* Skip io bitmap merge since hvm_io_bitmap has all bits set but + * 0x80 and 0xed. + */ + + /* v->arch.hvm_svm.msrpm has type unsigned long, thus + * BYTES_PER_LONG. + */ + for (i = 0; i < MSRPM_SIZE / BYTES_PER_LONG; i++) + VCPU_NESTEDHVM(v).nh_msrpm[i] = arch_svm->msrpm[i] | ns_msrpm_ptr[i]; + spin_unlock(&ns_pmlock); + + host_vmcb->iopm_base_pa = + (uint64_t)virt_to_maddr(hvm_io_bitmap); + host_vmcb->msrpm_base_pa = + (uint64_t)virt_to_maddr(VCPU_NESTEDHVM(v).nh_msrpm); + + return 0; +} + +static int nsvm_vmcb_prepare4vmrun(struct vcpu *v) +{ + struct vmcb_struct *ns_vmcb = VCPU_NESTEDHVM(v).nh_vmcb; + struct vmcb_struct *host_vmcb = v->arch.hvm_svm.vmcb; + int rc; + + /* Enable nested guest intercepts */ + VCPU_NESTEDHVM(v).nh_cr_intercepts = ns_vmcb->cr_intercepts; + VCPU_NESTEDHVM(v).nh_dr_intercepts = ns_vmcb->dr_intercepts; + VCPU_NESTEDHVM(v).nh_exception_intercepts = ns_vmcb->exception_intercepts; + VCPU_NESTEDHVM(v).nh_general1_intercepts = ns_vmcb->general1_intercepts; + VCPU_NESTEDHVM(v).nh_general2_intercepts = ns_vmcb->general2_intercepts; + + host_vmcb->cr_intercepts |= ns_vmcb->cr_intercepts; + host_vmcb->dr_intercepts |= ns_vmcb->dr_intercepts; + host_vmcb->exception_intercepts |= ns_vmcb->exception_intercepts; + host_vmcb->general1_intercepts |= ns_vmcb->general1_intercepts; + host_vmcb->general2_intercepts |= ns_vmcb->general2_intercepts; + + /* Nested Pause Filter */ + host_vmcb->pause_filter_count = ns_vmcb->pause_filter_count; + + /* Nested IO permission bitmaps */ + rc = nsvm_vmrun_permissionmap(v); + if (rc) + return rc; + + /* TSC offset */ + hvm_set_guest_tsc(v, host_vmcb->tsc_offset + ns_vmcb->tsc_offset); + + /* ASID */ + hvm_asid_flush_vcpu(v); + /* host_vmcb->guest_asid = ns_vmcb->guest_asid; */ + + /* TLB control */ + host_vmcb->tlb_control |= ns_vmcb->tlb_control; + + /* Virtual Interrupts */ + host_vmcb->vintr = ns_vmcb->vintr; + host_vmcb->vintr.fields.intr_masking = 1; + + /* Shadow Mode */ + host_vmcb->interrupt_shadow = ns_vmcb->interrupt_shadow; + + /* Exit codes */ + host_vmcb->exitcode = ns_vmcb->exitcode; + host_vmcb->exitinfo1 = ns_vmcb->exitinfo1; + host_vmcb->exitinfo2 = ns_vmcb->exitinfo2; + host_vmcb->exitintinfo = ns_vmcb->exitintinfo; + + /* Pending Interrupts */ + host_vmcb->eventinj = ns_vmcb->eventinj; + + /* LBR virtualization */ + VCPU_NESTEDHVM(v).nh_lbr_control = ns_vmcb->lbr_control; + host_vmcb->lbr_control.bytes |= ns_vmcb->lbr_control.bytes; + + /* NextRIP */ + host_vmcb->nextrip = ns_vmcb->nextrip; + + /* + * VMCB Save State Area + */ + + /* Segments */ + host_vmcb->es = ns_vmcb->es; + host_vmcb->cs = ns_vmcb->cs; + host_vmcb->ss = ns_vmcb->ss; + host_vmcb->ds = ns_vmcb->ds; + host_vmcb->gdtr = ns_vmcb->gdtr; + host_vmcb->idtr = ns_vmcb->idtr; + + /* CPL */ + host_vmcb->cpl = ns_vmcb->cpl; + + /* EFER */ + v->arch.hvm_vcpu.guest_efer = ns_vmcb->efer; + rc = hvm_set_efer(ns_vmcb->efer); + if (rc != X86EMUL_OKAY) + gdprintk(XENLOG_ERR, "hvm_set_efer failed, rc: %u\n", rc); + + /* CR4 */ + v->arch.hvm_vcpu.guest_cr[4] = ns_vmcb->cr4; + rc = hvm_set_cr4(ns_vmcb->cr4); + if (rc != X86EMUL_OKAY) + gdprintk(XENLOG_ERR, "hvm_set_cr4 failed, rc: %u\n", rc); + + /* CR0 */ + v->arch.hvm_vcpu.guest_cr[0] = ns_vmcb->cr0; + rc = hvm_set_cr0(ns_vmcb->cr0); + if (rc != X86EMUL_OKAY) + gdprintk(XENLOG_ERR, "hvm_set_cr0 failed, rc: %u\n", rc); + + /* CR2 */ + v->arch.hvm_vcpu.guest_cr[2] = ns_vmcb->cr2; + hvm_update_guest_cr(v, 2); + + /* Nested paging mode */ + if (nestedhvm_paging_mode_hap(v)) { + /* host nested paging + guest nested paging. */ + + /* hvm_set_cr3() below sets v->arch.hvm_vcpu.guest_cr[3] for us. */ + rc = hvm_set_cr3(ns_vmcb->cr3); + if (rc != X86EMUL_OKAY) + gdprintk(XENLOG_ERR, "hvm_set_cr3 failed, rc: %u\n", rc); + } else if (paging_mode_hap(v->domain)) { + /* host nested paging + guest shadow paging. */ + host_vmcb->np_enable = 1; + /* Keep h_cr3 as it is. */ + /* Guest shadow paging: Must intercept pagefaults. */ + host_vmcb->exception_intercepts |= (1U << TRAP_page_fault); + /* hvm_set_cr3() below sets v->arch.hvm_vcpu.guest_cr[3] for us. */ + rc = hvm_set_cr3(ns_vmcb->cr3); + if (rc != X86EMUL_OKAY) + gdprintk(XENLOG_ERR, "hvm_set_cr3 failed, rc: %u\n", rc); + } else { + /* host shadow paging + guest shadow paging. */ + host_vmcb->np_enable = 0; + host_vmcb->h_cr3 = 0x0; + +#if 0 + host_vmcb->cr3 = v->shadow_shadow_table; + + /* hvm_set_cr3() below sets v->arch.hvm_vcpu.guest_cr[3] for us. */ + rc = hvm_set_cr3(ns_vmcb->cr3); + if (rc != X86EMUL_OKAY) + gdprintk(XENLOG_ERR, "hvm_set_cr3 failed, rc: %u\n", rc); +#endif + } + + /* DRn */ + host_vmcb->dr7 = ns_vmcb->dr7; + host_vmcb->dr6 = ns_vmcb->dr6; + + /* RFLAGS */ + host_vmcb->rflags = ns_vmcb->rflags; + + /* RIP */ + host_vmcb->rip = ns_vmcb->rip; + + /* RSP */ + host_vmcb->rsp = ns_vmcb->rsp; + + /* RAX */ + host_vmcb->rax = ns_vmcb->rax; + + /* Keep the host values of the fs, gs, ldtr, tr, kerngsbase, + * star, lstar, cstar, sfmask, sysenter_cs, sysenter_esp, + * sysenter_eip. These are handled via VMSAVE/VMLOAD emulation. + */ + + /* Page tables */ + host_vmcb->pdpe0 = ns_vmcb->pdpe0; + host_vmcb->pdpe1 = ns_vmcb->pdpe1; + host_vmcb->pdpe2 = ns_vmcb->pdpe2; + host_vmcb->pdpe3 = ns_vmcb->pdpe3; + + /* PAT */ + host_vmcb->g_pat = ns_vmcb->g_pat; + + /* Debug Control MSR */ + host_vmcb->debugctlmsr = ns_vmcb->debugctlmsr; + + /* LBR MSRs */ + host_vmcb->lastbranchfromip = ns_vmcb->lastbranchfromip; + host_vmcb->lastbranchtoip = ns_vmcb->lastbranchtoip; + host_vmcb->lastintfromip = ns_vmcb->lastintfromip; + host_vmcb->lastinttoip = ns_vmcb->lastinttoip; + + return 0; +} + +static int nsvm_vmcb_prepare4vmexit(struct vcpu *v) +{ + struct vmcb_struct *vmcb, *ns_vmcb; + + vmcb = v->arch.hvm_svm.vmcb; + ns_vmcb = VCPU_NESTEDHVM(v).nh_vmcb; + + svm_vmsave(vmcb); + + /* Intercepts */ + /* Copy cached intercepts since they are the guest's original + * intercepts. + */ + ns_vmcb->cr_intercepts = VCPU_NESTEDHVM(v).nh_cr_intercepts; + ns_vmcb->dr_intercepts = VCPU_NESTEDHVM(v).nh_dr_intercepts; + ns_vmcb->exception_intercepts = VCPU_NESTEDHVM(v).nh_exception_intercepts; + ns_vmcb->general1_intercepts = VCPU_NESTEDHVM(v).nh_general1_intercepts; + ns_vmcb->general2_intercepts = VCPU_NESTEDHVM(v).nh_general2_intercepts; + + /* Nested Pause Filter */ + ns_vmcb->pause_filter_count = vmcb->pause_filter_count; + + /* Nested IO permission bitmap */ + /* Just keep the iopm_base_pa and msrpm_base_pa values. + * The guest must not see the virtualized values. + */ + + /* TSC offset */ + ns_vmcb->tsc_offset = vmcb->tsc_offset; + + /* ASID */ + /* ns_vmcb->guest_asid = vmcb->guest_asid; */ + + /* TLB control */ + ns_vmcb->tlb_control = 0; + + /* Virtual Interrupts */ + ns_vmcb->vintr = vmcb->vintr; + if (!(VCPU_NESTEDHVM(v).nh_hostflags.fields.vintrmask)) + ns_vmcb->vintr.fields.intr_masking = 0; + + /* Shadow mode */ + ns_vmcb->interrupt_shadow = vmcb->interrupt_shadow; + + /* Exit codes */ + ns_vmcb->exitcode = vmcb->exitcode; + ns_vmcb->exitinfo1 = vmcb->exitinfo1; + ns_vmcb->exitinfo2 = vmcb->exitinfo2; + ns_vmcb->exitintinfo = vmcb->exitintinfo; + + /* Interrupts */ + /* If we emulate a VMRUN/#VMEXIT in the same host #VMEXIT cycle we have + * to make sure that we do not lose injected events. So check eventinj + * here and copy it to exitintinfo if it is valid. + * exitintinfo and eventinj can't be both valid because the case below + * only happens on a VMRUN instruction intercept which has no valid + * exitintinfo set. + */ + if ( unlikely(vmcb->eventinj.fields.v) && + hvm_event_needs_reinjection(vmcb->eventinj.fields.type, + vmcb->eventinj.fields.vector) ) + { + ns_vmcb->exitintinfo = vmcb->eventinj; + } + + ns_vmcb->eventinj.bytes = 0; + + /* Nested paging mode */ + if (nestedhvm_paging_mode_hap(v)) { + /* host nested paging + guest nested paging. */ + ns_vmcb->np_enable = vmcb->np_enable; + ns_vmcb->cr3 = vmcb->cr3; + /* The vmcb->h_cr3 is the shadowed h_cr3. The original + * unshadowed guest h_cr3 is kept in ns_vmcb->h_cr3, + * hence we keep the ns_vmcb->h_cr3 value. */ + } else if (paging_mode_hap(v->domain)) { + /* host nested paging + guest shadow paging. */ + ns_vmcb->np_enable = 0; + /* Throw h_cr3 away. Guest is not allowed to set it or + * it can break out, otherwise (security hole!) */ + ns_vmcb->h_cr3 = 0x0; + /* Stop intercepting #PF (already done above + * by restoring cached intercepts). */ + ns_vmcb->cr3 = vmcb->cr3; + } else { + /* host shadow paging + guest shadow paging. */ + ns_vmcb->np_enable = 0; + ns_vmcb->h_cr3 = 0x0; + /* The vmcb->cr3 is the shadowed cr3. The original + * unshadowed guest cr3 is kept in ns_vmcb->cr3, + * hence we keep the ns_vmcb->cr3 value. */ + } + + /* LBR virtualization */ + ns_vmcb->lbr_control = VCPU_NESTEDHVM(v).nh_lbr_control; + + /* NextRIP */ + ns_vmcb->nextrip = vmcb->nextrip; + + /* + * VMCB Save State Area + */ + + /* Segments */ + ns_vmcb->es = vmcb->es; + ns_vmcb->cs = vmcb->cs; + ns_vmcb->ss = vmcb->ss; + ns_vmcb->ds = vmcb->ds; + ns_vmcb->gdtr = vmcb->gdtr; + ns_vmcb->idtr = vmcb->idtr; + + /* CPL */ + ns_vmcb->cpl = vmcb->cpl; + + /* EFER */ + ns_vmcb->efer = vmcb->efer; + + /* CRn */ + ns_vmcb->cr4 = vmcb->cr4; + ns_vmcb->cr0 = vmcb->cr0; + + /* DRn */ + ns_vmcb->dr7 = vmcb->dr7; + ns_vmcb->dr6 = vmcb->dr6; + + /* RFLAGS */ + ns_vmcb->rflags = vmcb->rflags; + + /* RIP */ + ns_vmcb->rip = vmcb->rip; + + /* RSP */ + ns_vmcb->rsp = vmcb->rsp; + + /* RAX */ + ns_vmcb->rax = vmcb->rax; + + /* Keep the nested guest values of the fs, gs, ldtr, tr, kerngsbase, + * star, lstar, cstar, sfmask, sysenter_cs, sysenter_esp, + * sysenter_eip. These are handled via VMSAVE/VMLOAD emulation. + */ + + /* CR2 */ + ns_vmcb->cr2 = vmcb->cr2; + + /* Page tables */ + ns_vmcb->pdpe0 = vmcb->pdpe0; + ns_vmcb->pdpe1 = vmcb->pdpe1; + ns_vmcb->pdpe2 = vmcb->pdpe2; + ns_vmcb->pdpe3 = vmcb->pdpe3; + + /* PAT */ + ns_vmcb->g_pat = vmcb->g_pat; + + /* Debug Control MSR */ + ns_vmcb->debugctlmsr = vmcb->debugctlmsr; + + /* LBR MSRs */ + ns_vmcb->lastbranchfromip = vmcb->lastbranchfromip; + ns_vmcb->lastbranchtoip = vmcb->lastbranchtoip; + ns_vmcb->lastintfromip = vmcb->lastintfromip; + ns_vmcb->lastinttoip = vmcb->lastinttoip; + + return 0; +} + + static int svm_cpu_up(void) { u32 eax, edx, phys_hsa_lo, phys_hsa_hi; @@ -952,8 +1585,8 @@ static void svm_do_nested_pgfault(paddr_ struct { uint64_t gpa; uint64_t mfn; - u32 qualification; - u32 p2mt; + uint32_t qualification; + uint32_t p2mt; } _d; _d.gpa = gpa; @@ -1271,6 +1904,102 @@ static void svm_vmexit_do_rdtsc(struct c hvm_rdtsc_intercept(regs); } +static void svm_vmexit_do_vmrun(struct cpu_user_regs *regs, + struct vcpu *v, uint64_t vmcbaddr) +{ + int ret; + unsigned int inst_len; + + if ( (inst_len = __get_instruction_length(current, INSTR_VMRUN)) == 0 ) + return; + + ret = nestedhvm_vcpu_vmrun(v, regs, vmcbaddr, inst_len); + if (ret) + /* On failure, nestedhvm_vcpu_vmrun injected an exception, + * almost a #GP or #UD. + */ + return; +} + +static void svm_vmexit_do_vmload(struct cpu_user_regs *regs, + struct vcpu *v, uint64_t vmcbaddr) +{ + int ret; + unsigned int inst_len; + + if ( (inst_len = __get_instruction_length(v, INSTR_VMLOAD)) == 0 ) + return; + + ret = nestedhvm_vcpu_vmload(v, vmcbaddr); + if (ret) + /* On failure, nestedhvm_vcpu_vmload injected an exception, + * almost a #GP or #UD. + */ + return; + + __update_guest_eip(regs, inst_len); +} + +static void svm_vmexit_do_vmsave(struct cpu_user_regs *regs, + struct vcpu *v, uint64_t vmcbaddr) +{ + int ret; + unsigned int inst_len; + + if ( (inst_len = __get_instruction_length(v, INSTR_VMSAVE)) == 0 ) + return; + + ret = nestedhvm_vcpu_vmsave(v, vmcbaddr); + if (ret) + /* On failure, nestedhvm_vcpu_vmsave injected an exception, + * almost a #GP or #UD. + */ + return; + + __update_guest_eip(regs, inst_len); +} + +static void svm_vmexit_do_clgi(struct cpu_user_regs *regs, struct vcpu *v) +{ + int ret; + struct vmcb_struct *vmcb = v->arch.hvm_svm.vmcb; + unsigned int inst_len; + + if ( (inst_len = __get_instruction_length(v, INSTR_CLGI)) == 0 ) + return; + + ret = nestedhvm_vcpu_clgi(v); + if (ret) + /* On failure, nestedhvm_vcpu_clgi injected an exception, + * almost a #GP or #UD. + */ + return; + + /* After a CLGI no interrupts should come */ + vmcb->vintr.fields.irq = 0; + vmcb->general1_intercepts &= ~GENERAL1_INTERCEPT_VINTR; + + __update_guest_eip(regs, inst_len); +} + +static void svm_vmexit_do_stgi(struct cpu_user_regs *regs, struct vcpu *v) +{ + int ret; + unsigned int inst_len; + + if ( (inst_len = __get_instruction_length(v, INSTR_STGI)) == 0 ) + return; + + ret = nestedhvm_vcpu_stgi(v); + if (ret) + /* On failure, nestedhvm_vcpu_stgi injected an exception, + * almost a #GP or #UD. + */ + return; + + __update_guest_eip(regs, inst_len); +} + static void svm_vmexit_ud_intercept(struct cpu_user_regs *regs) { struct hvm_emulate_ctxt ctxt; @@ -1418,20 +2147,35 @@ static struct hvm_function_table __read_ .msr_read_intercept = svm_msr_read_intercept, .msr_write_intercept = svm_msr_write_intercept, .invlpg_intercept = svm_invlpg_intercept, - .set_rdtsc_exiting = svm_set_rdtsc_exiting + .set_rdtsc_exiting = svm_set_rdtsc_exiting, + + .nestedhvm_vcpu_initialise = nsvm_vcpu_initialise, + .nestedhvm_vcpu_destroy = nsvm_vcpu_destroy, + .nestedhvm_vcpu_features = nsvm_vcpu_features, + .nestedhvm_vcpu_hostsave = nsvm_vcpu_hostsave, + .nestedhvm_vcpu_hostrestore = nsvm_vcpu_hostrestore, + .nestedhvm_vcpu_vmsave = nsvm_vcpu_vmsave, + .nestedhvm_vcpu_vmload = nsvm_vcpu_vmload, + .nestedhvm_vcpu_vmrun = nsvm_vcpu_vmrun, + .nestedhvm_vmcb_prepare4vmrun = nsvm_vmcb_prepare4vmrun, + .nestedhvm_vmcb_prepare4vmexit = nsvm_vmcb_prepare4vmexit, }; asmlinkage void svm_vmexit_handler(struct cpu_user_regs *regs) { - unsigned int exit_reason; + uint64_t exit_reason; struct vcpu *v = current; struct vmcb_struct *vmcb = v->arch.hvm_svm.vmcb; eventinj_t eventinj; int inst_len, rc; + bool_t vcpu_guestmode = 0; if ( paging_mode_hap(v->domain) ) v->arch.hvm_vcpu.guest_cr[3] = v->arch.hvm_vcpu.hw_cr[3] = vmcb->cr3; + if ( nestedhvm_enabled(v->domain) && nestedhvm_vcpu_in_guestmode(v) ) + vcpu_guestmode = 1; + /* * Before doing anything else, we need to sync up the VLAPIC's TPR with * SVM's vTPR. It's OK if the guest doesn't touch CR8 (e.g. 32-bit Windows) @@ -1439,12 +2183,45 @@ asmlinkage void svm_vmexit_handler(struc * NB. We need to preserve the low bits of the TPR to make checked builds * of Windows work, even though they don't actually do anything. */ - vlapic_set_reg(vcpu_vlapic(v), APIC_TASKPRI, - ((vmcb->vintr.fields.tpr & 0x0F) << 4) | - (vlapic_get_reg(vcpu_vlapic(v), APIC_TASKPRI) & 0x0F)); + if ( !vcpu_guestmode ) { + vlapic_set_reg(vcpu_vlapic(v), APIC_TASKPRI, + ((vmcb->vintr.fields.tpr & 0x0F) << 4) | + (vlapic_get_reg(vcpu_vlapic(v), APIC_TASKPRI) & 0x0F)); + } exit_reason = vmcb->exitcode; + if ( vcpu_guestmode ) { + enum nestedhvm_vmexits nsret; + uint64_t exitcode; + + if (VCPU_NESTEDHVM(v).nh_hostflags.fields.forcevmexit) + exitcode = VCPU_NESTEDHVM(v).nh_forcevmexit_exitcode; + else + exitcode = vmcb->exitcode; + + nsret = nestedhvm_vcpu_vmexit(v, regs, exitcode); + VCPU_NESTEDHVM(v).nh_hostflags.fields.forcevmexit = 0; + switch (nsret) { + case NESTEDHVM_VMEXIT_DONE: + goto out; + case NESTEDHVM_VMEXIT_ERROR: + gdprintk(XENLOG_ERR, + "nestedhvm_vcpu_vmexit() returned NESTEDHVM_VMEXIT_ERROR\n"); + goto out; + case NESTEDHVM_VMEXIT_HOST: + case NESTEDHVM_VMEXIT_CONTINUE: + break; + case NESTEDHVM_VMEXIT_FATALERROR: + gdprintk(XENLOG_ERR, "unexpected nestedhvm error\n"); + goto exit_and_crash; + default: + gdprintk(XENLOG_INFO, "nestedhvm_vcpu_vmexit returned %i\n", + nsret); + goto exit_and_crash; + } + } + if ( hvm_long_mode_enabled(v) ) HVMTRACE_ND(VMEXIT64, 1/*cycles*/, 3, exit_reason, (uint32_t)regs->eip, (uint32_t)((uint64_t)regs->eip >> 32), @@ -1532,6 +2309,27 @@ asmlinkage void svm_vmexit_handler(struc break; } + if ( vcpu_guestmode && !nestedhvm_paging_mode_hap(v) ) + { + enum nestedhvm_vmexits nsret; + + VCPU_NESTEDHVM(v).nh_forcevmexit_exitcode = VMEXIT_EXCEPTION_PF; + VCPU_NESTEDHVM(v).nh_hostflags.fields.forcevmexit = 1; + nsret = nestedhvm_vcpu_vmexit(v, regs, VMEXIT_EXCEPTION_PF); + VCPU_NESTEDHVM(v).nh_hostflags.fields.forcevmexit = 0; + switch (nsret) { + case NESTEDHVM_VMEXIT_DONE: + case NESTEDHVM_VMEXIT_ERROR: + goto out; + case NESTEDHVM_VMEXIT_HOST: + case NESTEDHVM_VMEXIT_CONTINUE: + case NESTEDHVM_VMEXIT_FATALERROR: + default: + gdprintk(XENLOG_ERR, "unexpected nestedhvm error %i\n", nsret); + goto exit_and_crash; + } + } + hvm_inject_exception(TRAP_page_fault, regs->error_code, va); break; } @@ -1606,6 +2404,12 @@ asmlinkage void svm_vmexit_handler(struc case VMEXIT_VMMCALL: if ( (inst_len = __get_instruction_length(v, INSTR_VMCALL)) == 0 ) break; + if ( vcpu_guestmode ) { + VCPU_NESTEDHVM(v).nh_vmcbaddr = vmcb->rax; + nestedhvm_vcpu_vmexit(v, regs, VMEXIT_VMMCALL); + __update_guest_eip(regs, inst_len); + break; + } HVMTRACE_1D(VMMCALL, regs->eax); rc = hvm_do_hypercall(regs); if ( rc != HVM_HCALL_preempted ) @@ -1638,11 +2442,27 @@ asmlinkage void svm_vmexit_handler(struc case VMEXIT_MONITOR: case VMEXIT_MWAIT: + hvm_inject_exception(TRAP_invalid_op, HVM_DELIVER_NO_ERROR_CODE, 0); + break; + case VMEXIT_VMRUN: + svm_vmexit_do_vmrun(regs, v, + regs->eax); + break; case VMEXIT_VMLOAD: + svm_vmexit_do_vmload(regs, v, + regs->eax); + break; case VMEXIT_VMSAVE: + svm_vmexit_do_vmsave(regs, v, + regs->eax); + break; case VMEXIT_STGI: + svm_vmexit_do_stgi(regs, v); + break; case VMEXIT_CLGI: + svm_vmexit_do_clgi(regs, v); + break; case VMEXIT_SKINIT: hvm_inject_exception(TRAP_invalid_op, HVM_DELIVER_NO_ERROR_CODE, 0); break; @@ -1677,7 +2497,7 @@ asmlinkage void svm_vmexit_handler(struc default: exit_and_crash: - gdprintk(XENLOG_ERR, "unexpected VMEXIT: exit reason = 0x%x, " + gdprintk(XENLOG_ERR, "unexpected VMEXIT: exit reason = 0x%"PRIx64", " "exitinfo1 = %"PRIx64", exitinfo2 = %"PRIx64"\n", exit_reason, (u64)vmcb->exitinfo1, (u64)vmcb->exitinfo2); @@ -1685,6 +2505,11 @@ asmlinkage void svm_vmexit_handler(struc break; } + out: + if ( vcpu_guestmode ) + /* Don't clobber TPR of the nested guest. */ + return; + /* The exit may have updated the TPR: reflect this in the hardware vtpr */ vmcb->vintr.fields.tpr = (vlapic_get_reg(vcpu_vlapic(v), APIC_TASKPRI) & 0xFF) >> 4; diff -r 9665f1bbdc20 -r f4ba5c5bc13d xen/include/asm-x86/hvm/svm/emulate.h --- a/xen/include/asm-x86/hvm/svm/emulate.h +++ b/xen/include/asm-x86/hvm/svm/emulate.h @@ -31,6 +31,11 @@ enum instruction_index { INSTR_HLT, INSTR_INT3, INSTR_RDTSC, + INSTR_VMRUN, + INSTR_VMLOAD, + INSTR_VMSAVE, + INSTR_STGI, + INSTR_CLGI, INSTR_MAX_COUNT /* Must be last - Number of instructions supported */ };