diff -r 5a60eb7fad79 xen/arch/x86/hvm/emulate.c --- a/xen/arch/x86/hvm/emulate.c Thu Apr 02 14:17:19 2009 +0100 +++ b/xen/arch/x86/hvm/emulate.c Mon Apr 06 11:01:42 2009 +0200 @@ -942,10 +942,10 @@ static struct x86_emulate_ops hvm_emulat }; int hvm_emulate_one( + struct vcpu *curr, struct hvm_emulate_ctxt *hvmemul_ctxt) { struct cpu_user_regs *regs = hvmemul_ctxt->ctxt.regs; - struct vcpu *curr = current; uint32_t new_intr_shadow, pfec = PFEC_page_present; unsigned long addr; int rc; @@ -1018,10 +1018,11 @@ int hvm_emulate_one( } void hvm_emulate_prepare( + struct vcpu *v, struct hvm_emulate_ctxt *hvmemul_ctxt, struct cpu_user_regs *regs) { - hvmemul_ctxt->intr_shadow = hvm_funcs.get_interrupt_shadow(current); + hvmemul_ctxt->intr_shadow = hvm_funcs.get_interrupt_shadow(v); hvmemul_ctxt->ctxt.regs = regs; hvmemul_ctxt->ctxt.force_writeback = 1; hvmemul_ctxt->seg_reg_accessed = 0; @@ -1030,17 +1031,19 @@ void hvm_emulate_prepare( hvmemul_get_seg_reg(x86_seg_ss, hvmemul_ctxt); } -void hvm_emulate_writeback( +void hvm_emulate_writeback(struct vcpu *v, struct hvm_emulate_ctxt *hvmemul_ctxt) { enum x86_segment seg; + ASSERT((v == current) || !vcpu_runnable(v)); + seg = find_first_bit(&hvmemul_ctxt->seg_reg_dirty, ARRAY_SIZE(hvmemul_ctxt->seg_reg)); while ( seg < ARRAY_SIZE(hvmemul_ctxt->seg_reg) ) { - hvm_set_segment_register(current, seg, &hvmemul_ctxt->seg_reg[seg]); + hvm_set_segment_register(v, seg, &hvmemul_ctxt->seg_reg[seg]); seg = find_next_bit(&hvmemul_ctxt->seg_reg_dirty, ARRAY_SIZE(hvmemul_ctxt->seg_reg), seg+1); diff -r 5a60eb7fad79 xen/arch/x86/hvm/io.c --- a/xen/arch/x86/hvm/io.c Thu Apr 02 14:17:19 2009 +0100 +++ b/xen/arch/x86/hvm/io.c Mon Apr 06 11:01:42 2009 +0200 @@ -177,9 +177,9 @@ int handle_mmio(void) struct vcpu *curr = current; int rc; - hvm_emulate_prepare(&ctxt, guest_cpu_user_regs()); + hvm_emulate_prepare(curr, &ctxt, guest_cpu_user_regs()); - rc = hvm_emulate_one(&ctxt); + rc = hvm_emulate_one(curr, &ctxt); if ( curr->arch.hvm_vcpu.io_state == HVMIO_awaiting_completion ) curr->arch.hvm_vcpu.io_state = HVMIO_handle_mmio_awaiting_completion; @@ -206,7 +206,7 @@ int handle_mmio(void) break; } - hvm_emulate_writeback(&ctxt); + hvm_emulate_writeback(curr, &ctxt); return 1; } diff -r 5a60eb7fad79 xen/arch/x86/hvm/svm/svm.c --- a/xen/arch/x86/hvm/svm/svm.c Thu Apr 02 14:17:19 2009 +0100 +++ b/xen/arch/x86/hvm/svm/svm.c Mon Apr 06 11:01:42 2009 +0200 @@ -37,6 +37,7 @@ #include #include #include +#include #include #include #include @@ -199,9 +200,9 @@ static int svm_vmcb_save(struct vcpu *v, c->cr3 = v->arch.hvm_vcpu.guest_cr[3]; c->cr4 = v->arch.hvm_vcpu.guest_cr[4]; - c->sysenter_cs = vmcb->sysenter_cs; - c->sysenter_esp = vmcb->sysenter_esp; - c->sysenter_eip = vmcb->sysenter_eip; + c->sysenter_cs = v->arch.hvm_vcpu.guest_sysenter_cs; + c->sysenter_esp = v->arch.hvm_vcpu.guest_sysenter_esp; + c->sysenter_eip = v->arch.hvm_vcpu.guest_sysenter_eip; c->pending_event = 0; c->error_code = 0; @@ -258,9 +259,9 @@ static int svm_vmcb_restore(struct vcpu svm_update_guest_cr(v, 2); svm_update_guest_cr(v, 4); - vmcb->sysenter_cs = c->sysenter_cs; - vmcb->sysenter_esp = c->sysenter_esp; - vmcb->sysenter_eip = c->sysenter_eip; + v->arch.hvm_vcpu.guest_sysenter_cs = c->sysenter_cs; + v->arch.hvm_vcpu.guest_sysenter_esp = c->sysenter_esp; + v->arch.hvm_vcpu.guest_sysenter_eip = c->sysenter_eip; if ( paging_mode_hap(v->domain) ) { @@ -286,7 +287,7 @@ static int svm_vmcb_restore(struct vcpu return 0; } - + static void svm_save_cpu_state(struct vcpu *v, struct hvm_hw_cpu *data) { struct vmcb_struct *vmcb = v->arch.hvm_svm.vmcb; @@ -985,6 +986,16 @@ static int svm_msr_read_intercept(struct msr_content = v->arch.hvm_vcpu.guest_efer; break; + case MSR_IA32_SYSENTER_CS: + msr_content = v->arch.hvm_vcpu.guest_sysenter_cs; + break; + case MSR_IA32_SYSENTER_ESP: + msr_content = v->arch.hvm_vcpu.guest_sysenter_esp; + break; + case MSR_IA32_SYSENTER_EIP: + msr_content = v->arch.hvm_vcpu.guest_sysenter_eip; + break; + case MSR_IA32_MC4_MISC: /* Threshold register */ case MSR_F10_MC4_MISC1 ... MSR_F10_MC4_MISC3: /* @@ -1067,6 +1078,16 @@ static int svm_msr_write_intercept(struc case MSR_K8_VM_HSAVE_PA: goto gpf; + case MSR_IA32_SYSENTER_CS: + v->arch.hvm_vcpu.guest_sysenter_cs = msr_content; + break; + case MSR_IA32_SYSENTER_ESP: + v->arch.hvm_vcpu.guest_sysenter_esp = msr_content; + break; + case MSR_IA32_SYSENTER_EIP: + v->arch.hvm_vcpu.guest_sysenter_eip = msr_content; + break; + case MSR_IA32_DEBUGCTLMSR: vmcb->debugctlmsr = msr_content; if ( !msr_content || !cpu_has_svm_lbrv ) @@ -1165,6 +1186,65 @@ static void svm_vmexit_do_rdtsc(struct c hvm_rdtsc_intercept(regs); } +static void svm_dump_regs(const char *from, struct cpu_user_regs *regs) +{ + printk("Dumping guest's current registers at %s...\n", from); + printk("Size of regs = 0x%lx, address = %p\n", + sizeof(struct cpu_user_regs), regs); + + printk("r15 = 0x%016"PRIx64", r14 = 0x%016"PRIx64"\n", + regs->r15, regs->r14); + printk("r13 = 0x%016"PRIx64", r12 = 0x%016"PRIx64"\n", + regs->r13, regs->r12); + printk("rbp = 0x%016"PRIx64", rbx = 0x%016"PRIx64"\n", + regs->rbp, regs->rbx); + printk("r11 = 0x%016"PRIx64", r10 = 0x%016"PRIx64"\n", + regs->r11, regs->r10); + printk("r9 = 0x%016"PRIx64", r8 = 0x%016"PRIx64"\n", + regs->r9, regs->r8); + printk("rax = 0x%016"PRIx64", rcx = 0x%016"PRIx64"\n", + regs->rax, regs->rcx); + printk("rdx = 0x%016"PRIx64", rsi = 0x%016"PRIx64"\n", + regs->rdx, regs->rsi); + printk("rdi = 0x%016"PRIx64", rsp = 0x%016"PRIx64"\n", + regs->rdi, regs->rsp); + printk("error code = 0x%08"PRIx32", entry_vector = 0x%08"PRIx32"\n", + regs->error_code, regs->entry_vector); + printk("rip = 0x%016"PRIx64", rflags = 0x%016"PRIx64"\n", + regs->rip, regs->rflags); +} + +static void svm_vmexit_ud_intercept(struct vcpu *v, struct cpu_user_regs *regs) +{ + struct hvm_emulate_ctxt ctxt; + int rc; + + hvm_emulate_prepare(v, &ctxt, regs); + + rc = hvm_emulate_one(v, &ctxt); + + switch (rc) { + case X86EMUL_UNHANDLEABLE: + gdprintk(XENLOG_WARNING, + "instruction emulation failed @ %04x:%lx: " + "%02x %02x %02x %02x %02x %02x\n", + hvmemul_get_seg_reg(x86_seg_cs, &ctxt)->sel, + ctxt.insn_buf_eip, + ctxt.insn_buf[0], ctxt.insn_buf[1], + ctxt.insn_buf[2], ctxt.insn_buf[3], + ctxt.insn_buf[4], ctxt.insn_buf[5]); + return; + case X86EMUL_EXCEPTION: + if ( ctxt.exn_pending ) + hvm_inject_exception(ctxt.exn_vector, ctxt.exn_error_code, 0); + break; + default: + break; + } + + hvm_emulate_writeback(v, &ctxt); +} + static void wbinvd_ipi(void *info) { wbinvd(); @@ -1224,6 +1304,7 @@ asmlinkage void svm_vmexit_handler(struc if ( unlikely(exit_reason == VMEXIT_INVALID) ) { svm_dump_vmcb(__func__, vmcb); + svm_dump_regs(__func__, regs); goto exit_and_crash; } @@ -1300,6 +1381,10 @@ asmlinkage void svm_vmexit_handler(struc break; } + case VMEXIT_EXCEPTION_UD: + svm_vmexit_ud_intercept(v, regs); + break; + /* Asynchronous event, handled when we STGI'd after the VMEXIT. */ case VMEXIT_EXCEPTION_MC: HVMTRACE_0D(MCE); diff -r 5a60eb7fad79 xen/arch/x86/hvm/svm/vmcb.c --- a/xen/arch/x86/hvm/svm/vmcb.c Thu Apr 02 14:17:19 2009 +0100 +++ b/xen/arch/x86/hvm/svm/vmcb.c Mon Apr 06 11:01:42 2009 +0200 @@ -150,9 +150,6 @@ static int construct_vmcb(struct vcpu *v svm_disable_intercept_for_msr(v, MSR_LSTAR); svm_disable_intercept_for_msr(v, MSR_STAR); svm_disable_intercept_for_msr(v, MSR_SYSCALL_MASK); - svm_disable_intercept_for_msr(v, MSR_IA32_SYSENTER_CS); - svm_disable_intercept_for_msr(v, MSR_IA32_SYSENTER_ESP); - svm_disable_intercept_for_msr(v, MSR_IA32_SYSENTER_EIP); vmcb->msrpm_base_pa = (u64)virt_to_maddr(arch_svm->msrpm); vmcb->iopm_base_pa = (u64)virt_to_maddr(hvm_io_bitmap); @@ -222,7 +219,9 @@ static int construct_vmcb(struct vcpu *v paging_update_paging_modes(v); - vmcb->exception_intercepts = HVM_TRAP_MASK | (1U << TRAP_no_device); + vmcb->exception_intercepts = HVM_TRAP_MASK + | (1U << TRAP_no_device) + | (1U << TRAP_invalid_op); if ( paging_mode_hap(v->domain) ) { diff -r 5a60eb7fad79 xen/arch/x86/hvm/vmx/realmode.c --- a/xen/arch/x86/hvm/vmx/realmode.c Thu Apr 02 14:17:19 2009 +0100 +++ b/xen/arch/x86/hvm/vmx/realmode.c Mon Apr 06 11:01:42 2009 +0200 @@ -108,7 +108,7 @@ static void realmode_emulate_one(struct perfc_incr(realmode_emulations); - rc = hvm_emulate_one(hvmemul_ctxt); + rc = hvm_emulate_one(curr, hvmemul_ctxt); if ( rc == X86EMUL_UNHANDLEABLE ) { @@ -179,7 +179,7 @@ void vmx_realmode(struct cpu_user_regs * if ( intr_info & INTR_INFO_VALID_MASK ) __vmwrite(VM_ENTRY_INTR_INFO, 0); - hvm_emulate_prepare(&hvmemul_ctxt, regs); + hvm_emulate_prepare(curr, &hvmemul_ctxt, regs); if ( curr->arch.hvm_vcpu.io_state == HVMIO_completed ) realmode_emulate_one(&hvmemul_ctxt); @@ -243,7 +243,7 @@ void vmx_realmode(struct cpu_user_regs * (1ul << x86_seg_fs) | (1ul << x86_seg_gs); } - hvm_emulate_writeback(&hvmemul_ctxt); + hvm_emulate_writeback(curr, &hvmemul_ctxt); /* Re-instate VM_ENTRY_INTR_INFO if we did not discharge it. */ if ( intr_info & INTR_INFO_VALID_MASK ) diff -r 5a60eb7fad79 xen/arch/x86/hvm/vmx/vmcs.c --- a/xen/arch/x86/hvm/vmx/vmcs.c Thu Apr 02 14:17:19 2009 +0100 +++ b/xen/arch/x86/hvm/vmx/vmcs.c Mon Apr 06 11:01:42 2009 +0200 @@ -668,7 +668,8 @@ static int construct_vmcs(struct vcpu *v __vmwrite(EXCEPTION_BITMAP, HVM_TRAP_MASK | (paging_mode_hap(d) ? 0 : (1U << TRAP_page_fault)) - | (1U << TRAP_no_device)); + | (1U << TRAP_no_device) + | (1U << TRAP_invalid_op)); v->arch.hvm_vcpu.guest_cr[0] = X86_CR0_PE | X86_CR0_ET; hvm_update_guest_cr(v, 0); diff -r 5a60eb7fad79 xen/arch/x86/hvm/vmx/vmx.c --- a/xen/arch/x86/hvm/vmx/vmx.c Thu Apr 02 14:17:19 2009 +0100 +++ b/xen/arch/x86/hvm/vmx/vmx.c Mon Apr 06 11:01:42 2009 +0200 @@ -37,6 +37,7 @@ #include #include #include +#include #include #include #include @@ -2229,6 +2230,37 @@ asmlinkage void vmx_enter_realmode(struc regs->eflags |= (X86_EFLAGS_VM | X86_EFLAGS_IOPL); } +static void vmx_vmexit_ud_intercept(struct vcpu *v, struct cpu_user_regs *regs) +{ + struct hvm_emulate_ctxt ctxt; + int rc; + + hvm_emulate_prepare(v, &ctxt, regs); + + rc = hvm_emulate_one(v, &ctxt); + + switch (rc) { + case X86EMUL_UNHANDLEABLE: + gdprintk(XENLOG_WARNING, + "instruction emulation failed @ %04x:%lx: " + "%02x %02x %02x %02x %02x %02x\n", + hvmemul_get_seg_reg(x86_seg_cs, &ctxt)->sel, + ctxt.insn_buf_eip, + ctxt.insn_buf[0], ctxt.insn_buf[1], + ctxt.insn_buf[2], ctxt.insn_buf[3], + ctxt.insn_buf[4], ctxt.insn_buf[5]); + return; + case X86EMUL_EXCEPTION: + if ( ctxt.exn_pending ) + hvm_inject_exception(ctxt.exn_vector, ctxt.exn_error_code, 0); + break; + default: + break; + } + + hvm_emulate_writeback(v, &ctxt); +} + asmlinkage void vmx_vmexit_handler(struct cpu_user_regs *regs) { unsigned int exit_reason, idtv_info; @@ -2410,6 +2442,9 @@ asmlinkage void vmx_vmexit_handler(struc HVMTRACE_0D(MCE); do_machine_check(regs); break; + case TRAP_invalid_op: + vmx_vmexit_ud_intercept(v, regs); + break; default: goto exit_and_crash; } diff -r 5a60eb7fad79 xen/arch/x86/x86_emulate.c --- a/xen/arch/x86/x86_emulate.c Thu Apr 02 14:17:19 2009 +0100 +++ b/xen/arch/x86/x86_emulate.c Mon Apr 06 11:01:42 2009 +0200 @@ -10,6 +10,7 @@ */ #include +#include /* Avoid namespace pollution. */ #undef cmpxchg diff -r 5a60eb7fad79 xen/arch/x86/x86_emulate/x86_emulate.c --- a/xen/arch/x86/x86_emulate/x86_emulate.c Thu Apr 02 14:17:19 2009 +0100 +++ b/xen/arch/x86/x86_emulate/x86_emulate.c Mon Apr 06 11:01:42 2009 +0200 @@ -172,7 +172,7 @@ static uint8_t opcode_table[256] = { static uint8_t twobyte_table[256] = { /* 0x00 - 0x07 */ - 0, ImplicitOps|ModRM, 0, 0, 0, 0, ImplicitOps, 0, + 0, ImplicitOps|ModRM, 0, 0, 0, ImplicitOps, ImplicitOps, ImplicitOps, /* 0x08 - 0x0F */ ImplicitOps, ImplicitOps, 0, 0, 0, ImplicitOps|ModRM, 0, 0, /* 0x10 - 0x17 */ @@ -186,7 +186,8 @@ static uint8_t twobyte_table[256] = { /* 0x28 - 0x2F */ 0, 0, 0, 0, 0, 0, 0, 0, /* 0x30 - 0x37 */ - ImplicitOps, ImplicitOps, ImplicitOps, 0, 0, 0, 0, 0, + ImplicitOps, ImplicitOps, ImplicitOps, 0, + ImplicitOps, ImplicitOps, 0, 0, /* 0x38 - 0x3F */ 0, 0, 0, 0, 0, 0, 0, 0, /* 0x40 - 0x47 */ @@ -942,6 +943,20 @@ in_protmode( } static int +in_longmode( + struct x86_emulate_ctxt *ctxt, + struct x86_emulate_ops *ops) +{ + uint64_t efer; + + if (ops->read_msr == NULL) + return -1; + + ops->read_msr(MSR_EFER, &efer, ctxt); + return (efer & EFER_LMA) ? 1 : 0; +} + +static int realmode_load_seg( enum x86_segment seg, uint16_t sel, @@ -3529,6 +3544,136 @@ x86_emulate( break; } + case 0x05: /* syscall */ { + uint64_t msr_content; + struct segment_register cs, ss; + int rc; + + memset(&cs, 0, sizeof(struct segment_register)); + memset(&ss, 0, sizeof(struct segment_register)); + + //printk("%s: syscall emulate\n", __func__); + fail_if(ops->read_msr == NULL); + fail_if(ops->read_segment == NULL); + fail_if(ops->write_segment == NULL); + + /* inject #UD if + * 1. we are in real mode + * 2. protected mode is not enabled + * 3. LOCK prefix is used + */ + generate_exception_if(in_realmode(ctxt, ops), EXC_UD, 0); + generate_exception_if(!in_protmode(ctxt, ops), EXC_UD, 0); + generate_exception_if(lock_prefix, EXC_UD, 0); + + /* inject #UD if syscall/sysret are disabled */ + rc = ops->read_msr(MSR_EFER, &msr_content, ctxt); + fail_if(rc != 0); + generate_exception_if((msr_content & EFER_SCE) == 0, EXC_UD, 0); + + rc = ops->read_msr(MSR_STAR, &msr_content, ctxt); + fail_if(rc != 0); + + msr_content >>= 32; + cs.sel = (uint16_t)(msr_content & 0xfffc); + ss.sel = (uint16_t)(msr_content + 8); + + /* Set all values for 32bit legacy/compat mode, + * re-adjust values for 64bit later. This avoids + * code duplication. + */ + + cs.attr.fields.l = 0; + + cs.base = 0; /* flat segment */ + ss.base = 0; + + /* Intel cares about granularity (g bit), + * so we don't set the effective limit. + */ + cs.attr.fields.g = 1; /* 4kb granularity */ + ss.attr.fields.g = 1; + cs.limit = 0xfffff; /* 4GB limit */ + ss.limit = 0xfffff; + + cs.attr.fields.dpl = 0; + ss.attr.fields.dpl = 0; + /* No need to set cpl explicitely here. write_segment() + * does this below based on the ss.dpl value. + */ + + /* AMD: Read, Execute (0x0a) + * Intel: Read, Execute, Accessed (0x0b) + */ + cs.attr.fields.type = 0x0b; + + /* AMD: Read/Write, Expand-Up (0x02) + * Intel: Read/Write, Expand-Up, Accessed (0x03) + */ + ss.attr.fields.type = 0x03; + + cs.attr.fields.p = 1; /* present */ + ss.attr.fields.p = 1; + + cs.attr.fields.s = 1; + ss.attr.fields.s = 1; + + cs.attr.fields.db = 1; /* 32bit segment */ + ss.attr.fields.db = 1; + + rc = in_longmode(ctxt, ops); + if (rc > 0) { + + cs.attr.fields.db = 0; + cs.attr.fields.l = 1; /* long mode */ + ss.attr.fields.db = 0; + ss.attr.fields.l = 1; /* long mode */ + + _regs.rcx = _regs.rip; + _regs.r11 = _regs.eflags & ~EFLG_RF; + + if (mode_64bit()) { + /* Intel cares about granularity (g bit), + * so we don't set the effective limit. + */ + cs.attr.fields.g = 1; + cs.limit = 0xffffffff; + ss.attr.fields.g = 1; + ss.limit = 0xffffffff; + + rc = ops->read_msr(MSR_LSTAR, &msr_content, ctxt); + fail_if(rc != 0); + } else { + /* compat mode */ + rc = ops->read_msr(MSR_CSTAR, &msr_content, ctxt); + fail_if(rc != 0); + } + + _regs.rip = msr_content; + + rc = ops->read_msr(MSR_SYSCALL_MASK, &msr_content, ctxt); + fail_if(rc != 0); + _regs.eflags &= ~(msr_content | EFLG_RF); + } else { + fail_if(rc < 0); + /* legacy mode */ + + rc = ops->read_msr(MSR_STAR, &msr_content, ctxt); + fail_if(rc != 0); + + _regs.rcx = _regs.rip; + _regs.eip = (uint32_t)msr_content; + + _regs.eflags &= ~(EFLG_VM | EFLG_IF | EFLG_RF); + } + + rc = ops->write_segment(x86_seg_cs, &cs, ctxt); + fail_if(rc != 0); + rc = ops->write_segment(x86_seg_ss, &ss, ctxt); + fail_if(rc != 0); + break; + } + case 0x06: /* clts */ generate_exception_if(!mode_ring0(), EXC_GP, 0); fail_if((ops->read_cr == NULL) || (ops->write_cr == NULL)); @@ -3537,6 +3682,12 @@ x86_emulate( goto done; break; + case 0x07: /* sysret */ { + printk("%s: sysret emulate\n", __func__); + fail_if(1); + break; + } + case 0x08: /* invd */ case 0x09: /* wbinvd */ generate_exception_if(!mode_ring0(), EXC_GP, 0); @@ -3630,6 +3781,228 @@ x86_emulate( dst.type = OP_NONE; break; + case 0x34: /* sysenter */ { + uint64_t msr_content; + struct segment_register cs, ss; + int rc; + + memset(&cs, 0, sizeof(struct segment_register)); + memset(&ss, 0, sizeof(struct segment_register)); + + fail_if(ops->read_msr == NULL); + fail_if(ops->read_segment == NULL); + fail_if(ops->write_segment == NULL); + + /* inject #GP if + * 1. we are in real mode + * 2. protected mode is not enabled + */ + generate_exception_if(mode_ring0(), EXC_GP, 0); + generate_exception_if(in_realmode(ctxt, ops), EXC_GP, 0); + generate_exception_if(!in_protmode(ctxt, ops), EXC_GP, 0); + + /* inject #UD if + * LOCK prefix is used + */ + generate_exception_if(lock_prefix, EXC_UD, 0); + + rc = ops->read_msr(MSR_IA32_SYSENTER_CS, &msr_content, ctxt); + fail_if(rc != 0); + + if (mode_64bit()) { + generate_exception_if(msr_content == 0x0, EXC_GP, 0); + } else { + /* Assume to be in compat or 32bit protected mode here */ + generate_exception_if((msr_content & 0xfffc) == 0x0, EXC_GP, 0); + } + + _regs.eflags &= ~(EFLG_VM | EFLG_IF | EFLG_RF); + + ops->read_segment(x86_seg_cs, &cs, ctxt); + cs.sel = (uint16_t)msr_content; + cs.base = 0; /* flat segment */ + cs.limit = 0xfffff; /* 4GB limit */ + cs.attr.fields.g = 1; /* 4kb granularity */ + cs.attr.fields.s = 1; + cs.attr.fields.type = 0x0b; /* Execute + Read, Accessed */ + cs.attr.fields.db = 1; /* 32bit code segment */ + cs.attr.fields.dpl = 0; + cs.attr.fields.p = 1; /* present */ + cs.sel &= ~3; /* SELECTOR_RPL_MASK */ + + /* No need to set cpl explicitely here. write_segment() + * does this below based on the ss.dpl value. + */ + + ss.sel = cs.sel + 8; + ss.base = 0; /* flat segment */ + ss.limit = 0xfffff; /* 4GB limit */ + ss.attr.fields.g = 1; /* 4kb granularity */ + ss.attr.fields.s = 1; + ss.attr.fields.type = 0x03; /* Read/Write, Accessed */ + ss.attr.fields.db = 1; /* 32bit stack segment */ + ss.attr.fields.dpl = 0; + ss.attr.fields.p = 1; /* present */ + ss.sel &= ~3; /* SELECTOR_RPL_MASK */ + + rc = in_longmode(ctxt, ops); + if (rc > 0) { + cs.attr.fields.db = 0; + cs.attr.fields.l = 1; + cs.base = 0; + cs.limit = 0xffffffff; + ss.base = 0; + ss.limit = 0xffffffff; + } + fail_if(rc < 0); + + rc = ops->write_segment(x86_seg_cs, &cs, ctxt); + fail_if(rc != 0); + rc = ops->write_segment(x86_seg_ss, &ss, ctxt); + fail_if(rc != 0); + + rc = ops->read_msr(MSR_IA32_SYSENTER_EIP, &msr_content, ctxt); + fail_if(rc != 0); + _regs.rip = msr_content; + + rc = ops->read_msr(MSR_IA32_SYSENTER_ESP, &msr_content, ctxt); + fail_if(rc != 0); + _regs.rsp = msr_content; + + break; + } + + case 0x35: /* sysexit */ { + uint64_t msr_content; + struct segment_register cs, ss; + unsigned int usermode; + int rc; + +#define X86EMUL_MODE_PROT32 0 +#define X86EMUL_MODE_PROT64 1 + + memset(&cs, 0, sizeof(struct segment_register)); + memset(&ss, 0, sizeof(struct segment_register)); + + //printk("%s: sysexit emulate\n", __func__); + fail_if(ops->read_msr == NULL); + fail_if(ops->read_segment == NULL); + fail_if(ops->write_segment == NULL); + + /* inject #GP if + * 1. we are not in cpl 0 + * 2. we are in real mode + * 3. protected mode is not enabled + */ + generate_exception_if(!mode_ring0(), EXC_GP, 0); + generate_exception_if(in_realmode(ctxt, ops), EXC_GP, 0); + generate_exception_if(!in_protmode(ctxt, ops), EXC_GP, 0); + + /* inject #UD if + * LOCK prefix is used + */ + generate_exception_if(lock_prefix, EXC_UD, 0); + + /* TODO check that rip and rsp are canonical. inject #GP if not */ + + /* if REX.W bit is set ... */ + if ((rex_prefix & 0x8) != 0x0) { + /* Application is in 64bit mode */ + usermode = X86EMUL_MODE_PROT64; + } else { + /* Application is in 32bit legacy/compat mode */ + usermode = X86EMUL_MODE_PROT32; + } + + rc = ops->read_msr(MSR_IA32_SYSENTER_CS, &msr_content, ctxt); + fail_if(rc != 0); + rc = ops->read_segment(x86_seg_cs, &cs, ctxt); + fail_if(rc != 0); + + switch (usermode) { + case X86EMUL_MODE_PROT32: + cs.sel = (uint16_t)(msr_content + 16); + generate_exception_if((msr_content & 0xfffc) == 0x0, EXC_GP, 0); + break; + case X86EMUL_MODE_PROT64: + cs.sel = (uint16_t)(msr_content + 32); + generate_exception_if(msr_content == 0x0, EXC_GP, 0); + break; + } + + cs.base = 0; /* flat segment */ + cs.limit = 0xfffff; /* 4GB limit */ + cs.attr.fields.g = 1; /* 4kb granularity */ + cs.attr.fields.s = 1; + cs.attr.fields.type = 0x0b; /* Execute, Read, Non-conforming code */ + cs.attr.fields.db = 1; /* 32bit code segment */ + cs.attr.fields.dpl = 3; + cs.attr.fields.p = 1; /* present */ + cs.attr.fields.l = 0; /* For return to compatibility mode */ + cs.sel |= 0x3; /* SELECTOR_RPL_MASK */ + + /* No need to set cpl explicitely here. write_segment() + * does this below based on the ss.dpl value. + */ + + switch (usermode) { + case X86EMUL_MODE_PROT32: + ss.sel = (uint16_t)(msr_content + 24); + break; + case X86EMUL_MODE_PROT64: + ss.sel = (cs.sel + 8); + break; + } + + ss.base = 0; /* flat segment */ + ss.limit = 0xfffff; /* 4GB limit */ + ss.attr.fields.g = 1; /* 4kb granularity */ + ss.attr.fields.s = 1; + ss.attr.fields.type = 0x03; /* Expand Up, Read/Write, Data */ + ss.attr.fields.db = 1; /* 32bit stack segment */ + ss.attr.fields.dpl = 3; + ss.attr.fields.p = 1; /* present */ + ss.sel |= 0x3; /* SELECTOR_RPL_MASK */ + + switch (usermode) { + case X86EMUL_MODE_PROT32: + /* AMD: We don't care about cs.g/ss.g bits (= 4kb granularity) + * so we have to set the effective limit here or we get a #GP + * in the guest, otherwise. + */ + cs.limit = 0xffffffff; + ss.limit = 0xffffffff; + break; + + case X86EMUL_MODE_PROT64: + /* AMD: We don't care about cs.g/ss.g bits (= 4kb granularity) + * so we have to set the effective limit here or we get a #GP + * in the guest, otherwise. + */ + cs.attr.fields.db = 0; + cs.attr.fields.l = 1; + cs.base = 0; + cs.limit = 0xffffffff; + ss.base = 0; + ss.limit = 0xffffffff; + break; + default: + break; + } + + rc = ops->write_segment(x86_seg_cs, &cs, ctxt); + fail_if(rc != 0); + rc = ops->write_segment(x86_seg_ss, &ss, ctxt); + fail_if(rc != 0); + + _regs.rip = _regs.rdx; + _regs.rsp = _regs.rcx; + +#undef X86EMUL_MODE_PROT32 +#undef X86EMUL_MODE_PROT64 + break; + } + case 0x6f: /* movq mm/m64,mm */ { uint8_t stub[] = { 0x0f, 0x6f, modrm, 0xc3 }; struct fpu_insn_ctxt fic = { .insn_bytes = sizeof(stub)-1 }; diff -r 5a60eb7fad79 xen/include/asm-x86/hvm/emulate.h --- a/xen/include/asm-x86/hvm/emulate.h Thu Apr 02 14:17:19 2009 +0100 +++ b/xen/include/asm-x86/hvm/emulate.h Mon Apr 06 11:01:42 2009 +0200 @@ -36,11 +36,13 @@ struct hvm_emulate_ctxt { }; int hvm_emulate_one( + struct vcpu *curr, struct hvm_emulate_ctxt *hvmemul_ctxt); void hvm_emulate_prepare( + struct vcpu *v, struct hvm_emulate_ctxt *hvmemul_ctxt, struct cpu_user_regs *regs); -void hvm_emulate_writeback( +void hvm_emulate_writeback(struct vcpu *v, struct hvm_emulate_ctxt *hvmemul_ctxt); struct segment_register *hvmemul_get_seg_reg( enum x86_segment seg, diff -r 5a60eb7fad79 xen/include/asm-x86/hvm/vcpu.h --- a/xen/include/asm-x86/hvm/vcpu.h Thu Apr 02 14:17:19 2009 +0100 +++ b/xen/include/asm-x86/hvm/vcpu.h Mon Apr 06 11:01:42 2009 +0200 @@ -39,6 +39,15 @@ struct hvm_vcpu { unsigned long guest_cr[5]; unsigned long guest_efer; + /* On AMD: Upper four bytes are undefined in the VMCB, therefore we can't + * use the fields in the VMCB. Write a 64bit value and then read a 64bit + * value is fine unless there's a VMRUN/VMEXIT in between which clears + * the upper four bytes. + */ + uint64_t guest_sysenter_cs; + uint64_t guest_sysenter_esp; + uint64_t guest_sysenter_eip; + /* * Processor-visible control-register values, while guest executes. * CR0, CR4: Used as a cache of VMCS contents by VMX only. diff -r 5a60eb7fad79 xen/include/public/arch-x86/hvm/save.h --- a/xen/include/public/arch-x86/hvm/save.h Thu Apr 02 14:17:19 2009 +0100 +++ b/xen/include/public/arch-x86/hvm/save.h Mon Apr 06 11:01:42 2009 +0200 @@ -123,9 +123,7 @@ struct hvm_hw_cpu { uint32_t tr_arbytes; uint32_t ldtr_arbytes; - uint32_t sysenter_cs; - uint32_t padding0; - + uint64_t sysenter_cs; uint64_t sysenter_esp; uint64_t sysenter_eip;