WARNING - OLD ARCHIVES

This is an archived copy of the Xen.org mailing list, which we have preserved to ensure that existing links to archives are not broken. The live archive, which contains the latest emails, can be found at http://lists.xen.org/
   
 
 
Xen 
 
Home Products Support Community News
 
   
 

xen-changelog

[Xen-changelog] [xen-unstable] Use virtual 8086 mode for VMX guests with

To: xen-changelog@xxxxxxxxxxxxxxxxxxx
Subject: [Xen-changelog] [xen-unstable] Use virtual 8086 mode for VMX guests with CR0.PE == 0
From: Xen patchbot-unstable <patchbot-unstable@xxxxxxxxxxxxxxxxxxx>
Date: Tue, 09 Dec 2008 08:30:20 -0800
Delivery-date: Tue, 09 Dec 2008 08:30:49 -0800
Envelope-to: www-data@xxxxxxxxxxxxxxxxxxx
List-help: <mailto:xen-changelog-request@lists.xensource.com?subject=help>
List-id: BK change log <xen-changelog.lists.xensource.com>
List-post: <mailto:xen-changelog@lists.xensource.com>
List-subscribe: <http://lists.xensource.com/mailman/listinfo/xen-changelog>, <mailto:xen-changelog-request@lists.xensource.com?subject=subscribe>
List-unsubscribe: <http://lists.xensource.com/mailman/listinfo/xen-changelog>, <mailto:xen-changelog-request@lists.xensource.com?subject=unsubscribe>
Reply-to: xen-devel@xxxxxxxxxxxxxxxxxxx
Sender: xen-changelog-bounces@xxxxxxxxxxxxxxxxxxx
# HG changeset patch
# User Keir Fraser <keir.fraser@xxxxxxxxxx>
# Date 1228840082 0
# Node ID 6595393a3d28a7bf95f02b198f52d754bcfa7a80
# Parent  5535efd8e01141f840f9a8cbc31a9b3a4c9d49e9
Use virtual 8086 mode for VMX guests with CR0.PE == 0

When a VMX guest tries to enter real mode, put it in virtual 8086 mode
instead, if that's possible.  Handle all errors and corner cases by
falling back to the real-mode emulator.

This is similar to the old VMXASSIST system except it uses Xen's
x86_emulate emulator instead of having a partial emulator in the guest
firmware.  It more than doubles the speed of real-mode operation on
VMX.

Signed-off-by: Tim Deegan <Tim.Deegan@xxxxxxxxxx>
---
 tools/firmware/hvmloader/hvmloader.c   |   19 ++
 tools/libxc/xc_domain_restore.c        |   16 ++
 tools/libxc/xc_domain_save.c           |   26 ++-
 xen/arch/x86/hvm/vmx/entry.S           |   14 +
 xen/arch/x86/hvm/vmx/realmode.c        |   45 ++---
 xen/arch/x86/hvm/vmx/vmcs.c            |   51 ++++--
 xen/arch/x86/hvm/vmx/vmx.c             |  250 ++++++++++++++++++++++++++++-----
 xen/arch/x86/x86_32/asm-offsets.c      |    4 
 xen/arch/x86/x86_64/asm-offsets.c      |    4 
 xen/arch/x86/x86_emulate/x86_emulate.h |    1 
 xen/include/asm-x86/hvm/vmx/vmcs.h     |   13 +
 xen/include/asm-x86/perfc_defn.h       |    3 
 xen/include/public/hvm/params.h        |    5 
 13 files changed, 356 insertions(+), 95 deletions(-)

diff -r 5535efd8e011 -r 6595393a3d28 tools/firmware/hvmloader/hvmloader.c
--- a/tools/firmware/hvmloader/hvmloader.c      Tue Dec 09 13:23:15 2008 +0000
+++ b/tools/firmware/hvmloader/hvmloader.c      Tue Dec 09 16:28:02 2008 +0000
@@ -536,6 +536,23 @@ static uint16_t init_xen_platform_io_bas
     return bios_info->xen_pfiob;
 }
 
+/* Set up an empty TSS area for virtual 8086 mode to use. 
+ * The only important thing is that it musn't have any bits set 
+ * in the interrupt redirection bitmap, so all zeros will do.  */
+static void init_vm86_tss(void)
+{
+    uint32_t tss;
+    struct xen_hvm_param p;
+
+    tss = e820_malloc(128, 128);
+    memset((char *)tss, 0, 128);
+    p.domid = DOMID_SELF;
+    p.index = HVM_PARAM_VM86_TSS;
+    p.value = tss;
+    hypercall_hvm_op(HVMOP_set_param, &p);
+    printf("vm86 TSS at %08x\n", tss);
+}
+
 int main(void)
 {
     int option_rom_sz = 0, vgabios_sz = 0, etherboot_sz = 0;
@@ -605,6 +622,8 @@ int main(void)
         printf("Loading ACPI ...\n");
         acpi_build_tables();
     }
+
+    init_vm86_tss();
 
     cmos_write_memory_size();
 
diff -r 5535efd8e011 -r 6595393a3d28 tools/libxc/xc_domain_restore.c
--- a/tools/libxc/xc_domain_restore.c   Tue Dec 09 13:23:15 2008 +0000
+++ b/tools/libxc/xc_domain_restore.c   Tue Dec 09 16:28:02 2008 +0000
@@ -490,6 +490,22 @@ int xc_domain_restore(int xc_handle, int
             continue;
         }
 
+        if ( j == -4 )
+        {
+            uint64_t vm86_tss;
+
+            /* Skip padding 4 bytes then read the vm86 TSS location. */
+            if ( read_exact(io_fd, &vm86_tss, sizeof(uint32_t)) ||
+                 read_exact(io_fd, &vm86_tss, sizeof(uint64_t)) )
+            {
+                ERROR("error read the address of the vm86 TSS");
+                goto out;
+            }
+
+            xc_set_hvm_param(xc_handle, dom, HVM_PARAM_VM86_TSS, vm86_tss);
+            continue;
+        }
+
         if ( j == 0 )
             break;  /* our work here is done */
 
diff -r 5535efd8e011 -r 6595393a3d28 tools/libxc/xc_domain_save.c
--- a/tools/libxc/xc_domain_save.c      Tue Dec 09 13:23:15 2008 +0000
+++ b/tools/libxc/xc_domain_save.c      Tue Dec 09 16:28:02 2008 +0000
@@ -1388,18 +1388,30 @@ int xc_domain_save(int xc_handle, int io
     if ( hvm )
     {
         struct {
-            int minusthree;
+            int id;
             uint32_t pad;
-            uint64_t ident_pt;
-        } chunk = { -3, 0 };
-
+            uint64_t data;
+        } chunk = { 0, };
+
+        chunk.id = -3;
         xc_get_hvm_param(xc_handle, dom, HVM_PARAM_IDENT_PT,
-                         (unsigned long *)&chunk.ident_pt);
-
-        if ( (chunk.ident_pt != 0) &&
+                         (unsigned long *)&chunk.data);
+
+        if ( (chunk.data != 0) &&
              write_exact(io_fd, &chunk, sizeof(chunk)) )
         {
             PERROR("Error when writing the ident_pt for EPT guest");
+            goto out;
+        }
+
+        chunk.id = -4;
+        xc_get_hvm_param(xc_handle, dom, HVM_PARAM_VM86_TSS,
+                         (unsigned long *)&chunk.data);
+
+        if ( (chunk.data != 0) &&
+             write_exact(io_fd, &chunk, sizeof(chunk)) )
+        {
+            PERROR("Error when writing the vm86 TSS for guest");
             goto out;
         }
     }
diff -r 5535efd8e011 -r 6595393a3d28 xen/arch/x86/hvm/vmx/entry.S
--- a/xen/arch/x86/hvm/vmx/entry.S      Tue Dec 09 13:23:15 2008 +0000
+++ b/xen/arch/x86/hvm/vmx/entry.S      Tue Dec 09 16:28:02 2008 +0000
@@ -133,9 +133,15 @@ vmx_asm_do_vmentry:
         cmpl $0,(r(dx),r(ax),1)
         jnz  .Lvmx_process_softirqs
 
-        testb $0xff,VCPU_vmx_emul(r(bx))
-        jnz  .Lvmx_goto_realmode
-
+        testb $0xff,VCPU_vmx_emulate(r(bx))
+        jnz .Lvmx_goto_emulator
+        testb $0xff,VCPU_vmx_realmode(r(bx))
+        jz .Lvmx_not_realmode
+        cmpw $0,VCPU_vm86_seg_mask(r(bx))
+        jnz .Lvmx_goto_emulator
+        call_with_regs(vmx_enter_realmode) 
+
+.Lvmx_not_realmode:
         mov  VCPU_hvm_guest_cr2(r(bx)),r(ax)
         mov  r(ax),%cr2
         call vmx_trace_vmentry
@@ -189,7 +195,7 @@ vmx_asm_do_vmentry:
         call vm_launch_fail
         ud2
 
-.Lvmx_goto_realmode:
+.Lvmx_goto_emulator:
         sti
         call_with_regs(vmx_realmode)
         jmp  vmx_asm_do_vmentry
diff -r 5535efd8e011 -r 6595393a3d28 xen/arch/x86/hvm/vmx/realmode.c
--- a/xen/arch/x86/hvm/vmx/realmode.c   Tue Dec 09 13:23:15 2008 +0000
+++ b/xen/arch/x86/hvm/vmx/realmode.c   Tue Dec 09 16:28:02 2008 +0000
@@ -103,30 +103,12 @@ static void realmode_emulate_one(struct 
 static void realmode_emulate_one(struct hvm_emulate_ctxt *hvmemul_ctxt)
 {
     struct vcpu *curr = current;
-    unsigned long seg_reg_dirty;
     uint32_t intr_info;
     int rc;
 
-    seg_reg_dirty = hvmemul_ctxt->seg_reg_dirty;
-    hvmemul_ctxt->seg_reg_dirty = 0;
+    perfc_incr(realmode_emulations);
 
     rc = hvm_emulate_one(hvmemul_ctxt);
-
-    if ( test_bit(x86_seg_cs, &hvmemul_ctxt->seg_reg_dirty) )
-    {
-        curr->arch.hvm_vmx.vmxemul &= ~VMXEMUL_BAD_CS;
-        if ( hvmemul_get_seg_reg(x86_seg_cs, hvmemul_ctxt)->sel & 3 )
-            curr->arch.hvm_vmx.vmxemul |= VMXEMUL_BAD_CS;
-    }
-
-    if ( test_bit(x86_seg_ss, &hvmemul_ctxt->seg_reg_dirty) )
-    {
-        curr->arch.hvm_vmx.vmxemul &= ~VMXEMUL_BAD_SS;
-        if ( hvmemul_get_seg_reg(x86_seg_ss, hvmemul_ctxt)->sel & 3 )
-            curr->arch.hvm_vmx.vmxemul |= VMXEMUL_BAD_SS;
-    }
-
-    hvmemul_ctxt->seg_reg_dirty |= seg_reg_dirty;
 
     if ( rc == X86EMUL_UNHANDLEABLE )
     {
@@ -210,7 +192,8 @@ void vmx_realmode(struct cpu_user_regs *
         intr_info = 0;
     }
 
-    while ( curr->arch.hvm_vmx.vmxemul &&
+    curr->arch.hvm_vmx.vmx_emulate = 1;
+    while ( curr->arch.hvm_vmx.vmx_emulate &&
             !softirq_pending(smp_processor_id()) &&
             (curr->arch.hvm_vcpu.io_state == HVMIO_none) )
     {
@@ -220,13 +203,27 @@ void vmx_realmode(struct cpu_user_regs *
          * in real mode, because we don't emulate protected-mode IDT vectoring.
          */
         if ( unlikely(!(++emulations & 15)) &&
-             !(curr->arch.hvm_vcpu.guest_cr[0] & X86_CR0_PE) &&
+             curr->arch.hvm_vmx.vmx_realmode && 
              hvm_local_events_need_delivery(curr) )
             break;
+
         realmode_emulate_one(&hvmemul_ctxt);
-    }
-
-    if ( !curr->arch.hvm_vmx.vmxemul )
+
+        /* Stop emulating unless our segment state is not safe */
+        if ( curr->arch.hvm_vmx.vmx_realmode )
+            curr->arch.hvm_vmx.vmx_emulate = 
+                (curr->arch.hvm_vmx.vm86_segment_mask != 0);
+        else
+            curr->arch.hvm_vmx.vmx_emulate = 
+                 ((hvmemul_ctxt.seg_reg[x86_seg_cs].sel & 3)
+                  || (hvmemul_ctxt.seg_reg[x86_seg_ss].sel & 3));
+    }
+
+    /* Need to emulate next time if we've started an IO operation */
+    if ( curr->arch.hvm_vcpu.io_state != HVMIO_none )
+        curr->arch.hvm_vmx.vmx_emulate = 1;
+
+    if ( !curr->arch.hvm_vmx.vmx_emulate && !curr->arch.hvm_vmx.vmx_realmode )
     {
         /*
          * Cannot enter protected mode with bogus selector RPLs and DPLs.
diff -r 5535efd8e011 -r 6595393a3d28 xen/arch/x86/hvm/vmx/vmcs.c
--- a/xen/arch/x86/hvm/vmx/vmcs.c       Tue Dec 09 13:23:15 2008 +0000
+++ b/xen/arch/x86/hvm/vmx/vmcs.c       Tue Dec 09 16:28:02 2008 +0000
@@ -880,21 +880,34 @@ void vmx_do_resume(struct vcpu *v)
     reset_stack_and_jump(vmx_asm_do_vmentry);
 }
 
-static void vmx_dump_sel(char *name, enum x86_segment seg)
-{
-    struct segment_register sreg;
-    hvm_get_segment_register(current, seg, &sreg);
-    printk("%s: sel=0x%04x, attr=0x%05x, limit=0x%08x, base=0x%016llx\n", 
-           name, sreg.sel, sreg.attr.bytes, sreg.limit,
-           (unsigned long long)sreg.base);
-}
-
 static unsigned long vmr(unsigned long field)
 {
     int rc;
     unsigned long val;
     val = __vmread_safe(field, &rc);
     return rc ? 0 : val;
+}
+
+static void vmx_dump_sel(char *name, uint32_t selector)
+{
+    uint32_t sel, attr, limit;
+    uint64_t base;
+    sel = vmr(selector);
+    attr = vmr(selector + (GUEST_ES_AR_BYTES - GUEST_ES_SELECTOR));
+    limit = vmr(selector + (GUEST_ES_LIMIT - GUEST_ES_SELECTOR));
+    base = vmr(selector + (GUEST_ES_BASE - GUEST_ES_SELECTOR));
+    printk("%s: sel=0x%04x, attr=0x%05x, limit=0x%08x, base=0x%016"PRIx64"\n",
+           name, sel, attr, limit, base);
+}
+
+static void vmx_dump_sel2(char *name, uint32_t lim)
+{
+    uint32_t limit;
+    uint64_t base;
+    limit = vmr(lim);
+    base = vmr(lim + (GUEST_GDTR_BASE - GUEST_GDTR_LIMIT));
+    printk("%s:                           limit=0x%08x, base=0x%016"PRIx64"\n",
+           name, limit, base);
 }
 
 void vmcs_dump_vcpu(struct vcpu *v)
@@ -938,16 +951,16 @@ void vmcs_dump_vcpu(struct vcpu *v)
            (unsigned long long)vmr(GUEST_SYSENTER_ESP),
            (int)vmr(GUEST_SYSENTER_CS),
            (unsigned long long)vmr(GUEST_SYSENTER_EIP));
-    vmx_dump_sel("CS", x86_seg_cs);
-    vmx_dump_sel("DS", x86_seg_ds);
-    vmx_dump_sel("SS", x86_seg_ss);
-    vmx_dump_sel("ES", x86_seg_es);
-    vmx_dump_sel("FS", x86_seg_fs);
-    vmx_dump_sel("GS", x86_seg_gs);
-    vmx_dump_sel("GDTR", x86_seg_gdtr);
-    vmx_dump_sel("LDTR", x86_seg_ldtr);
-    vmx_dump_sel("IDTR", x86_seg_idtr);
-    vmx_dump_sel("TR", x86_seg_tr);
+    vmx_dump_sel("CS", GUEST_CS_SELECTOR);
+    vmx_dump_sel("DS", GUEST_DS_SELECTOR);
+    vmx_dump_sel("SS", GUEST_SS_SELECTOR);
+    vmx_dump_sel("ES", GUEST_ES_SELECTOR);
+    vmx_dump_sel("FS", GUEST_FS_SELECTOR);
+    vmx_dump_sel("GS", GUEST_GS_SELECTOR);
+    vmx_dump_sel2("GDTR", GUEST_GDTR_LIMIT);
+    vmx_dump_sel("LDTR", GUEST_LDTR_SELECTOR);
+    vmx_dump_sel2("IDTR", GUEST_IDTR_LIMIT);
+    vmx_dump_sel("TR", GUEST_TR_SELECTOR);
     x  = (unsigned long long)vmr(TSC_OFFSET_HIGH) << 32;
     x |= (uint32_t)vmr(TSC_OFFSET);
     printk("TSC Offset = %016llx\n", x);
diff -r 5535efd8e011 -r 6595393a3d28 xen/arch/x86/hvm/vmx/vmx.c
--- a/xen/arch/x86/hvm/vmx/vmx.c        Tue Dec 09 13:23:15 2008 +0000
+++ b/xen/arch/x86/hvm/vmx/vmx.c        Tue Dec 09 16:28:02 2008 +0000
@@ -704,6 +704,26 @@ static void vmx_ctxt_switch_to(struct vc
     vpmu_load(v);
 }
 
+
+/* SDM volume 3b section 22.3.1.2: we can only enter virtual 8086 mode
+ * if all of CS, SS, DS, ES, FS and GS are 16bit ring-3 data segments.
+ * The guest thinks it's got ring-0 segments, so we need to fudge
+ * things.  We store the ring-3 version in the VMCS to avoid lots of
+ * shuffling on vmenter and vmexit, and translate in these accessors. */
+
+#define rm_cs_attr (((union segment_attributes) {                       \
+        .fields = { .type = 0xb, .s = 1, .dpl = 0, .p = 1, .avl = 0,    \
+                    .l = 0, .db = 0, .g = 0, .pad = 0 } }).bytes)
+#define rm_ds_attr (((union segment_attributes) {                       \
+        .fields = { .type = 0x3, .s = 1, .dpl = 0, .p = 1, .avl = 0,    \
+                    .l = 0, .db = 0, .g = 0, .pad = 0 } }).bytes)
+#define vm86_ds_attr (((union segment_attributes) {                     \
+        .fields = { .type = 0x3, .s = 1, .dpl = 3, .p = 1, .avl = 0,    \
+                    .l = 0, .db = 0, .g = 0, .pad = 0 } }).bytes)
+#define vm86_tr_attr (((union segment_attributes) {                     \
+        .fields = { .type = 0xb, .s = 0, .dpl = 0, .p = 1, .avl = 0,    \
+                    .l = 0, .db = 0, .g = 0, .pad = 0 } }).bytes)
+
 static void vmx_get_segment_register(struct vcpu *v, enum x86_segment seg,
                                      struct segment_register *reg)
 {
@@ -779,14 +799,85 @@ static void vmx_get_segment_register(str
     /* Unusable flag is folded into Present flag. */
     if ( attr & (1u<<16) )
         reg->attr.fields.p = 0;
+
+    /* Adjust for virtual 8086 mode */
+    if ( v->arch.hvm_vmx.vmx_realmode && seg <= x86_seg_tr 
+         && !(v->arch.hvm_vmx.vm86_segment_mask & (1u << seg)) )
+    {
+        struct segment_register *sreg = &v->arch.hvm_vmx.vm86_saved_seg[seg];
+        if ( seg == x86_seg_tr ) 
+            *reg = *sreg;
+        else if ( reg->base != sreg->base || seg == x86_seg_ss )
+        {
+            /* If the guest's reloaded the segment, remember the new version.
+             * We can't tell if the guest reloaded the segment with another 
+             * one that has the same base.  By default we assume it hasn't,
+             * since we don't want to lose big-real-mode segment attributes,
+             * but for SS we assume it has: the Ubuntu graphical bootloader
+             * does this and gets badly confused if we leave the old SS in 
+             * place. */
+            reg->attr.bytes = (seg == x86_seg_cs ? rm_cs_attr : rm_ds_attr);
+            *sreg = *reg;
+        }
+        else 
+        {
+            /* Always give realmode guests a selector that matches the base
+             * but keep the attr and limit from before */
+            *reg = *sreg;
+            reg->sel = reg->base >> 4;
+        }
+    }
 }
 
 static void vmx_set_segment_register(struct vcpu *v, enum x86_segment seg,
                                      struct segment_register *reg)
 {
-    uint32_t attr;
-
+    uint32_t attr, sel, limit;
+    uint64_t base;
+
+    sel = reg->sel;
     attr = reg->attr.bytes;
+    limit = reg->limit;
+    base = reg->base;
+
+    /* Adjust CS/SS/DS/ES/FS/GS/TR for virtual 8086 mode */
+    if ( v->arch.hvm_vmx.vmx_realmode && seg <= x86_seg_tr )
+    {
+        /* Remember the proper contents */
+        v->arch.hvm_vmx.vm86_saved_seg[seg] = *reg;
+        
+        if ( seg == x86_seg_tr ) 
+        {
+            if ( v->domain->arch.hvm_domain.params[HVM_PARAM_VM86_TSS] )
+            {
+                sel = 0;
+                attr = vm86_tr_attr;
+                limit = 0xff;
+                base = v->domain->arch.hvm_domain.params[HVM_PARAM_VM86_TSS];
+                v->arch.hvm_vmx.vm86_segment_mask &= ~(1u << seg);
+            }
+            else
+                v->arch.hvm_vmx.vm86_segment_mask |= (1u << seg);
+        }
+        else
+        {
+            /* Try to fake it out as a 16bit data segment.  This could
+             * cause confusion for the guest if it reads the selector,
+             * but otherwise we have to emulate if *any* segment hasn't
+             * been reloaded. */
+            if ( base < 0x100000 && !(base & 0xf) && limit >= 0xffff
+                 && reg->attr.fields.p )
+            {
+                sel = base >> 4;
+                attr = vm86_ds_attr;
+                limit = 0xffff;
+                v->arch.hvm_vmx.vm86_segment_mask &= ~(1u << seg);
+            }
+            else 
+                v->arch.hvm_vmx.vm86_segment_mask |= (1u << seg);
+        }
+    }
+
     attr = ((attr & 0xf00) << 4) | (attr & 0xff);
 
     /* Not-present must mean unusable. */
@@ -794,67 +885,67 @@ static void vmx_set_segment_register(str
         attr |= (1u << 16);
 
     /* VMX has strict consistency requirement for flag G. */
-    attr |= !!(reg->limit >> 20) << 15;
+    attr |= !!(limit >> 20) << 15;
 
     vmx_vmcs_enter(v);
 
     switch ( seg )
     {
     case x86_seg_cs:
-        __vmwrite(GUEST_CS_SELECTOR, reg->sel);
-        __vmwrite(GUEST_CS_LIMIT, reg->limit);
-        __vmwrite(GUEST_CS_BASE, reg->base);
+        __vmwrite(GUEST_CS_SELECTOR, sel);
+        __vmwrite(GUEST_CS_LIMIT, limit);
+        __vmwrite(GUEST_CS_BASE, base);
         __vmwrite(GUEST_CS_AR_BYTES, attr);
         break;
     case x86_seg_ds:
-        __vmwrite(GUEST_DS_SELECTOR, reg->sel);
-        __vmwrite(GUEST_DS_LIMIT, reg->limit);
-        __vmwrite(GUEST_DS_BASE, reg->base);
+        __vmwrite(GUEST_DS_SELECTOR, sel);
+        __vmwrite(GUEST_DS_LIMIT, limit);
+        __vmwrite(GUEST_DS_BASE, base);
         __vmwrite(GUEST_DS_AR_BYTES, attr);
         break;
     case x86_seg_es:
-        __vmwrite(GUEST_ES_SELECTOR, reg->sel);
-        __vmwrite(GUEST_ES_LIMIT, reg->limit);
-        __vmwrite(GUEST_ES_BASE, reg->base);
+        __vmwrite(GUEST_ES_SELECTOR, sel);
+        __vmwrite(GUEST_ES_LIMIT, limit);
+        __vmwrite(GUEST_ES_BASE, base);
         __vmwrite(GUEST_ES_AR_BYTES, attr);
         break;
     case x86_seg_fs:
-        __vmwrite(GUEST_FS_SELECTOR, reg->sel);
-        __vmwrite(GUEST_FS_LIMIT, reg->limit);
-        __vmwrite(GUEST_FS_BASE, reg->base);
+        __vmwrite(GUEST_FS_SELECTOR, sel);
+        __vmwrite(GUEST_FS_LIMIT, limit);
+        __vmwrite(GUEST_FS_BASE, base);
         __vmwrite(GUEST_FS_AR_BYTES, attr);
         break;
     case x86_seg_gs:
-        __vmwrite(GUEST_GS_SELECTOR, reg->sel);
-        __vmwrite(GUEST_GS_LIMIT, reg->limit);
-        __vmwrite(GUEST_GS_BASE, reg->base);
+        __vmwrite(GUEST_GS_SELECTOR, sel);
+        __vmwrite(GUEST_GS_LIMIT, limit);
+        __vmwrite(GUEST_GS_BASE, base);
         __vmwrite(GUEST_GS_AR_BYTES, attr);
         break;
     case x86_seg_ss:
-        __vmwrite(GUEST_SS_SELECTOR, reg->sel);
-        __vmwrite(GUEST_SS_LIMIT, reg->limit);
-        __vmwrite(GUEST_SS_BASE, reg->base);
+        __vmwrite(GUEST_SS_SELECTOR, sel);
+        __vmwrite(GUEST_SS_LIMIT, limit);
+        __vmwrite(GUEST_SS_BASE, base);
         __vmwrite(GUEST_SS_AR_BYTES, attr);
         break;
     case x86_seg_tr:
-        __vmwrite(GUEST_TR_SELECTOR, reg->sel);
-        __vmwrite(GUEST_TR_LIMIT, reg->limit);
-        __vmwrite(GUEST_TR_BASE, reg->base);
+        __vmwrite(GUEST_TR_SELECTOR, sel);
+        __vmwrite(GUEST_TR_LIMIT, limit);
+        __vmwrite(GUEST_TR_BASE, base);
         /* VMX checks that the the busy flag (bit 1) is set. */
         __vmwrite(GUEST_TR_AR_BYTES, attr | 2);
         break;
     case x86_seg_gdtr:
-        __vmwrite(GUEST_GDTR_LIMIT, reg->limit);
-        __vmwrite(GUEST_GDTR_BASE, reg->base);
+        __vmwrite(GUEST_GDTR_LIMIT, limit);
+        __vmwrite(GUEST_GDTR_BASE, base);
         break;
     case x86_seg_idtr:
-        __vmwrite(GUEST_IDTR_LIMIT, reg->limit);
-        __vmwrite(GUEST_IDTR_BASE, reg->base);
+        __vmwrite(GUEST_IDTR_LIMIT, limit);
+        __vmwrite(GUEST_IDTR_BASE, base);
         break;
     case x86_seg_ldtr:
-        __vmwrite(GUEST_LDTR_SELECTOR, reg->sel);
-        __vmwrite(GUEST_LDTR_LIMIT, reg->limit);
-        __vmwrite(GUEST_LDTR_BASE, reg->base);
+        __vmwrite(GUEST_LDTR_SELECTOR, sel);
+        __vmwrite(GUEST_LDTR_LIMIT, limit);
+        __vmwrite(GUEST_LDTR_BASE, base);
         __vmwrite(GUEST_LDTR_AR_BYTES, attr);
         break;
     default:
@@ -970,6 +1061,7 @@ static void vmx_update_guest_cr(struct v
     switch ( cr )
     {
     case 0: {
+        int realmode;
         unsigned long hw_cr0_mask =
             X86_CR0_NE | X86_CR0_PG | X86_CR0_PE;
 
@@ -998,9 +1090,44 @@ static void vmx_update_guest_cr(struct v
                 vmx_fpu_enter(v);
         }
 
-        v->arch.hvm_vmx.vmxemul &= ~VMXEMUL_REALMODE;
-        if ( !(v->arch.hvm_vcpu.guest_cr[0] & X86_CR0_PE) )
-            v->arch.hvm_vmx.vmxemul |= VMXEMUL_REALMODE;
+        realmode = !(v->arch.hvm_vcpu.guest_cr[0] & X86_CR0_PE); 
+        if ( realmode != v->arch.hvm_vmx.vmx_realmode )
+        {
+            enum x86_segment s; 
+            struct segment_register reg[x86_seg_tr + 1];
+
+            /* Entering or leaving real mode: adjust the segment registers.
+             * Need to read them all either way, as realmode reads can update
+             * the saved values we'll use when returning to prot mode. */
+            for ( s = x86_seg_cs ; s <= x86_seg_tr ; s++ )
+                vmx_get_segment_register(v, s, &reg[s]);
+            v->arch.hvm_vmx.vmx_realmode = realmode;
+            
+            if ( realmode )
+            {
+                for ( s = x86_seg_cs ; s <= x86_seg_tr ; s++ )
+                    vmx_set_segment_register(v, s, &reg[s]);
+                v->arch.hvm_vcpu.hw_cr[4] |= X86_CR4_VME;
+                __vmwrite(GUEST_CR4, v->arch.hvm_vcpu.hw_cr[4]);
+                __vmwrite(EXCEPTION_BITMAP, 0xffffffff);
+            }
+            else 
+            {
+                for ( s = x86_seg_cs ; s <= x86_seg_tr ; s++ ) 
+                    if ( !(v->arch.hvm_vmx.vm86_segment_mask & (1<<s)) )
+                        vmx_set_segment_register(
+                            v, s, &v->arch.hvm_vmx.vm86_saved_seg[s]);
+                v->arch.hvm_vcpu.hw_cr[4] =
+                    ((v->arch.hvm_vcpu.hw_cr[4] & ~X86_CR4_VME)
+                     |(v->arch.hvm_vcpu.guest_cr[4] & X86_CR4_VME));
+                __vmwrite(GUEST_CR4, v->arch.hvm_vcpu.hw_cr[4]);
+                __vmwrite(EXCEPTION_BITMAP, 
+                          HVM_TRAP_MASK
+                          | (paging_mode_hap(v->domain) ?
+                             0 : (1U << TRAP_page_fault))
+                          | (1U << TRAP_no_device));
+            }
+        }
 
         v->arch.hvm_vcpu.hw_cr[0] =
             v->arch.hvm_vcpu.guest_cr[0] | hw_cr0_mask;
@@ -1028,6 +1155,8 @@ static void vmx_update_guest_cr(struct v
         if ( paging_mode_hap(v->domain) )
             v->arch.hvm_vcpu.hw_cr[4] &= ~X86_CR4_PAE;
         v->arch.hvm_vcpu.hw_cr[4] |= v->arch.hvm_vcpu.guest_cr[4];
+        if ( v->arch.hvm_vmx.vmx_realmode ) 
+            v->arch.hvm_vcpu.hw_cr[4] |= X86_CR4_VME;
         if ( paging_mode_hap(v->domain) && !hvm_paging_enabled(v) )
         {
             v->arch.hvm_vcpu.hw_cr[4] |= X86_CR4_PSE;
@@ -1097,6 +1226,7 @@ static void __vmx_inject_exception(int t
 static void __vmx_inject_exception(int trap, int type, int error_code)
 {
     unsigned long intr_fields;
+    struct vcpu *curr = current;
 
     /*
      * NB. Callers do not need to worry about clearing STI/MOV-SS blocking:
@@ -1113,6 +1243,11 @@ static void __vmx_inject_exception(int t
     }
 
     __vmwrite(VM_ENTRY_INTR_INFO, intr_fields);
+
+    /* Can't inject exceptions in virtual 8086 mode because they would 
+     * use the protected-mode IDT.  Emulate at the next vmenter instead. */
+    if ( curr->arch.hvm_vmx.vmx_realmode ) 
+        curr->arch.hvm_vmx.vmx_emulate = 1;
 }
 
 void vmx_inject_hw_exception(int trap, int error_code)
@@ -2072,6 +2207,17 @@ static void vmx_failed_vmentry(unsigned 
     domain_crash(curr->domain);
 }
 
+asmlinkage void vmx_enter_realmode(struct cpu_user_regs *regs)
+{
+    struct vcpu *v = current;
+
+    /* Adjust RFLAGS to enter virtual 8086 mode with IOPL == 3.  Since
+     * we have CR4.VME == 1 and our own TSS with an empty interrupt
+     * redirection bitmap, all software INTs will be handled by vm86 */
+    v->arch.hvm_vmx.vm86_saved_eflags = regs->eflags;
+    regs->eflags |= (X86_EFLAGS_VM | X86_EFLAGS_IOPL);
+}
+
 asmlinkage void vmx_vmexit_handler(struct cpu_user_regs *regs)
 {
     unsigned int exit_reason, idtv_info;
@@ -2099,6 +2245,42 @@ asmlinkage void vmx_vmexit_handler(struc
 
     if ( unlikely(exit_reason & VMX_EXIT_REASONS_FAILED_VMENTRY) )
         return vmx_failed_vmentry(exit_reason, regs);
+
+    if ( v->arch.hvm_vmx.vmx_realmode )
+    {
+        unsigned int vector;
+
+        /* Put RFLAGS back the way the guest wants it */
+        regs->eflags &= ~(X86_EFLAGS_VM | X86_EFLAGS_IOPL);
+        regs->eflags |= (v->arch.hvm_vmx.vm86_saved_eflags & X86_EFLAGS_IOPL);
+
+        /* Unless this exit was for an interrupt, we've hit something
+         * vm86 can't handle.  Try again, using the emulator. */
+        switch ( exit_reason )
+        {
+        case EXIT_REASON_EXCEPTION_NMI:
+            vector = __vmread(VM_EXIT_INTR_INFO) & INTR_INFO_VECTOR_MASK;;
+            if ( vector != TRAP_page_fault
+                 && vector != TRAP_nmi 
+                 && vector != TRAP_machine_check ) 
+            {
+                perfc_incr(realmode_exits);
+                v->arch.hvm_vmx.vmx_emulate = 1;
+                return;
+            }
+        case EXIT_REASON_EXTERNAL_INTERRUPT:
+        case EXIT_REASON_INIT:
+        case EXIT_REASON_SIPI:
+        case EXIT_REASON_PENDING_VIRT_INTR:
+        case EXIT_REASON_PENDING_VIRT_NMI:
+        case EXIT_REASON_MACHINE_CHECK:
+            break;
+        default:
+            v->arch.hvm_vmx.vmx_emulate = 1;
+            perfc_incr(realmode_exits);
+            return;
+        }
+    }
 
     hvm_maybe_deassert_evtchn_irq();
 
diff -r 5535efd8e011 -r 6595393a3d28 xen/arch/x86/x86_32/asm-offsets.c
--- a/xen/arch/x86/x86_32/asm-offsets.c Tue Dec 09 13:23:15 2008 +0000
+++ b/xen/arch/x86/x86_32/asm-offsets.c Tue Dec 09 16:28:02 2008 +0000
@@ -88,7 +88,9 @@ void __dummy__(void)
     BLANK();
 
     OFFSET(VCPU_vmx_launched, struct vcpu, arch.hvm_vmx.launched);
-    OFFSET(VCPU_vmx_emul, struct vcpu, arch.hvm_vmx.vmxemul);
+    OFFSET(VCPU_vmx_realmode, struct vcpu, arch.hvm_vmx.vmx_realmode);
+    OFFSET(VCPU_vmx_emulate, struct vcpu, arch.hvm_vmx.vmx_emulate);
+    OFFSET(VCPU_vm86_seg_mask, struct vcpu, arch.hvm_vmx.vm86_segment_mask);
     OFFSET(VCPU_hvm_guest_cr2, struct vcpu, arch.hvm_vcpu.guest_cr[2]);
     BLANK();
 
diff -r 5535efd8e011 -r 6595393a3d28 xen/arch/x86/x86_64/asm-offsets.c
--- a/xen/arch/x86/x86_64/asm-offsets.c Tue Dec 09 13:23:15 2008 +0000
+++ b/xen/arch/x86/x86_64/asm-offsets.c Tue Dec 09 16:28:02 2008 +0000
@@ -107,7 +107,9 @@ void __dummy__(void)
     BLANK();
 
     OFFSET(VCPU_vmx_launched, struct vcpu, arch.hvm_vmx.launched);
-    OFFSET(VCPU_vmx_emul, struct vcpu, arch.hvm_vmx.vmxemul);
+    OFFSET(VCPU_vmx_realmode, struct vcpu, arch.hvm_vmx.vmx_realmode);
+    OFFSET(VCPU_vmx_emulate, struct vcpu, arch.hvm_vmx.vmx_emulate);
+    OFFSET(VCPU_vm86_seg_mask, struct vcpu, arch.hvm_vmx.vm86_segment_mask);
     OFFSET(VCPU_hvm_guest_cr2, struct vcpu, arch.hvm_vcpu.guest_cr[2]);
     BLANK();
 
diff -r 5535efd8e011 -r 6595393a3d28 xen/arch/x86/x86_emulate/x86_emulate.h
--- a/xen/arch/x86/x86_emulate/x86_emulate.h    Tue Dec 09 13:23:15 2008 +0000
+++ b/xen/arch/x86/x86_emulate/x86_emulate.h    Tue Dec 09 16:28:02 2008 +0000
@@ -67,6 +67,7 @@ typedef union segment_attributes {
         uint16_t l:   1;    /* 9;  Bit 53 */
         uint16_t db:  1;    /* 10; Bit 54 */
         uint16_t g:   1;    /* 11; Bit 55 */
+        uint16_t pad: 4;
     } fields;
 } __attribute__ ((packed)) segment_attributes_t;
 
diff -r 5535efd8e011 -r 6595393a3d28 xen/include/asm-x86/hvm/vmx/vmcs.h
--- a/xen/include/asm-x86/hvm/vmx/vmcs.h        Tue Dec 09 13:23:15 2008 +0000
+++ b/xen/include/asm-x86/hvm/vmx/vmcs.h        Tue Dec 09 16:28:02 2008 +0000
@@ -109,11 +109,16 @@ struct arch_vmx_struct {
 
     unsigned long        host_cr0;
 
+    /* Is the guest in real mode? */
+    uint8_t              vmx_realmode;
     /* Are we emulating rather than VMENTERing? */
-#define VMXEMUL_REALMODE 1  /* Yes, because CR0.PE == 0   */
-#define VMXEMUL_BAD_CS   2  /* Yes, because CS.RPL != CPL */
-#define VMXEMUL_BAD_SS   4  /* Yes, because SS.RPL != CPL */
-    uint8_t              vmxemul;
+    uint8_t              vmx_emulate;
+    /* Bitmask of segments that we can't safely use in virtual 8086 mode */
+    uint16_t             vm86_segment_mask;
+    /* Shadow CS, SS, DS, ES, FS, GS, TR while in virtual 8086 mode */
+    struct segment_register vm86_saved_seg[x86_seg_tr + 1];
+    /* Remember EFLAGS while in virtual 8086 mode */
+    uint32_t             vm86_saved_eflags;
 };
 
 int vmx_create_vmcs(struct vcpu *v);
diff -r 5535efd8e011 -r 6595393a3d28 xen/include/asm-x86/perfc_defn.h
--- a/xen/include/asm-x86/perfc_defn.h  Tue Dec 09 13:23:15 2008 +0000
+++ b/xen/include/asm-x86/perfc_defn.h  Tue Dec 09 16:28:02 2008 +0000
@@ -127,4 +127,7 @@ PERFCOUNTER(mshv_wrmsr_tpr,             
 PERFCOUNTER(mshv_wrmsr_tpr,             "MS Hv wrmsr tpr")
 PERFCOUNTER(mshv_wrmsr_eoi,             "MS Hv wrmsr eoi")
 
+PERFCOUNTER(realmode_emulations, "realmode instructions emulated")
+PERFCOUNTER(realmode_exits,      "vmexits from realmode")
+
 /*#endif*/ /* __XEN_PERFC_DEFN_H__ */
diff -r 5535efd8e011 -r 6595393a3d28 xen/include/public/hvm/params.h
--- a/xen/include/public/hvm/params.h   Tue Dec 09 13:23:15 2008 +0000
+++ b/xen/include/public/hvm/params.h   Tue Dec 09 16:28:02 2008 +0000
@@ -100,6 +100,9 @@
 /* ACPI S state: currently support S0 and S3 on x86. */
 #define HVM_PARAM_ACPI_S_STATE 14
 
-#define HVM_NR_PARAMS          15
+/* TSS used on Intel when CR0.PE=0. */
+#define HVM_PARAM_VM86_TSS     15
+
+#define HVM_NR_PARAMS          16
 
 #endif /* __XEN_PUBLIC_HVM_PARAMS_H__ */

_______________________________________________
Xen-changelog mailing list
Xen-changelog@xxxxxxxxxxxxxxxxxxx
http://lists.xensource.com/xen-changelog

<Prev in Thread] Current Thread [Next in Thread>
  • [Xen-changelog] [xen-unstable] Use virtual 8086 mode for VMX guests with CR0.PE == 0, Xen patchbot-unstable <=