WARNING - OLD ARCHIVES

This is an archived copy of the Xen.org mailing list, which we have preserved to ensure that existing links to archives are not broken. The live archive, which contains the latest emails, can be found at http://lists.xen.org/
   
 
 
Xen 
 
Home Products Support Community News
 
   
 

xen-changelog

[Xen-changelog] Add a compile time option to enable domain 0 running in

To: xen-changelog@xxxxxxxxxxxxxxxxxxx
Subject: [Xen-changelog] Add a compile time option to enable domain 0 running in ring 0.
From: Xen patchbot -unstable <patchbot-unstable@xxxxxxxxxxxxxxxxxxx>
Date: Mon, 27 Feb 2006 21:56:06 +0000
Delivery-date: Mon, 27 Feb 2006 21:56:24 +0000
Envelope-to: www-data@xxxxxxxxxxxxxxxxxxx
List-help: <mailto:xen-changelog-request@lists.xensource.com?subject=help>
List-id: BK change log <xen-changelog.lists.xensource.com>
List-post: <mailto:xen-changelog@lists.xensource.com>
List-subscribe: <http://lists.xensource.com/cgi-bin/mailman/listinfo/xen-changelog>, <mailto:xen-changelog-request@lists.xensource.com?subject=subscribe>
List-unsubscribe: <http://lists.xensource.com/cgi-bin/mailman/listinfo/xen-changelog>, <mailto:xen-changelog-request@lists.xensource.com?subject=unsubscribe>
Reply-to: xen-devel@xxxxxxxxxxxxxxxxxxx
Sender: xen-changelog-bounces@xxxxxxxxxxxxxxxxxxx
# HG changeset patch
# User kaf24@xxxxxxxxxxxxxxxxxxxx
# Node ID ee8041b0ab86f9315476f718da57af38cbf2eed7
# Parent  6060937db0fe568d34b83d015ebe22b86194faa9
Add a compile time option to enable domain 0 running in ring 0.

In this mode only a single guest kernel is supported.

This mode only works for x86/32 (not x86/64).

Signed-off-by: Ian Campbell <Ian.Campbell@xxxxxxxxxxxxx>
Signed-off-by: Keir Fraser <keir@xxxxxxxxxxxxx>

diff -r 6060937db0fe -r ee8041b0ab86 xen/arch/x86/Makefile
--- a/xen/arch/x86/Makefile     Mon Feb 27 11:02:00 2006
+++ b/xen/arch/x86/Makefile     Mon Feb 27 14:52:43 2006
@@ -31,6 +31,10 @@
  else
   OBJS += shadow32.o                   # x86_32: old code
  endif
+endif
+
+ifneq ($(supervisor_mode_kernel),y)
+OBJS := $(subst x86_32/supervisor_mode_kernel.o,,$(OBJS))
 endif
 
 OBJS := $(subst $(TARGET_SUBARCH)/asm-offsets.o,,$(OBJS))
diff -r 6060937db0fe -r ee8041b0ab86 xen/arch/x86/Rules.mk
--- a/xen/arch/x86/Rules.mk     Mon Feb 27 11:02:00 2006
+++ b/xen/arch/x86/Rules.mk     Mon Feb 27 14:52:43 2006
@@ -6,6 +6,7 @@
 # 'make clean' before rebuilding.
 #
 pae ?= n
+supervisor_mode_kernel ?= n
 
 CFLAGS  += -nostdinc -fno-builtin -fno-common -fno-strict-aliasing
 CFLAGS  += -iwithprefix include -Wall -Werror -Wno-pointer-arith -pipe
@@ -32,6 +33,9 @@
 CFLAGS  += -DCONFIG_X86_PAE=1
 endif
 endif
+ifeq ($(supervisor_mode_kernel),y)
+CFLAGS  += -DCONFIG_X86_SUPERVISOR_MODE_KERNEL=1
+endif
 
 ifeq ($(TARGET_SUBARCH),x86_64)
 CFLAGS  += -m64 -mno-red-zone -fpic -fno-reorder-blocks
diff -r 6060937db0fe -r ee8041b0ab86 xen/arch/x86/domain.c
--- a/xen/arch/x86/domain.c     Mon Feb 27 11:02:00 2006
+++ b/xen/arch/x86/domain.c     Mon Feb 27 14:52:43 2006
@@ -351,17 +351,17 @@
 
     if ( !(c->flags & VGCF_HVM_GUEST) )
     {
-        fixup_guest_selector(c->user_regs.ss);
-        fixup_guest_selector(c->kernel_ss);
-        fixup_guest_selector(c->user_regs.cs);
+        fixup_guest_stack_selector(c->user_regs.ss);
+        fixup_guest_stack_selector(c->kernel_ss);
+        fixup_guest_code_selector(c->user_regs.cs);
 
 #ifdef __i386__
-        fixup_guest_selector(c->event_callback_cs);
-        fixup_guest_selector(c->failsafe_callback_cs);
+        fixup_guest_code_selector(c->event_callback_cs);
+        fixup_guest_code_selector(c->failsafe_callback_cs);
 #endif
 
         for ( i = 0; i < 256; i++ )
-            fixup_guest_selector(c->trap_ctxt[i].cs);
+            fixup_guest_code_selector(c->trap_ctxt[i].cs);
     }
     else if ( !hvm_enabled )
       return -EINVAL;
@@ -847,7 +847,11 @@
         regs       = guest_cpu_user_regs();
 #if defined(__i386__)
         regs->eax  = op;
-        regs->eip -= 2;  /* re-execute 'int 0x82' */
+
+        if ( supervisor_mode_kernel )
+            regs->eip &= ~31; /* re-execute entire hypercall entry stub */
+        else
+            regs->eip -= 2;   /* re-execute 'int 0x82' */
 
         for ( i = 0; i < nr_args; i++ )
         {
diff -r 6060937db0fe -r ee8041b0ab86 xen/arch/x86/domain_build.c
--- a/xen/arch/x86/domain_build.c       Mon Feb 27 11:02:00 2006
+++ b/xen/arch/x86/domain_build.c       Mon Feb 27 14:52:43 2006
@@ -793,6 +793,17 @@
         update_pagetables(v);
     }
 
+    if ( supervisor_mode_kernel )
+    {
+        v->arch.guest_context.kernel_ss &= ~3;
+        v->arch.guest_context.user_regs.ss &= ~3;
+        v->arch.guest_context.user_regs.es &= ~3;
+        v->arch.guest_context.user_regs.ds &= ~3;
+        v->arch.guest_context.user_regs.fs &= ~3;
+        v->arch.guest_context.user_regs.gs &= ~3;
+        printk("Dom0 runs in ring 0 (supervisor mode)\n");
+    }
+
     rc = 0;
 
     /* DOM0 is permitted full I/O capabilities. */
diff -r 6060937db0fe -r ee8041b0ab86 xen/arch/x86/traps.c
--- a/xen/arch/x86/traps.c      Mon Feb 27 11:02:00 2006
+++ b/xen/arch/x86/traps.c      Mon Feb 27 14:52:43 2006
@@ -1429,7 +1429,7 @@
         if ( cur.address == 0 )
             break;
 
-        fixup_guest_selector(cur.cs);
+        fixup_guest_code_selector(cur.cs);
 
         memcpy(&dst[cur.vector], &cur, sizeof(cur));
 
diff -r 6060937db0fe -r ee8041b0ab86 xen/arch/x86/x86_32/asm-offsets.c
--- a/xen/arch/x86/x86_32/asm-offsets.c Mon Feb 27 11:02:00 2006
+++ b/xen/arch/x86/x86_32/asm-offsets.c Mon Feb 27 14:52:43 2006
@@ -72,6 +72,13 @@
     DEFINE(_VCPUF_nmi_masked, _VCPUF_nmi_masked);
     BLANK();
 
+    OFFSET(TSS_ss0, struct tss_struct, ss0);
+    OFFSET(TSS_esp0, struct tss_struct, esp0);
+    OFFSET(TSS_ss1, struct tss_struct, ss1);
+    OFFSET(TSS_esp1, struct tss_struct, esp1);
+    DEFINE(TSS_sizeof, sizeof(struct tss_struct));
+    BLANK();
+
     OFFSET(VCPU_svm_vmcb_pa, struct vcpu, arch.hvm_svm.vmcb_pa);
     OFFSET(VCPU_svm_hsa_pa,  struct vcpu, arch.hvm_svm.host_save_pa);
     OFFSET(VCPU_svm_vmcb, struct vcpu, arch.hvm_svm.vmcb);
diff -r 6060937db0fe -r ee8041b0ab86 xen/arch/x86/x86_32/entry.S
--- a/xen/arch/x86/x86_32/entry.S       Mon Feb 27 11:02:00 2006
+++ b/xen/arch/x86/x86_32/entry.S       Mon Feb 27 14:52:43 2006
@@ -77,6 +77,13 @@
 restore_all_guest:
         testl $X86_EFLAGS_VM,UREGS_eflags(%esp)
         jnz  restore_all_vm86
+#ifdef CONFIG_X86_SUPERVISOR_MODE_KERNEL
+        testl $2,UREGS_cs(%esp)
+        jnz   1f
+        call  restore_ring0_guest
+        jmp   restore_all_vm86
+1:
+#endif
 FLT1:   mov  UREGS_ds(%esp),%ds
 FLT2:   mov  UREGS_es(%esp),%es
 FLT3:   mov  UREGS_fs(%esp),%fs
@@ -157,6 +164,7 @@
         ALIGN
 ENTRY(hypercall)
         subl $4,%esp
+        FIXUP_RING0_GUEST_STACK
        SAVE_ALL(b)
         sti
         GET_CURRENT(%ebx)
@@ -294,6 +302,11 @@
         popl %eax
         shll $16,%eax                    # Bits 16-23: saved_upcall_mask
         movw UREGS_cs+4(%esp),%ax        # Bits  0-15: CS
+#ifdef CONFIG_X86_SUPERVISOR_MODE_KERNEL
+        testw $2,%ax
+        jnz  FLT15
+        and  $~3,%ax                     # RPL 1 -> RPL 0
+#endif
 FLT15:  movl %eax,%gs:4(%esi) 
         test $0x00FF0000,%eax            # Bits 16-23: saved_upcall_mask
         setz %ch                         # %ch == !saved_upcall_mask
@@ -388,6 +401,7 @@
        pushl $TRAP_divide_error<<16
        ALIGN
 error_code:
+        FIXUP_RING0_GUEST_STACK
         SAVE_ALL_NOSEGREGS(a)
         SET_XEN_SEGMENTS(a)
         testb $X86_EFLAGS_IF>>8,UREGS_eflags+1(%esp)
@@ -505,6 +519,10 @@
        jmp error_code
 
 ENTRY(nmi)
+#ifdef CONFIG_X86_SUPERVISOR_MODE_KERNEL
+        # NMI entry protocol is incompatible with guest kernel in ring 0.
+        iret
+#else
         # Save state but do not trash the segment registers!
         # We may otherwise be unable to reload them or copy them to ring 1. 
        pushl %eax
@@ -546,6 +564,7 @@
         movl  $(APIC_DM_FIXED | APIC_DEST_SELF | APIC_DEST_LOGICAL | \
                 TRAP_deferred_nmi),%ss:APIC_ICR(%eax)
         jmp   restore_all_xen
+#endif /* !CONFIG_X86_SUPERVISOR_MODE_KERNEL */
 
 ENTRY(setup_vm86_frame)
         # Copies the entire stack frame forwards by 16 bytes.
diff -r 6060937db0fe -r ee8041b0ab86 xen/arch/x86/x86_32/mm.c
--- a/xen/arch/x86/x86_32/mm.c  Mon Feb 27 11:02:00 2006
+++ b/xen/arch/x86/x86_32/mm.c  Mon Feb 27 14:52:43 2006
@@ -180,6 +180,15 @@
             page_set_owner(page, dom_xen);
         }
     }
+
+    if ( supervisor_mode_kernel )
+    {
+        /* Guest kernel runs in ring 0, not ring 1. */
+        struct desc_struct *d;
+        d = &gdt_table[(FLAT_RING1_CS >> 3) - FIRST_RESERVED_GDT_ENTRY];
+        d[0].b &= ~_SEGMENT_DPL;
+        d[1].b &= ~_SEGMENT_DPL;
+    }
 }
 
 long subarch_memory_op(int op, void *arg)
@@ -223,7 +232,7 @@
     int nr = smp_processor_id();
     struct tss_struct *t = &init_tss[nr];
 
-    fixup_guest_selector(ss);
+    fixup_guest_stack_selector(ss);
 
     current->arch.guest_context.kernel_ss = ss;
     current->arch.guest_context.kernel_sp = esp;
@@ -239,6 +248,10 @@
     unsigned long base, limit;
     u32 a = d->a, b = d->b;
     u16 cs;
+
+    /* Let a ring0 guest kernel set any descriptor it wants to. */
+    if ( supervisor_mode_kernel )
+        return 1;
 
     /* A not-present descriptor will always fault, so is safe. */
     if ( !(b & _SEGMENT_P) ) 
@@ -273,7 +286,7 @@
 
         /* Validate and fix up the target code selector. */
         cs = a >> 16;
-        fixup_guest_selector(cs);
+        fixup_guest_code_selector(cs);
         if ( !guest_gate_selector_okay(cs) )
             goto bad;
         a = d->a = (d->a & 0xffffU) | (cs << 16);
diff -r 6060937db0fe -r ee8041b0ab86 xen/arch/x86/x86_32/traps.c
--- a/xen/arch/x86/x86_32/traps.c       Mon Feb 27 11:02:00 2006
+++ b/xen/arch/x86/x86_32/traps.c       Mon Feb 27 14:52:43 2006
@@ -256,8 +256,14 @@
      * We can't virtualise interrupt gates, as there's no way to get
      * the CPU to automatically clear the events_mask variable. Also we
      * must ensure that the CS is safe to poke into an interrupt gate.
-     */
-    if ( TI_GET_IF(ti) || !guest_gate_selector_okay(ti->cs) )
+     *
+     * When running with supervisor_mode_kernel enabled a direct trap
+     * to the guest OS cannot be used because the INT instruction will
+     * switch to the Xen stack and we need to swap back to the guest
+     * kernel stack before passing control to the system call entry point.
+     */
+    if ( TI_GET_IF(ti) || !guest_gate_selector_okay(ti->cs) ||
+         supervisor_mode_kernel )
     {
         v->arch.int80_desc.a = v->arch.int80_desc.b = 0;
         return;
@@ -278,8 +284,8 @@
 {
     struct vcpu *d = current;
 
-    fixup_guest_selector(event_selector);
-    fixup_guest_selector(failsafe_selector);
+    fixup_guest_code_selector(event_selector);
+    fixup_guest_code_selector(failsafe_selector);
 
     d->arch.guest_context.event_callback_cs     = event_selector;
     d->arch.guest_context.event_callback_eip    = event_address;
@@ -289,12 +295,51 @@
     return 0;
 }
 
-void hypercall_page_initialise(void *hypercall_page)
-{
+static void hypercall_page_initialise_ring0_kernel(void *hypercall_page)
+{
+    extern asmlinkage int hypercall(void);
     char *p;
     int i;
 
     /* Fill in all the transfer points with template machine code. */
+
+    for ( i = 0; i < NR_hypercalls; i++ )
+    {
+        p = (char *)(hypercall_page + (i * 32));
+
+        *(u8  *)(p+ 0) = 0x9c;      /* pushf */
+        *(u8  *)(p+ 1) = 0xfa;      /* cli */
+        *(u8  *)(p+ 2) = 0xb8;      /* mov $<i>,%eax */
+        *(u32 *)(p+ 3) = i;
+        *(u8  *)(p+ 7) = 0x9a;      /* lcall $__HYPERVISOR_CS,&hypercall */
+        *(u32 *)(p+ 8) = (u32)&hypercall;
+        *(u16 *)(p+12) = (u16)__HYPERVISOR_CS;
+        *(u8  *)(p+14) = 0xc3;      /* ret */
+    }
+
+    /*
+     * HYPERVISOR_iret is special because it doesn't return and expects a
+     * special stack frame. Guests jump at this transfer point instead of
+     * calling it.
+     */
+    p = (char *)(hypercall_page + (__HYPERVISOR_iret * 32));
+    *(u8  *)(p+ 0) = 0x50;      /* push %eax */
+    *(u8  *)(p+ 1) = 0x9c;      /* pushf */
+    *(u8  *)(p+ 2) = 0xfa;      /* cli */
+    *(u8  *)(p+ 3) = 0xb8;      /* mov $<i>,%eax */
+    *(u32 *)(p+ 4) = __HYPERVISOR_iret;
+    *(u8  *)(p+ 8) = 0x9a;      /* lcall $__HYPERVISOR_CS,&hypercall */
+    *(u32 *)(p+ 9) = (u32)&hypercall;
+    *(u16 *)(p+13) = (u16)__HYPERVISOR_CS;
+}
+
+static void hypercall_page_initialise_ring1_kernel(void *hypercall_page)
+{
+    char *p;
+    int i;
+
+    /* Fill in all the transfer points with template machine code. */
+
     for ( i = 0; i < (PAGE_SIZE / 32); i++ )
     {
         p = (char *)(hypercall_page + (i * 32));
@@ -314,6 +359,14 @@
     *(u8  *)(p+ 1) = 0xb8;    /* mov  $__HYPERVISOR_iret,%eax */
     *(u32 *)(p+ 2) = __HYPERVISOR_iret;
     *(u16 *)(p+ 6) = 0x82cd;  /* int  $0x82 */
+}
+
+void hypercall_page_initialise(void *hypercall_page)
+{
+    if ( supervisor_mode_kernel )
+        hypercall_page_initialise_ring0_kernel(hypercall_page);
+    else
+        hypercall_page_initialise_ring1_kernel(hypercall_page);
 }
 
 /*
diff -r 6060937db0fe -r ee8041b0ab86 xen/arch/x86/x86_64/mm.c
--- a/xen/arch/x86/x86_64/mm.c  Mon Feb 27 11:02:00 2006
+++ b/xen/arch/x86/x86_64/mm.c  Mon Feb 27 14:52:43 2006
@@ -228,7 +228,7 @@
 
 long do_stack_switch(unsigned long ss, unsigned long esp)
 {
-    fixup_guest_selector(ss);
+    fixup_guest_stack_selector(ss);
     current->arch.guest_context.kernel_ss = ss;
     current->arch.guest_context.kernel_sp = esp;
     return 0;
@@ -315,7 +315,7 @@
 
     /* Validate and fix up the target code selector. */
     cs = a >> 16;
-    fixup_guest_selector(cs);
+    fixup_guest_code_selector(cs);
     if ( !guest_gate_selector_okay(cs) )
         goto bad;
     a = d->a = (d->a & 0xffffU) | (cs << 16);
diff -r 6060937db0fe -r ee8041b0ab86 xen/common/dom0_ops.c
--- a/xen/common/dom0_ops.c     Mon Feb 27 11:02:00 2006
+++ b/xen/common/dom0_ops.c     Mon Feb 27 14:52:43 2006
@@ -170,6 +170,13 @@
         cpumask_t      cpu_exclude_map;
         static domid_t rover = 0;
 
+        /*
+         * Running the domain 0 kernel in ring 0 is not compatible
+         * with multiple guests.
+         */
+        if ( supervisor_mode_kernel )
+            return -EINVAL;
+
         dom = op->u.createdomain.domain;
         if ( (dom > 0) && (dom < DOMID_FIRST_RESERVED) )
         {
diff -r 6060937db0fe -r ee8041b0ab86 xen/common/kernel.c
--- a/xen/common/kernel.c       Mon Feb 27 11:02:00 2006
+++ b/xen/common/kernel.c       Mon Feb 27 14:52:43 2006
@@ -195,6 +195,8 @@
                     (1U << XENFEAT_writable_page_tables) |
                     (1U << XENFEAT_auto_translated_physmap) |
                     (1U << XENFEAT_pae_pgdir_above_4gb);
+            if ( supervisor_mode_kernel )
+                fi.submap |= 1U << XENFEAT_supervisor_mode_kernel;
             break;
         default:
             return -EINVAL;
diff -r 6060937db0fe -r ee8041b0ab86 xen/include/asm-ia64/config.h
--- a/xen/include/asm-ia64/config.h     Mon Feb 27 11:02:00 2006
+++ b/xen/include/asm-ia64/config.h     Mon Feb 27 14:52:43 2006
@@ -39,6 +39,8 @@
 //#define CONFIG_NR_CPUS 16
 //leave SMP for a later time
 //#undef CONFIG_SMP
+
+#define supervisor_mode_kernel (0)
 
 #define MAX_DMADOM_PFN (0x7FFFFFFFUL >> PAGE_SHIFT) /* 31 addressable bits */
 
diff -r 6060937db0fe -r ee8041b0ab86 xen/include/asm-x86/config.h
--- a/xen/include/asm-x86/config.h      Mon Feb 27 11:02:00 2006
+++ b/xen/include/asm-x86/config.h      Mon Feb 27 14:52:43 2006
@@ -36,6 +36,12 @@
 #define OPT_CONSOLE_STR "com1,vga"
 
 #define NR_CPUS 32
+
+#ifdef CONFIG_X86_SUPERVISOR_MODE_KERNEL
+# define supervisor_mode_kernel (1)
+#else
+# define supervisor_mode_kernel (0)
+#endif
 
 /* Linkage for x86 */
 #define __ALIGN .align 16,0x90
diff -r 6060937db0fe -r ee8041b0ab86 xen/include/asm-x86/desc.h
--- a/xen/include/asm-x86/desc.h        Mon Feb 27 11:02:00 2006
+++ b/xen/include/asm-x86/desc.h        Mon Feb 27 14:52:43 2006
@@ -27,9 +27,22 @@
 #endif
 
 /* Fix up the RPL of a guest segment selector. */
-#define fixup_guest_selector(sel)                               \
+#define __fixup_guest_selector(sel)                             \
     ((sel) = (((sel) & 3) >= GUEST_KERNEL_RPL) ? (sel) :        \
      (((sel) & ~3) | GUEST_KERNEL_RPL))
+
+/* Stack selectors don't need fixing up if the kernel runs in ring 0. */
+#ifdef CONFIG_X86_SUPERVISOR_MODE_KERNEL
+#define fixup_guest_stack_selector(ss) ((void)0)
+#else
+#define fixup_guest_stack_selector(ss) __fixup_guest_selector(ss)
+#endif
+
+/*
+ * Code selectors are always fixed up. It allows the Xen exit stub to detect
+ * return to guest context, even when the guest kernel runs in ring 0.
+ */
+#define fixup_guest_code_selector(cs)  __fixup_guest_selector(cs)
 
 /*
  * We need this function because enforcing the correct guest kernel RPL is
diff -r 6060937db0fe -r ee8041b0ab86 xen/include/asm-x86/x86_32/asm_defns.h
--- a/xen/include/asm-x86/x86_32/asm_defns.h    Mon Feb 27 11:02:00 2006
+++ b/xen/include/asm-x86/x86_32/asm_defns.h    Mon Feb 27 14:52:43 2006
@@ -48,9 +48,24 @@
 
 #ifdef PERF_COUNTERS
 #define PERFC_INCR(_name,_idx)                          \
-    lock incl perfcounters+_name(,_idx,4)
+        lock incl perfcounters+_name(,_idx,4)
 #else
 #define PERFC_INCR(_name,_idx)
+#endif
+
+#ifdef CONFIG_X86_SUPERVISOR_MODE_KERNEL
+#define FIXUP_RING0_GUEST_STACK                         \
+        testl $2,8(%esp);                               \
+        jnz 1f; /* rings 2 & 3 permitted */             \
+        testl $1,8(%esp);                               \
+        jz 2f;                                          \
+        ud2; /* ring 1 should not be used */            \
+        2:cmpl $(__HYPERVISOR_VIRT_START),%esp;         \
+        jge 1f;                                         \
+        call fixup_ring0_guest_stack;                   \
+        1:
+#else
+#define FIXUP_RING0_GUEST_STACK
 #endif
 
 #define BUILD_SMP_INTERRUPT(x,v) XBUILD_SMP_INTERRUPT(x,v)
@@ -61,6 +76,7 @@
     ".globl " STR(x) "\n\t"                     \
     STR(x) ":\n\t"                              \
     "pushl $"#v"<<16\n\t"                       \
+    STR(FIXUP_RING0_GUEST_STACK)                \
     STR(SAVE_ALL(a))                            \
     "movl %esp,%eax\n\t"                        \
     "pushl %eax\n\t"                            \
@@ -72,6 +88,7 @@
 __asm__(                                        \
     "\n" __ALIGN_STR"\n"                        \
     "common_interrupt:\n\t"                     \
+    STR(FIXUP_RING0_GUEST_STACK)                \
     STR(SAVE_ALL(a))                            \
     "movl %esp,%eax\n\t"                        \
     "pushl %eax\n\t"                            \
diff -r 6060937db0fe -r ee8041b0ab86 
xen/arch/x86/x86_32/supervisor_mode_kernel.S
--- /dev/null   Mon Feb 27 11:02:00 2006
+++ b/xen/arch/x86/x86_32/supervisor_mode_kernel.S      Mon Feb 27 14:52:43 2006
@@ -0,0 +1,145 @@
+/*
+ * Handle stack fixup for guest running in RING 0.
+ *
+ * Copyright (c) 2006 Ian Campbell
+ *
+ * When a guest kernel is allowed to run in RING 0 a hypercall,
+ * interrupt or exception interrupting the guest kernel will not cause
+ * a privilege level change and therefore the stack will not be swapped
+ * to the Xen stack.
+ *
+ * To fix this we look for RING 0 activation frames with a stack
+ * pointer below HYPERVISOR_VIRT_START (indicating a guest kernel
+ * frame) and fix this up by locating the Xen stack via the TSS
+ * and moving the activation frame to the Xen stack. In the process we
+ * convert the frame into an inter-privilege frame returning to RING 1
+ * so that we can catch and reverse the process on exit.
+ */
+
+#include <xen/config.h>
+#include <asm/asm_defns.h>
+#include <public/xen.h>
+
+        # Upon entry the stack should be the Xen stack and contain:
+        #   %ss, %esp, EFLAGS, %cs|1, %eip, ERROR, SAVE_ALL, RETURN
+        # On exit the stack should be %ss:%esp (i.e. the guest stack)
+        # and contain:
+        #   EFLAGS, %cs, %eip, ERROR, SAVE_ALL, RETURN
+        ALIGN
+ENTRY(restore_ring0_guest)
+        # Point %gs:%esi to guest stack.
+RRG0:   movw UREGS_ss+4(%esp),%gs
+        movl UREGS_esp+4(%esp),%esi
+
+        # Copy EFLAGS...EBX, RETURN from Xen stack to guest stack.
+        movl $(UREGS_kernel_sizeof>>2)+1,%ecx
+
+1:      subl $4,%esi
+        movl -4(%esp,%ecx,4),%eax
+RRG1:   movl %eax,%gs:(%esi)
+        loop 1b
+
+RRG2:   andl $~3,%gs:UREGS_cs+4(%esi)
+
+        movl %gs,%eax
+
+        # We need to do this because these registers are not present
+        # on the guest stack so they cannot be restored by the code in
+        # restore_all_guest.
+RRG3:   mov  UREGS_ds+4(%esp),%ds
+RRG4:   mov  UREGS_es+4(%esp),%es
+RRG5:   mov  UREGS_fs+4(%esp),%fs
+RRG6:   mov  UREGS_gs+4(%esp),%gs
+
+RRG7:   movl %eax,%ss
+        movl %esi,%esp
+
+        ret
+.section __ex_table,"a"
+        .long RRG0,domain_crash_synchronous
+        .long RRG1,domain_crash_synchronous
+        .long RRG2,domain_crash_synchronous
+        .long RRG3,domain_crash_synchronous
+        .long RRG4,domain_crash_synchronous
+        .long RRG5,domain_crash_synchronous
+        .long RRG6,domain_crash_synchronous
+        .long RRG7,domain_crash_synchronous
+.previous
+
+        # Upon entry the stack should be a guest stack and contain:
+        #   EFLAGS, %cs, %eip, ERROR, RETURN
+        # On exit the stack should be the Xen stack and contain:
+        #   %ss, %esp, EFLAGS, %cs|1, %eip, ERROR, RETURN
+        ALIGN
+ENTRY(fixup_ring0_guest_stack)
+        pushl %eax
+        pushl %ecx
+        pushl %ds
+        pushl %gs
+        pushl %esi
+
+        movw  $__HYPERVISOR_DS,%ax
+        movw  %ax,%ds
+
+        # Point %gs:%esi to guest stack frame.
+        movw  %ss,%ax
+        movw  %ax,%gs
+        movl  %esp,%esi
+        # Account for entries on the guest stack:
+        # * Pushed by normal exception/interrupt/hypercall mechanisms
+        #   * EFLAGS, %cs, %eip, ERROR == 4 words.
+        # * Pushed by the fixup routine
+        #   * [RETURN], %eax, %ecx, %ds, %gs and %esi == 6 words.
+        addl $((6+4)*4),%esi
+
+        # %gs:%esi now points to the guest stack before the
+        # interrupt/exception occured.
+
+        /*
+         * Reverse the __TSS macro, giving us the CPU number.
+         * The TSS for this cpu is at init_tss + ( cpu * 128 ).
+         */
+        str   %ecx
+        shrl  $3,%ecx                                   # Calculate GDT index 
for TSS.
+        subl  $(FIRST_RESERVED_GDT_ENTRY+8),%ecx        # %ecx = 2*cpu.
+        shll  $6,%ecx                                   # Each TSS entry is 
0x80 bytes
+        addl  $init_tss,%ecx                            # but we have 2*cpu 
from above.
+
+        # Load Xen stack from TSS.
+        movw  TSS_ss0(%ecx),%ax
+TRP1:   movw  %ax,%ss
+        movl  TSS_esp0(%ecx),%esp
+
+        pushl %gs
+        pushl %esi
+
+        # Move EFLAGS, %cs, %eip, ERROR, RETURN, %eax, %ecx, %ds, %gs, %esi
+        # from guest stack to Xen stack.
+        movl  $10,%ecx
+1:      subl  $4,%esp
+        subl  $4,%esi
+TRP2:   movl  %gs:(%esi),%eax
+        movl  %eax,(%esp)
+        loop  1b
+
+        # CS = CS|1 to simulate RING1 stack frame.
+        orl   $1,32(%esp)
+
+        popl  %esi
+        popl  %gs
+        popl  %ds
+        popl  %ecx
+        popl  %eax
+        ret
+.section __ex_table,"a"
+        .long TRP1,domain_crash_synchronous
+        .long TRP2,domain_crash_synchronous
+.previous
+
+domain_crash_synchronous_string:
+        .asciz "domain_crash_sync called from supervisor_mode_kernel.S (%lx)\n"
+
+domain_crash_synchronous:
+        pushl $domain_crash_synchronous_string
+        call  printf
+        jmp   __domain_crash_synchronous

_______________________________________________
Xen-changelog mailing list
Xen-changelog@xxxxxxxxxxxxxxxxxxx
http://lists.xensource.com/xen-changelog

<Prev in Thread] Current Thread [Next in Thread>
  • [Xen-changelog] Add a compile time option to enable domain 0 running in ring 0., Xen patchbot -unstable <=