WARNING - OLD ARCHIVES

This is an archived copy of the Xen.org mailing list, which we have preserved to ensure that existing links to archives are not broken. The live archive, which contains the latest emails, can be found at http://lists.xen.org/
   
 
 
Xen 
 
Home Products Support Community News
 
   
 

xen-devel

[Xen-devel] [PATCH] x86-64: syscall/sysenter support for 32-bit apps

To: <xen-devel@xxxxxxxxxxxxxxxxxxx>
Subject: [Xen-devel] [PATCH] x86-64: syscall/sysenter support for 32-bit apps
From: "Jan Beulich" <jbeulich@xxxxxxxxxx>
Date: Thu, 05 Jul 2007 14:07:56 +0100
Delivery-date: Thu, 05 Jul 2007 06:04:48 -0700
Envelope-to: www-data@xxxxxxxxxxxxxxxxxx
List-help: <mailto:xen-devel-request@lists.xensource.com?subject=help>
List-id: Xen developer discussion <xen-devel.lists.xensource.com>
List-post: <mailto:xen-devel@lists.xensource.com>
List-subscribe: <http://lists.xensource.com/cgi-bin/mailman/listinfo/xen-devel>, <mailto:xen-devel-request@lists.xensource.com?subject=subscribe>
List-unsubscribe: <http://lists.xensource.com/cgi-bin/mailman/listinfo/xen-devel>, <mailto:xen-devel-request@lists.xensource.com?subject=unsubscribe>
Sender: xen-devel-bounces@xxxxxxxxxxxxxxxxxxx
This is for both 32-bit apps in 64-bit pv guests and 32on64.

While I coded both a hypercall interface and MSR emulation, I'm not really
sure both mechanisms need to be there.

Depends on more than just guest_context getting saved/restored as guest
state during save/restore/migrate (namely the new fields holding callback
addresses), which isn't implemented yet (and I likely won't do it).

Since the 32-bit kernel doesn't make use of syscall (it would be possible to
do so now, when running on a 64-bit hv), the compat mode guest code path for
syscall wasn't tested.

Signed-off-by: Jan Beulich <jbeulich@xxxxxxxxxx>

Index: 2007-07-03/xen/arch/x86/domain.c
===================================================================
--- 2007-07-03.orig/xen/arch/x86/domain.c       2007-06-22 16:57:45.000000000 
+0200
+++ 2007-07-03/xen/arch/x86/domain.c    2007-07-03 10:39:13.000000000 +0200
@@ -395,6 +395,12 @@ int vcpu_initialise(struct vcpu *v)
     v->arch.perdomain_ptes =
         d->arch.mm_perdomain_pt + (v->vcpu_id << GDT_LDT_VCPU_SHIFT);
 
+#ifdef __x86_64__
+    v->arch.sysexit_cs = 3;
+    v->arch.syscall_eflags_mask = X86_EFLAGS_DF|X86_EFLAGS_TF|X86_EFLAGS_NT|
+                                  X86_EFLAGS_RF|X86_EFLAGS_VM;
+#endif
+
     return (is_pv_32on64_vcpu(v) ? setup_compat_l4(v) : 0);
 }
 
@@ -607,7 +613,18 @@ int arch_set_info_guest(
         v->arch.flags |= TF_kernel_mode;
 
     if ( !compat )
+    {
         memcpy(&v->arch.guest_context, c.nat, sizeof(*c.nat));
+#ifdef __x86_64__
+        /*
+         * Despite not being correct, be backwards compatible - most
+         * importantly in order to prevent the guest from being crashed
+         * due to use of syscall from compatibility mode when the kernel
+         * didn't set the compatibility mode callback.
+         */
+        v->arch.syscall32_callback_eip = c.nat->syscall_callback_eip;
+#endif
+    }
 #ifdef CONFIG_COMPAT
     else
     {
@@ -1274,7 +1291,9 @@ void context_switch(struct vcpu *prev, s
             local_flush_tlb_one(GDT_VIRT_START(next) +
                                 FIRST_RESERVED_GDT_BYTE);
 
-            if ( !is_pv_32on64_vcpu(next) == !(efer & EFER_SCE) )
+            if ( (!is_pv_32on64_vcpu(next)
+                  || (next->arch.syscall32_callback_cs & ~3)) ==
+                 !(efer & EFER_SCE) )
                 write_efer(efer ^ EFER_SCE);
         }
 #endif
Index: 2007-07-03/xen/arch/x86/traps.c
===================================================================
--- 2007-07-03.orig/xen/arch/x86/traps.c        2007-07-03 10:35:22.000000000 
+0200
+++ 2007-07-03/xen/arch/x86/traps.c     2007-07-04 13:21:20.000000000 +0200
@@ -609,16 +609,21 @@ static int emulate_forced_invalid_op(str
         clear_bit(X86_FEATURE_DE,  &d);
         clear_bit(X86_FEATURE_PSE, &d);
         clear_bit(X86_FEATURE_PGE, &d);
+        if ( !cpu_has_sep )
+            clear_bit(X86_FEATURE_SEP, &d);
+#ifdef __i386__
         if ( !supervisor_mode_kernel )
             clear_bit(X86_FEATURE_SEP, &d);
+#endif
         if ( !IS_PRIV(current->domain) )
             clear_bit(X86_FEATURE_MTRR, &d);
     }
     else if ( regs->eax == 0x80000001 )
     {
         /* Modify Feature Information. */
-        if ( is_pv_32bit_vcpu(current) )
-            clear_bit(X86_FEATURE_SYSCALL % 32, &d);
+#ifdef __i386__
+        clear_bit(X86_FEATURE_SYSCALL % 32, &d);
+#endif
         clear_bit(X86_FEATURE_RDTSCP % 32, &d);
     }
     else
@@ -1695,6 +1700,8 @@ static int emulate_privileged_op(struct 
         break;
 
     case 0x30: /* WRMSR */
+        data = regs->eax;
+        res = ((u64)regs->edx << 32) | data;
         switch ( regs->ecx )
         {
 #ifdef CONFIG_X86_64
@@ -1703,24 +1710,87 @@ static int emulate_privileged_op(struct 
                 goto fail;
             if ( wrmsr_safe(MSR_FS_BASE, regs->eax, regs->edx) )
                 goto fail;
-            v->arch.guest_context.fs_base =
-                ((u64)regs->edx << 32) | regs->eax;
+            v->arch.guest_context.fs_base = res;
             break;
         case MSR_GS_BASE:
             if ( is_pv_32on64_vcpu(v) )
                 goto fail;
             if ( wrmsr_safe(MSR_GS_BASE, regs->eax, regs->edx) )
                 goto fail;
-            v->arch.guest_context.gs_base_kernel =
-                ((u64)regs->edx << 32) | regs->eax;
+            v->arch.guest_context.gs_base_kernel = res;
             break;
         case MSR_SHADOW_GS_BASE:
             if ( is_pv_32on64_vcpu(v) )
                 goto fail;
             if ( wrmsr_safe(MSR_SHADOW_GS_BASE, regs->eax, regs->edx) )
                 goto fail;
-            v->arch.guest_context.gs_base_user =
-                ((u64)regs->edx << 32) | regs->eax;
+            v->arch.guest_context.gs_base_user = res;
+            break;
+        case MSR_STAR:
+            if ( is_pv_32on64_vcpu(v) )
+            {
+                v->arch.syscall32_callback_eip = data;
+                v->arch.syscall32_callback_cs = (uint16_t)regs->edx;
+                fixup_guest_code_selector(v->domain,
+                                          v->arch.syscall32_callback_cs);
+            }
+            break;
+        case MSR_LSTAR:
+            if ( is_pv_32on64_vcpu(v) || !is_canonical_address(res) )
+                goto fail;
+            v->arch.guest_context.syscall_callback_eip = res;
+            break;
+        case MSR_CSTAR:
+            if ( is_pv_32on64_vcpu(v) || !is_canonical_address(res) )
+                goto fail;
+            v->arch.syscall32_callback_eip = res;
+            break;
+        case MSR_SYSCALL_MASK:
+            if ( is_pv_32on64_vcpu(v) || (uint32_t)regs->edx != 0 )
+                goto fail;
+            v->arch.syscall_eflags_mask = data &
+                                          ~(X86_EFLAGS_IF|X86_EFLAGS_IOPL);
+            if ( data & X86_EFLAGS_IF )
+            {
+                set_bit(_VGCF_syscall_disables_events,
+                        &v->arch.guest_context.flags);
+                v->arch.syscall32_disables_events = 1;
+            }
+            else
+            {
+                clear_bit(_VGCF_syscall_disables_events,
+                          &v->arch.guest_context.flags);
+                v->arch.syscall32_disables_events = 0;
+            }
+            break;
+        case MSR_IA32_SYSENTER_CS:
+            if ( is_pv_32on64_vcpu(v) )
+            {
+                v->arch.sysenter_callback_cs = data;
+                fixup_guest_code_selector(v->domain,
+                                          v->arch.sysenter_callback_cs);
+                /*
+                 * While this doesn't match real SYSENTER behavior, the guest
+                 * generally doesn't have a need to switch stacks (or anything
+                 * else that needs to keep interrupts disabled). If the guest
+                 * really needs interrupts disabled on entry, it can still use
+                 * the corresponding hypercall.
+                 */
+                v->arch.sysenter_disables_events = 0;
+            }
+            v->arch.sysexit_cs = (data + 16) | 3;
+            break;
+        case MSR_IA32_SYSENTER_EIP:
+            if ( !is_pv_32on64_vcpu(v) && !is_canonical_address(res) )
+                goto fail;
+            v->arch.sysenter_callback_eip = is_pv_32on64_vcpu(v) ? data : res;
+            if ( !is_pv_32on64_vcpu(v) )
+                /* See comment above. */
+                v->arch.sysenter_disables_events = 0;
+            break;
+        case MSR_IA32_SYSENTER_ESP:
+            if ( !is_pv_32on64_vcpu(v) && !is_canonical_address(res) )
+                goto fail;
             break;
 #endif
         default:
@@ -1758,6 +1828,53 @@ static int emulate_privileged_op(struct 
             regs->eax = v->arch.guest_context.gs_base_user & 0xFFFFFFFFUL;
             regs->edx = v->arch.guest_context.gs_base_user >> 32;
             break;
+        case MSR_STAR:
+            if ( is_pv_32on64_vcpu(v) )
+            {
+                regs->eax = v->arch.syscall32_callback_eip;
+                regs->edx = v->arch.syscall32_callback_cs |
+                            (FLAT_COMPAT_USER_CS << 16);
+            }
+            else
+                regs->edx = FLAT_KERNEL_CS64 | (FLAT_USER_CS64 << 16);
+            break;
+        case MSR_LSTAR:
+            if ( is_pv_32on64_vcpu(v) )
+                goto fail;
+            regs->eax = (uint32_t)v->arch.guest_context.syscall_callback_eip;
+            regs->edx = v->arch.guest_context.syscall_callback_eip >> 32;
+            break;
+        case MSR_CSTAR:
+            if ( is_pv_32on64_vcpu(v) )
+                goto fail;
+            regs->eax = (uint32_t)v->arch.syscall32_callback_eip;
+            regs->edx = v->arch.syscall32_callback_eip >> 32;
+            break;
+        case MSR_SYSCALL_MASK:
+            if ( is_pv_32on64_vcpu(v) )
+                goto fail;
+            data = v->arch.syscall_eflags_mask;
+            if ( test_bit(_VGCF_syscall_disables_events,
+                          &v->arch.guest_context.flags) )
+                data |= X86_EFLAGS_IF;
+            regs->eax = data;
+            regs->edx = 0;
+            break;
+        case MSR_IA32_SYSENTER_CS:
+            if ( is_pv_32on64_vcpu(v) )
+                regs->eax = v->arch.sysenter_callback_cs;
+            else
+                regs->eax = FLAT_KERNEL_CS64;
+            regs->edx = 0;
+            break;
+        case MSR_IA32_SYSENTER_EIP:
+            regs->eax = (uint32_t)v->arch.sysenter_callback_eip;
+            regs->edx = v->arch.sysenter_callback_eip >> 32;
+            break;
+        case MSR_IA32_SYSENTER_ESP:
+            regs->eax = (uint32_t)v->arch.guest_context.kernel_sp;
+            regs->edx = v->arch.guest_context.kernel_sp >> 32;
+            break;
 #endif
         case MSR_EFER:
             if ( rdmsr_safe(regs->ecx, regs->eax, regs->edx) )
@@ -2026,6 +2143,13 @@ asmlinkage int do_debug(struct cpu_user_
 
     if ( !guest_mode(regs) )
     {
+#ifdef __x86_64__
+        /*
+         * Single stepping across sysenter must not result in the single step
+         * flag being lost: record it here for create_bounce_frame to pick up.
+         */
+        v->arch.eflags_mask |= (regs->eflags & EF_TF);
+#endif
         /* Clear TF just for absolute sanity. */
         regs->eflags &= ~EF_TF;
         /*
Index: 2007-07-03/xen/arch/x86/x86_32/traps.c
===================================================================
--- 2007-07-03.orig/xen/arch/x86/x86_32/traps.c 2007-06-22 16:57:45.000000000 
+0200
+++ 2007-07-03/xen/arch/x86/x86_32/traps.c      2007-07-03 10:39:14.000000000 
+0200
@@ -329,12 +329,19 @@ static long register_guest_callback(stru
         break;
 
 #ifdef CONFIG_X86_SUPERVISOR_MODE_KERNEL
-    case CALLBACKTYPE_sysenter:
+    case CALLBACKTYPE_sysenter_deprecated:
         if ( ! cpu_has_sep )
             ret = -EINVAL;
         else if ( on_each_cpu(do_update_sysenter, &reg->address, 1, 1) != 0 )
             ret = -EIO;
         break;
+
+    case CALLBACKTYPE_sysenter:
+        if ( ! cpu_has_sep )
+            ret = -EINVAL;
+        else
+            do_update_sysenter(&reg->address);
+        break;
 #endif
 
     case CALLBACKTYPE_nmi:
@@ -358,6 +365,7 @@ static long unregister_guest_callback(st
     case CALLBACKTYPE_event:
     case CALLBACKTYPE_failsafe:
 #ifdef CONFIG_X86_SUPERVISOR_MODE_KERNEL
+    case CALLBACKTYPE_sysenter_deprecated:
     case CALLBACKTYPE_sysenter:
 #endif
         ret = -EINVAL;
Index: 2007-07-03/xen/arch/x86/x86_64/asm-offsets.c
===================================================================
--- 2007-07-03.orig/xen/arch/x86/x86_64/asm-offsets.c   2007-07-03 
10:35:22.000000000 +0200
+++ 2007-07-03/xen/arch/x86/x86_64/asm-offsets.c        2007-07-04 
12:51:10.000000000 +0200
@@ -71,6 +71,22 @@ void __dummy__(void)
            arch.guest_context.failsafe_callback_cs);
     OFFSET(VCPU_syscall_addr, struct vcpu,
            arch.guest_context.syscall_callback_eip);
+    OFFSET(VCPU_syscall32_addr, struct vcpu, arch.syscall32_callback_eip);
+    OFFSET(VCPU_syscall32_sel, struct vcpu, arch.syscall32_callback_cs);
+    OFFSET(VCPU_syscall32_disables_events, struct vcpu,
+           arch.syscall32_disables_events);
+    OFFSET(VCPU_syscall_eflags_mask, struct vcpu, arch.syscall_eflags_mask);
+    OFFSET(VCPU_sysenter_addr, struct vcpu, arch.sysenter_callback_eip);
+    OFFSET(VCPU_sysenter_sel, struct vcpu, arch.sysenter_callback_cs);
+    OFFSET(VCPU_sysenter_disables_events, struct vcpu,
+           arch.sysenter_disables_events);
+    OFFSET(VCPU_sysexit_addr, struct vcpu, arch.sysexit_eip);
+    OFFSET(VCPU_sysexit_sel, struct vcpu, arch.sysexit_cs);
+    OFFSET(VCPU_eflags_mask, struct vcpu, arch.eflags_mask);
+    OFFSET(VCPU_gp_fault_addr, struct vcpu,
+           arch.guest_context.trap_ctxt[TRAP_gp_fault].address);
+    OFFSET(VCPU_gp_fault_sel, struct vcpu,
+           arch.guest_context.trap_ctxt[TRAP_gp_fault].cs);
     OFFSET(VCPU_kernel_sp, struct vcpu, arch.guest_context.kernel_sp);
     OFFSET(VCPU_kernel_ss, struct vcpu, arch.guest_context.kernel_ss);
     OFFSET(VCPU_guest_context_flags, struct vcpu, arch.guest_context.flags);
Index: 2007-07-03/xen/arch/x86/x86_64/compat/entry.S
===================================================================
--- 2007-07-03.orig/xen/arch/x86/x86_64/compat/entry.S  2007-07-03 
10:35:22.000000000 +0200
+++ 2007-07-03/xen/arch/x86/x86_64/compat/entry.S       2007-07-04 
13:26:46.000000000 +0200
@@ -188,6 +188,39 @@ ENTRY(compat_post_handle_exception)
         movb  $0,TRAPBOUNCE_flags(%rdx)
         jmp   compat_test_all_events
 
+ENTRY(compat_syscall)
+        cmpb  $0,VCPU_syscall32_disables_events(%rbx)
+        movzwl VCPU_syscall32_sel(%rbx),%esi
+        movq  VCPU_syscall32_addr(%rbx),%rax
+        setne %cl
+        leaq  VCPU_trap_bounce(%rbx),%rdx
+        testl $~3,%esi
+        leal  (,%rcx,TBF_INTERRUPT),%ecx
+        jz    2f
+1:      movq  %rax,TRAPBOUNCE_eip(%rdx)
+        movw  %si,TRAPBOUNCE_cs(%rdx)
+        movb  %cl,TRAPBOUNCE_flags(%rdx)
+        call  compat_create_bounce_frame
+        jmp   compat_test_all_events
+2:      movl  $TRAP_gp_fault,UREGS_entry_vector(%rsp)
+        movq  VCPU_gp_fault_addr(%rbx),%rax
+        movzwl VCPU_gp_fault_sel(%rbx),%esi
+        movb  $(TBF_EXCEPTION|TBF_EXCEPTION_ERRCODE|TBF_INTERRUPT),%cl
+        movl  $0,TRAPBOUNCE_error_code(%rdx)
+        jmp   1b
+
+ENTRY(compat_sysenter)
+        cmpl  $TRAP_gp_fault,UREGS_entry_vector(%rsp)
+        movzwl VCPU_sysenter_sel(%rbx),%eax
+        movzwl VCPU_gp_fault_sel(%rbx),%ecx
+        cmovel %ecx,%eax
+        testl $~3,%eax
+        movl  $FLAT_COMPAT_USER_SS,UREGS_ss(%rsp)
+        cmovzl %ecx,%eax
+        movw  %ax,TRAPBOUNCE_cs(%rdx)
+        call  compat_create_bounce_frame
+        jmp   compat_test_all_events
+
 ENTRY(compat_int80_direct_trap)
         call  compat_create_bounce_frame
         jmp   compat_test_all_events
@@ -230,7 +263,9 @@ compat_create_bounce_frame:
         setz  %ch                       # %ch == !saved_upcall_mask
         movl  UREGS_eflags+8(%rsp),%eax
         andl  $~X86_EFLAGS_IF,%eax
-        shlb  $1,%ch                    # Bit 9 (EFLAGS.IF)
+        addb  %ch,%ch                   # Bit 9 (EFLAGS.IF)
+        orl   VCPU_eflags_mask(%rbx),%eax
+        movl  $0,VCPU_eflags_mask(%rbx)
         orb   %ch,%ah                   # Fold EFLAGS.IF into %eax
 .Lft6:  movl  %eax,%fs:2*4(%rsi)        # EFLAGS
         movl  UREGS_rip+8(%rsp),%eax
Index: 2007-07-03/xen/arch/x86/x86_64/compat/traps.c
===================================================================
--- 2007-07-03.orig/xen/arch/x86/x86_64/compat/traps.c  2007-07-03 
10:35:22.000000000 +0200
+++ 2007-07-03/xen/arch/x86/x86_64/compat/traps.c       2007-07-03 
11:54:46.000000000 +0200
@@ -160,12 +160,35 @@ static long compat_register_guest_callba
                       &v->arch.guest_context.flags);
         break;
 
+    case CALLBACKTYPE_syscall:
+        v->arch.syscall32_callback_cs     = reg->address.cs;
+        v->arch.syscall32_callback_eip    = reg->address.eip;
+        v->arch.syscall32_disables_events =
+            (reg->flags & CALLBACKF_mask_events) != 0;
+        if ( v->arch.syscall32_callback_cs & ~3 )
+             write_efer(read_efer() | EFER_SCE);
+        else
+             write_efer(read_efer() & ~EFER_SCE);
+        break;
+
+    case CALLBACKTYPE_sysenter:
+        v->arch.sysenter_callback_cs     = reg->address.cs;
+        v->arch.sysenter_callback_eip    = reg->address.eip;
+        v->arch.sysenter_disables_events =
+            (reg->flags & CALLBACKF_mask_events) != 0;
+        break;
+
+    case CALLBACKTYPE_sysexit:
+        v->arch.sysexit_cs  = reg->address.cs | 3;
+        v->arch.sysexit_eip = reg->address.eip;
+        break;
+
     case CALLBACKTYPE_nmi:
         ret = register_guest_nmi_callback(reg->address.eip);
         break;
 
     default:
-        ret = -EINVAL;
+        ret = -ENOSYS;
         break;
     }
 
@@ -178,12 +201,20 @@ static long compat_unregister_guest_call
 
     switch ( unreg->type )
     {
+    case CALLBACKTYPE_event:
+    case CALLBACKTYPE_failsafe:
+    case CALLBACKTYPE_syscall:
+    case CALLBACKTYPE_sysenter:
+    case CALLBACKTYPE_sysexit:
+        ret = -EINVAL;
+        break;
+
     case CALLBACKTYPE_nmi:
         ret = unregister_guest_nmi_callback();
         break;
 
     default:
-        ret = -EINVAL;
+        ret = -ENOSYS;
         break;
     }
 
Index: 2007-07-03/xen/arch/x86/x86_64/entry.S
===================================================================
--- 2007-07-03.orig/xen/arch/x86/x86_64/entry.S 2007-07-03 10:35:37.000000000 
+0200
+++ 2007-07-03/xen/arch/x86/x86_64/entry.S      2007-07-04 12:48:33.000000000 
+0200
@@ -26,15 +26,19 @@
         ALIGN
 /* %rbx: struct vcpu */
 switch_to_kernel:
-        leaq  VCPU_trap_bounce(%rbx),%rdx
+        cmpw  $FLAT_USER_CS32,UREGS_cs(%rsp)
         movq  VCPU_syscall_addr(%rbx),%rax
+        leaq  VCPU_trap_bounce(%rbx),%rdx
+        cmoveq VCPU_syscall32_addr(%rbx),%rax
+        btl   $_VGCF_syscall_disables_events,VCPU_guest_context_flags(%rbx)
         movq  %rax,TRAPBOUNCE_eip(%rdx)
-        movb  $0,TRAPBOUNCE_flags(%rdx)
-        bt    $_VGCF_syscall_disables_events,VCPU_guest_context_flags(%rbx)
-        jnc   1f
-        movb  $TBF_INTERRUPT,TRAPBOUNCE_flags(%rdx)
-1:      call  create_bounce_frame
-        andl  $~X86_EFLAGS_DF,UREGS_eflags(%rsp)
+        setc  %cl
+        leal  (,%rcx,TBF_INTERRUPT),%ecx
+        movb  %cl,TRAPBOUNCE_flags(%rdx)
+        call  create_bounce_frame
+        movl  VCPU_syscall_eflags_mask(%rbx),%eax
+        notl  %eax
+        andl  %eax,UREGS_eflags(%rsp)
         jmp   test_all_events
 
 /* %rbx: struct vcpu, interrupts disabled */
@@ -47,7 +51,7 @@ restore_all_guest:
         addq  $8,%rsp
         popq  %rcx                    # RIP
         popq  %r11                    # CS
-        cmpw  $FLAT_KERNEL_CS32,%r11
+        cmpw  $FLAT_USER_CS32,%r11
         popq  %r11                    # RFLAGS
         popq  %rsp                    # RSP
         je    1f
@@ -127,6 +131,9 @@ ENTRY(syscall_enter)
         movl  $TRAP_syscall,4(%rsp)
         SAVE_ALL
         GET_CURRENT(%rbx)
+        movq  VCPU_domain(%rbx),%rcx
+        testb $1,DOMAIN_is_32bit_pv(%rcx)
+        jnz   compat_syscall
         testb $TF_kernel_mode,VCPU_thread_flags(%rbx)
         jz    switch_to_kernel
 
@@ -224,6 +231,41 @@ bad_hypercall:
         movq $-ENOSYS,UREGS_rax(%rsp)
         jmp  test_all_events
 
+ENTRY(sysenter_entry)
+        sti
+        pushq $FLAT_USER_SS
+        pushq $0
+        pushfq
+        pushq $0
+        pushq $0
+        pushq $0
+        movl  $TRAP_syscall,4(%rsp)
+        SAVE_ALL
+        GET_CURRENT(%rbx)
+        movq  VCPU_sysexit_addr(%rbx),%rax
+        movzwl VCPU_sysexit_sel(%rbx),%edx
+        cmpb  $0,VCPU_sysenter_disables_events(%rbx)
+        movq  %rax,UREGS_rip(%rsp)
+        movl  %edx,UREGS_cs(%rsp)
+        movq  VCPU_sysenter_addr(%rbx),%rax
+        setne %cl
+        leaq  VCPU_trap_bounce(%rbx),%rdx
+        testq %rax,%rax
+        leal  (,%rcx,TBF_INTERRUPT),%ecx
+        jz    2f
+1:      movq  VCPU_domain(%rbx),%rdi
+        movq  %rax,TRAPBOUNCE_eip(%rdx)
+        movb  %cl,TRAPBOUNCE_flags(%rdx)
+        testb $1,DOMAIN_is_32bit_pv(%rdi)
+        jnz   compat_sysenter
+        call  create_bounce_frame
+        jmp   test_all_events
+2:      movl  %eax,TRAPBOUNCE_error_code(%rdx)
+        movq  VCPU_gp_fault_addr(%rbx),%rax
+        movb  $(TBF_EXCEPTION|TBF_EXCEPTION_ERRCODE|TBF_INTERRUPT),%cl
+        movl  $TRAP_gp_fault,UREGS_entry_vector(%rsp)
+        jmp   1b
+
 ENTRY(int80_direct_trap)
         pushq $0
         SAVE_ALL
@@ -296,9 +338,11 @@ create_bounce_frame:
         shrq  $32,%rax
         testb $0xFF,%al                 # Bits 0-7: saved_upcall_mask
         setz  %ch                       # %ch == !saved_upcall_mask
-        movq  UREGS_eflags+8(%rsp),%rax
-        andq  $~X86_EFLAGS_IF,%rax
-        shlb  $1,%ch                    # Bit 9 (EFLAGS.IF)
+        movl  UREGS_eflags+8(%rsp),%eax
+        andl  $~X86_EFLAGS_IF,%eax
+        addb  %ch,%ch                   # Bit 9 (EFLAGS.IF)
+        orl   VCPU_eflags_mask(%rbx),%eax
+        movl  $0,VCPU_eflags_mask(%rbx)
         orb   %ch,%ah                   # Fold EFLAGS.IF into %eax
 .Lft5:  movq  %rax,16(%rsi)             # RFLAGS
         movq  UREGS_rip+8(%rsp),%rax
Index: 2007-07-03/xen/arch/x86/x86_64/traps.c
===================================================================
--- 2007-07-03.orig/xen/arch/x86/x86_64/traps.c 2007-07-03 10:34:30.000000000 
+0200
+++ 2007-07-03/xen/arch/x86/x86_64/traps.c      2007-07-03 12:06:05.000000000 
+0200
@@ -22,6 +22,7 @@
 #include <public/callback.h>
 
 asmlinkage void syscall_enter(void);
+asmlinkage void sysenter_entry(void);
 asmlinkage void compat_hypercall(void);
 asmlinkage void int80_direct_trap(void);
 
@@ -323,12 +324,26 @@ void __init percpu_traps_init(void)
 
     /* Trampoline for SYSCALL entry from long mode. */
     stack = &stack[IST_MAX * PAGE_SIZE]; /* Skip the IST stacks. */
-    wrmsr(MSR_LSTAR, (unsigned long)stack, ((unsigned long)stack>>32));
+    wrmsrl(MSR_LSTAR, (unsigned long)stack);
     stack += write_stack_trampoline(stack, stack_bottom, FLAT_KERNEL_CS64);
 
-    /* Trampoline for SYSCALL entry from compatibility mode. */
-    wrmsr(MSR_CSTAR, (unsigned long)stack, ((unsigned long)stack>>32));
-    stack += write_stack_trampoline(stack, stack_bottom, FLAT_KERNEL_CS32);
+    switch ( boot_cpu_data.x86_vendor )
+    {
+    case X86_VENDOR_INTEL:
+        /* SYSENTER entry. */
+        wrmsrl(MSR_IA32_SYSENTER_ESP, (unsigned long)stack_bottom);
+        wrmsrl(MSR_IA32_SYSENTER_EIP, (unsigned long)sysenter_entry);
+        wrmsr(MSR_IA32_SYSENTER_CS, __HYPERVISOR_CS, 0);
+        break;
+    case X86_VENDOR_AMD:
+        /* Trampoline for SYSCALL entry from compatibility mode. */
+        stack = (char *)L1_CACHE_ALIGN((unsigned long)stack);
+        wrmsrl(MSR_CSTAR, (unsigned long)stack);
+        stack += write_stack_trampoline(stack, stack_bottom, FLAT_USER_CS32);
+        break;
+    default:
+        BUG();
+    }
 
     /* Common SYSCALL parameters. */
     wrmsr(MSR_STAR, 0, (FLAT_RING3_CS32<<16) | __HYPERVISOR_CS);
@@ -353,6 +368,9 @@ static long register_guest_callback(stru
     long ret = 0;
     struct vcpu *v = current;
 
+    if ( !is_canonical_address(reg->address) )
+        return -EINVAL;
+
     switch ( reg->type )
     {
     case CALLBACKTYPE_event:
@@ -370,6 +388,14 @@ static long register_guest_callback(stru
         break;
 
     case CALLBACKTYPE_syscall:
+        /* See arch_set_info_guest() for why this is being done. */
+        if ( v->arch.syscall32_callback_eip ==
+             v->arch.guest_context.syscall_callback_eip )
+        {
+            v->arch.syscall32_callback_eip = reg->address;
+            v->arch.syscall32_disables_events =
+                (reg->flags & CALLBACKF_mask_events) != 0;
+        }
         v->arch.guest_context.syscall_callback_eip  = reg->address;
         if ( reg->flags & CALLBACKF_mask_events )
             set_bit(_VGCF_syscall_disables_events,
@@ -379,6 +405,43 @@ static long register_guest_callback(stru
                       &v->arch.guest_context.flags);
         break;
 
+    case CALLBACKTYPE_syscall32:
+        v->arch.syscall32_callback_eip = reg->address;
+        v->arch.syscall32_disables_events =
+            (reg->flags & CALLBACKF_mask_events) != 0;
+        break;
+
+    case CALLBACKTYPE_sfmask:
+        v->arch.syscall_eflags_mask = reg->address &
+                                      ~(X86_EFLAGS_IF|X86_EFLAGS_IOPL);
+        if ( reg->address & X86_EFLAGS_IF )
+        {
+            set_bit(_VGCF_syscall_disables_events,
+                    &v->arch.guest_context.flags);
+            v->arch.syscall32_disables_events = 1;
+        }
+        else
+        {
+            clear_bit(_VGCF_syscall_disables_events,
+                      &v->arch.guest_context.flags);
+            v->arch.syscall32_disables_events = 0;
+        }
+        break;
+
+    case CALLBACKTYPE_sysenter:
+        v->arch.sysenter_callback_eip = reg->address;
+        v->arch.sysenter_disables_events =
+            (reg->flags & CALLBACKF_mask_events) != 0;
+        break;
+
+    case CALLBACKTYPE_sysexit:
+        v->arch.sysexit_eip = reg->address;
+        if ( reg->flags & CALLBACKF_mask_events )
+            v->arch.sysexit_cs = FLAT_USER_CS32;
+        else
+            v->arch.sysexit_cs = FLAT_USER_CS64;
+        break;
+
     case CALLBACKTYPE_nmi:
         ret = register_guest_nmi_callback(reg->address);
         break;
@@ -400,6 +463,10 @@ static long unregister_guest_callback(st
     case CALLBACKTYPE_event:
     case CALLBACKTYPE_failsafe:
     case CALLBACKTYPE_syscall:
+    case CALLBACKTYPE_syscall32:
+    case CALLBACKTYPE_sfmask:
+    case CALLBACKTYPE_sysenter:
+    case CALLBACKTYPE_sysexit:
         ret = -EINVAL;
         break;
 
Index: 2007-07-03/xen/include/asm-x86/cpufeature.h
===================================================================
--- 2007-07-03.orig/xen/include/asm-x86/cpufeature.h    2007-07-03 
10:35:30.000000000 +0200
+++ 2007-07-03/xen/include/asm-x86/cpufeature.h 2007-07-03 10:39:14.000000000 
+0200
@@ -130,7 +130,7 @@
 #define cpu_has_pae            1
 #define cpu_has_pge            1
 #define cpu_has_apic           boot_cpu_has(X86_FEATURE_APIC)
-#define cpu_has_sep            0
+#define cpu_has_sep            boot_cpu_has(X86_FEATURE_SEP)
 #define cpu_has_mtrr           1
 #define cpu_has_mmx            1
 #define cpu_has_fxsr           1
Index: 2007-07-03/xen/include/asm-x86/domain.h
===================================================================
--- 2007-07-03.orig/xen/include/asm-x86/domain.h        2007-06-15 
14:05:46.000000000 +0200
+++ 2007-07-03/xen/include/asm-x86/domain.h     2007-07-04 12:51:40.000000000 
+0200
@@ -281,6 +281,16 @@ struct arch_vcpu
 #endif
 #ifdef CONFIG_X86_64
     struct trap_bounce int80_bounce;
+    unsigned long      syscall32_callback_eip;
+    unsigned long      sysenter_callback_eip;
+    unsigned long      sysexit_eip;
+    unsigned short     syscall32_callback_cs;
+    unsigned short     sysenter_callback_cs;
+    unsigned short     sysexit_cs;
+    bool_t             syscall32_disables_events;
+    bool_t             sysenter_disables_events;
+    unsigned int       syscall_eflags_mask;
+    unsigned int       eflags_mask;
 #endif
 
     /* Virtual Machine Extensions */
Index: 2007-07-03/xen/include/public/callback.h
===================================================================
--- 2007-07-03.orig/xen/include/public/callback.h       2006-11-08 
10:37:31.000000000 +0100
+++ 2007-07-03/xen/include/public/callback.h    2007-07-03 10:39:14.000000000 
+0200
@@ -38,13 +38,34 @@
 
 #define CALLBACKTYPE_event                 0
 #define CALLBACKTYPE_failsafe              1
-#define CALLBACKTYPE_syscall               2 /* x86_64 only */
+#define CALLBACKTYPE_syscall               2 /* x86_64 hv only */
 /*
- * sysenter is only available on x86_32 with the
- * supervisor_mode_kernel option enabled.
+ * sysenter_deprecated is only available on x86_32 with the
+ * supervisor_mode_kernel option enabled, and should not be used in new code.
  */
-#define CALLBACKTYPE_sysenter              3
+#define CALLBACKTYPE_sysenter_deprecated   3
 #define CALLBACKTYPE_nmi                   4
+#if __XEN_INTERFACE_VERSION__ < 0x00030206
+#define CALLBACKTYPE_sysenter              CALLBACKTYPE_sysenter_deprecated
+#else
+/*
+ * sysenter is only available
+ * - on x86_32 with the supervisor_mode_kernel option enabled,
+ * - on x86_64 hv for x86_32 pv or 32-bit guest support in x86_64 pv.
+ */
+#define CALLBACKTYPE_sysenter              5
+/*
+ * sysexit is only available on x86_64 hv, and is only used to fill a
+ * sysenter frame's return address (if the guest desires to have a non-NULL
+ * value there). Additionally, since CALLBACKF_mask_events is meaningless
+ * here, it is being (mis-)used for 64-bits guests to distinguish sysenter
+ * callers expected to be in 64-bit mode (flag set) from 32-bit ones (flag
+ * clear).
+ */
+#define CALLBACKTYPE_sysexit               6
+#define CALLBACKTYPE_syscall32             7 /* x86_64 only */
+#define CALLBACKTYPE_sfmask                8 /* x86_64 only */
+#endif
 
 /*
  * Disable event deliver during callback? This flag is ignored for event and
Index: 2007-07-03/xen/include/public/xen-compat.h
===================================================================
--- 2007-07-03.orig/xen/include/public/xen-compat.h     2006-11-16 
14:06:41.000000000 +0100
+++ 2007-07-03/xen/include/public/xen-compat.h  2007-07-03 10:39:14.000000000 
+0200
@@ -27,7 +27,7 @@
 #ifndef __XEN_PUBLIC_XEN_COMPAT_H__
 #define __XEN_PUBLIC_XEN_COMPAT_H__
 
-#define __XEN_LATEST_INTERFACE_VERSION__ 0x00030205
+#define __XEN_LATEST_INTERFACE_VERSION__ 0x00030206
 
 #if defined(__XEN__) || defined(__XEN_TOOLS__)
 /* Xen is built with matching headers and implements the latest interface. */



_______________________________________________
Xen-devel mailing list
Xen-devel@xxxxxxxxxxxxxxxxxxx
http://lists.xensource.com/xen-devel

<Prev in Thread] Current Thread [Next in Thread>