# HG changeset patch # User yamahata@xxxxxxxxxxxxx # Date 1156328955 -32400 # Node ID 56645e1eb4c5d4e3fdd082f6610d2d5a737a795f # Parent 663a9be17ecda33b0f7a17882bd35789544a8f02 fix vDSO. __kernel_syscall_via_epc masks interrupt assuming page fault on the vDSO page can't occur during executing. However Xen might preempt vcpu when interrupt is disabled so that page fault might occur with interrupt masked unlike baremetal case. Such page fault results in acquiring semaphore with interrupt disabled. see ia64_do_page_fault(). It means that we can't disable interrupt in vDSO page on xen environment. So we jumps kernel text area which is pinned by ITR before disabling interrupt to avoid such page fault. PATCHNAME: fix_vdso Signed-off-by: Isaku Yamahata diff -r 663a9be17ecd -r 56645e1eb4c5 linux-2.6-xen-sparse/arch/ia64/kernel/gate.S --- a/linux-2.6-xen-sparse/arch/ia64/kernel/gate.S Wed Aug 23 17:44:28 2006 +0900 +++ b/linux-2.6-xen-sparse/arch/ia64/kernel/gate.S Wed Aug 23 19:29:15 2006 +0900 @@ -14,7 +14,7 @@ #include #include #include -#ifdef CONFIG_XEN_IA64_VDSO_PARAVIRT +#ifdef CONFIG_XEN # include #endif @@ -36,7 +36,7 @@ [1:](pr)brl.cond.sptk 0; \ .xdata4 ".data.patch.brl_fsys_bubble_down", 1b-. -#ifdef CONFIG_XEN_IA64_VDSO_PARAVIRT +#ifdef CONFIG_XEN // The page in which hyperprivop lives must be pinned by ITR. // However vDSO area isn't pinned. So issuing hyperprivop // from vDSO page causes trouble that Kevin pointed out. @@ -47,6 +47,17 @@ // which is pinned, and then issue hyperprivop and return back // to vDSO page. // This is Dan Magenheimer's idea. + // + // notes on page fault. + // __kernel_syscall_via_epc masks interrupt assuming page fault on the vDSO page + // can't occur during executing. + // However Xen might preempt vcpu when interrupt is disabled so that + // page fault might occur with interrupt masked unlike baremetal case. + // Such page fault results in acquiring semaphore with interrupt disabled. + // see ia64_do_page_fault(). + // It mean that we can't disable interrupt in vDSO page on xen environment. + // So we jumps kernel text area which is pinned by ITR before disabling interrupt + // to avoid such page fault. // Currently is_running_on_xen() is defined as running_on_xen. // If is_running_on_xen() is a real function, we must update @@ -57,29 +68,11 @@ [1:] movl reg=0; \ .xdata4 ".data.patch.running_on_xen", 1b-. - .section ".data.patch.brl_xen_rsm_be_i", "a" + .section ".data.patch.brl___xen_kernel_syscall_via_epc", "a" .previous -#define BRL_COND_XEN_RSM_BE_I(pr) \ -[1:](pr)brl.cond.sptk 0; \ - .xdata4 ".data.patch.brl_xen_rsm_be_i", 1b-. - - .section ".data.patch.brl_xen_get_psr", "a" - .previous -#define BRL_COND_XEN_GET_PSR(pr) \ -[1:](pr)brl.cond.sptk 0; \ - .xdata4 ".data.patch.brl_xen_get_psr", 1b-. - - .section ".data.patch.brl_xen_ssm_i_0", "a" - .previous -#define BRL_COND_XEN_SSM_I_0(pr) \ -[1:](pr)brl.cond.sptk 0; \ - .xdata4 ".data.patch.brl_xen_ssm_i_0", 1b-. - - .section ".data.patch.brl_xen_ssm_i_1", "a" - .previous -#define BRL_COND_XEN_SSM_I_1(pr) \ -[1:](pr)brl.cond.sptk 0; \ - .xdata4 ".data.patch.brl_xen_ssm_i_1", 1b-. +#define BRL_COND___XEN_KERNEL_SYSCALL_VIA_EPC(pr) \ +[1:](pr)brl.cond.sptk 0; \ + .xdata4 ".data.patch.brl___xen_kernel_syscall_via_epc", 1b-. #endif GLOBAL_ENTRY(__kernel_syscall_via_break) @@ -126,9 +119,9 @@ GLOBAL_ENTRY(__kernel_syscall_via_epc) epc // B causes split-issue } ;; -#ifdef CONFIG_XEN_IA64_VDSO_PARAVIRT +#ifdef CONFIG_XEN // r20 = 1 - // r22 = &vcpu->evtchn_mask + // r22 = &vcpu->intrrupt_mask_addr // r23 = &vpsr.ic // r24 = &vcpu->pending_interruption // r25 = tmp @@ -137,28 +130,24 @@ GLOBAL_ENTRY(__kernel_syscall_via_epc) // r31 = tmp // p11 = tmp // p12 = running_on_xen - // p13 = !running_on_xen // p14 = tmp // p15 = tmp #define isXen p12 -#define isRaw p13 LOAD_RUNNING_ON_XEN(r28) +#ifdef CONFIG_XEN_IA64_VDSO_PARAVIRT movl r22=XSI_PSR_I_ADDR movl r23=XSI_PSR_IC movl r24=XSI_PSR_I_ADDR+(XSI_PEND_OFS-XSI_PSR_I_ADDR_OFS) mov r20=1 +#endif ;; ld4 r30=[r28] ;; - cmp.ne isXen,isRaw=r0,r30 - ;; -(isRaw) rsm psr.be | psr.i - BRL_COND_XEN_RSM_BE_I(isXen) - .global .vdso_rsm_be_i_ret -.vdso_rsm_be_i_ret: -#else + cmp.ne isXen,p0=r0,r30 + ;; + BRL_COND___XEN_KERNEL_SYSCALL_VIA_EPC(isXen) +#endif rsm psr.be | psr.i // M2 (5 cyc to srlz.d) -#endif LOAD_FSYSCALL_TABLE(r14) // X ;; mov r16=IA64_KR(CURRENT) // M2 (12 cyc) @@ -166,14 +155,7 @@ GLOBAL_ENTRY(__kernel_syscall_via_epc) mov r19=NR_syscalls-1 // A ;; lfetch [r18] // M0|1 -#ifdef CONFIG_XEN_IA64_VDSO_PARAVIRT -(isRaw) mov r29=psr - BRL_COND_XEN_GET_PSR(isXen) - .global .vdso_get_psr_ret -.vdso_get_psr_ret: -#else mov r29=psr // M2 (12 cyc) -#endif // If r17 is a NaT, p6 will be zero cmp.geu p6,p7=r19,r17 // A (sysnr > 0 && sysnr < 1024+NR_syscalls)? ;; @@ -187,21 +169,9 @@ GLOBAL_ENTRY(__kernel_syscall_via_epc) ;; nop.m 0 (p6) tbit.z.unc p8,p0=r18,0 // I0 (dual-issues with "mov b7=r18"!) -#ifdef CONFIG_XEN_IA64_VDSO_PARAVIRT - ;; - // p14 = running_on_xen && p8 - // p15 = !running_on_xen && p8 -(p8) cmp.ne.unc p14,p15=r0,r30 - ;; -(p15) ssm psr.i - BRL_COND_XEN_SSM_I_0(p14) - .global .vdso_ssm_i_0_ret -.vdso_ssm_i_0_ret: -#else nop.i 0 ;; (p8) ssm psr.i -#endif (p6) mov b7=r18 // I0 (p8) br.dptk.many b7 // B @@ -222,14 +192,7 @@ GLOBAL_ENTRY(__kernel_syscall_via_epc) #else BRL_COND_FSYS_BUBBLE_DOWN(p6) #endif -#ifdef CONFIG_XEN_IA64_VDSO_PARAVIRT -(isRaw) ssm psr.i - BRL_COND_XEN_SSM_I_1(isXen) - .global .vdso_ssm_i_1_ret -.vdso_ssm_i_1_ret: -#else ssm psr.i -#endif mov r10=-1 (p10) mov r8=EINVAL #ifdef CONFIG_XEN_IA64_VDSO_PARAVIRT diff -r 663a9be17ecd -r 56645e1eb4c5 linux-2.6-xen-sparse/arch/ia64/kernel/gate.lds.S --- a/linux-2.6-xen-sparse/arch/ia64/kernel/gate.lds.S Wed Aug 23 17:44:28 2006 +0900 +++ b/linux-2.6-xen-sparse/arch/ia64/kernel/gate.lds.S Wed Aug 23 19:29:15 2006 +0900 @@ -44,26 +44,14 @@ SECTIONS *(.data.patch.brl_fsys_bubble_down) __end_gate_brl_fsys_bubble_down_patchlist = .; -#ifdef CONFIG_XEN_IA64_VDSO_PARAVIRT +#ifdef CONFIG_XEN __start_gate_running_on_xen_patchlist = .; *(.data.patch.running_on_xen) __end_gate_running_on_xen_patchlist = .; - __start_gate_brl_xen_rsm_be_i_patchlist = .; - *(.data.patch.brl_xen_rsm_be_i) - __end_gate_brl_xen_rsm_be_i_patchlist = .; - - __start_gate_brl_xen_get_psr_patchlist = .; - *(.data.patch.brl_xen_get_psr) - __end_gate_brl_xen_get_psr_patchlist = .; - - __start_gate_brl_xen_ssm_i_0_patchlist = .; - *(.data.patch.brl_xen_ssm_i_0) - __end_gate_brl_xen_ssm_i_0_patchlist = .; - - __start_gate_brl_xen_ssm_i_1_patchlist = .; - *(.data.patch.brl_xen_ssm_i_1) - __end_gate_brl_xen_ssm_i_1_patchlist = .; + __start_gate_brl___xen_kernel_syscall_via_epc_patchlist = .; + *(.data.patch.brl___xen_kernel_syscall_via_epc) + __end_gate_brl___xen_kernel_syscall_via_epc_patchlist = .; #endif } :readable .IA_64.unwind_info : { *(.IA_64.unwind_info*) } diff -r 663a9be17ecd -r 56645e1eb4c5 linux-2.6-xen-sparse/arch/ia64/kernel/patch.c --- a/linux-2.6-xen-sparse/arch/ia64/kernel/patch.c Wed Aug 23 17:44:28 2006 +0900 +++ b/linux-2.6-xen-sparse/arch/ia64/kernel/patch.c Wed Aug 23 19:29:15 2006 +0900 @@ -184,7 +184,7 @@ patch_brl_fsys_bubble_down (unsigned lon ia64_srlz_i(); } -#ifdef CONFIG_XEN_IA64_VDSO_PARAVIRT +#ifdef CONFIG_XEN extern char __start_gate_running_on_xen_patchlist[]; extern char __end_gate_running_on_xen_patchlist[]; @@ -236,19 +236,9 @@ static void static void patch_brl_in_vdso(void) { - EXTERN_PATCHLIST(xen_rsm_be_i); - EXTERN_PATCHLIST(xen_get_psr); - EXTERN_PATCHLIST(xen_ssm_i_0); - EXTERN_PATCHLIST(xen_ssm_i_1); - - PATCH_BRL_SYMADDR(xen_rsm_be_i); - PATCH_BRL_SYMADDR(xen_get_psr); - PATCH_BRL_SYMADDR(xen_ssm_i_0); - PATCH_BRL_SYMADDR(xen_ssm_i_1); -} -#else -#define patch_running_on_xen(start, end) do { } while (0) -#define patch_brl_in_vdso() do { } while (0) + EXTERN_PATCHLIST(__xen_kernel_syscall_via_epc); + PATCH_BRL_SYMADDR(__xen_kernel_syscall_via_epc); +} #endif void diff -r 663a9be17ecd -r 56645e1eb4c5 linux-2.6-xen-sparse/arch/ia64/xen/Makefile --- a/linux-2.6-xen-sparse/arch/ia64/xen/Makefile Wed Aug 23 17:44:28 2006 +0900 +++ b/linux-2.6-xen-sparse/arch/ia64/xen/Makefile Wed Aug 23 19:29:15 2006 +0900 @@ -3,6 +3,6 @@ # obj-y := hypercall.o xenivt.o xenentry.o xensetup.o xenpal.o xenhpski.o \ - hypervisor.o pci-dma-xen.o util.o + hypervisor.o pci-dma-xen.o util.o xengate.o pci-dma-xen-y := ../../i386/kernel/pci-dma-xen.o diff -r 663a9be17ecd -r 56645e1eb4c5 linux-2.6-xen-sparse/arch/ia64/xen/hypercall.S --- a/linux-2.6-xen-sparse/arch/ia64/xen/hypercall.S Wed Aug 23 17:44:28 2006 +0900 +++ b/linux-2.6-xen-sparse/arch/ia64/xen/hypercall.S Wed Aug 23 19:29:15 2006 +0900 @@ -351,64 +351,3 @@ GLOBAL_ENTRY(xen_send_ipi) br.ret.sptk.many rp ;; END(xen_send_ipi) - -#ifdef CONFIG_XEN_IA64_VDSO_PARAVIRT -// Those are vdso specialized. -// In fsys mode, call, ret can't be used. -GLOBAL_ENTRY(xen_rsm_be_i) - ld8 r22=[r22] - ;; - st1 [r22]=r20 - st4 [r23]=r0 - XEN_HYPER_RSM_BE - st4 [r23]=r20 - brl.cond.sptk .vdso_rsm_be_i_ret - ;; -END(xen_rsm_be_i) - -GLOBAL_ENTRY(xen_get_psr) - mov r31=r8 - mov r25=IA64_PSR_IC - st4 [r23]=r0 - XEN_HYPER_GET_PSR - ;; - st4 [r23]=r20 - or r29=r8,r25 // vpsr.ic was cleared for hyperprivop - mov r8=r31 - brl.cond.sptk .vdso_get_psr_ret - ;; -END(xen_get_psr) - - // see xen_ssm_i() in privop.h - // r22 = &vcpu->evtchn_mask - // r23 = &vpsr.ic - // r24 = &vcpu->pending_interruption - // r25 = tmp - // r31 = tmp - // p11 = tmp - // p14 = tmp -#define XEN_SET_PSR_I \ - ld4 r31=[r22]; \ - ld4 r25=[r24]; \ - ;; \ - st4 [r22]=r0; \ - cmp.ne.unc p14,p0=r0,r31; \ - ;; \ -(p14) cmp.ne.unc p11,p0=r0,r25; \ - ;; \ -(p11) st4 [r22]=r20; \ -(p11) st4 [r23]=r0; \ -(p11) XEN_HYPER_SSM_I; - -GLOBAL_ENTRY(xen_ssm_i_0) - XEN_SET_PSR_I - brl.cond.sptk .vdso_ssm_i_0_ret - ;; -END(xen_ssm_i_0) - -GLOBAL_ENTRY(xen_ssm_i_1) - XEN_SET_PSR_I - brl.cond.sptk .vdso_ssm_i_1_ret - ;; -END(xen_ssm_i_1) -#endif diff -r 663a9be17ecd -r 56645e1eb4c5 linux-2.6-xen-sparse/arch/ia64/xen/xengate.S --- a/linux-2.6-xen-sparse/arch/ia64/xen/xengate.S Wed Aug 23 17:44:28 2006 +0900 +++ b/linux-2.6-xen-sparse/arch/ia64/xen/xengate.S Wed Aug 23 19:29:15 2006 +0900 @@ -14,7 +14,11 @@ #include #include #include - +#ifdef CONFIG_XEN_IA64_VDSO_PARAVIRT +# include +#endif + +#ifndef CONFIG_XEN /* * We can't easily refer to symbols inside the kernel. To avoid full runtime relocation, * complications with the linker (which likes to create PLT stubs for branches @@ -47,6 +51,7 @@ GLOBAL_ENTRY(__kernel_syscall_via_break) br.ret.sptk.many b6 } END(__kernel_syscall_via_break) +#endif /* * On entry: @@ -62,10 +67,15 @@ END(__kernel_syscall_via_break) * all "preserved" registers: same as on entry */ +#ifdef CONFIG_XEN +GLOBAL_ENTRY(__xen_kernel_syscall_via_epc) +#else GLOBAL_ENTRY(__kernel_syscall_via_epc) +#endif .prologue .altrp b6 .body +#ifndef CONFIG_XEN { /* * Note: the kernel cannot assume that the first two instructions in this @@ -77,15 +87,52 @@ GLOBAL_ENTRY(__kernel_syscall_via_epc) epc // B causes split-issue } ;; +#endif +#ifdef CONFIG_XEN_IA64_VDSO_PARAVIRT + // r20 = 1 + // r22 = vcpu->intrrupt_mask_addr = &vcpu->evtchn_upcall_mask + // on entry r22 = &vcpu->intrrupt_mask_addr + // r23 = &vpsr.ic + // r24 = &vcpu->pending_interruption + // r25 = tmp + // r28 = &running_on_xen + // r30 = running_on_xen + // r31 = tmp + // p11 = tmp + // p14 = tmp + // p15 = tmp + ld8 r22=[r22] + ;; + st1 [r22]=r20 + st4 [r23]=r0 + XEN_HYPER_RSM_BE + st4 [r23]=r20 +#else rsm psr.be | psr.i // M2 (5 cyc to srlz.d) +#endif +#ifdef CONFIG_XEN + movl r14=fsyscall_table +#else LOAD_FSYSCALL_TABLE(r14) // X +#endif ;; mov r16=IA64_KR(CURRENT) // M2 (12 cyc) shladd r18=r17,3,r14 // A mov r19=NR_syscalls-1 // A ;; lfetch [r18] // M0|1 +#ifdef CONFIG_XEN_IA64_VDSO_PARAVIRT + mov r31=r8 + mov r25=IA64_PSR_IC + st4 [r23]=r0 + XEN_HYPER_GET_PSR + ;; + st4 [r23]=r20 + or r29=r8,r25 // vpsr.ic was cleared for hyperprivop + mov r8=r31 +#else mov r29=psr // M2 (12 cyc) +#endif // If r17 is a NaT, p6 will be zero cmp.geu p6,p7=r19,r17 // A (sysnr > 0 && sysnr < 1024+NR_syscalls)? ;; @@ -99,9 +146,24 @@ GLOBAL_ENTRY(__kernel_syscall_via_epc) ;; nop.m 0 (p6) tbit.z.unc p8,p0=r18,0 // I0 (dual-issues with "mov b7=r18"!) +#ifdef CONFIG_XEN_IA64_VDSO_PARAVIRT + ;; +(p8) ld4 r31=[r22]; +(p8) ld4 r25=[r24]; + ;; +(p8) st4 [r22]=r0; +(p8) cmp.ne.unc p14,p0=r0,r31; + ;; +(p14) cmp.ne.unc p11,p0=r0,r25; + ;; +(p11) st4 [r22]=r20; +(p11) st4 [r23]=r0; +(p11) XEN_HYPER_SSM_I; +#else nop.i 0 ;; (p8) ssm psr.i +#endif (p6) mov b7=r18 // I0 (p8) br.dptk.many b7 // B @@ -120,15 +182,38 @@ GLOBAL_ENTRY(__kernel_syscall_via_epc) (p6) mov b7=r14 (p6) br.sptk.many b7 #else +#ifdef CONFIG_XEN +(p6) brl.cond.sptk fsys_bubble_down +#else BRL_COND_FSYS_BUBBLE_DOWN(p6) -#endif +#endif +#endif +#ifdef CONFIG_XEN_IA64_VDSO_PARAVIRT + ld4 r31=[r22]; + ld4 r25=[r24]; + ;; + st4 [r22]=r0; + cmp.ne.unc p14,p0=r0,r31; + ;; +(p14) cmp.ne.unc p11,p0=r0,r25; + ;; +(p11) st4 [r22]=r20; +(p11) st4 [r23]=r0; +(p11) XEN_HYPER_SSM_I; +#else ssm psr.i +#endif mov r10=-1 (p10) mov r8=EINVAL (p9) mov r8=ENOSYS FSYS_RETURN +#ifdef CONFIG_XEN +END(__xen_kernel_syscall_via_epc) +#else END(__kernel_syscall_via_epc) - +#endif + +#ifndef CONFIG_XEN # define ARG0_OFF (16 + IA64_SIGFRAME_ARG0_OFFSET) # define ARG1_OFF (16 + IA64_SIGFRAME_ARG1_OFFSET) # define ARG2_OFF (16 + IA64_SIGFRAME_ARG2_OFFSET) @@ -374,3 +459,4 @@ restore_rbs: // invala not necessary as that will happen when returning to user-mode br.cond.sptk back_from_restore_rbs END(__kernel_sigtramp) +#endif