On our x460 4-node 32-way, I occasionally fail to restart and hang in
xen's reboot code. Examining machine_restart[1], the code ends up
sending IPI's to every other cpu, which in turn send more IPIs.
Eventually the boot cpu runs machine_restart, but sometimes this breaks.
We only need to send one IPI to the boot cpu to run machine_restart code.
AFAICT[2], logical CPU0 is always going to be the boot processor.
1. xen/arch/x86/shutdown.c:203
2. xen/arch/x86/smpboot.c:1025 says:
x86_cpu_to_apicid[0] = boot_cpu_physical_apicid;
Here is the failure I see on the serial console:
.
(XEN) Domain 0 shutdown: rebooting machine.
(XEN) **(XiElNe=)e ((fxXitaElbNel)=e. ecx,t (alifibnel=e7=7l)e .Pcr,e
-leeinxet=x7ca7epbl)tieo. nP:r efc-ff,f 8l3i0ne0x0c0e1p2t3eie=o1b7n :- >f
7f00)0 0P0f0f08r030e0-0e0xc000e0p0t0i0on0
:( 12X3EeN1)b Af -s>s ef0rtf00i0of0n 800'0l0e30.c00,000 00li00n10e23=
7(e67XE6) N -P) >r( e-f00eil0xce00e=e0ptx00ita0onb00:le0 f.00fc,0ff 08l
3i0n(X0e=E007N)17) 2( 8afPrcilea e-e-=ex> xce0tap00bti0leo00.n:0c, 00
ff0lif00nf80e=300700070)
0(P12Xre8EN-ac)exa oce-n:p> ti0ffo00fn:0f8 003ff000f000f80013020000800ac0010a
2-8
(>X E0aN00c)0a *0*0 *00-*0>**0*0
*000**0*0*00***0*0*0*0**0**00***0**0**00***0**00**********
***(nXtEeNd) * ]--------[n
s(tXXaeEnN-)3 . 0]b--l-e- - *
u***************************************************************
*
n(0X0E1N4)e (fCXdPUE2>:N]
) (X E2N4C)
4PUef: d2 >] (2afibllee=.cex, taliblnee.=7c,7 ) liPrnee-=7ex7)ce Pprtie-onex:
ceffptfif8on30: 00f0ff12f81d309f00 -01> 2100d900f 0-00> 0000000000000000000
(0X00EN00) 0f
83(0XE00N)01 -21-d--9f[ -Xe> n-003.000-00un00st00ab00le00 00 CPffUf: f8 30
105001
(29XE66N6) >R]I
P(: X EN e)0 C10P:U:[ < f ff 4f83
(0X0E0N01)4 ReIfPc6: > ]- -t-ab-l[ eX en - N3o.t0- tuanistnatebdle ]- -
-N-o--t -t-[a iXnteend-3 .]-0--u-n-st
(aXbEleN ) - -N--o[t tXaeni-n3te.d0- u]n--s-ta-bl
(eX:[E<N)f fCffP8U:30 0 00 6121
d(X9fEN>])
R(IIXEP:N) CP eU0: 1 0: [<25fff
(f8XE3N00) 0R01IP4e:f d2 > e]0-e-- -[ NXeotn- t3.a0in-utendst ]ab--l-e - a
ckNo_AtP tIaC_inirteq+d 0]x0--/0--x15
(XEN) RFLAGS: 0000000000010082 CONTEXT: hypervisor
(XEN) rax: ffff8300001e1018 rbx: ffff830000218300 rcx: 000000000000000f
(XEN) rdx: 000000000000001f rsi: 0000000000000020 rdi: ffff830000fd7028
(XEN) rbp: ffff830000fd7018 rsp: ffff830000fd7000 r8: 0000000000000000
(XEN) strnlen+0x0/0x45 bitmap_weight+0x0/0x1dAssertion 'spin_is_locked(lock)'
failed, line 37, file
/home/rharper/work/openhype/xen/unstable/hg/e/xen/include/asm/spinlock.h
(XEN) BUG at
/home/rharper/work/openhype/xen/unstable/hg/e/xen/include/asm/spinlock.h:37
(XEN) ----[ Xen-3.0-unstable Not tainted ]----
(XEN) CPU: 5
(XEN) RIP: e010:[<ffff830000129666>]************************************
(XEN) ack_APIC_irq+0x0/0x15(file=extable.c, line=77) Pre-exception:
ffff8300001281ef -> 0000000000000000
(XEN)
(XEN) RFLAGS: 0000000000010096 CONTEXT: hypervisor
(XEN) rax: 00000000ffffffff rbx: ffff830000218300 rcx: ffff830000fef138
(XEN)
(XEN) RFLAGS: 0000000000010082 CONTEXT: hypervisor
(XEN) rax: ffff8300001e0434 rbx: ffff830000218300 rcx: 000000000000000f
(XEN) rdx: 0000000000000000 rsi: ffff83000030b218 rdi: ffff830000313028
(XEN) ----[ Xen-3.0-unstable Not tainted ]----
(XEN) CPU: 19
(XEN) RIP: e010:[<ffff830000123e1b>]rdx: ffff830000fef170 rsi:
ffffffffffffffff rdi: ffff8300001cbfb1
(XEN) rbp: ffff830000fef108 rsp: ffff830000fef000 r8: 0000000000000000
(XEN) r9: 00000000deadbeef r10: ffff830000feff28 r11: 0000000000000246
(XEN) r12: 0000000000000000 r13: 0000000000000000 r14: 0000000000000000
(XEN) r15: 0000000000000000 cr0: 000000008005003b cr3: 00000000d9411000
(XEN) ds: 002b es: 002b fs: 0000 gs: 0000 ss: e018 cs: e010
(XEN) Xen stack trace from rsp=ffff830000fef000:
(XEN) ffff8300001245f5Assertion 'diff < STACK_SIZE' failed, line 31, file
traps.c
(XEN) BUG at traps.c:31
(XEN) rbp: ffff830000313018 rsp: ffff830000313000 r8: 0000000000000000
(XEN) r9: 00000000deadbeef r10: ffff830000313f28 r11: 0000000000000246
(XEN) r12: 0000000000000000 r13: 0000000000000000 r14: 0000000000000000
(XEN) r15: 0000000000000000 cr0: 000000008005003b cr3: 0000000010901000
(XEN) ds: 002b es: 002b fs: 0000 gs: 0000 ss: e018 cs: e010
(XEN) Xen stack trace from rsp=ffff830000313000:
(XEN) (file=extable.c, line=77) Pre-exception: ffff8300001b614b ->
0000000000000000
(XEN) Assertion 'spin_is_locked(lock)' failed, line 37, file
/home/rharper/work/openhype/xen/unstable/hg/e/xen/include/asm/spinlock.h
(XEN) BUG at
/home/rharper/work/openhype/xen/unstable/hg/e/xen/include/asm/spinlock.h:37
(XEN) ----[ Xen-3.0-unstable Not tainted ]----
(XEN) ----[ Xen-3.0-unstable Not tainted ]----
(XEN) CPU: 8
(XEN) RIP: e010:[<ffff830000128aca>](file=extable.c, line=77) Pre-exception:
ffff8300001281ef -> 0000000000000000
(XEN) 0000000000000000 0000000000000000CPU: 2
(XEN) RIP: e010:[<ffff830000105e08>]CPU: 0
(XEN) RIP: e010:[<ffff83000014d241>] __bitmap_weight+0x99/0xe0
(XEN) RFLAGS: 0000000000010202 CONTEXT: hypervisor
(XEN) atomic_inc+0x0/0x19
(XEN) RFLAGS: 0000000000010046 CONTEXT: hypervisor
(XEN) rax: ffff83000014e3ae rbx: ffff830000218300 rcx: 000000000000000f
(XEN) rdx: ffff8300003233bc rsi: ffff8300003233c8 rdi: ffff8300001e0e6c
(XEN) rbp: ffff8300001ff008 rsp: ffff8300001ff000 r8: 0000000000000000
(XEN) r9: 00000000deadbeef r10: ffff8300001fff28 r11: 0000000000000246
(XEN) r12: 0000000000000000 r13: 0000000000000000 r14: 0000000000000000
(XEN) r15: 0000000000000000 cr0: 000000008005003b cr3: 00000000cabeb000
(XEN) ds: 0000 es: 0000 fs: 0000 gs: 0000 ss: e018 cs: e010
(XEN) serial_start_sync+0x33/0xd1
(XEN) RFLAGS: 0000000000010046 CONTEXT: hypervisor
(XEN) rax: ffff8300001e03c0 rbx: ffff830000218300 rcx: 000000000000000f
(XEN) rdx: 0000000000000000 rsi: ffff83000030b218 rdi: 0000000000000008
(XEN) rbp: ffff830000fe3008 rsp: ffff830000fe2fe8 r8: 0000000000000000
(XEN) r9: 00000000deadbeef r10: ffff830000fe3f28 r11: 0000000000000246
(XEN) r12: 0000000000000000 r13: 0000000000000000 r14: 0000000000000000
(XEN) r15: 0000000000000000 cr0: 000000008005003b cr3: 00000000dc055000
(XEN) ds: 002b es: 002b fs: 0000 gs: 0000 ss: e018 cs: e010
(XEN) Xen stack trace from rsp=ffff830000fe2fe8:
(XEN) ----[ Xen-3.0-unstable Not tainted ]----
(XEN) CPU: 0
(XEN) RIP: e010:[<ffff8300001b614b>](file=extable.c, line=77) Pre-exception:
ffff830000152134 -> 0000000000000000
(XEN) r9: 00000000deadbeef r10: ffff830000fd7f28 r11: 0000000000000246
(XEN) r12: 0000000000000000 r13: 0000000000000000 r14: 0000000000000000
(XEN)
(XEN) RFLAGS: 0000000000010282 CONTEXT: hypervisor
(XEN) rax: 0000000000000000 rbx: ffff830000218300 rcx: 0000000000000001
(XEN) rdx: 0000000000000000 rsi: 0000000000000020 rdi: ffff83000031b080
(XEN) rbp: ffff83000031b018 rsp: ffff83000031b000 r8: 0000000000000000
(XEN) r9: 00000000deadbeef r10: ffff83000031bf28 r11: 0000000000000246
(XEN) r12: 0000000000000000 r13: 0000000000000000 r14: 0000000000000000
(XEN) r15: 0000000000000000 cr0: 000000008005003b cr3: 00000000cc143000
(XEN) ds: 002b es: 002b fs: 0000 gs: 0000 ss: e018 cs: e010
(XEN) Xen stack trace from rsp=ffff83000031b000:
(XEN) ffff83000014efc4Assertion 'diff < STACK_SIZE' failed, line 38, file
traps.c
(XEN) BUG at traps.c:38
(XEN) 0000002000000000 bitmap_weight+0xc/0x1drax: 0000000000000020 rbx:
ffff8300003e8080 rcx: 0000000000000001
(XEN) Xen stack trace from rsp=ffff8300001ff000:
(XEN) rdx: 00000000fffffffb rsi: 0000000000000020 rdi: ffff830000ff70d0
(XEN) rbp: ffff830000ff7028 rsp: ffff830000ff6ff8 r8: 0000000000000000
(XEN) r9: 00000000deadbeef r10: ffff830000ff7f28 r11: 0000000000000202
(XEN) r12: 00000000fee1dead r13: 0000000000000008 r14: 0000000000000001
(XEN) r15: 0000000000000000 cr0: 000000008005003b cr3: 00000000c9671000
(XEN) (file=extable.c, line=77) Pre-exception: ffff8300001b62e9 ->
0000000000000000
(XEN) r15: 0000000000000000 cr0: 000000008005003b cr3: 00000000c5505000
(XEN) ds: 002b es: 002b fs: 0000 gs: 0000 ss: e018 cs: e010
(XEN) ----[ Xen-3.0-unstable Not tainted ]----
(XEN) CPU: 0
(XEN) RIP: e010:[<ffff8300001b62e9>]
(XEN) RFLAGS: 0000000000010282 CONTEXT: hypervisor
(XEN) rax: 0000000000000000 rbx: ffff830000218300 rcx: 0000000000000001
(XEN) rdx: 0000000000000000 rsi: 0000000000000020 rdi: ffff830000357090
(XEN) show_registers+0x249/0x718
(XEN) RFLAGS: 0000000000010082 CONTEXT: hypervisor
(XEN) rax: ffff8300001e0304 rbx: ffff830000218300 rcx: 0000000000001e8d
(XEN) rdx: 0000000000000000 rsi: 0000000000000001 rdi: ffff8300001e0304
(XEN) rbp: ffff830000fe0058 rsp: ffff830000fdfee8 r8: 00000000ffffffff
(XEN) r9: 00000000ffffffff r10: ffff83000021a9bf r11: ffff83000021a5cf
(XEN) r12: 0000000000000000 r13: 0000000000000000 r14: 0000000000000000
(XEN) r15: 0000000000000000 cr0: 000000008005003b cr3: 00000000dc055000
(XEN) ds: 002b es: 002b fs: 0000 gs: 0000 ss: e018 cs: e010
(XEN) Xen stack trace from rsp=ffff830000fdfee8:
(XEN) 0a00000000000400 ffff8300001e0434 ffff830000129df3 0000000000000000
show_registers+0xab/0x718 ffff830000fdff38 0000000000000082
(XEN) ds: 0000 es: 0000 fs: 0000 gs: 0000 ss: e018 cs: e010
(XEN) ffff83000014d23fXen stack trace from rsp=ffff830000ff6ff8:
(XEN) ffff8300001ff028Assertion 'diff < STACK_SIZE' failed, line 31, file
traps.c
(XEN) BUG at traps.c:31
(XEN) ffff83000014e3de ffff8300003233c8
(XEN) 0000000000000000 ffff8300001ff058
(XEN) RFLAGS: 0000000000010082 CONTEXT: hypervisor
(XEN) rax: ffff8300001e0304 rbx: ffff830000fea080 rcx: 00000000000010c0
(XEN) rdx: 0000000000000000 rsi: 0000000000000001 rdi: ffff8300001e0304
(XEN) rbp: ffff830000fe3f28 rsp: ffff830000fe3db8 r8: 00000000ffffffff
(XEN) r9: 00000000ffffffff r10: ffff83000021a9bf r11: ffff83000021a5cf
(XEN) r12: 0000000000000000 r13: 0000000000000000 r14: 0000000000000000
(XEN) r15: 0000000000000000 cr0: 000000008005003b cr3: 00000000cc143000
(XEN) ds: 002b es: 002b fs: 0000 gs: 0000 ss: e018 cs: e010
(XEN) Xen stack trace from rsp=ffff830000fe3db8:
(XEN)
(XEN) ffff83000031b080 ffff83000031b088
(XEN) 0aff830000fe3de8 0000000000000000 0000000000000000 ffff8300001e0434
ffff83000014ee9e 0000000000000000 ffff830000fe3e08 ffff830000313028
ffff83000014f0e1 fffffffffffffe80 00ff83000021a9bf 0000000000000082
ffff8300001e03c0
(XEN) ffff8300001cb9a2 0000000a00124738 0000000a00fdff58 0000000000000001
(XEN) 0000000a00000400 ffff8300001e0304 ffff830000fe0068 0000000000000082
(XEN) ffff8300001268a7 0000000000000082 0000000000000000 0000000000000000
(XEN) 0000000000000000 0000000000000000 ffff830000fe0188
ffff830000218300Assertion 'diff < STACK_SIZE' failed, line 31, file traps.c
(XEN) number+0x47/0x38f
(XEN) RFLAGS: 0000000000010046 CONTEXT: hypervisor
(XEN) rax: 0000000000000000 rbx: ffff8300001cb4c0 rcx: 000000000000000a
(XEN) rdx: 0000000000000025 rsi: ffff83000021a9bf rdi: ffff83000021a5ee
(XEN) rbp: ffff83000037f0a8 rsp: ffff83000037efe8 r8: 00000000ffffffff
(XEN) r9: 00000000ffffffff r10: ffff83000021a9bf r11: ffff83000021a5ee
(XEN) r12: 0000000000000000 r13: 0000000000000000 r14: 0000000000000000
(XEN) r15: 0000000000000000 cr0: 000000008005003b cr3: 00000000c965d000
(XEN) ds: 002b es: 002b fs: 0000 gs: 0000 ss: e018 cs: e010
(XEN) Xen stack trace from rsp=ffff83000037efe8:
(XEN) ----[ Xen-3.0-unstable Not tainted ]----
(XEN) 0000000000000000 0000000000000082 00007cffffcecfb7 0000000000000000
(XEN)
(XEN) (file=extable.c, line=77) Pre-exception: ffff830000152134 ->
0000000000000000
(XEN) 0000000000000000CPU: 1016
(XEN) RIP: e010:[<ffff8300001b614b>]
(XEN) ffffffffffffffb0
(XEN) 00ff83000021a9bf ffff8300001394e0 show_registers+0xab/0x718
0000000000000000 ffff83000014e3ae
(XEN) ffff83000021a5fb ffff83000021a9bf 00000000ffffffff 0000000000000010
(XEN) ffff830000fe0178 ffff8300001ff068 00007cffffe00f77
(XEN) RFLAGS: 0000000000010282 CONTEXT: hypervisor
(XEN) rax: ffff8300001e0304 rbx: ffff8300003e8080 rcx: 00000000000026e3
(XEN) rdx: 0000000000000000 rsi: 0000000000000001 rdi: ffff8300001e0304
(XEN) rbp: ffff830000ff4068 rsp: ffff830000ff3ef8 r8: 00000000ffffffff
(XEN) r9: 00000000ffffffff r10: ffff83000021a9bf r11: ffff83000021a5cf
(XEN) r12: 00000000fee1dead r13: 0000000000000008 r14: 0000000000000001
(XEN) 0000000000001a5c 0000000000000000 0000000000000000 0000000000000000
0000000000000082 0000000000000000 0000000000000000 0000000000000000
ffff8300001e03c0 ffff83000031b098
(XEN) 0000000000000000 0000000000000000
(XEN) 0000004e00000000 0000000000000000 0000000000000000
(XEN) ffff830000fe2fe8 00000000000003f8 0000000000000000 ffff8300001394a0
0000000000000000 0000000000000000 0000000000000000
(XEN) r15: 0000000000000000 cr0: 000000008005003b cr3: 00000000c9671000
(XEN) ds: 0000 es: 0000 fs: 0000 gs: 0000 ss: e018 cs: e010
(XEN) 0000000000000000 ffff8300001ff118Xen stack trace from
rsp=ffff830000ff3ef8:
(XEN) Xen stack trace from rsp=ffff830000fd7000:
(XEN) ffff830000129df3
(XEN) ffff8300001e0304 0000000e00000000 ffff830000152134
(XEN) Xen call trace:
(XEN) [<ffff8300001b62e9>] 0000000000000000 ffff830000fd7028 00007cffff028fb7
(XEN) show_registers+0x249/0x718
(XEN)
(XEN) ************************************
(XEN) CPU0 FATAL TRAP 6 (invalid opcode), ERROR_CODE 0000, IN INTERRUPT CONTEXT.
(XEN) System shutting down -- need manual reset.
(XEN) ************************************
--
Ryan Harper
Software Engineer; Linux Technology Center
IBM Corp., Austin, Tx
(512) 838-9253 T/L: 678-9253
ryanh@xxxxxxxxxx
diffstat output:
shutdown.c | 8 +++++++-
1 files changed, 7 insertions(+), 1 deletion(-)
Signed-off-by: Ryan Harper <ryanh@xxxxxxxxxx>
---
# HG changeset patch
# User rharper@xxxxxxxxxxxxxxxxxxxxxxxxx
# Node ID afe3fd9d527c05c93b6ea1d9499d9add51db8383
# Parent 428babd7c1e0a51aadb03a4af2e5607a364aebd8
Occasionally large smp machines fail to reboot properly and die under an IPI
storm of smp_call_function() to machine_reboot. Only the boot processor needs
to run machine_restart, so send an IPI to CPU0.
diff -r 428babd7c1e0 -r afe3fd9d527c xen/arch/x86/shutdown.c
--- a/xen/arch/x86/shutdown.c Tue Apr 25 10:12:16 2006
+++ b/xen/arch/x86/shutdown.c Thu Apr 27 06:27:43 2006
@@ -20,6 +20,7 @@
#include <xen/irq.h>
#include <xen/console.h>
#include <asm/msr.h>
+#include <asm/smpboot.h>
/* opt_noreboot: If true, machine will need manual reset on error. */
static int opt_noreboot = 0;
@@ -218,7 +219,12 @@
/* Ensure we are the boot CPU. */
if ( GET_APIC_ID(apic_read(APIC_ID)) != boot_cpu_physical_apicid )
{
- smp_call_function((void *)machine_restart, NULL, 1, 0);
+ cpumask_t mask = CPU_MASK_NONE;
+
+ cpu_set(0, mask); /* boot CPU is logical cpu 0 */
+
+ /* only send IPI to boot CPU. */
+ on_selected_cpus(mask, (void *)machine_restart, NULL, 1, 0);
for ( ; ; )
safe_halt();
}
_______________________________________________
Xen-devel mailing list
Xen-devel@xxxxxxxxxxxxxxxxxxx
http://lists.xensource.com/xen-devel
|