[patch 4/4]Enable CMCI for Intel CPUs -- main patch for CMCI support This patch is the main patch for CMCI enabling in XEN. It adds the CMCI interrupt handler, new common CMCI/MCA init process, CMCI owner judge algorithm when bring_up CPUs, CPU on/offlines, polling mechanisms, etc Signed-off-by Yunhong Jiang Signed-off-by Liping Ke diff -r a2069e8a3055 xen/arch/x86/cpu/mcheck/Makefile --- a/xen/arch/x86/cpu/mcheck/Makefile Thu Dec 18 14:38:19 2008 +0800 +++ b/xen/arch/x86/cpu/mcheck/Makefile Sun Jan 02 00:25:07 2005 +0800 @@ -3,8 +3,7 @@ obj-y += amd_k8.o obj-y += amd_f10.o obj-y += mce.o +obj-y += mce_intel.o obj-y += non-fatal.o -obj-y += p4.o obj-$(x86_32) += p5.o -obj-$(x86_32) += p6.o obj-$(x86_32) += winchip.o diff -r a2069e8a3055 xen/arch/x86/cpu/mcheck/k7.c --- a/xen/arch/x86/cpu/mcheck/k7.c Thu Dec 18 14:38:19 2008 +0800 +++ b/xen/arch/x86/cpu/mcheck/k7.c Sun Jan 02 00:25:07 2005 +0800 @@ -14,6 +14,7 @@ #include #include "mce.h" +#include "x86_mca.h" /* Machine Check Handler For AMD Athlon/Duron */ static fastcall void k7_machine_check(struct cpu_user_regs * regs, long error_code) diff -r a2069e8a3055 xen/arch/x86/cpu/mcheck/mce.c --- a/xen/arch/x86/cpu/mcheck/mce.c Thu Dec 18 14:38:19 2008 +0800 +++ b/xen/arch/x86/cpu/mcheck/mce.c Sun Jan 02 00:25:07 2005 +0800 @@ -27,7 +27,7 @@ * to physical cpus present in the machine. * The more physical cpus are available, the more entries you need. */ -#define MAX_MCINFO 10 +#define MAX_MCINFO 20 struct mc_machine_notify { struct mc_info mc; @@ -110,6 +110,22 @@ } } +/*check the existence of Machine Check*/ +int mce_available(struct cpuinfo_x86 *c) +{ + return cpu_has(c, X86_FEATURE_MCE) && cpu_has(c, X86_FEATURE_MCA); +} + +/*Make sure there are no machine check on offlined or suspended CPUs*/ +void mce_disable_cpu(void) +{ + if (!mce_available(¤t_cpu_data) || mce_disabled == 1) + return; + printk(KERN_DEBUG "MCE: disable mce on CPU%d\n", smp_processor_id()); + clear_in_cr4(X86_CR4_MCE); +} + + /* This has to be run for each processor */ void mcheck_init(struct cpuinfo_x86 *c) { @@ -135,11 +151,13 @@ #ifndef CONFIG_X86_64 if (c->x86==5) intel_p5_mcheck_init(c); - if (c->x86==6) - intel_p6_mcheck_init(c); #endif - if (c->x86==15) - intel_p4_mcheck_init(c); + /*If it is P6 or P4 family, including CORE 2 DUO series*/ + if (c->x86 == 6 || c->x86==15) + { + printk(KERN_DEBUG "MCE: Intel newly family MC Init\n"); + intel_mcheck_init(c); + } break; #ifndef CONFIG_X86_64 @@ -181,7 +199,7 @@ entry = mc_data.error_idx; smp_rmb(); next = entry + 1; - if (cmpxchg(&mc_data.error_idx, entry, next) == entry) + if (cmpxchg(&mc_data.error_idx, entry, next) == entry) break; } @@ -231,8 +249,7 @@ *fetch_idx = mc_data.fetch_idx; mc_data.fetch_idx++; - BUG_ON(mc_data.fetch_idx > mc_data.error_idx); - + BUG_ON(mc_data.fetch_idx > mc_data.error_idx); return mi; } @@ -392,18 +409,24 @@ struct mcinfo_bank *mc_bank; /* first print the global info */ + printk(KERN_DEBUG "MCE: Dump machine check data\n"); x86_mcinfo_lookup(mic, mi, MC_TYPE_GLOBAL); - if (mic == NULL) + if (mic == NULL) { + printk(XENLOG_WARNING "MCE: global info NULL on CPU%d\n", + smp_processor_id()); return; + } mc_global = (struct mcinfo_global *)mic; - if (mc_global->mc_flags & MC_FLAG_UNCORRECTABLE) { + if (mc_global->mc_flags & + (MC_FLAG_UNCORRECTABLE | MC_FLAG_RECOVERABLE)) { printk(XENLOG_WARNING - "CPU%d: Machine Check Exception: %16"PRIx64"\n", + "CORE%d CPU%d: Machine Check Exception: %16"PRIx64"\n", + mc_global->mc_socketid, mc_global->mc_coreid, mc_global->mc_gstatus); } else { printk(XENLOG_WARNING "MCE: The hardware reports a non " "fatal, correctable incident occured on " - "CPU %d.\n", + "CORE%d CPU %d.\n", mc_global->mc_socketid, mc_global->mc_coreid); } @@ -413,7 +436,7 @@ if (mic == NULL) return; if (mic->type != MC_TYPE_BANK) - continue; + goto next; mc_bank = (struct mcinfo_bank *)mic; @@ -426,6 +449,7 @@ printk(" at %16"PRIx64, mc_bank->mc_addr); printk("\n"); +next: mic = x86_mcinfo_next(mic); /* next entry */ if ((mic == NULL) || (mic->size == 0)) break; diff -r a2069e8a3055 xen/arch/x86/cpu/mcheck/mce.h --- a/xen/arch/x86/cpu/mcheck/mce.h Thu Dec 18 14:38:19 2008 +0800 +++ b/xen/arch/x86/cpu/mcheck/mce.h Sun Jan 02 00:25:07 2005 +0800 @@ -1,14 +1,22 @@ #include +#include #include +#include +#include + /* Init functions */ void amd_nonfatal_mcheck_init(struct cpuinfo_x86 *c); void amd_k7_mcheck_init(struct cpuinfo_x86 *c); void amd_k8_mcheck_init(struct cpuinfo_x86 *c); void amd_f10_mcheck_init(struct cpuinfo_x86 *c); -void intel_p4_mcheck_init(struct cpuinfo_x86 *c); + + +void intel_mcheck_timer(struct cpuinfo_x86 *c); void intel_p5_mcheck_init(struct cpuinfo_x86 *c); -void intel_p6_mcheck_init(struct cpuinfo_x86 *c); +void intel_mcheck_init(struct cpuinfo_x86 *c); +void mce_intel_feature_init(struct cpuinfo_x86 *c); + void winchip_mcheck_init(struct cpuinfo_x86 *c); /* Function pointer used in the handlers to collect additional information @@ -19,6 +27,7 @@ uint16_t bank, uint64_t status); +int mce_available(struct cpuinfo_x86 *c); /* Helper functions used for collecting error telemetry */ struct mc_info *x86_mcinfo_getptr(void); void x86_mcinfo_clear(struct mc_info *mi); @@ -26,6 +35,3 @@ void x86_mcinfo_dump(struct mc_info *mi); void mc_panic(char *s); -/* Global variables */ -extern int mce_disabled; -extern unsigned int nr_mce_banks; diff -r a2069e8a3055 xen/arch/x86/cpu/mcheck/mce_intel.c --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/xen/arch/x86/cpu/mcheck/mce_intel.c Sun Jan 02 00:25:07 2005 +0800 @@ -0,0 +1,678 @@ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "mce.h" +#include "x86_mca.h" + +DEFINE_PER_CPU(cpu_banks_t, mce_banks_owned); + +static int nr_intel_ext_msrs = 0; +static int cmci_support = 0; +extern int firstbank; + +static inline void intel_get_extended_msrs(struct mcinfo_extended *mc_ext) +{ + if (nr_intel_ext_msrs == 0) + return; + + /*this function will called when CAP(9).MCG_EXT_P = 1*/ + memset(mc_ext, 0, sizeof(struct mcinfo_extended)); + mc_ext->common.type = MC_TYPE_EXTENDED; + mc_ext->common.size = sizeof(mc_ext); + mc_ext->mc_msrs = 10; + + mc_ext->mc_msr[0].reg = MSR_IA32_MCG_EAX; + rdmsrl(MSR_IA32_MCG_EAX, mc_ext->mc_msr[0].value); + mc_ext->mc_msr[1].reg = MSR_IA32_MCG_EBX; + rdmsrl(MSR_IA32_MCG_EBX, mc_ext->mc_msr[1].value); + mc_ext->mc_msr[2].reg = MSR_IA32_MCG_ECX; + rdmsrl(MSR_IA32_MCG_ECX, mc_ext->mc_msr[2].value); + + mc_ext->mc_msr[3].reg = MSR_IA32_MCG_EDX; + rdmsrl(MSR_IA32_MCG_EDX, mc_ext->mc_msr[3].value); + mc_ext->mc_msr[4].reg = MSR_IA32_MCG_ESI; + rdmsrl(MSR_IA32_MCG_ESI, mc_ext->mc_msr[4].value); + mc_ext->mc_msr[5].reg = MSR_IA32_MCG_EDI; + rdmsrl(MSR_IA32_MCG_EDI, mc_ext->mc_msr[5].value); + + mc_ext->mc_msr[6].reg = MSR_IA32_MCG_EBP; + rdmsrl(MSR_IA32_MCG_EBP, mc_ext->mc_msr[6].value); + mc_ext->mc_msr[7].reg = MSR_IA32_MCG_ESP; + rdmsrl(MSR_IA32_MCG_ESP, mc_ext->mc_msr[7].value); + mc_ext->mc_msr[8].reg = MSR_IA32_MCG_EFLAGS; + rdmsrl(MSR_IA32_MCG_EFLAGS, mc_ext->mc_msr[8].value); + mc_ext->mc_msr[9].reg = MSR_IA32_MCG_EIP; + rdmsrl(MSR_IA32_MCG_EIP, mc_ext->mc_msr[9].value); +} + +#ifdef CONFIG_X86_MCE_THERMAL +static void unexpected_thermal_interrupt(struct cpu_user_regs *regs) +{ + printk(KERN_ERR "Thermal: CPU%d: Unexpected LVT TMR interrupt!\n", + smp_processor_id()); + add_taint(TAINT_MACHINE_CHECK); +} + +/* P4/Xeon Thermal transition interrupt handler */ +static void intel_thermal_interrupt(struct cpu_user_regs *regs) +{ + u32 l, h; + unsigned int cpu = smp_processor_id(); + static s_time_t next[NR_CPUS]; + + ack_APIC_irq(); + if (NOW() < next[cpu]) + return; + + next[cpu] = NOW() + MILLISECS(5000); + rdmsr(MSR_IA32_THERM_STATUS, l, h); + if (l & 0x1) { + printk(KERN_EMERG "CPU%d: Temperature above threshold\n", cpu); + printk(KERN_EMERG "CPU%d: Running in modulated clock mode\n", + cpu); + add_taint(TAINT_MACHINE_CHECK); + } else { + printk(KERN_INFO "CPU%d: Temperature/speed normal\n", cpu); + } +} + +/* Thermal interrupt handler for this CPU setup */ +static void (*vendor_thermal_interrupt)(struct cpu_user_regs *regs) + = unexpected_thermal_interrupt; + +fastcall void smp_thermal_interrupt(struct cpu_user_regs *regs) +{ + irq_enter(); + vendor_thermal_interrupt(regs); + irq_exit(); +} + +/* P4/Xeon Thermal regulation detect and init */ +static void intel_init_thermal(struct cpuinfo_x86 *c) +{ + u32 l, h; + int tm2 = 0; + unsigned int cpu = smp_processor_id(); + + /* Thermal monitoring */ + if (!cpu_has(c, X86_FEATURE_ACPI)) + return; /* -ENODEV */ + + /* Clock modulation */ + if (!cpu_has(c, X86_FEATURE_ACC)) + return; /* -ENODEV */ + + /* first check if its enabled already, in which case there might + * be some SMM goo which handles it, so we can't even put a handler + * since it might be delivered via SMI already -zwanem. + */ + rdmsr (MSR_IA32_MISC_ENABLE, l, h); + h = apic_read(APIC_LVTTHMR); + if ((l & (1<<3)) && (h & APIC_DM_SMI)) { + printk(KERN_DEBUG "CPU%d: Thermal monitoring handled by SMI\n",cpu); + return; /* -EBUSY */ + } + + if (cpu_has(c, X86_FEATURE_TM2) && (l & (1 << 13))) + tm2 = 1; + + /* check whether a vector already exists, temporarily masked? */ + if (h & APIC_VECTOR_MASK) { + printk(KERN_DEBUG "CPU%d: Thermal LVT vector (%#x) already installed\n", + cpu, (h & APIC_VECTOR_MASK)); + return; /* -EBUSY */ + } + + /* The temperature transition interrupt handler setup */ + h = THERMAL_APIC_VECTOR; /* our delivery vector */ + h |= (APIC_DM_FIXED | APIC_LVT_MASKED); /* we'll mask till we're ready */ + apic_write_around(APIC_LVTTHMR, h); + + rdmsr (MSR_IA32_THERM_INTERRUPT, l, h); + wrmsr (MSR_IA32_THERM_INTERRUPT, l | 0x03 , h); + + /* ok we're good to go... */ + vendor_thermal_interrupt = intel_thermal_interrupt; + + rdmsr (MSR_IA32_MISC_ENABLE, l, h); + wrmsr (MSR_IA32_MISC_ENABLE, l | (1<<3), h); + + l = apic_read (APIC_LVTTHMR); + apic_write_around (APIC_LVTTHMR, l & ~APIC_LVT_MASKED); + printk (KERN_INFO "CPU%d: Thermal monitoring enabled (%s)\n", + cpu, tm2 ? "TM2" : "TM1"); + return; +} +#endif /* CONFIG_X86_MCE_THERMAL */ + +/* machine_check_poll might be called by following types: + * 1. called when do mcheck_init. + * 2. called in cmci interrupt handler + * 3. called in polling handler + * It will generate a new mc_info item if found CE/UC errors. DOM0 is the + * consumer. +*/ +static int machine_check_poll(struct mc_info *mi, int calltype) +{ + int exceptions = (read_cr4() & X86_CR4_MCE); + int i, nr_unit = 0, uc = 0, pcc = 0; + uint64_t status, addr; + struct mcinfo_global mcg; + struct mcinfo_extended mce; + unsigned int cpu; + struct domain *d; + + cpu = smp_processor_id(); + + if (!mi) { + printk(KERN_ERR "mcheck_poll: Failed to get mc_info entry\n"); + return 0; + } + x86_mcinfo_clear(mi); + + memset(&mcg, 0, sizeof(mcg)); + mcg.common.type = MC_TYPE_GLOBAL; + mcg.common.size = sizeof(mcg); + /*If called from cpu-reset check, don't need to fill them. + *If called from cmci context, we'll try to fill domid by memory addr + */ + mcg.mc_domid = -1; + mcg.mc_vcpuid = -1; + if (calltype == MC_FLAG_POLLED || calltype == MC_FLAG_RESET) + mcg.mc_flags = MC_FLAG_POLLED; + else if (calltype == MC_FLAG_CMCI) + mcg.mc_flags = MC_FLAG_CMCI; + mcg.mc_socketid = phys_proc_id[cpu]; + mcg.mc_coreid = cpu_core_id[cpu]; + mcg.mc_apicid = cpu_physical_id(cpu); + mcg.mc_core_threadid = mcg.mc_apicid & ( 1 << (smp_num_siblings - 1)); + rdmsrl(MSR_IA32_MCG_STATUS, mcg.mc_gstatus); + + for ( i = 0; i < nr_mce_banks; i++ ) { + struct mcinfo_bank mcb; + if (!test_bit(i, __get_cpu_var(mce_banks_owned))) + continue; + rdmsrl(MSR_IA32_MC0_STATUS + 4 * i, status); + + if (! (status & MCi_STATUS_VAL) ) + continue; + /* + * Uncorrected events are handled by the exception + * handler when it is enabled. But when the exception + * is disabled such as when mcheck_init, log everything. + */ + if ((status & MCi_STATUS_UC) && exceptions) + continue; + + if (status & MCi_STATUS_UC) + uc = 1; + if (status & MCi_STATUS_PCC) + pcc = 1; + + memset(&mcb, 0, sizeof(mcb)); + mcb.common.type = MC_TYPE_BANK; + mcb.common.size = sizeof(mcb); + mcb.mc_bank = i; + mcb.mc_status = status; + if (status & MCi_STATUS_MISCV) + rdmsrl(MSR_IA32_MC0_MISC + 4 * i, mcb.mc_misc); + if (status & MCi_STATUS_ADDRV) { + rdmsrl(MSR_IA32_MC0_ADDR + 4 * i, addr); + d = maddr_get_owner(addr); + if ( d && (calltype == MC_FLAG_CMCI || calltype == MC_FLAG_POLLED) ) + mcb.mc_domid = d->domain_id; + } + if (cmci_support) + rdmsrl(MSR_IA32_MC0_CTL2 + i, mcb.mc_ctrl2); + if (calltype == MC_FLAG_CMCI) + rdtscll(mcb.mc_tsc); + x86_mcinfo_add(mi, &mcb); + nr_unit++; + add_taint(TAINT_MACHINE_CHECK); + /*Clear state for this bank */ + wrmsrl(MSR_IA32_MC0_STATUS + 4 * i, 0); + printk(KERN_DEBUG "mcheck_poll: bank%i CPU%d status[%lx]\n", + i, cpu, status); + printk(KERN_DEBUG "mcheck_poll: CPU%d, SOCKET%d, CORE%d, APICID[%d], " + "thread[%d]\n", cpu, mcg.mc_socketid, + mcg.mc_coreid, mcg.mc_apicid, mcg.mc_core_threadid); + + } + /*if pcc = 1, uc must be 1*/ + if (pcc) + mcg.mc_flags |= MC_FLAG_UNCORRECTABLE; + else if (uc) + mcg.mc_flags |= MC_FLAG_RECOVERABLE; + else /*correctable*/ + mcg.mc_flags |= MC_FLAG_CORRECTABLE; + + if (nr_unit && nr_intel_ext_msrs && + (mcg.mc_gstatus & MCG_STATUS_EIPV)) { + intel_get_extended_msrs(&mce); + x86_mcinfo_add(mi, &mce); + } + if (nr_unit) + x86_mcinfo_add(mi, &mcg); + /*Clear global state*/ + return nr_unit; +} + +static fastcall void intel_machine_check(struct cpu_user_regs * regs, long error_code) +{ + /* MACHINE CHECK Error handler will be sent in another patch, + * simply copy old solutions here. This code will be replaced + * by upcoming machine check patches + */ + + int recover=1; + u32 alow, ahigh, high, low; + u32 mcgstl, mcgsth; + int i; + + rdmsr (MSR_IA32_MCG_STATUS, mcgstl, mcgsth); + if (mcgstl & (1<<0)) /* Recoverable ? */ + recover=0; + + printk (KERN_EMERG "CPU %d: Machine Check Exception: %08x%08x\n", + smp_processor_id(), mcgsth, mcgstl); + + for (i=0; ivcpu[0], VIRQ_MCA)) + send_guest_global_virq(dom0, VIRQ_MCA); + } + irq_exit(); +} + +void mce_intel_feature_init(struct cpuinfo_x86 *c) +{ + +#ifdef CONFIG_X86_MCE_THERMAL + intel_init_thermal(c); +#endif + intel_init_cmci(c); +} + +static void mce_cap_init(struct cpuinfo_x86 *c) +{ + u32 l, h; + + rdmsr (MSR_IA32_MCG_CAP, l, h); + if ((l & MCG_CMCI_P) && cpu_has_apic) + cmci_support = 1; + + nr_mce_banks = l & 0xff; + if (nr_mce_banks > MAX_NR_BANKS) + printk(KERN_WARNING "MCE: exceed max mce banks\n"); + if (l & MCG_EXT_P) + { + nr_intel_ext_msrs = (l >> MCG_EXT_CNT) & 0xff; + printk (KERN_INFO "CPU%d: Intel Extended MCE MSRs (%d) available\n", + smp_processor_id(), nr_intel_ext_msrs); + } + /* for most of p6 family, bank 0 is an alias bios MSR. + * But after model>1a, bank 0 is available*/ + if ( c->x86 == 6 && c->x86_vendor == X86_VENDOR_INTEL + && c->x86_model < 0x1A) + firstbank = 1; + else + firstbank = 0; +} + +static void mce_init(void) +{ + u32 l, h; + int i, nr_unit; + struct mc_info *mi = x86_mcinfo_getptr(); + clear_in_cr4(X86_CR4_MCE); + /* log the machine checks left over from the previous reset. + * This also clears all registers*/ + + nr_unit = machine_check_poll(mi, MC_FLAG_RESET); + /*in the boot up stage, not expect inject to DOM0, but go print out + */ + if (nr_unit > 0) + x86_mcinfo_dump(mi); + + set_in_cr4(X86_CR4_MCE); + rdmsr (MSR_IA32_MCG_CAP, l, h); + if (l & MCG_CTL_P) /* Control register present ? */ + wrmsr(MSR_IA32_MCG_CTL, 0xffffffff, 0xffffffff); + + for (i = firstbank; i < nr_mce_banks; i++) + { + /*Some banks are shared across cores, use MCi_CTRL to judge whether + * this bank has been initialized by other cores already.*/ + rdmsr(MSR_IA32_MC0_CTL + 4*i, l, h); + if (!l & !h) + { + /*if ctl is 0, this bank is never initialized*/ + printk(KERN_DEBUG "mce_init: init bank%d\n", i); + wrmsr (MSR_IA32_MC0_CTL + 4*i, 0xffffffff, 0xffffffff); + wrmsr (MSR_IA32_MC0_STATUS + 4*i, 0x0, 0x0); + } + } + if (firstbank) /*if cmci enabled, firstbank = 0*/ + wrmsr (MSR_IA32_MC0_STATUS, 0x0, 0x0); +} + +/*p4/p6 faimily has similar MCA initialization process*/ +void intel_mcheck_init(struct cpuinfo_x86 *c) +{ + + mce_cap_init(c); + printk (KERN_INFO "Intel machine check reporting enabled on CPU#%d.\n", + smp_processor_id()); + /* machine check is available */ + machine_check_vector = intel_machine_check; + mce_init(); + mce_intel_feature_init(c); + mce_set_owner(); +} + +/* + * Periodic polling timer for "silent" machine check errors. If the + * poller finds an MCE, poll faster. When the poller finds no more + * errors, poll slower +*/ +static struct timer mce_timer; + +#define MCE_PERIOD 4000 +#define MCE_MIN 2000 +#define MCE_MAX 32000 + +static u64 period = MCE_PERIOD; +static int adjust = 0; + +static void mce_intel_checkregs(void *info) +{ + int nr_unit; + struct mc_info *mi = x86_mcinfo_getptr(); + + if( !mce_available(¤t_cpu_data)) + return; + nr_unit = machine_check_poll(mi, MC_FLAG_POLLED); + if (nr_unit) + { + x86_mcinfo_dump(mi); + adjust++; + if (dom0 && guest_enabled_event(dom0->vcpu[0], VIRQ_MCA)) + send_guest_global_virq(dom0, VIRQ_MCA); + } +} + +static void mce_intel_work_fn(void *data) +{ + on_each_cpu(mce_intel_checkregs, data, 1, 1); + if (adjust) { + period = period / (adjust + 1); + printk(KERN_DEBUG "mcheck_poll: Find error, shorten interval to %ld", + period); + } + else { + period *= 2; + } + if (period > MCE_MAX) + period = MCE_MAX; + if (period < MCE_MIN) + period = MCE_MIN; + set_timer(&mce_timer, NOW() + MILLISECS(period)); + adjust = 0; +} + +void intel_mcheck_timer(struct cpuinfo_x86 *c) +{ + printk(KERN_DEBUG "mcheck_poll: Init_mcheck_timer\n"); + init_timer(&mce_timer, mce_intel_work_fn, NULL, 0); + set_timer(&mce_timer, NOW() + MILLISECS(MCE_PERIOD)); +} diff -r a2069e8a3055 xen/arch/x86/cpu/mcheck/non-fatal.c --- a/xen/arch/x86/cpu/mcheck/non-fatal.c Thu Dec 18 14:38:19 2008 +0800 +++ b/xen/arch/x86/cpu/mcheck/non-fatal.c Sun Jan 02 00:25:07 2005 +0800 @@ -19,8 +19,8 @@ #include #include "mce.h" - -static int firstbank; +#include "x86_mca.h" +int firstbank = 0; static struct timer mce_timer; #define MCE_PERIOD MILLISECS(15000) @@ -61,13 +61,8 @@ struct cpuinfo_x86 *c = &boot_cpu_data; /* Check for MCE support */ - if (!cpu_has(c, X86_FEATURE_MCE)) + if (!mce_available(c)) return -ENODEV; - - /* Check for PPro style MCA */ - if (!cpu_has(c, X86_FEATURE_MCA)) - return -ENODEV; - /* * Check for non-fatal errors every MCE_RATE s */ @@ -85,12 +80,20 @@ break; case X86_VENDOR_INTEL: - init_timer(&mce_timer, mce_work_fn, NULL, 0); - set_timer(&mce_timer, NOW() + MCE_PERIOD); + /* p5 family is different. P4/P6 and latest CPUs shares the + * same polling methods + */ + if ( c->x86 != 5 ) + { + /* some CPUs or banks don't support cmci, we need to + * enable this feature anyway + */ + intel_mcheck_timer(c); + } break; } - printk(KERN_INFO "MCA: Machine check polling timer started.\n"); + printk(KERN_INFO "mcheck_poll: Machine check polling timer started.\n"); return 0; } __initcall(init_nonfatal_mce_checker); diff -r a2069e8a3055 xen/arch/x86/cpu/mcheck/x86_mca.h --- a/xen/arch/x86/cpu/mcheck/x86_mca.h Thu Dec 18 14:38:19 2008 +0800 +++ b/xen/arch/x86/cpu/mcheck/x86_mca.h Sun Jan 02 00:25:07 2005 +0800 @@ -28,7 +28,10 @@ /* Bitfield of the MSR_IA32_MCG_CAP register */ #define MCG_CAP_COUNT 0x00000000000000ffULL #define MCG_CTL_P 0x0000000000000100ULL -/* Bits 9-63 are reserved */ +#define MCG_EXT_P (1UL<<9) +#define MCG_EXT_CNT (16) +#define MCG_CMCI_P (1UL<<10) +/* Other bits are reserved */ /* Bitfield of the MSR_IA32_MCG_STATUS register */ #define MCG_STATUS_RIPV 0x0000000000000001ULL @@ -70,3 +73,17 @@ /* reserved bits */ #define MCi_STATUS_OTHER_RESERVED2 0x0180000000000000ULL +/*Intel Specific bitfield*/ +#define CMCI_THRESHOLD 0x2 + + +#define MAX_NR_BANKS 128 + +typedef DECLARE_BITMAP(cpu_banks_t, MAX_NR_BANKS); +DECLARE_PER_CPU(cpu_banks_t, mce_banks_owned); + +/* Global variables */ +extern int mce_disabled; +extern unsigned int nr_mce_banks; +extern int firstbank; + diff -r a2069e8a3055 xen/arch/x86/smpboot.c --- a/xen/arch/x86/smpboot.c Thu Dec 18 14:38:19 2008 +0800 +++ b/xen/arch/x86/smpboot.c Sun Jan 02 00:25:07 2005 +0800 @@ -1237,11 +1237,24 @@ } extern void fixup_irqs(cpumask_t map); -int __cpu_disable(void) + +/* + * Called when offline cpu. We need to process some new + * feature such as CMCI owner change in latest Intel + * CPU families +*/ +void (*cpu_down_handler)(int down_cpu) = NULL; +void (*cpu_down_rollback_handler)(int down_cpu) = NULL; + + +int __cpu_disable(int down_cpu) { cpumask_t map = cpu_online_map; int cpu = smp_processor_id(); + /*Only down_cpu need to execute this function*/ + if (cpu != down_cpu) + return 0; /* * Perhaps use cpufreq to drop frequency, but that could go * into generic code. @@ -1293,10 +1306,14 @@ } printk(KERN_ERR "CPU %u didn't die...\n", cpu); } +static int take_cpu_down(void *down_cpu) +{ -static int take_cpu_down(void *unused) -{ - return __cpu_disable(); + if (cpu_down_handler) + cpu_down_handler(*(int *)down_cpu); + wmb(); + + return __cpu_disable(*(int *)down_cpu); } int cpu_down(unsigned int cpu) @@ -1322,7 +1339,7 @@ printk("Prepare to bring CPU%d down...\n", cpu); - err = stop_machine_run(take_cpu_down, NULL, cpu); + err = stop_machine_run(take_cpu_down, &cpu, cpu_online_map); if ( err < 0 ) goto out; @@ -1333,6 +1350,10 @@ err = -EBUSY; } out: + /*if cpu_offline failed, re-check cmci_owner*/ + + if ( err < 0 && cpu_down_rollback_handler) + cpu_down_rollback_handler(cpu); spin_unlock(&cpu_add_remove_lock); return err; } diff -r a2069e8a3055 xen/include/asm-x86/msr-index.h --- a/xen/include/asm-x86/msr-index.h Thu Dec 18 14:38:19 2008 +0800 +++ b/xen/include/asm-x86/msr-index.h Sun Jan 02 00:25:07 2005 +0800 @@ -92,8 +92,10 @@ #define MSR_IA32_MC0_STATUS 0x00000401 #define MSR_IA32_MC0_ADDR 0x00000402 #define MSR_IA32_MC0_MISC 0x00000403 +#define MSR_IA32_MC0_CTL2 0x00000280 +#define CMCI_EN (1UL<<30) +#define CMCI_THRESHOLD_MASK 0x7FFF -#define MSR_IA32_MC1_CTL 0x00000404 #define MSR_IA32_MC1_STATUS 0x00000405 #define MSR_IA32_MC1_ADDR 0x00000406 #define MSR_IA32_MC1_MISC 0x00000407 diff -r a2069e8a3055 xen/include/asm-x86/smp.h --- a/xen/include/asm-x86/smp.h Thu Dec 18 14:38:19 2008 +0800 +++ b/xen/include/asm-x86/smp.h Sun Jan 02 00:25:07 2005 +0800 @@ -101,7 +101,7 @@ #endif -extern int __cpu_disable(void); +extern int __cpu_disable(int down_cpu); extern void __cpu_die(unsigned int cpu); #endif /* !__ASSEMBLY__ */ diff -r a2069e8a3055 xen/include/public/arch-x86/xen-mca.h --- a/xen/include/public/arch-x86/xen-mca.h Thu Dec 18 14:38:19 2008 +0800 +++ b/xen/include/public/arch-x86/xen-mca.h Sun Jan 02 00:25:07 2005 +0800 @@ -106,7 +106,10 @@ #define MC_FLAG_CORRECTABLE (1 << 0) #define MC_FLAG_UNCORRECTABLE (1 << 1) - +#define MC_FLAG_RECOVERABLE (1 << 2) +#define MC_FLAG_POLLED (1 << 3) +#define MC_FLAG_RESET (1 << 4) +#define MC_FLAG_CMCI (1 << 5) /* contains global x86 mc information */ struct mcinfo_global { struct mcinfo_common common; @@ -115,6 +118,7 @@ uint16_t mc_domid; uint32_t mc_socketid; /* physical socket of the physical core */ uint16_t mc_coreid; /* physical impacted core */ + uint8_t mc_apicid; uint16_t mc_core_threadid; /* core thread of physical core */ uint16_t mc_vcpuid; /* virtual cpu scheduled for mc_domid */ uint64_t mc_gstatus; /* global status */ @@ -132,6 +136,8 @@ uint64_t mc_addr; /* bank address, only valid * if addr bit is set in mc_status */ uint64_t mc_misc; + uint64_t mc_ctrl2; + uint64_t mc_tsc; }; @@ -150,7 +156,12 @@ * multiple times. */ uint32_t mc_msrs; /* Number of msr with valid values. */ - struct mcinfo_msr mc_msr[5]; + /* + * Currently Intel extended MSR (32/64) including all gp registers + * and E(R)DI, E(R)BP, E(R)SP, E(R)FLAGS, E(R)IP, E(R)MISC, only 10 + * of them might be useful. So expend this array to 10. + */ + struct mcinfo_msr mc_msr[10]; }; #define MCINFO_HYPERCALLSIZE 1024