# HG changeset patch
# User Keir Fraser <keir.fraser@xxxxxxxxxx>
# Date 1253605052 -3600
# Node ID 8c4685fc198ef4b5ea8accf30cb0b6b828cef54f
# Parent bcb6b95b30b13efa9635f8b8e1b7ff57c50dae3d
mca: Fix several issues for MCA UCR error handling
This patch is for fixing several issues for MCA UCR error handling on
latest Intel platforms, including:
1) For UCR error, the is 0xC0 ~ 0xCF instead of just C0
2) Synchronization issues for clearing error finding flag and clearing
global MCIP flag. Otherwise, in some cases, MCIP flag can't be cleared.
Signed-off-by: Liping Ke <liping.ke@xxxxxxxxx>
---
xen/arch/x86/cpu/mcheck/mce_intel.c | 73 +++++++++++++++---------------------
1 files changed, 32 insertions(+), 41 deletions(-)
diff -r bcb6b95b30b1 -r 8c4685fc198e xen/arch/x86/cpu/mcheck/mce_intel.c
--- a/xen/arch/x86/cpu/mcheck/mce_intel.c Tue Sep 22 08:36:40 2009 +0100
+++ b/xen/arch/x86/cpu/mcheck/mce_intel.c Tue Sep 22 08:37:32 2009 +0100
@@ -45,7 +45,6 @@ static atomic_t found_error = ATOMIC_INI
static void mce_barrier_enter(struct mce_softirq_barrier *);
static void mce_barrier_exit(struct mce_softirq_barrier *);
-static int mce_barrier_last(struct mce_softirq_barrier *);
#ifdef CONFIG_X86_MCE_THERMAL
static void unexpected_thermal_interrupt(struct cpu_user_regs *regs)
@@ -339,7 +338,7 @@ void intel_UCR_handler(struct mcinfo_ban
unsigned long mfn, gfn;
uint32_t status;
- printk(KERN_DEBUG "MCE: Enter EWB UCR recovery action\n");
+ printk(KERN_DEBUG "MCE: Enter UCR recovery action\n");
result->result = MCA_NEED_RESET;
if (bank->mc_addr != 0) {
mfn = bank->mc_addr >> PAGE_SHIFT;
@@ -430,8 +429,10 @@ static int mce_action(mctelem_cookie_t m
/* TODO: Add recovery actions here, such as page-offline, etc */
memset(&mca_res, 0x0f, sizeof(mca_res));
for ( i = 0; i < INTEL_MAX_RECOVERY; i++ ) {
- if ( (mc_bank->mc_status & 0xffff) ==
- intel_recovery_handler[i].mca_code ) {
+ if ( ((mc_bank->mc_status & 0xffff) ==
+ intel_recovery_handler[i].mca_code) ||
+ ((mc_bank->mc_status & 0xfff0) ==
+ intel_recovery_handler[i].mca_code)) {
/* For SRAR, OVER = 1 should have caused reset
* For SRAO, OVER = 1 skip recovery action, continue execution
*/
@@ -439,10 +440,10 @@ static int mce_action(mctelem_cookie_t m
intel_recovery_handler[i].recovery_handler
(mc_bank, mc_global, NULL, &mca_res);
else {
- if (!mc_global->mc_gstatus & MCG_STATUS_RIPV)
+ if (!(mc_global->mc_gstatus & MCG_STATUS_RIPV))
mca_res.result = MCA_NEED_RESET;
else
- mca_res.result = MCA_NO_ACTION;
+ mca_res.result = MCA_NO_ACTION;
}
if (mca_res.result & MCA_OWNER)
mc_bank->mc_domid = mca_res.owner;
@@ -458,13 +459,14 @@ static int mce_action(mctelem_cookie_t m
"recover action, RIPV=1, let it be.\n");
break;
}
- /* For SRAR, no defined recovery action should have caused reset
- * in MCA Handler
- */
- if ( i >= INTEL_MAX_RECOVERY )
- printk(KERN_DEBUG "MCE: No software recovery action found for "
- "this SRAO error\n");
}
+ /* For SRAR, no defined recovery action should have caused reset
+ * in MCA Handler
+ */
+ if ( i >= INTEL_MAX_RECOVERY )
+ printk(KERN_DEBUG "MCE: No software recovery action found for "
+ "this SRAO error\n");
+
}
return 1;
}
@@ -622,16 +624,6 @@ static void mce_barrier_exit(struct mce_
}
}
-static int mce_barrier_last(struct mce_softirq_barrier *bar)
-{
- int gen = atomic_read(&bar->ingen);
- if ( atomic_read(&bar->ingen) == gen &&
- atomic_read(&bar->val) == 1 ) {
- return 1;
- }
- return 0;
-}
-
#if 0
static void mce_barrier(struct mce_softirq_barrier *bar)
{
@@ -645,7 +637,7 @@ static void intel_machine_check(struct c
uint64_t gstatus;
mctelem_cookie_t mctc = NULL;
struct mca_summary bs;
- cpu_banks_t clear_bank;
+ cpu_banks_t clear_bank;
mce_spin_lock(&mce_logout_lock);
@@ -677,9 +669,11 @@ static void intel_machine_check(struct c
}
atomic_set(&found_error, 1);
- printk(KERN_DEBUG "MCE: clear_bank map %lx\n",
- *((unsigned long*)clear_bank));
+ printk(KERN_DEBUG "MCE: clear_bank map %lx on CPU%d\n",
+ *((unsigned long*)clear_bank), smp_processor_id());
mcheck_mca_clearbanks(clear_bank);
+ /* Print MCE error */
+ x86_mcinfo_dump(mctelem_dataptr(mctc));
} else {
if (mctc != NULL)
@@ -692,29 +686,26 @@ static void intel_machine_check(struct c
*/
mce_barrier_enter(&mce_trap_bar);
/* According to latest MCA OS writer guide, if no error bank found
- * on all cpus, something unexpected happening, we can't do any
+ * on all cpus, something unexpected happening, we can't do any
* recovery job but to reset the system.
*/
if (atomic_read(&found_error) == 0)
mc_panic("Unexpected condition for the MCE handler, need reset\n");
- if (mce_barrier_last(&mce_trap_bar)) {
- printk(KERN_DEBUG "Choose one CPU to clear error finding flag\n ");
+ mce_barrier_exit(&mce_trap_bar);
+
+ /* Clear error finding flags after all cpus finishes above judgement */
+ mce_barrier_enter(&mce_trap_bar);
+ if (atomic_read(&found_error)) {
+ printk(KERN_DEBUG "MCE: Choose one CPU "
+ "to clear error finding flag\n ");
atomic_set(&found_error, 0);
}
+ mca_rdmsrl(MSR_IA32_MCG_STATUS, gstatus);
+ if ((gstatus & MCG_STATUS_MCIP) != 0) {
+ printk(KERN_DEBUG "MCE: Clear MCIP@ last step");
+ mca_wrmsrl(MSR_IA32_MCG_STATUS, gstatus & ~MCG_STATUS_MCIP);
+ }
mce_barrier_exit(&mce_trap_bar);
-
- /*
- * Clear MCIP if it wasn't already. There is a small
- * chance that more than 1 CPU will end up doing this,
- * but that's OK.
- */
- if (bs.errcnt) {
- mca_rdmsrl(MSR_IA32_MCG_STATUS, gstatus);
- if ((gstatus & MCG_STATUS_MCIP) != 0)
- mca_wrmsrl(MSR_IA32_MCG_STATUS, gstatus & ~MCG_STATUS_MCIP);
- /* Print MCE error */
- x86_mcinfo_dump(mctelem_dataptr(mctc));
- }
raise_softirq(MACHINE_CHECK_SOFTIRQ);
}
_______________________________________________
Xen-changelog mailing list
Xen-changelog@xxxxxxxxxxxxxxxxxxx
http://lists.xensource.com/xen-changelog
|