WARNING - OLD ARCHIVES

This is an archived copy of the Xen.org mailing list, which we have preserved to ensure that existing links to archives are not broken. The live archive, which contains the latest emails, can be found at http://lists.xen.org/
   
 
 
Xen 
 
Home Products Support Community News
 
   
 

xen-changelog

[Xen-changelog] [xen-unstable] x86 mce: Clean-up the mc_panic handler.

To: xen-changelog@xxxxxxxxxxxxxxxxxxx
Subject: [Xen-changelog] [xen-unstable] x86 mce: Clean-up the mc_panic handler.
From: Xen patchbot-unstable <patchbot-unstable@xxxxxxxxxxxxxxxxxxx>
Date: Thu, 10 Jun 2010 02:25:18 -0700
Delivery-date: Thu, 10 Jun 2010 02:26:34 -0700
Envelope-to: www-data@xxxxxxxxxxxxxxxxxxx
List-help: <mailto:xen-changelog-request@lists.xensource.com?subject=help>
List-id: BK change log <xen-changelog.lists.xensource.com>
List-post: <mailto:xen-changelog@lists.xensource.com>
List-subscribe: <http://lists.xensource.com/mailman/listinfo/xen-changelog>, <mailto:xen-changelog-request@lists.xensource.com?subject=subscribe>
List-unsubscribe: <http://lists.xensource.com/mailman/listinfo/xen-changelog>, <mailto:xen-changelog-request@lists.xensource.com?subject=unsubscribe>
Reply-to: xen-devel@xxxxxxxxxxxxxxxxxxx
Sender: xen-changelog-bounces@xxxxxxxxxxxxxxxxxxx
# HG changeset patch
# User Keir Fraser <keir.fraser@xxxxxxxxxx>
# Date 1276154351 -3600
# Node ID 8a2486514f1adda5b35acc39ccc71c7aa0348420
# Parent  704bcd622dc2ee4acb799d1bbd08ca1b28af0552
x86 mce: Clean-up the mc_panic handler.

Firstly, mc_panic should only mc_panic in one CPU to avoid printk
output be mixed.
Secondly, call urgent handler in MCE panic to get error code specific
hander be involved.

Signed-off-by: Jiang, Yunhong <yunhong.jiang@xxxxxxxxx>
---
 xen/arch/x86/cpu/mcheck/mce_intel.c |   94 +++++++++++++++++++++++++++++-------
 1 files changed, 76 insertions(+), 18 deletions(-)

diff -r 704bcd622dc2 -r 8a2486514f1a xen/arch/x86/cpu/mcheck/mce_intel.c
--- a/xen/arch/x86/cpu/mcheck/mce_intel.c       Thu Jun 10 08:18:46 2010 +0100
+++ b/xen/arch/x86/cpu/mcheck/mce_intel.c       Thu Jun 10 08:19:11 2010 +0100
@@ -150,6 +150,7 @@ static DEFINE_SPINLOCK(mce_logout_lock);
 
 static atomic_t severity_cpu = ATOMIC_INIT(-1);
 static atomic_t found_error = ATOMIC_INIT(0);
+static cpumask_t mce_fatal_cpus;
 
 static void mce_barrier_enter(struct mce_softirq_barrier *);
 static void mce_barrier_exit(struct mce_softirq_barrier *);
@@ -320,6 +321,27 @@ static void mce_softirq(void)
     }
 
     mce_barrier_exit(&mce_inside_bar);
+}
+
+/*
+ * Return:
+ * -1: if system can't be recoved
+ * 0: Continoue to next step
+ */
+static int mce_urgent_action(struct cpu_user_regs *regs,
+                              mctelem_cookie_t mctc)
+{
+    uint64_t gstatus;
+
+    if ( mctc == NULL)
+        return 0;
+
+    mca_rdmsrl(MSR_IA32_MCG_STATUS, gstatus);
+    /* Xen is not pre-emptible */
+    if ( !(gstatus & MCG_STATUS_RIPV) && !guest_mode(regs))
+        return 0;
+
+    return mce_action(regs, mctc) == MCER_RESET ? -1 : 0;
 }
 
 /* Machine Check owner judge algorithm:
@@ -693,6 +715,31 @@ struct mca_error_handler intel_mce_dhand
 struct mca_error_handler intel_mce_dhandlers[] =
             {{is_async_memerr, intel_memerr_dhandler}, {default_check, 
intel_default_dhandler}};
 
+static void intel_default_uhandler(int bnum,
+             struct mca_binfo *binfo,
+             struct mca_handle_result *result)
+{
+    uint64_t status = binfo->mib->mc_status;
+    enum intel_mce_type type;
+
+    type = intel_check_mce_type(status);
+
+    switch (type)
+    {
+    /* Panic if no handler for SRAR error */
+    case intel_mce_ucr_srar:
+    case intel_mce_fatal:
+        result->result = MCA_RESET;
+        break;
+    default:
+        result->result = MCA_NO_ACTION;
+        break;
+    }
+}
+
+struct mca_error_handler intel_mce_uhandlers[] =
+            {{default_check, intel_default_uhandler}};
+
 static void intel_machine_check(struct cpu_user_regs * regs, long error_code)
 {
     uint64_t gstatus;
@@ -724,17 +771,16 @@ static void intel_machine_check(struct c
              * clearing  the banks, and deal with the telemetry after reboot
              * (the MSRs are sticky)
              */
-            if (bs.pcc)
-                mc_panic("State lost due to machine check exception.\n");
-            if (!bs.ripv)
-                mc_panic("RIPV =0 can't resume execution!\n");
-            if (!bs.recoverable)
-                mc_panic("Machine check exception software recovery fail.\n");
+            if (bs.pcc || !bs.recoverable)
+                cpu_set(smp_processor_id(), mce_fatal_cpus);
         } else {
             if (mctc != NULL)
                 mctelem_commit(mctc);
         }
         atomic_set(&found_error, 1);
+
+        /* The last CPU will be take check/clean-up etc */
+        atomic_set(&severity_cpu, smp_processor_id());
 
         mce_printk(MCE_CRITICAL, "MCE: clear_bank map %lx on CPU%d\n",
                 *((unsigned long*)clear_bank), smp_processor_id());
@@ -745,25 +791,35 @@ static void intel_machine_check(struct c
     }
     mce_spin_unlock(&mce_logout_lock);
 
+    mce_barrier_enter(&mce_trap_bar);
+    if ( mctc != NULL && mce_urgent_action(regs, mctc))
+        cpu_set(smp_processor_id(), mce_fatal_cpus);
+    mce_barrier_exit(&mce_trap_bar);
     /*
      * Wait until everybody has processed the trap.
      */
     mce_barrier_enter(&mce_trap_bar);
-    /* According to latest MCA OS writer guide, if no error bank found
-     * on all cpus, something unexpected happening, we can't do any
-     * recovery job but to reset the system.
-     */
-    if (atomic_read(&found_error) == 0)
-        mc_panic("Unexpected condition for the MCE handler, need reset\n");
+    if (atomic_read(&severity_cpu) == smp_processor_id())
+    {
+        /* According to SDM, if no error bank found on any cpus,
+         * something unexpected happening, we can't do any
+         * recovery job but to reset the system.
+         */
+        if (atomic_read(&found_error) == 0)
+            mc_panic("MCE: No CPU found valid MCE, need reset\n");
+        if (!cpus_empty(mce_fatal_cpus))
+        {
+            char *ebufp, ebuf[96] = "MCE: Fatal error happened on CPUs ";
+            ebufp = ebuf + strlen(ebuf);
+            cpumask_scnprintf(ebufp, 95 - strlen(ebuf), mce_fatal_cpus);
+            mc_panic(ebuf);
+        }
+        atomic_set(&found_error, 0);
+    }
     mce_barrier_exit(&mce_trap_bar);
 
-    /* Clear error finding flags after all cpus finishes above judgement */
+    /* Clear flags after above fatal check */
     mce_barrier_enter(&mce_trap_bar);
-    if (atomic_read(&found_error)) {
-        mce_printk(MCE_CRITICAL, "MCE: Choose one CPU "
-                   "to clear error finding flag\n ");
-        atomic_set(&found_error, 0);
-    }
     mca_rdmsrl(MSR_IA32_MCG_STATUS, gstatus);
     if ((gstatus & MCG_STATUS_MCIP) != 0) {
         mce_printk(MCE_CRITICAL, "MCE: Clear MCIP@ last step");
@@ -1158,6 +1214,8 @@ static void intel_init_mce(void)
 
     mce_dhandlers = intel_mce_dhandlers;
     mce_dhandler_num = sizeof(intel_mce_dhandlers)/sizeof(struct 
mca_error_handler);
+    mce_uhandlers = intel_mce_uhandlers;
+    mce_uhandler_num = sizeof(intel_mce_uhandlers)/sizeof(struct 
mca_error_handler);
 }
 
 static int intel_init_mca_banks(void)

_______________________________________________
Xen-changelog mailing list
Xen-changelog@xxxxxxxxxxxxxxxxxxx
http://lists.xensource.com/xen-changelog

<Prev in Thread] Current Thread [Next in Thread>
  • [Xen-changelog] [xen-unstable] x86 mce: Clean-up the mc_panic handler., Xen patchbot-unstable <=