WARNING - OLD ARCHIVES

This is an archived copy of the Xen.org mailing list, which we have preserved to ensure that existing links to archives are not broken. The live archive, which contains the latest emails, can be found at http://lists.xen.org/
   
 
 
Xen 
 
Home Products Support Community News
 
   
 

xen-changelog

[Xen-changelog] [xen-unstable] x86 mca: Support MCA recovery actions for

To: xen-changelog@xxxxxxxxxxxxxxxxxxx
Subject: [Xen-changelog] [xen-unstable] x86 mca: Support MCA recovery actions for latest Intel platforms
From: Xen patchbot-unstable <patchbot-unstable@xxxxxxxxxxxxxxxxxxx>
Date: Tue, 16 Jun 2009 06:05:14 -0700
Delivery-date: Tue, 16 Jun 2009 06:06:13 -0700
Envelope-to: www-data@xxxxxxxxxxxxxxxxxxx
List-help: <mailto:xen-changelog-request@lists.xensource.com?subject=help>
List-id: BK change log <xen-changelog.lists.xensource.com>
List-post: <mailto:xen-changelog@lists.xensource.com>
List-subscribe: <http://lists.xensource.com/mailman/listinfo/xen-changelog>, <mailto:xen-changelog-request@lists.xensource.com?subject=subscribe>
List-unsubscribe: <http://lists.xensource.com/mailman/listinfo/xen-changelog>, <mailto:xen-changelog-request@lists.xensource.com?subject=unsubscribe>
Reply-to: xen-devel@xxxxxxxxxxxxxxxxxxx
Sender: xen-changelog-bounces@xxxxxxxxxxxxxxxxxxx
# HG changeset patch
# User Keir Fraser <keir.fraser@xxxxxxxxxx>
# Date 1245146336 -3600
# Node ID c23aeb37b17fca00ba8aa39539266efa1b502024
# Parent  376c0749586e7004e8091500a320bd1e96c79c7f
x86 mca: Support MCA recovery actions for latest Intel platforms

When an UC = 1 PCC = 0 MCE happens, there're more types of software
recoverable error defined on on latest Intel Platform. For identifying
those new errors, some new bit (S/AR bit) is defined in MCi_STATUS
register. Also we need MCACOD help's to judge the detailed
error. Combined with the OVER bit, different recovery policies are
required for containing those new errors.

SRAO error  is an software recoverable MCA error, no recovery action
required. while SRAR  is an software recoverable MCA error, recovery
action is required.

Signed-off-by: Liping Ke <liping.ke@xxxxxxxxx>
Signed-off-by: Yunhong Jiang <yunhong.jiang@xxxxxxxxx>
---
 xen/arch/x86/cpu/mcheck/amd_nonfatal.c |    2 
 xen/arch/x86/cpu/mcheck/mce.c          |   59 +++++++-
 xen/arch/x86/cpu/mcheck/mce.h          |   14 +-
 xen/arch/x86/cpu/mcheck/mce_intel.c    |  228 +++++++++++++++++++++++++++++++--
 xen/arch/x86/cpu/mcheck/non-fatal.c    |    2 
 xen/arch/x86/cpu/mcheck/x86_mca.h      |    9 +
 6 files changed, 292 insertions(+), 22 deletions(-)

diff -r 376c0749586e -r c23aeb37b17f xen/arch/x86/cpu/mcheck/amd_nonfatal.c
--- a/xen/arch/x86/cpu/mcheck/amd_nonfatal.c    Tue Jun 16 10:55:36 2009 +0100
+++ b/xen/arch/x86/cpu/mcheck/amd_nonfatal.c    Tue Jun 16 10:58:56 2009 +0100
@@ -86,7 +86,7 @@ void mce_amd_checkregs(void *info)
        struct mca_summary bs;
        unsigned int event_enabled;
 
-       mctc = mcheck_mca_logout(MCA_POLLER, mca_allbanks, &bs);
+       mctc = mcheck_mca_logout(MCA_POLLER, mca_allbanks, &bs, NULL);
 
        event_enabled = guest_enabled_event(dom0->vcpu[0], VIRQ_MCA);
 
diff -r 376c0749586e -r c23aeb37b17f xen/arch/x86/cpu/mcheck/mce.c
--- a/xen/arch/x86/cpu/mcheck/mce.c     Tue Jun 16 10:55:36 2009 +0100
+++ b/xen/arch/x86/cpu/mcheck/mce.c     Tue Jun 16 10:58:56 2009 +0100
@@ -82,14 +82,41 @@ void x86_mce_callback_register(x86_mce_c
        mc_callback_bank_extended = cbfunc;
 }
 
+/* Machine check recoverable judgement callback handler 
+ * It is used to judge whether an UC error is recoverable by software
+ */
+static mce_recoverable_t mc_recoverable_scan = NULL;
+
+void mce_recoverable_register(mce_recoverable_t cbfunc)
+{
+    mc_recoverable_scan = cbfunc;
+}
+
+/* Judging whether to Clear Machine Check error bank callback handler
+ * According to Intel latest MCA OS Recovery Writer's Guide, 
+ * whether the error MCA bank needs to be cleared is decided by the mca_source
+ * and MCi_status bit value. 
+ */
+static mce_need_clearbank_t mc_need_clearbank_scan = NULL;
+
+void mce_need_clearbank_register(mce_need_clearbank_t cbfunc)
+{
+    mc_need_clearbank_scan = cbfunc;
+}
+
 /* Utility function to perform MCA bank telemetry readout and to push that
  * telemetry towards an interested dom0 for logging and diagnosis.
  * The caller - #MC handler or MCA poll function - must arrange that we
  * do not migrate cpus. */
 
 /* XXFM Could add overflow counting? */
+
+/* Add out_param clear_bank for Machine Check Handler Caller.
+ * For Intel latest CPU, whether to clear the error bank status needs to
+ * be judged by the callback function defined above.
+ */
 mctelem_cookie_t mcheck_mca_logout(enum mca_source who, cpu_banks_t bankmask,
-    struct mca_summary *sp)
+    struct mca_summary *sp, cpu_banks_t* clear_bank)
 {
        struct vcpu *v = current;
        struct domain *d;
@@ -98,7 +125,7 @@ mctelem_cookie_t mcheck_mca_logout(enum 
        struct mcinfo_common *mic;
        struct mcinfo_global *mig;      /* on stack */
        mctelem_cookie_t mctc = NULL;
-       uint32_t uc = 0, pcc = 0;
+       uint32_t uc = 0, pcc = 0, recover, need_clear = 1 ;
        struct mc_info *mci = NULL;
        mctelem_class_t which = MC_URGENT;      /* XXXgcc */
        unsigned int cpu_nr;
@@ -150,6 +177,11 @@ mctelem_cookie_t mcheck_mca_logout(enum 
            &mcg.mc_coreid, &mcg.mc_core_threadid,
            &mcg.mc_apicid, NULL, NULL, NULL);
 
+       /* If no mc_recovery_scan callback handler registered,
+        * this error is not recoverable
+        */
+       recover = (mc_recoverable_scan)? 1: 0;
+
        for (i = 0; i < 32 && i < nr_mce_banks; i++) {
                struct mcinfo_bank mcb;         /* on stack */
 
@@ -160,6 +192,13 @@ mctelem_cookie_t mcheck_mca_logout(enum 
                mca_rdmsrl(MSR_IA32_MC0_STATUS + i * 4, status);
                if (!(status & MCi_STATUS_VAL))
                        continue;       /* this bank has no valid telemetry */
+
+               /* For Intel Latest CPU CMCI/MCE Handler caller, we need to
+                * decide whether to clear bank by MCi_STATUS bit value such as
+                * OVER/UC/EN/PCC/S/AR
+                */
+               if ( mc_need_clearbank_scan )
+                       need_clear = mc_need_clearbank_scan(who, status);
 
                /* If this is the first bank with valid MCA DATA, then
                 * try to reserve an entry from the urgent/nonurgent queue
@@ -187,6 +226,11 @@ mctelem_cookie_t mcheck_mca_logout(enum 
                if ((status & MCi_STATUS_PCC) != 0)
                        pcc |= (1 << i);
 
+               if (recover && uc)
+                /* uc = 1, recover = 1, we need not panic.
+                 */
+                       recover = mc_recoverable_scan(status);
+
                addr = misc = 0;
 
                if (status & MCi_STATUS_ADDRV) {
@@ -221,9 +265,13 @@ mctelem_cookie_t mcheck_mca_logout(enum 
                        cbret = mc_callback_bank_extended(mci, i, status);
                }
 
-               if (who != MCA_MCE_SCAN)
+               /* By default, need_clear = 1 */
+               if (who != MCA_MCE_SCAN && need_clear)
                        /* Clear status */
                        mca_wrmsrl(MSR_IA32_MC0_STATUS + 4 * i, 0x0ULL);
+               else if ( who == MCA_MCE_SCAN && need_clear)
+                       set_bit(i, clear_bank);
+
                wmb();
        }
 
@@ -245,6 +293,7 @@ mctelem_cookie_t mcheck_mca_logout(enum 
                sp->eipv = (gstatus & MCG_STATUS_EIPV) != 0;
                sp->uc = uc;
                sp->pcc = pcc;
+               sp->recoverable = recover;
        }
 
        return mci != NULL ? mctc : NULL;       /* may be NULL */
@@ -296,7 +345,7 @@ void mcheck_cmn_handler(struct cpu_user_
         * for logging or dismiss the cookie that is returned, and must not
         * reference the cookie after that action.
         */
-       mctc = mcheck_mca_logout(MCA_MCE_HANDLER, bankmask, &bs);
+       mctc = mcheck_mca_logout(MCA_MCE_HANDLER, bankmask, &bs, NULL);
        if (mctc != NULL)
                mci = (struct mc_info *)mctelem_dataptr(mctc);
 
@@ -606,7 +655,7 @@ static void __init mcheck_disable(char *
 
 static void __init mcheck_enable(char *str)
 {
-       mce_disabled = -1;
+       mce_disabled = 0;
 }
 
 custom_param("nomce", mcheck_disable);
diff -r 376c0749586e -r c23aeb37b17f xen/arch/x86/cpu/mcheck/mce.h
--- a/xen/arch/x86/cpu/mcheck/mce.h     Tue Jun 16 10:55:36 2009 +0100
+++ b/xen/arch/x86/cpu/mcheck/mce.h     Tue Jun 16 10:58:56 2009 +0100
@@ -42,6 +42,10 @@ extern void x86_mce_vector_register(x86_
  * via x86_mce_vector_register. */
 extern void mcheck_cmn_handler(struct cpu_user_regs *, long, cpu_banks_t);
 
+/* Register a handler for judging whether mce is recoverable. */
+typedef int (*mce_recoverable_t)(u64 status);
+extern void mce_recoverable_register(mce_recoverable_t);
+
 /* Read an MSR, checking for an interposed value first */
 extern struct intpose_ent *intpose_lookup(unsigned int, uint64_t,
     uint64_t *);
@@ -86,6 +90,8 @@ struct mca_summary {
        int             eipv;   /* meaningful on #MC */
        uint32_t        uc;     /* bitmask of banks with UC */
        uint32_t        pcc;    /* bitmask of banks with PCC */
+       /* bitmask of banks with software error recovery ability*/
+       uint32_t        recoverable; 
 };
 
 extern cpu_banks_t mca_allbanks;
@@ -93,11 +99,12 @@ DECLARE_PER_CPU(cpu_banks_t, poll_bankma
 DECLARE_PER_CPU(cpu_banks_t, poll_bankmask);
 DECLARE_PER_CPU(cpu_banks_t, no_cmci_banks);
 extern int cmci_support;
+extern int ser_support;
 extern int is_mc_panic;
 extern void mcheck_mca_clearbanks(cpu_banks_t);
 
 extern mctelem_cookie_t mcheck_mca_logout(enum mca_source, cpu_banks_t,
-    struct mca_summary *);
+    struct mca_summary *, cpu_banks_t*);
 
 /* Register a callback to be made during bank telemetry logout.
  * This callback is only available to those machine check handlers
@@ -112,6 +119,11 @@ extern mctelem_cookie_t mcheck_mca_logou
  * the current MCA bank number we are reading telemetry from, and the
  * MCi_STATUS value for that bank.
  */
+
+/* Register a handler for judging whether the bank need to be cleared */
+typedef int (*mce_need_clearbank_t)(enum mca_source who, u64 status);
+extern void mce_need_clearbank_register(mce_need_clearbank_t);
+
 typedef enum mca_extinfo (*x86_mce_callback_t)
     (struct mc_info *, uint16_t, uint64_t);
 extern void x86_mce_callback_register(x86_mce_callback_t);
diff -r 376c0749586e -r c23aeb37b17f xen/arch/x86/cpu/mcheck/mce_intel.c
--- a/xen/arch/x86/cpu/mcheck/mce_intel.c       Tue Jun 16 10:55:36 2009 +0100
+++ b/xen/arch/x86/cpu/mcheck/mce_intel.c       Tue Jun 16 10:58:56 2009 +0100
@@ -5,7 +5,9 @@
 #include <xen/kernel.h>
 #include <xen/delay.h>
 #include <xen/smp.h>
+#include <xen/mm.h>
 #include <asm/processor.h> 
+#include <public/sysctl.h>
 #include <asm/system.h>
 #include <asm/msr.h>
 #include "mce.h"
@@ -14,6 +16,7 @@ DEFINE_PER_CPU(cpu_banks_t, mce_banks_ow
 DEFINE_PER_CPU(cpu_banks_t, mce_banks_owned);
 DEFINE_PER_CPU(cpu_banks_t, no_cmci_banks);
 int cmci_support = 0;
+int ser_support = 0;
 
 static int nr_intel_ext_msrs = 0;
 static int firstbank;
@@ -37,9 +40,11 @@ static DEFINE_SPINLOCK(mce_logout_lock);
 static DEFINE_SPINLOCK(mce_logout_lock);
 
 static atomic_t severity_cpu = ATOMIC_INIT(-1);
+static atomic_t found_error = ATOMIC_INIT(0);
 
 static void mce_barrier_enter(struct mce_softirq_barrier *);
 static void mce_barrier_exit(struct mce_softirq_barrier *);
+static int mce_barrier_last(struct mce_softirq_barrier *);
 
 #ifdef CONFIG_X86_MCE_THERMAL
 static void unexpected_thermal_interrupt(struct cpu_user_regs *regs)
@@ -261,6 +266,44 @@ static int fill_vmsr_data(int cpu, struc
     return 0;
 }
 
+void intel_UCR_handler(struct mcinfo_bank *bank,
+             struct mcinfo_global *global,
+             struct mcinfo_extended *extension,
+             struct mca_handle_result *result)
+{
+    struct domain *d;
+    unsigned long mfn;
+    uint32_t status;
+
+    printk(KERN_DEBUG "MCE: Enter EWB UCR recovery action\n");
+    result->result = MCA_NEED_RESET;
+    if (bank->mc_addr != 0) {
+         mfn = bank->mc_addr >> PAGE_SHIFT;
+         if (!offline_page(mfn, 1, &status)) {
+              if (status & PG_OFFLINE_OFFLINED)
+                  result->result = MCA_RECOVERED;
+              else if (status & PG_OFFLINE_PENDING) {
+                 /* This page has owner */
+                  if (status & PG_OFFLINE_OWNED) {
+                      result->result |= MCA_OWNER;
+                      result->owner = status >> PG_OFFLINE_OWNER_SHIFT;
+                      printk(KERN_DEBUG "MCE: This error page is ownded"
+                                  " by DOM %d\n", result->owner);
+                      if (result->owner != 0 && result->owner != DOMID_XEN) {
+                          d = get_domain_by_id(result->owner);
+                          domain_crash(d);
+                          result->result = MCA_RECOVERED;
+                      }
+                  }
+              }
+         }
+    }
+}
+
+#define INTEL_MAX_RECOVERY 2
+struct mca_error_handler intel_recovery_handler[INTEL_MAX_RECOVERY] =
+            {{0x017A, intel_UCR_handler}, {0x00C0, intel_UCR_handler}};
+
 /*
  * Called from mctelem_process_deferred. Return 1 if the telemetry
  * should be committed for dom0 consumption, 0 if it should be
@@ -269,9 +312,11 @@ static int mce_action(unsigned int cpu, 
 static int mce_action(unsigned int cpu, mctelem_cookie_t mctc)
 {
     struct mc_info *local_mi;
+    uint32_t i;
     struct mcinfo_common *mic = NULL;
     struct mcinfo_global *mc_global;
     struct mcinfo_bank *mc_bank;
+    struct mca_handle_result mca_res;
 
     local_mi = (struct mc_info*)mctelem_dataptr(mctc);
     x86_mcinfo_lookup(mic, local_mi, MC_TYPE_GLOBAL);
@@ -294,9 +339,45 @@ static int mce_action(unsigned int cpu, 
         if (fill_vmsr_data(cpu, mc_bank, mc_global->mc_gstatus) == -1)
              break;
 
-       /* TODO: Add recovery actions here, such as page-offline, etc */
-    }
-
+        /* TODO: Add recovery actions here, such as page-offline, etc */
+        memset(&mca_res, 0x0f, sizeof(mca_res));
+        for ( i = 0; i < INTEL_MAX_RECOVERY; i++ ) {
+            if ( (mc_bank->mc_status & 0xffff) == 
+                        intel_recovery_handler[i].mca_code ) {
+                /* For SRAR, OVER = 1 should have caused reset
+                 * For SRAO, OVER = 1 skip recovery action, continue execution
+                 */
+                if (!(mc_bank->mc_status & MCi_STATUS_OVER))
+                    intel_recovery_handler[i].recovery_handler
+                                (mc_bank, mc_global, NULL, &mca_res);
+                else {
+                   if (!mc_global->mc_gstatus & MCG_STATUS_RIPV)
+                       mca_res.result = MCA_NEED_RESET;
+                   else
+                       mca_res.result = MCA_NO_ACTION; 
+                }
+                if (mca_res.result & MCA_OWNER)
+                    mc_bank->mc_domid = mca_res.owner;
+                if (mca_res.result == MCA_NEED_RESET)
+                    /* DOMID_XEN*/
+                    mc_panic("MCE: Software recovery failed for the UCR "
+                                "error\n");
+                else if (mca_res.result == MCA_RECOVERED)
+                    printk(KERN_DEBUG "MCE: The UCR error is succesfully "
+                                "recovered by software!\n");
+                else if (mca_res.result == MCA_NO_ACTION)
+                    printk(KERN_DEBUG "MCE: Overwrite SRAO error can't execute 
"
+                                "recover action, RIPV=1, let it be.\n");
+                break;
+            }
+            /* For SRAR, no defined recovery action should have caused reset
+             * in MCA Handler
+             */
+            if ( i >= INTEL_MAX_RECOVERY )
+                printk(KERN_DEBUG "MCE: No software recovery action found for "
+                                "this SRAO error\n");
+        }
+    }
     return 1;
 }
 
@@ -468,21 +549,35 @@ static void mce_barrier_exit(struct mce_
       }
 }
 
+static int mce_barrier_last(struct mce_softirq_barrier *bar)
+{
+    int gen = atomic_read(&bar->ingen);
+    if ( atomic_read(&bar->ingen) == gen &&
+        atomic_read(&bar->val) == 1 ) {
+        return 1;
+    }
+    return 0;
+}
+
+#if 0
 static void mce_barrier(struct mce_softirq_barrier *bar)
 {
       mce_barrier_enter(bar);
       mce_barrier_exit(bar);
 }
+#endif
 
 static void intel_machine_check(struct cpu_user_regs * regs, long error_code)
 {
     uint64_t gstatus;
     mctelem_cookie_t mctc = NULL;
     struct mca_summary bs;
+    cpu_banks_t clear_bank; 
 
     mce_spin_lock(&mce_logout_lock);
 
-    mctc = mcheck_mca_logout(MCA_MCE_SCAN, mca_allbanks, &bs);
+    memset( &clear_bank, 0x0, sizeof(cpu_banks_t));
+    mctc = mcheck_mca_logout(MCA_MCE_SCAN, mca_allbanks, &bs, &clear_bank);
 
     if (bs.errcnt) {
         /*
@@ -493,28 +588,47 @@ static void intel_machine_check(struct c
             if (mctc != NULL)
                 mctelem_defer(mctc);
             /*
-             * For PCC=1, context is lost, so reboot now without clearing
-             * the banks, and deal with the telemetry after reboot
+             * For PCC=1 and can't be recovered, context is lost, so reboot 
now without
+             * clearing  the banks, and deal with the telemetry after reboot
              * (the MSRs are sticky)
              */
             if (bs.pcc)
                 mc_panic("State lost due to machine check exception.\n");
+            if (!bs.ripv)
+                mc_panic("RIPV =0 can't resume execution!\n");
+            if (!bs.recoverable)
+                mc_panic("Machine check exception software recovery fail.\n");
         } else {
             if (mctc != NULL)
                 mctelem_commit(mctc);
         }
-        mcheck_mca_clearbanks(mca_allbanks);
+        atomic_set(&found_error, 1);
+
+        printk(KERN_DEBUG "MCE: clear_bank map %lx\n", 
+                *((unsigned long*)clear_bank));
+        mcheck_mca_clearbanks(clear_bank);
+
     } else {
         if (mctc != NULL)
             mctelem_dismiss(mctc);
     }
-
     mce_spin_unlock(&mce_logout_lock);
 
     /*
      * Wait until everybody has processed the trap.
      */
-    mce_barrier(&mce_trap_bar);
+    mce_barrier_enter(&mce_trap_bar);
+    /* According to latest MCA OS writer guide, if no error bank found
+     * on all cpus, something unexpected happening, we can't do any 
+     * recovery job but to reset the system.
+     */
+    if (atomic_read(&found_error) == 0)
+        mc_panic("Unexpected condition for the MCE handler, need reset\n");
+    if (mce_barrier_last(&mce_trap_bar)) {
+        printk(KERN_DEBUG "Choose one CPU to clear error finding flag\n ");
+        atomic_set(&found_error, 0);
+    }
+    mce_barrier_exit(&mce_trap_bar);
 
     /*
      * Clear MCIP if it wasn't already. There is a small
@@ -532,6 +646,90 @@ static void intel_machine_check(struct c
     raise_softirq(MACHINE_CHECK_SOFTIRQ);
 }
 
+/* According to MCA OS writer guide, CMCI handler need to clear bank when
+ * 1) CE (UC = 0)
+ * 2) ser_support = 1, Superious error, OVER = 0, EN = 0, [UC = 1]
+ * 3) ser_support = 1, UCNA, OVER = 0, S = 1, AR = 0, PCC = 0, [UC = 1, EN = 1]
+ * MCA handler need to clear bank when
+ * 1) ser_support = 1, Superious error, OVER = 0, EN = 0, UC = 1
+ * 2) ser_support = 1, SRAR, UC = 1, OVER = 0, S = 1, AR = 1, [EN = 1]
+ * 3) ser_support = 1, SRAO, UC = 1, S = 1, AR = 0, [EN = 1]
+*/
+
+static int intel_need_clearbank_scan(enum mca_source who, u64 status)
+{
+    if ( who == MCA_CMCI_HANDLER) {
+        /* CMCI need clear bank */
+        if ( !(status & MCi_STATUS_UC) )
+            return 1;
+        /* Spurious need clear bank */
+        else if ( ser_support && !(status & MCi_STATUS_OVER)
+                    && !(status & MCi_STATUS_EN) )
+            return 1;
+        /* UCNA OVER = 0 need clear bank */
+        else if ( ser_support && !(status & MCi_STATUS_OVER) 
+                    && !(status & MCi_STATUS_PCC) && !(status & MCi_STATUS_S) 
+                    && !(status & MCi_STATUS_AR))
+            return 1;
+        /* Only Log, no clear */
+        else return 0;
+    }
+    else if ( who == MCA_MCE_SCAN) {
+        /* Spurious need clear bank */
+        if ( ser_support && !(status & MCi_STATUS_OVER)
+                    && (status & MCi_STATUS_UC) && !(status & MCi_STATUS_EN))
+            return 1;
+        /* SRAR OVER=0 clear bank. OVER = 1 have caused reset */
+        else if ( ser_support && (status & MCi_STATUS_UC)
+                    && (status & MCi_STATUS_S) && (status & MCi_STATUS_AR )
+                    && (status & MCi_STATUS_OVER) )
+            return 1;
+        /* SRAO need clear bank */
+        else if ( ser_support && !(status & MCi_STATUS_AR) 
+                    && (status & MCi_STATUS_S) && (status & MCi_STATUS_UC))
+            return 1; 
+        else
+            return 0;
+    }
+
+    return 1;
+}
+
+/* MCE continues/is recoverable when 
+ * 1) CE UC = 0
+ * 2) Supious ser_support = 1, OVER = 0, En = 0 [UC = 1]
+ * 3) SRAR ser_support = 1, OVER = 0, PCC = 0, S = 1, AR = 1 [UC =1, EN = 1]
+ * 4) SRAO ser_support = 1, PCC = 0, S = 1, AR = 0, EN = 1 [UC = 1]
+ * 5) UCNA ser_support = 1, OVER = 0, EN = 1, PCC = 0, S = 0, AR = 0, [UC = 1]
+ */
+static int intel_recoverable_scan(u64 status)
+{
+
+    if ( !(status & MCi_STATUS_UC ) )
+        return 1;
+    else if ( ser_support && !(status & MCi_STATUS_EN) 
+                && !(status & MCi_STATUS_OVER) )
+        return 1;
+    /* SRAR error */
+    else if ( ser_support && !(status & MCi_STATUS_OVER) 
+                && !(status & MCi_STATUS_PCC) && (status & MCi_STATUS_S)
+                && (status & MCi_STATUS_AR) ) {
+        printk(KERN_DEBUG "MCE: No SRAR error defined currently.\n");
+        return 0;
+    }
+    /* SRAO error */
+    else if (ser_support && !(status & MCi_STATUS_PCC)
+                && (status & MCi_STATUS_S) && !(status & MCi_STATUS_AR)
+                && (status & MCi_STATUS_EN))
+        return 1;
+    /* UCNA error */
+    else if (ser_support && !(status & MCi_STATUS_OVER)
+                && (status & MCi_STATUS_EN) && !(status & MCi_STATUS_PCC)
+                && !(status & MCi_STATUS_S) && !(status & MCi_STATUS_AR))
+        return 1;
+    return 0;
+}
+
 static DEFINE_SPINLOCK(cmci_discover_lock);
 
 /*
@@ -586,7 +784,7 @@ static void cmci_discover(void)
      */
 
     mctc = mcheck_mca_logout(
-        MCA_CMCI_HANDLER, __get_cpu_var(mce_banks_owned), &bs);
+        MCA_CMCI_HANDLER, __get_cpu_var(mce_banks_owned), &bs, NULL);
 
     if (bs.errcnt && mctc != NULL) {
         if (guest_enabled_event(dom0->vcpu[0], VIRQ_MCA)) {
@@ -700,7 +898,7 @@ fastcall void smp_cmci_interrupt(struct 
     irq_enter();
 
     mctc = mcheck_mca_logout(
-        MCA_CMCI_HANDLER, __get_cpu_var(mce_banks_owned), &bs);
+        MCA_CMCI_HANDLER, __get_cpu_var(mce_banks_owned), &bs, NULL);
 
     if (bs.errcnt && mctc != NULL) {
         if (guest_enabled_event(dom0->vcpu[0], VIRQ_MCA)) {
@@ -738,6 +936,10 @@ static void mce_cap_init(struct cpuinfo_
     if ((l & MCG_CMCI_P) && cpu_has_apic)
         cmci_support = 1;
 
+    /* Support Software Error Recovery */
+    if (l & MCG_SER_P)
+        ser_support = 1;
+
     nr_mce_banks = l & MCG_CAP_COUNT;
     if (nr_mce_banks > MAX_NR_BANKS)
     {
@@ -770,7 +972,7 @@ static void mce_init(void)
     /* log the machine checks left over from the previous reset.
      * This also clears all registers*/
 
-    mctc = mcheck_mca_logout(MCA_RESET, mca_allbanks, &bs);
+    mctc = mcheck_mca_logout(MCA_RESET, mca_allbanks, &bs, NULL);
 
     /* in the boot up stage, don't inject to DOM0, but print out */
     if (bs.errcnt && mctc != NULL) {
@@ -810,6 +1012,8 @@ int intel_mcheck_init(struct cpuinfo_x86
     /* machine check is available */
     x86_mce_vector_register(intel_machine_check);
     x86_mce_callback_register(intel_get_extended_msrs);
+    mce_recoverable_register(intel_recoverable_scan);
+    mce_need_clearbank_register(intel_need_clearbank_scan);
 
     mce_init();
     mce_intel_feature_init(c);
diff -r 376c0749586e -r c23aeb37b17f xen/arch/x86/cpu/mcheck/non-fatal.c
--- a/xen/arch/x86/cpu/mcheck/non-fatal.c       Tue Jun 16 10:55:36 2009 +0100
+++ b/xen/arch/x86/cpu/mcheck/non-fatal.c       Tue Jun 16 10:58:56 2009 +0100
@@ -39,7 +39,7 @@ static void mce_checkregs (void *info)
        struct mca_summary bs;
        static uint64_t dumpcount = 0;
 
-       mctc = mcheck_mca_logout(MCA_POLLER, __get_cpu_var(poll_bankmask), &bs);
+       mctc = mcheck_mca_logout(MCA_POLLER, __get_cpu_var(poll_bankmask), &bs, 
NULL);
 
        if (bs.errcnt && mctc != NULL) {
                adjust++;
diff -r 376c0749586e -r c23aeb37b17f xen/arch/x86/cpu/mcheck/x86_mca.h
--- a/xen/arch/x86/cpu/mcheck/x86_mca.h Tue Jun 16 10:55:36 2009 +0100
+++ b/xen/arch/x86/cpu/mcheck/x86_mca.h Tue Jun 16 10:58:56 2009 +0100
@@ -30,6 +30,7 @@
 
 
 /* Bitfield of the MSR_IA32_MCG_CAP register */
+#define MCG_SER_P               (1UL<<24)
 #define MCG_CAP_COUNT           0x00000000000000ffULL
 #define MCG_CTL_P               0x0000000000000100ULL
 #define MCG_EXT_P              (1UL<<9)
@@ -50,6 +51,10 @@
 #define MCi_STATUS_MSEC         0x00000000ffff0000ULL
 /* Other information */
 #define MCi_STATUS_OTHER        0x01ffffff00000000ULL
+/* Action Required flag */
+#define MCi_STATUS_AR           0x0080000000000000ULL
+/* Signaling flag */
+#define MCi_STATUS_S            0x0100000000000000ULL
 /* processor context corrupt */
 #define MCi_STATUS_PCC          0x0200000000000000ULL
 /* MSR_K8_MCi_ADDR register valid */
@@ -105,8 +110,8 @@ DECLARE_PER_CPU(cpu_banks_t, mce_banks_o
 #define MCA_OWNER (0x1 < 1)
 /* MCA error can't be recovered and need reset */
 #define MCA_NEED_RESET (0x1 < 2)
-/* MCA error need further actions in softIRQ context for recovery */
-#define MCA_MORE_ACTION (0x1 < 3)
+/* MCA error did not have any action yet */
+#define MCA_NO_ACTION (0x1 < 3)
 
 struct mca_handle_result
 {

_______________________________________________
Xen-changelog mailing list
Xen-changelog@xxxxxxxxxxxxxxxxxxx
http://lists.xensource.com/xen-changelog

<Prev in Thread] Current Thread [Next in Thread>
  • [Xen-changelog] [xen-unstable] x86 mca: Support MCA recovery actions for latest Intel platforms, Xen patchbot-unstable <=