WARNING - OLD ARCHIVES

This is an archived copy of the Xen.org mailing list, which we have preserved to ensure that existing links to archives are not broken. The live archive, which contains the latest emails, can be found at http://lists.xen.org/
   
 
 
Xen 
 
Home Products Support Community News
 
   
 

xen-devel

[Xen-devel] [RFC] [PATCH 2/2] Clean-up the MCA handling code

To: "Frank.Vanderlinden@xxxxxxx" <Frank.Vanderlinden@xxxxxxx>, Christoph Egger <Christoph.Egger@xxxxxxx>, Keir Fraser <keir.fraser@xxxxxxxxxxxxx>
Subject: [Xen-devel] [RFC] [PATCH 2/2] Clean-up the MCA handling code
From: "Jiang, Yunhong" <yunhong.jiang@xxxxxxxxx>
Date: Mon, 19 Apr 2010 16:59:46 +0800
Accept-language: en-US
Acceptlanguage: en-US
Cc: "xen-devel@xxxxxxxxxxxxxxxxxxx" <xen-devel@xxxxxxxxxxxxxxxxxxx>
Delivery-date: Mon, 19 Apr 2010 02:06:43 -0700
Envelope-to: www-data@xxxxxxxxxxxxxxxxxxx
List-help: <mailto:xen-devel-request@lists.xensource.com?subject=help>
List-id: Xen developer discussion <xen-devel.lists.xensource.com>
List-post: <mailto:xen-devel@lists.xensource.com>
List-subscribe: <http://lists.xensource.com/mailman/listinfo/xen-devel>, <mailto:xen-devel-request@lists.xensource.com?subject=subscribe>
List-unsubscribe: <http://lists.xensource.com/mailman/listinfo/xen-devel>, <mailto:xen-devel-request@lists.xensource.com?subject=unsubscribe>
Sender: xen-devel-bounces@xxxxxxxxxxxxxxxxxxx
Thread-index: AcrfnqkOcHUD+NjNQUa9FfTX+aQ/LA==
Thread-topic: [RFC] [PATCH 2/2] Clean-up the MCA handling code
arch/x86/cpu/mcheck/amd_f10.c      |   15
 arch/x86/cpu/mcheck/amd_k8.c       |   20
 arch/x86/cpu/mcheck/amd_nonfatal.c |   21
 arch/x86/cpu/mcheck/mce.c          |  246 ++++-------
 arch/x86/cpu/mcheck/mce.h          |  125 ++++-
 arch/x86/cpu/mcheck/mce_intel.c    |  818 ++++++++++++++++++++++++-------------
 arch/x86/cpu/mcheck/non-fatal.c    |   19
 arch/x86/cpu/mcheck/x86_mca.h      |   40 -
 include/public/arch-x86/xen-mca.h  |   16
 include/xlat.lst                   |    1
 10 files changed, 830 insertions(+), 491 deletions(-)

Clean-up the MCA handling code

The key idea of this patch is try to add mca_ops to MCE/polling/CMCI handler

a) Rename the mcheck_mca_logout to mcheck_mca_handler, as it is now the major
   utility of all MCA information, instead of simply MCA information logout
b) Mcheck_mca_handler will check error information bank by bank.
   For each bank, it will first need check if the bank need handled, if yes,
   it will get the extended bank information, and call the corresponding
   bank handler. The global infor is checked once a need_handle bank found.
c) Defined mca_binfo structure to passing information around the handling flow
d) We define the handler result through 
MCA_NO_ERROR/MCA_CORRECTED/MCA_NEXT_STEP etc,
   to abstract the handler result and what's to do.
e) We add some default handler for AMD side.
f) I remove several domain_id/vcpu_id log from mcheck_mca_handler, instead, I'd 
keep
   them in the bank_handler or global_handler if needed in future.
g) It also include change for some intel specific changes, mainly from
   intel_mce_uhandlers and mce_delay_handlers.

Signed-off-by: Jiang, Yunhong <yunhong.jiang@xxxxxxxxx>

diff -r da1165144ac4 -r 5223a8944e2d xen/arch/x86/cpu/mcheck/amd_f10.c
--- a/xen/arch/x86/cpu/mcheck/amd_f10.c Mon Apr 19 15:31:29 2010 +0800
+++ b/xen/arch/x86/cpu/mcheck/amd_f10.c Mon Apr 19 16:41:28 2010 +0800
@@ -49,15 +49,18 @@
 #include "x86_mca.h"


-static struct mcinfo_extended *
-amd_f10_handler(struct mc_info *mi, uint16_t bank, uint64_t status)
+struct mcinfo_extended *
+amd_f10_get_bext(int bank, struct mca_binfo *bi)
 {
        struct mcinfo_extended *mc_ext;
+    uint64_t status;

        /* Family 0x10 introduced additional MSR that belong to the
         * northbridge bank (4). */
-       if (mi == NULL || bank != 4)
+       if (!bi || !bi->mi || bank != 4)
                return NULL;
+
+    status = mca_binfo_status(bank, bi);

        if (!(status & MCi_STATUS_VAL))
                return NULL;
@@ -65,11 +68,11 @@ amd_f10_handler(struct mc_info *mi, uint
        if (!(status & MCi_STATUS_MISCV))
                return NULL;

-    mc_ext = x86_mcinfo_reserve(mi, sizeof(struct mcinfo_extended) +
+    mc_ext = x86_mcinfo_reserve(bi->mi, sizeof(struct mcinfo_extended) +
             3 * sizeof(struct mcinfo_msr));
     if (!mc_ext)
     {
-        mi->flags |= MCINFO_FLAGS_UNCOMPLETE;
+        bi->mi->flags |= MCINFO_FLAGS_UNCOMPLETE;
         return NULL;
     }

@@ -95,7 +98,7 @@ enum mcheck_type amd_f10_mcheck_init(str
        if (amd_k8_mcheck_init(c) == mcheck_none)
                return mcheck_none;

-       x86_mce_callback_register(amd_f10_handler);
+    amd_mce_ops->get_bext = amd_f10_get_bext;

        return mcheck_amd_famXX;
 }
diff -r da1165144ac4 -r 5223a8944e2d xen/arch/x86/cpu/mcheck/amd_k8.c
--- a/xen/arch/x86/cpu/mcheck/amd_k8.c  Mon Apr 19 15:31:29 2010 +0800
+++ b/xen/arch/x86/cpu/mcheck/amd_k8.c  Mon Apr 19 16:41:28 2010 +0800
@@ -69,10 +69,24 @@
 #include "mce.h"
 #include "mce_quirks.h"

+struct mca_ops *amd_mce_ops;
 /* Machine Check Handler for AMD K8 family series */
 static void k8_machine_check(struct cpu_user_regs *regs, long error_code)
 {
-       mcheck_cmn_handler(regs, error_code, mca_allbanks);
+       mcheck_cmn_handler(regs, error_code, amd_mce_ops);
+}
+
+static int amd_k8_init_ops(struct mca_ops *ops)
+{
+    if (!ops)
+        return -1;
+
+    ops->queue_type = MC_URGENT;
+    ops->flags = MC_FLAG_MCE;
+    ops->need_handle = default_valid_bank;
+    ops->global_handler = mce_global_handler;
+    ops->bank_handler = default_mca_bhandler;
+    return 0;
 }

 /* AMD K8 machine check */
@@ -85,10 +99,14 @@ enum mcheck_type amd_k8_mcheck_init(stru
        if (!cpu_has(c, X86_FEATURE_MCA))
                return mcheck_none;

+    if (!(amd_mce_ops = xmalloc(struct mca_ops)))
+        return 0;
+
        quirkflag = mcequirk_lookup_amd_quirkdata(c);

        mce_cap_init();
        x86_mce_vector_register(k8_machine_check);
+    amd_k8_init_ops(amd_mce_ops);

        for (i = 0; i < nr_mce_banks; i++) {
                if (quirkflag == MCEQUIRK_K8_GART && i == 4) {
diff -r da1165144ac4 -r 5223a8944e2d xen/arch/x86/cpu/mcheck/amd_nonfatal.c
--- a/xen/arch/x86/cpu/mcheck/amd_nonfatal.c    Mon Apr 19 15:31:29 2010 +0800
+++ b/xen/arch/x86/cpu/mcheck/amd_nonfatal.c    Mon Apr 19 16:41:28 2010 +0800
@@ -75,6 +75,7 @@ static int hw_threshold = 0;
 static int hw_threshold = 0;
 static int adjust = 0;
 static int variable_period = 1;
+static struct mca_ops *amd_nonfatal_ops;

 /* The polling service routine:
  * Collects information of correctable errors and notifies
@@ -85,7 +86,7 @@ static void mce_amd_checkregs(void *info
        mctelem_cookie_t mctc;
        struct mca_summary bs;

-       mctc = mcheck_mca_logout(MCA_POLLER, mca_allbanks, &bs, NULL);
+       mctc = mcheck_mca_handler(NULL, 0, amd_nonfatal_ops, &bs);

        if (bs.errcnt && mctc != NULL) {
                static uint64_t dumpcount = 0;
@@ -198,11 +199,29 @@ static void mce_amd_work_fn(void *data)
        adjust = 0;
 }

+static int amd_init_nonfatal_ops(void)
+{
+    amd_nonfatal_ops = (struct mca_ops *)xmalloc(struct mca_ops);
+    if (!amd_nonfatal_ops)
+        return -ENOMEM;
+
+    memset(amd_nonfatal_ops, 0, sizeof(struct mca_ops));
+    amd_nonfatal_ops->queue_type = MC_NONURGENT;
+    amd_nonfatal_ops->flags = MC_FLAG_POLLED;
+    amd_nonfatal_ops->need_handle = default_valid_bank;
+    amd_nonfatal_ops->bank_handler = default_mca_bhandler;
+    if ( boot_cpu_data.x86 == 0x10)
+        amd_nonfatal_ops->get_bext = amd_f10_get_bext;
+    return 0;
+}
+
 void amd_nonfatal_mcheck_init(struct cpuinfo_x86 *c)
 {
        if (c->x86_vendor != X86_VENDOR_AMD)
                return;

+    if (amd_init_nonfatal_ops())
+        return;
        /* Assume we are on K8 or newer AMD CPU here */

        /* The threshold bitfields in MSR_IA32_MC4_MISC has
diff -r da1165144ac4 -r 5223a8944e2d xen/arch/x86/cpu/mcheck/mce.c
--- a/xen/arch/x86/cpu/mcheck/mce.c     Mon Apr 19 15:31:29 2010 +0800
+++ b/xen/arch/x86/cpu/mcheck/mce.c     Mon Apr 19 16:41:28 2010 +0800
@@ -91,42 +91,7 @@ void machine_check_vector(struct cpu_use
        _machine_check_vector(regs, error_code);
 }

-/* Init machine check callback handler
- * It is used to collect additional information provided by newer
- * CPU families/models without the need to duplicate the whole handler.
- * This avoids having many handlers doing almost nearly the same and each
- * with its own tweaks ands bugs. */
-static x86_mce_callback_t mc_callback_bank_extended = NULL;
-
-void x86_mce_callback_register(x86_mce_callback_t cbfunc)
-{
-       mc_callback_bank_extended = cbfunc;
-}
-
-/* Machine check recoverable judgement callback handler
- * It is used to judge whether an UC error is recoverable by software
- */
-static mce_recoverable_t mc_recoverable_scan = NULL;
-
-void mce_recoverable_register(mce_recoverable_t cbfunc)
-{
-    mc_recoverable_scan = cbfunc;
-}
-
-/* Judging whether to Clear Machine Check error bank callback handler
- * According to Intel latest MCA OS Recovery Writer's Guide,
- * whether the error MCA bank needs to be cleared is decided by the mca_source
- * and MCi_status bit value.
- */
-static mce_need_clearbank_t mc_need_clearbank_scan = NULL;
-
-void mce_need_clearbank_register(mce_need_clearbank_t cbfunc)
-{
-    mc_need_clearbank_scan = cbfunc;
-}
-
-static struct mcinfo_bank *mca_init_bank(enum mca_source who,
-                                         struct mc_info *mi, int bank)
+static struct mcinfo_bank *mca_init_bank(struct mc_info *mi, int bank)
 {
        struct mcinfo_bank *mib;
        uint64_t addr=0, misc = 0;
@@ -153,23 +118,7 @@ static struct mcinfo_bank *mca_init_bank
                mca_rdmsrl(MSR_IA32_MC0_MISC + 4 * bank, mib->mc_misc);

        if (mib->mc_status & MCi_STATUS_ADDRV)
-       {
                mca_rdmsrl(MSR_IA32_MC0_ADDR + 4 * bank, mib->mc_addr);
-
-               if (mfn_valid(paddr_to_pfn(mib->mc_addr))) {
-                       struct domain *d;
-
-                       d = maddr_get_owner(mib->mc_addr);
-                       if (d != NULL && (who == MCA_POLLER ||
-                                 who == MCA_CMCI_HANDLER))
-                               mib->mc_domid = d->domain_id;
-               }
-       }
-
-       if (who == MCA_CMCI_HANDLER) {
-               mca_rdmsrl(MSR_IA32_MC0_CTL2 + bank, mib->mc_ctrl2);
-               rdtscll(mib->mc_tsc);
-       }

        return mib;
 }
@@ -218,113 +167,80 @@ static int mca_init_global(uint32_t flag
  * For Intel latest CPU, whether to clear the error bank status needs to
  * be judged by the callback function defined above.
  */
-mctelem_cookie_t mcheck_mca_logout(enum mca_source who, cpu_banks_t bankmask,
-    struct mca_summary *sp, cpu_banks_t* clear_bank)
-{
-       uint64_t gstatus, status;
-       struct mcinfo_global *mig = NULL;       /* on stack */
+ mctelem_cookie_t mcheck_mca_handler(struct cpu_user_regs *regs,
+                                       unsigned long error_code,
+                                       struct mca_ops *ops,
+                                       struct mca_summary *sp)
+{
+       struct mca_binfo bi;
+       struct mcinfo_global *mig = NULL;
        mctelem_cookie_t mctc = NULL;
-       uint32_t uc = 0, pcc = 0, recover, need_clear = 1, mc_flags = 0;
+       uint32_t uc = 0, pcc = 0;
        struct mc_info *mci = NULL;
        mctelem_class_t which = MC_URGENT;      /* XXXgcc */
        int errcnt = 0;
-       int i;
-
-       mca_rdmsrl(MSR_IA32_MCG_STATUS, gstatus);
-       switch (who) {
-       case MCA_MCE_HANDLER:
-       case MCA_MCE_SCAN:
-               mc_flags = MC_FLAG_MCE;
-               which = MC_URGENT;
-               break;
-
-       case MCA_POLLER:
-       case MCA_RESET:
-               mc_flags = MC_FLAG_POLLED;
-               which = MC_NONURGENT;
-               break;
-
-       case MCA_CMCI_HANDLER:
-               mc_flags = MC_FLAG_CMCI;
-               which = MC_NONURGENT;
-               break;
-
-       default:
-               BUG();
-       }
-
-       /* If no mc_recovery_scan callback handler registered,
-        * this error is not recoverable
-        */
-       recover = (mc_recoverable_scan)? 1: 0;
+       int i, ret = 0, recover = 0, result = MCA_NO_ERROR;
+
+       which = ops->queue_type;
+       memset(&bi, 0, sizeof(bi));

        for (i = 0; i < 32 && i < nr_mce_banks; i++) {
-               struct mcinfo_bank *mib;                /* on stack */
-
-               /* Skip bank if corresponding bit in bankmask is clear */
-               if (!test_bit(i, bankmask))
-                       continue;
+               uint64_t status;
+
+               bi.mib = NULL;
+               bi.mibext = NULL;

                mca_rdmsrl(MSR_IA32_MC0_STATUS + i * 4, status);
                if (!(status & MCi_STATUS_VAL))
                        continue;       /* this bank has no valid telemetry */
-
-               /* For Intel Latest CPU CMCI/MCE Handler caller, we need to
-                * decide whether to clear bank by MCi_STATUS bit value such as
-                * OVER/UC/EN/PCC/S/AR
-                */
-               if ( mc_need_clearbank_scan )
-                       need_clear = mc_need_clearbank_scan(who, status);
+               bi.status = status;
+
+               if ( !ops->need_handle(i, &bi) )
+                   continue;
+
+               if (status & MCi_STATUS_PCC)
+                   pcc |= 1UL << i;
+               if (status & MCi_STATUS_UC)
+                   uc |= 1UL << i;

                /* If this is the first bank with valid MCA DATA, then
                 * try to reserve an entry from the urgent/nonurgent queue
                 * depending on whethere we are called from an exception or
                 * a poller;  this can fail (for example dom0 may not
                 * yet have consumed past telemetry). */
-               if (errcnt == 0) {
+               if (errcnt++ == 0) {
                        if ( (mctc = mctelem_reserve(which)) != NULL ) {
                                mci = mctelem_dataptr(mctc);
                                mcinfo_clear(mci);
                                mig = (struct mcinfo_global*)x86_mcinfo_reserve
                                  (mci, sizeof(struct mcinfo_global));
                                /* mc_info should at least hold up the global 
information */
+                               bi.mig = mig;
                                ASSERT(mig);
-                               mca_init_global(mc_flags, mig);
-                /* A hook here to get global extended msrs */
-                {
-                    struct mcinfo_extended *intel_get_extended_msrs(
-                              struct mcinfo_global *mig, struct mc_info *mi);
-                    struct cpuinfo_x86 *c;
-                    c = &cpu_data[smp_processor_id()];
-                    intel_get_extended_msrs(mig, mci);
-                }
+                               mca_init_global(ops->flags, mig);
+                               if (ops->get_gext)
+                                       bi.migext = ops->get_gext(&bi);
+
+                               if (ops->global_handler)
+                                       result = ops->global_handler(regs,
+                                                       error_code, &bi, sp);
                        }
                }

-               /* form a mask of which banks have logged uncorrected errors */
-               if ((status & MCi_STATUS_UC) != 0)
-                       uc |= (1 << i);
-
-               /* likewise for those with processor context corrupt */
-               if ((status & MCi_STATUS_PCC) != 0)
-                       pcc |= (1 << i);
-
-               if (recover && uc)
-                /* uc = 1, recover = 1, we need not panic.
-                 */
-                       recover = mc_recoverable_scan(status);
-
-               mib = mca_init_bank(who, mci, i);
-
-               if (mc_callback_bank_extended)
-                       mc_callback_bank_extended(mci, i, status);
-
-               /* By default, need_clear = 1 */
-               if (who != MCA_MCE_SCAN && need_clear)
-                       /* Clear status */
-                       mca_wrmsrl(MSR_IA32_MC0_STATUS + 4 * i, 0x0ULL);
-               else if ( who == MCA_MCE_SCAN && need_clear)
-                       set_bit(i, clear_bank);
+               bi.mib = mca_init_bank(mci, i);
+
+               /* Get extended back information */
+               if (ops->get_bext)
+                       bi.mibext = ops->get_bext(i, &bi);
+
+               if (ops->bank_handler)
+               {
+                       ret = ops->bank_handler(i, regs, error_code, &bi, sp);
+                       if (ret > MCA_CORRECTED && ret < MCA_FATAL)
+                               recover ++;
+                       if (ret > result)
+                               result = ret;
+               }

                wmb();
        }
@@ -340,12 +256,16 @@ mctelem_cookie_t mcheck_mca_logout(enum


        if (sp) {
+        uint64_t gstatus;
+
                sp->errcnt = errcnt;
+        mca_rdmsrl(MSR_IA32_MCG_STATUS, gstatus);
                sp->ripv = (gstatus & MCG_STATUS_RIPV) != 0;
                sp->eipv = (gstatus & MCG_STATUS_EIPV) != 0;
                sp->uc = uc;
                sp->pcc = pcc;
                sp->recoverable = recover;
+        sp->severity = result;
        }

        return mci != NULL ? mctc : NULL;       /* may be NULL */
@@ -356,9 +276,61 @@ mctelem_cookie_t mcheck_mca_logout(enum
 #define DOMU_TRAP      2
 #define DOMU_KILLED    4

+/* These are currently mainly for AMD, can be deleted later */
+int mce_global_handler(struct cpu_user_regs *regs,
+                        long error_code,
+                        struct mca_binfo *bi,
+                        struct mca_summary *result)
+{
+    uint64_t gstatus;
+
+    mca_rdmsrl(MSR_IA32_MCG_STATUS, gstatus);
+
+    /* What's on earth is the relationship of EIPV and RIPV?? */
+    if ( ( !(gstatus & MCG_STATUS_RIPV) || gstatus & MCG_STATUS_EIPV ) &&
+            !guest_mode(regs))
+        /* Maybe we should not crash if Xen is in vCPU context?? */
+        return MCA_FATAL;
+
+    return MCA_NO_ERROR;
+}
+
+/* By default, always logoug for all banks */
+int default_valid_bank(int bank, struct mca_binfo *bi)
+{
+    return 1;
+}
+
+static int default_mce_type(uint64_t status)
+{
+    if (!(status & MCi_STATUS_VAL))
+        return MCA_NO_ERROR;
+
+    if (status & MCi_STATUS_PCC)
+        return MCA_FATAL;
+
+    /* Corrected error? */
+    if (!(status & MCi_STATUS_UC))
+        return MCA_CORRECTED;
+
+    return MCA_FATAL;;
+}
+
+int default_mca_bhandler(int bank,
+                         struct cpu_user_regs *regs,
+                         long error_code,
+                         struct mca_binfo *bi,
+                         struct mca_summary *result)
+{
+    uint64_t status;
+
+    status = mca_binfo_status(bank, bi);
+    mca_wrmsrl(MSR_IA32_MC0_STATUS + 4 * bank, 0x0ULL);
+    return default_mce_type(status);
+}
+
 /* Shared #MC handler. */
-void mcheck_cmn_handler(struct cpu_user_regs *regs, long error_code,
-    cpu_banks_t bankmask)
+void mcheck_cmn_handler(struct cpu_user_regs *regs, long error_code, struct 
mca_ops *ops)
 {
        int xen_state_lost, dom0_state_lost, domU_state_lost;
        struct vcpu *v = current;
@@ -397,7 +369,7 @@ void mcheck_cmn_handler(struct cpu_user_
         * for logging or dismiss the cookie that is returned, and must not
         * reference the cookie after that action.
         */
-       mctc = mcheck_mca_logout(MCA_MCE_HANDLER, bankmask, &bs, NULL);
+       mctc = mcheck_mca_handler(regs, error_code, ops, &bs);
        if (mctc != NULL)
                mci = (struct mc_info *)mctelem_dataptr(mctc);

@@ -1247,9 +1219,11 @@ CHECK_mcinfo_extended;
 # define xen_mcinfo_recovery         mcinfo_recovery
 # define xen_cpu_offline_action      cpu_offline_action
 # define xen_page_offline_action     page_offline_action
+# define xen_vmce_inject_action     vmce_inject_action
 CHECK_mcinfo_recovery;
 # undef xen_cpu_offline_action
 # undef xen_page_offline_action
+# undef xen_vmce_inject_action
 # undef xen_mcinfo_recovery
 #else
 # define compat_mc_fetch xen_mc_fetch
diff -r da1165144ac4 -r 5223a8944e2d xen/arch/x86/cpu/mcheck/mce.h
--- a/xen/arch/x86/cpu/mcheck/mce.h     Mon Apr 19 15:31:29 2010 +0800
+++ b/xen/arch/x86/cpu/mcheck/mce.h     Mon Apr 19 16:41:28 2010 +0800
@@ -70,14 +70,6 @@ typedef void (*x86_mce_vector_t)(struct
 typedef void (*x86_mce_vector_t)(struct cpu_user_regs *, long);
 extern void x86_mce_vector_register(x86_mce_vector_t);

-/* Common generic MCE handler that implementations may nominate
- * via x86_mce_vector_register. */
-extern void mcheck_cmn_handler(struct cpu_user_regs *, long, cpu_banks_t);
-
-/* Register a handler for judging whether mce is recoverable. */
-typedef int (*mce_recoverable_t)(u64 status);
-extern void mce_recoverable_register(mce_recoverable_t);
-
 /* Read an MSR, checking for an interposed value first */
 extern struct intpose_ent *intpose_lookup(unsigned int, uint64_t,
     uint64_t *);
@@ -94,21 +86,19 @@ extern void intpose_inval(unsigned int,
        wrmsrl(msr, val); \
 } while (0)

-
-/* Utility function to "logout" all architectural MCA telemetry from the MCA
- * banks of the current processor.  A cookie is returned which may be
- * uses to reference the data so logged (the cookie can be NULL if
- * no logout structures were available).  The caller can also pass a pointer
- * to a structure which will be completed with some summary information
- * of the MCA data observed in the logout operation. */
-
-enum mca_source {
-       MCA_MCE_HANDLER,
-       MCA_POLLER,
-       MCA_CMCI_HANDLER,
-       MCA_RESET,
-       MCA_MCE_SCAN
-};
+#define MCA_NO_ERROR    0x0
+/* Hardware has corrected the errors */
+#define MCA_CORRECTED   0x1
+/* Software has recovered the error successfully */
+#define MCA_RECOVERED   0x2
+/* The error is recovred, but the system is tainted, as for intel's SRAO*/
+#define MCA_PARTIAL_RECOVERED   0x3
+/*
+ * The error is not corrected still need go to next softirq
+ * Is same as fatal if is in last step
+ */
+#define MCA_NEXT_STEP   0x8
+#define MCA_FATAL       0xF

 struct mca_summary {
        uint32_t        errcnt; /* number of banks with valid errors */
@@ -117,7 +107,58 @@ struct mca_summary {
        uint32_t        uc;     /* bitmask of banks with UC */
        uint32_t        pcc;    /* bitmask of banks with PCC */
        /* bitmask of banks with software error recovery ability*/
-       uint32_t        recoverable;
+       uint32_t        recoverable;
+    int         severity;
+};
+
+struct mca_binfo {
+    int bank;
+    uint64_t status;
+    struct mcinfo_global *mig;
+    struct mcinfo_extended *migext;
+    struct mcinfo_bank *mib;
+    struct mcinfo_extended *mibext;
+    struct mc_info *mi;
+};
+
+static inline uint64_t mca_binfo_status(int bank, struct mca_binfo *bi)
+{
+    uint64_t status;
+
+    if (bi)
+        return bi->status;
+    else
+        mca_rdmsrl(MSR_IA32_MC0_STATUS + 4 * bank, status);
+    return status;
+}
+
+struct mca_ops {
+    mctelem_class_t queue_type;
+
+    /* flags passing in mcinfo_global, mainly for compatibility */
+    uint32_t flags;
+
+    struct mcinfo_extended* (*get_gext)(struct mca_binfo *bi);
+
+    int (*global_handler)(struct cpu_user_regs *regs,
+                        long error_code,
+                        struct mca_binfo *bi,
+                        struct mca_summary *result);
+
+    /*
+     * We pass bank here because bi maybe NULL because of out of memory,
+     * Same for followed bank related handler
+     */
+    int (*need_handle)(int bank, struct mca_binfo *bi);
+
+    struct mcinfo_extended *(*get_bext)(int bank, struct mca_binfo *bi);
+
+    /* Return value same as global_handler */
+    int (*bank_handler)(int bank,
+                        struct cpu_user_regs *regs,
+                        long error_code,
+                        struct mca_binfo *bi,
+                        struct mca_summary *result);
 };

 extern cpu_banks_t mca_allbanks;
@@ -129,14 +170,34 @@ extern int is_mc_panic;
 extern int is_mc_panic;
 extern int mce_broadcast;
 extern void mcheck_mca_clearbanks(cpu_banks_t);
-
-extern mctelem_cookie_t mcheck_mca_logout(enum mca_source, cpu_banks_t,
-    struct mca_summary *, cpu_banks_t*);
+extern int default_mca_bhandler(int bank, struct cpu_user_regs *regs,
+                             long error_code,
+                             struct mca_binfo *bi,
+                             struct mca_summary *result);
+extern int default_valid_bank(int bank,
+                              struct mca_binfo *bi);
+extern struct mcinfo_extended *
+amd_f10_get_bext(int bank, struct mca_binfo *bi);
+int mce_global_handler(struct cpu_user_regs *regs,
+                        long error_code,
+                        struct mca_binfo *bi,
+                        struct mca_summary *result);
+extern struct mca_ops *amd_mce_ops;
+
+extern mctelem_cookie_t mcheck_mca_handler(struct cpu_user_regs *regs,
+                                    unsigned long error_code,
+                                    struct mca_ops *ops,
+                                    struct mca_summary *sp);
+
+/* Common generic MCE handler that implementations may nominate
+ * via x86_mce_vector_register. */
+extern void mcheck_cmn_handler(struct cpu_user_regs *regs,
+                       long error_code, struct mca_ops *ops);

 /* Register a callback to be made during bank telemetry logout.
  * This callback is only available to those machine check handlers
  * that call to the common mcheck_cmn_handler or who use the common
- * telemetry logout function mcheck_mca_logout in error polling.
+ * telemetry logout function mcheck_mca_handler in error polling.
  *
  * This can be used to collect additional information (typically non-
  * architectural) provided by newer CPU families/models without the need
@@ -147,14 +208,6 @@ extern mctelem_cookie_t mcheck_mca_logou
  * MCi_STATUS value for that bank.
  */

-/* Register a handler for judging whether the bank need to be cleared */
-typedef int (*mce_need_clearbank_t)(enum mca_source who, u64 status);
-extern void mce_need_clearbank_register(mce_need_clearbank_t);
-
-typedef struct mcinfo_extended *(*x86_mce_callback_t)
-    (struct mc_info *, uint16_t, uint64_t);
-extern void x86_mce_callback_register(x86_mce_callback_t);
-
 void *x86_mcinfo_add(struct mc_info *mi, void *mcinfo);
 void *x86_mcinfo_reserve(struct mc_info *mi, int size);
 void x86_mcinfo_dump(struct mc_info *mi);
diff -r da1165144ac4 -r 5223a8944e2d xen/arch/x86/cpu/mcheck/mce_intel.c
--- a/xen/arch/x86/cpu/mcheck/mce_intel.c       Mon Apr 19 15:31:29 2010 +0800
+++ b/xen/arch/x86/cpu/mcheck/mce_intel.c       Mon Apr 19 16:41:28 2010 +0800
@@ -18,7 +18,7 @@ DEFINE_PER_CPU(cpu_banks_t, mce_banks_ow
 DEFINE_PER_CPU(cpu_banks_t, mce_banks_owned);
 DEFINE_PER_CPU(cpu_banks_t, no_cmci_banks);
 int cmci_support = 0;
-int ser_support = 0;
+int ser_support = 1;

 static int nr_intel_ext_msrs = 0;

@@ -40,8 +40,14 @@ static struct mce_softirq_barrier mce_tr
  */
 static DEFINE_SPINLOCK(mce_logout_lock);

-static atomic_t severity_cpu = ATOMIC_INIT(-1);
+/* used at mce context */
+static atomic_t trap_severity_cpu = ATOMIC_INIT(-1);
+/*Set as -1 , so that at least one CPU will set it */
+static atomic_t trap_severity = ATOMIC_INIT(0);
 static atomic_t found_error = ATOMIC_INIT(0);
+
+/* used at softirq context */
+static atomic_t si_severity_cpu = ATOMIC_INIT(-1);

 static void mce_barrier_enter(struct mce_softirq_barrier *);
 static void mce_barrier_exit(struct mce_softirq_barrier *);
@@ -150,13 +156,20 @@ static void intel_init_thermal(struct cp
 #endif /* CONFIG_X86_MCE_THERMAL */

 #define INTEL_EXTENDED_MCA_MSRs (MSR_IA32_MCG_R15 - MSR_IA32_MCG_EAX + 1)
-struct mcinfo_extended *
-intel_get_extended_msrs(struct mcinfo_global *mig, struct mc_info *mi)
+struct mcinfo_extended*
+intel_get_extended_msrs(struct mca_binfo *bi)
 {
     struct mcinfo_extended *mc_ext;
     struct mcinfo_msr *msr;
     int num = nr_intel_ext_msrs, i, length;
-
+    struct mc_info *mi;
+    struct mcinfo_global *mig;
+
+    if (!bi)
+        return NULL;
+
+    mi = bi->mi;
+    mig = bi->mig;
     /*
      * According to spec, processor _support_ 64 bit will always
      * have MSR beyond IA32_MCG_MISC
@@ -190,102 +203,239 @@ intel_get_extended_msrs(struct mcinfo_gl
     return mc_ext;
 }

-static void intel_UCR_handler(struct mcinfo_bank *bank,
-             struct mcinfo_global *global,
-             struct mcinfo_extended *extension,
-             struct mca_handle_result *result)
+#define mce_invalid 0x0
+#define mce_fatal 0x1
+#define mce_corrected 0x2
+#define mce_ucr_ucna 0x3
+#define mce_ucr_srao 0x4
+#define mce_ucr_srar 0x5
+static int mce_type(uint64_t status)
+{
+    if (!(status & MCi_STATUS_VAL))
+        return mce_invalid;
+
+    if (status & MCi_STATUS_PCC)
+        return mce_fatal;
+
+    /* Corrected error? */
+    if (!(status & MCi_STATUS_UC))
+        return mce_corrected;
+
+    if (!ser_support)
+        return mce_fatal;
+
+    /* XXX Is this check really right? */
+    if (!(status & MCi_STATUS_EN))
+        return mce_invalid;
+
+    if (status & MCi_STATUS_S)
+    {
+        if (status & MCi_STATUS_AR)
+        {
+            if (status & MCi_STATUS_OVER)
+                return mce_fatal;
+            else
+                return mce_ucr_srar;
+        } else
+            return mce_ucr_srao;
+    }
+    else
+        return mce_ucr_ucna;
+
+    /* Any type not included abovoe ? */
+    return mce_fatal;
+}
+
+struct mcinfo_recovery *mci_add_pageoff_action(int bank, struct mc_info *mi,
+                              uint64_t mfn, uint32_t status)
+{
+    struct mcinfo_recovery *rec;
+
+    if (!mi)
+        return NULL;
+
+    rec = x86_mcinfo_reserve(mi, sizeof(struct mcinfo_recovery));
+    if (!rec)
+    {
+        mi->flags |= MCINFO_FLAGS_UNCOMPLETE;
+        return NULL;
+    }
+
+    memset(rec, 0, sizeof(struct mcinfo_recovery));
+
+    rec->mc_bank = bank;
+    rec->action_types = MC_ACTION_PAGE_OFFLINE;
+    rec->action_info.page_retire.mfn = mfn;
+    rec->action_info.page_retire.status = status;
+    return rec;
+}
+
+struct mcinfo_recovery *mci_add_vmce_action(int bank, struct mc_info *mi, int 
domain)
+{
+    struct mcinfo_recovery *rec;
+
+    if (!mi)
+        return NULL;
+
+    rec = x86_mcinfo_reserve(mi, sizeof(struct mcinfo_recovery));
+    if (!rec)
+    {
+        mi->flags |= MCINFO_FLAGS_UNCOMPLETE;
+        return NULL;
+    }
+
+    memset(rec, 0, sizeof(struct mcinfo_recovery));
+
+    rec->mc_bank = bank;
+    rec->action_types = MC_ACTION_VMCE_INJECTION;
+    rec->action_info.vmce.dom = domain;
+    return rec;
+}
+
+static int intel_srao_handler(int bank, struct mcinfo_bank *mib,
+             struct mcinfo_global *mig,
+             struct mc_info *mi)
 {
     struct domain *d;
     unsigned long mfn, gfn;
     uint32_t status;
+    int ret = MCA_PARTIAL_RECOVERED;

     mce_printk(MCE_VERBOSE, "MCE: Enter UCR recovery action\n");
-    result->result = MCA_NEED_RESET;
-    if (bank->mc_addr != 0) {
-         mfn = bank->mc_addr >> PAGE_SHIFT;
-         if (!offline_page(mfn, 1, &status)) {
-              /* This is free page */
-              if (status & PG_OFFLINE_OFFLINED)
-                  result->result = MCA_RECOVERED;
-              else if (status & PG_OFFLINE_PENDING) {
-                 /* This page has owner */
-                  if (status & PG_OFFLINE_OWNED) {
-                      result->result |= MCA_OWNER;
-                      result->owner = status >> PG_OFFLINE_OWNER_SHIFT;
-                      mce_printk(MCE_QUIET, "MCE: This error page is ownded"
-                                  " by DOM %d\n", result->owner);
-                      /* Fill vMCE# injection and vMCE# MSR virtualization "
-                       * "related data */
-                      bank->mc_domid = result->owner;
-                      /* XXX: Cannot handle shared pages yet
-                       * (this should identify all domains and gfn mapping to
-                       *  the mfn in question) */
-                      BUG_ON( result->owner == DOMID_COW );
-                      if ( result->owner != DOMID_XEN ) {
-
-                          d = get_domain_by_id(result->owner);
-                          if ( mca_ctl_conflict(bank, d) )
-                          {
-                              /* Guest has different MCE ctl with hypervisor */
-                              if ( d )
-                                  put_domain(d);
-                              return;
-                          }
-
-                          ASSERT(d);
-                          gfn =
-                              get_gpfn_from_mfn((bank->mc_addr) >> PAGE_SHIFT);
-                          bank->mc_addr =  gfn << PAGE_SHIFT |
-                                        (bank->mc_addr & (PAGE_SIZE -1 ));
-                          if ( fill_vmsr_data(bank, d,
-                                              global->mc_gstatus) == -1 )
-                          {
-                              mce_printk(MCE_QUIET, "Fill vMCE# data for DOM%d 
"
-                                      "failed\n", result->owner);
-                              put_domain(d);
-                              domain_crash(d);
-                              return;
-                          }
-                          /* We will inject vMCE to DOMU*/
-                          if ( inject_vmce(d) < 0 )
-                          {
-                              mce_printk(MCE_QUIET, "inject vMCE to DOM%d"
-                                          " failed\n", d->domain_id);
-                              put_domain(d);
-                              domain_crash(d);
-                              return;
-                          }
-                          /* Impacted domain go on with domain's recovery job
-                           * if the domain has its own MCA handler.
-                           * For xen, it has contained the error and finished
-                           * its own recovery job.
-                           */
-                          result->result = MCA_RECOVERED;
-                          put_domain(d);
-                      }
-                  }
-              }
-         }
-    }
-}
-
-#define INTEL_MAX_RECOVERY 2
-struct mca_error_handler intel_recovery_handler[INTEL_MAX_RECOVERY] =
-            {{0x017A, intel_UCR_handler}, {0x00C0, intel_UCR_handler}};
-
+
+    /* We can always continue even if recover failed here */
+    if (!mib|| !mib->mc_addr)
+        return ret;
+
+    mfn = mib->mc_addr >> PAGE_SHIFT;
+    if (!offline_page(mfn, 1, &status)) {
+        struct mcinfo_recovery *po;
+        uint64_t tmp;
+        domid_t owner;
+
+        po = mci_add_pageoff_action(mib->mc_bank, mi, mfn, status);
+
+        if (status & PG_OFFLINE_OFFLINED)
+            return MCA_RECOVERED;
+        else if (status & PG_OFFLINE_PENDING)
+        {
+            if (status & PG_OFFLINE_XENPAGE)
+                return ret;
+
+            ASSERT(status & PG_OFFLINE_OWNED);
+
+            owner = status >> PG_OFFLINE_OWNER_SHIFT;
+            if (owner == DOMID_XEN)
+                return ret;
+
+            mce_printk(MCE_QUIET, "MCE: The error page is owned"
+              " by DOM %d\n", owner);
+            /* XXX: Cannot handle shared pages yet
+             * (this should identify all domains and gfn mapping to
+             *  the mfn in question) */
+            BUG_ON(owner == DOMID_COW );
+            d = get_domain_by_id(owner);
+            if (!d)
+                return ret;
+            /*
+             * No vMCE injection to this guest
+             */
+            if (mca_ctl_conflict(mib, d))
+            {
+                put_domain(d);
+                return ret;
+            }
+
+            /* We need keep physical address for dom0's log */
+            tmp = mib->mc_addr;
+
+            gfn =
+              get_gpfn_from_mfn((mib->mc_addr) >> PAGE_SHIFT);
+            mib->mc_addr =  gfn << PAGE_SHIFT |
+              (mib->mc_addr & (PAGE_SIZE -1 ));
+
+            if (vmce_domain_inject(mib, d, mig) < 0)
+            {
+                mce_printk(MCE_QUIET, "Fill vMCE# data for DOM%d "
+                  "failed\n", owner);
+                put_domain(d);
+            } else
+            {
+                put_domain(d);
+                mci_add_vmce_action(mib->mc_bank, mi, d->domain_id);
+            }
+            mib->mc_addr = tmp;
+        } else
+            /* status & PG_OFFLINE_FAILED */
+            return ret;
+    }
+
+    return ret;
+}
+
+/*
+ * According to SDM, currently defined 2 SRAO UCR
+ * Luckily, these two errors can be handled in the same way
+ */
+static int intel_srao_check(int bank, struct mcinfo_bank *mib,
+            struct mcinfo_global *mig,
+            struct mc_info *mi)
+{
+    int mca_code;
+
+    if (!mib)
+        return 0;
+
+    mca_code = mib->mc_status & 0xFFFF;
+    if (mca_code == 0x17A ||
+         (mca_code & 0xFFF0) == 0xC0)
+         return 1;
+
+     return 0;
+}
+
+static int mce_default_check(int bank, struct mcinfo_bank *mib,
+             struct mcinfo_global *mig,
+             struct mc_info *mi)
+{
+    return 1;
+}
+
+static int mce_default_dhandler(int bank, struct mcinfo_bank *mib,
+            struct mcinfo_global *mig,
+            struct mc_info *mi)
+{
+    int type;
+
+    if (!mib)
+        return MCA_NO_ERROR;
+
+    type = mce_type(mib->mc_status);
+
+    if (type == mce_fatal || type == mce_ucr_srar)
+        return MCA_FATAL;
+
+    return (type == mce_ucr_srao)? MCA_PARTIAL_RECOVERED : MCA_RECOVERED;
+}
+
+/* Handlers be called in softirq context */
+static struct mca_error_handler mce_delay_handlers[] =
+            { {intel_srao_check, intel_srao_handler},
+              {mce_default_check, mce_default_dhandler}, };
+
+#define mce_dhandler_size (sizeof(mce_delay_handlers)/sizeof(struct 
mca_error_handler))
 /*
  * Called from mctelem_process_deferred. Return 1 if the telemetry
  * should be committed for dom0 consumption, 0 if it should be
  * dismissed.
  */
-static int mce_action(mctelem_cookie_t mctc)
+static int intel_mce_action(mctelem_cookie_t mctc)
 {
     struct mc_info *local_mi;
-    uint32_t i;
+    uint32_t i, valid_num = 0;
     struct mcinfo_common *mic = NULL;
     struct mcinfo_global *mc_global;
     struct mcinfo_bank *mc_bank;
-    struct mca_handle_result mca_res;

     local_mi = (struct mc_info*)mctelem_dataptr(mctc);
     x86_mcinfo_lookup(mic, local_mi, MC_TYPE_GLOBAL);
@@ -304,50 +454,25 @@ static int mce_action(mctelem_cookie_t m
             continue;
         }
         mc_bank = (struct mcinfo_bank*)mic;
-
-        /* TODO: Add recovery actions here, such as page-offline, etc */
-        memset(&mca_res, 0x0f, sizeof(mca_res));
-        for ( i = 0; i < INTEL_MAX_RECOVERY; i++ ) {
-            if ( ((mc_bank->mc_status & 0xffff) ==
-                        intel_recovery_handler[i].mca_code) ||
-                  ((mc_bank->mc_status & 0xfff0) ==
-                        intel_recovery_handler[i].mca_code)) {
-                /* For SRAR, OVER = 1 should have caused reset
-                 * For SRAO, OVER = 1 skip recovery action, continue execution
-                 */
-                if (!(mc_bank->mc_status & MCi_STATUS_OVER))
-                    intel_recovery_handler[i].recovery_handler
-                                (mc_bank, mc_global, NULL, &mca_res);
-                else {
-                   if (!(mc_global->mc_gstatus & MCG_STATUS_RIPV))
-                       mca_res.result = MCA_NEED_RESET;
-                   else
-                       mca_res.result = MCA_NO_ACTION;
+        valid_num ++;
+
+        for ( i = 0; i < mce_dhandler_size; i++ ) {
+            if (mce_delay_handlers[i].check_error(mc_bank->mc_bank,
+                                mc_bank, mc_global, local_mi))
+            {
+                int ret;
+
+                ret = mce_delay_handlers[i].handler(mc_bank->mc_bank,
+                            mc_bank, mc_global, local_mi);
+                if (ret == MCA_FATAL)
+                {
+                    x86_mcinfo_dump(mctelem_dataptr(mctc));
+                    mc_panic("Failed to handle mce in softirq context\n");
                 }
-                if (mca_res.result & MCA_OWNER)
-                    mc_bank->mc_domid = mca_res.owner;
-                if (mca_res.result == MCA_NEED_RESET)
-                    /* DOMID_XEN*/
-                    mc_panic("MCE: Software recovery failed for the UCR "
-                                "error\n");
-                else if (mca_res.result == MCA_RECOVERED)
-                    mce_printk(MCE_VERBOSE, "MCE: The UCR error is"
-                                "successfully recovered by software!\n");
-                else if (mca_res.result == MCA_NO_ACTION)
-                    mce_printk(MCE_VERBOSE, "MCE: Overwrite SRAO error can't"
-                                "do recover action, RIPV=1, let it be.\n");
-                break;
             }
         }
-        /* For SRAR, no defined recovery action should have caused reset
-         * in MCA Handler
-         */
-        if ( i >= INTEL_MAX_RECOVERY )
-            mce_printk(MCE_VERBOSE, "MCE: No software recovery action"
-                            " found for this SRAO error\n");
-
-    }
-    return 1;
+    }
+    return valid_num ? 1 : 0;
 }

 /* Softirq Handler for this MCE# processing */
@@ -369,15 +494,15 @@ static void mce_softirq(void)
      * will overwrite the value and become the default.
      */

-    atomic_set(&severity_cpu, cpu);
+    atomic_set(&si_severity_cpu, cpu);

     mce_barrier_enter(&mce_severity_bar);
     if (!mctelem_has_deferred(cpu))
-        atomic_set(&severity_cpu, cpu);
+        atomic_set(&si_severity_cpu, cpu);
     mce_barrier_exit(&mce_severity_bar);

     /* We choose severity_cpu for further processing */
-    if (atomic_read(&severity_cpu) == cpu) {
+    if (atomic_read(&si_severity_cpu) == cpu) {

         mce_printk(MCE_VERBOSE, "CPU%d handling errors\n", cpu);

@@ -385,7 +510,7 @@ static void mce_softirq(void)
          * vMCE MSRs virtualization buffer
          */
         for_each_online_cpu(workcpu) {
-            mctelem_process_deferred(workcpu, mce_action);
+            mctelem_process_deferred(workcpu, intel_mce_action);
         }

         /* Step2: Send Log to DOM0 through vIRQ */
@@ -393,6 +518,7 @@ static void mce_softirq(void)
             mce_printk(MCE_VERBOSE, "MCE: send MCE# to DOM0 through virq\n");
             send_guest_global_virq(dom0, VIRQ_MCA);
         }
+        atomic_set(&si_severity_cpu, -1);
     }

     mce_barrier_exit(&mce_inside_bar);
@@ -515,53 +641,225 @@ static void mce_barrier(struct mce_softi
 }
 #endif

+static int mce_default_ucheck(int bank, struct mcinfo_bank *mib,
+             struct mcinfo_global *mig,
+             struct mc_info *mi)
+{
+    return 1;
+}
+
+static int mce_default_uhandler(int bank, struct mcinfo_bank *mib,
+            struct mcinfo_global *mig,
+            struct mc_info *mi)
+{
+    int type;
+    uint64_t status;
+
+    mca_rdmsrl(MSR_IA32_MC0_STATUS + 4 * bank, status);
+    type = mce_type(status);
+
+    if (type != mce_ucr_srar && type != mce_ucr_srao)
+    {
+        mce_printk(MCE_CRITICAL, "Wrong mce type in urgent handler\n");
+        return MCA_FATAL;
+    }
+
+    if (type == mce_ucr_srar)
+        return MCA_FATAL;
+    /* Fail if encounter unknown SRAO error */
+    else if ((status & 0xFFFF) == 0x17A ||
+             (status & 0xFFF0) == 0xC0)
+            return MCA_NEXT_STEP;
+    else
+        return MCA_PARTIAL_RECOVERED;
+}
+
+/* Handlers be called in mce context */
+struct mca_error_handler intel_mce_uhandlers[] =
+            {{mce_default_ucheck, mce_default_uhandler},};
+#define intel_mce_uhandler_size (sizeof(intel_mce_uhandlers)/sizeof(struct 
mca_error_handler))
+static int intel_urgent_mce_handler(int bank,
+                        struct cpu_user_regs *regs,
+                        long error_code,
+                        struct mca_binfo *bi,
+                        struct mca_summary *result)
+{
+    int i;
+
+    if (!bi)
+        return MCA_NO_ERROR;
+
+    for (i = 0; i < intel_mce_uhandler_size; i++)
+    {
+        if (intel_mce_uhandlers[i].check_error(bank, bi->mib, bi->mig, bi->mi))
+            return intel_mce_uhandlers[i].handler(bank,
+                    bi->mib, bi->mig, bi->mi);
+    }
+
+    /* This will not be arrived because of the default handler */
+    return MCA_FATAL;
+}
+
+static int mce_bank_handler(int bank,
+                        struct cpu_user_regs *regs,
+                        long error_code,
+                        struct mca_binfo *bi,
+                        struct mca_summary *result)
+{
+    int type, ret = MCA_NO_ERROR, cb = 1;
+    uint64_t status;
+
+    status = mca_binfo_status(bank, bi);
+    type = mce_type(status);
+
+    switch (type) {
+    case mce_invalid:
+    /* CMCI or poller will handle corrected errors */
+    case mce_corrected:
+    case mce_ucr_ucna:
+        cb = 0;
+        return MCA_NO_ERROR;
+    case mce_fatal:
+        ret = MCA_FATAL;
+        break;
+    case mce_ucr_srao:
+        ret = intel_urgent_mce_handler(bank, regs, error_code, bi, result);
+        break;
+    case mce_ucr_srar:
+        /* Fail if MCE information is not logged */
+        if (!bi || !bi->mig || !bi->mib)
+            ret = MCA_FATAL;
+        else
+            ret = intel_urgent_mce_handler(bank, regs, error_code, bi, result);
+        break;
+    }
+
+    if (cb)
+        mca_wrmsrl(MSR_IA32_MC0_STATUS + 4 * bank, 0x0ULL);
+
+    return ret;
+}
+
+static int mce_valid_bank(int bank, struct mca_binfo *bi)
+{
+    int type;
+    uint64_t status;
+
+    status = mca_binfo_status(bank, bi);
+    type = mce_type(status);
+    if ( type == mce_invalid || type == mce_corrected || type == mce_ucr_ucna )
+        return 0;
+    return 1;
+}
+
+struct mca_ops intel_mce_ops = {
+    .queue_type = MC_URGENT,
+    .flags  = MC_FLAG_MCE,
+    .get_gext = intel_get_extended_msrs,
+    .bank_handler = mce_bank_handler,
+    .global_handler = mce_global_handler,
+    .need_handle = mce_valid_bank,
+};
+
+static struct mcinfo_extended *cmci_get_bext(int bank, struct mca_binfo *bi)
+{
+    if (!bi || !bi->mib)
+        return NULL;
+
+    mca_rdmsrl(MSR_IA32_MC0_CTL2 + bank, bi->mib->mc_ctrl2);
+    return NULL;
+}
+
+static int cmci_bank_handler(int bank,
+                             struct cpu_user_regs *regs,
+                             long error_code,
+                             struct mca_binfo *bi,
+                             struct mca_summary *result)
+{
+    uint64_t status;
+
+    status = mca_binfo_status(bank, bi);
+    mca_wrmsrl(MSR_IA32_MC0_STATUS + 4 * bank, 0x0ULL);
+    return MCA_CORRECTED;
+}
+
+static int cmci_valid_bank(int bank, struct mca_binfo *bi)
+{
+    if (test_bit(bank, __get_cpu_var(mce_banks_owned)) )
+    {
+        uint64_t status;
+        int type;
+
+        status = mca_binfo_status(bank, bi);
+        type = mce_type(status);
+        if (type == mce_corrected|| type == mce_ucr_ucna)
+            return 1;
+        return 0;
+    }
+
+    return 0;
+}
+
+struct mca_ops intel_cmci_ops = {
+    .queue_type = MC_NONURGENT,
+    .flags  = MC_FLAG_CMCI,
+    .get_gext = intel_get_extended_msrs,
+    .get_bext = cmci_get_bext,
+    .bank_handler = cmci_bank_handler,
+    .need_handle = cmci_valid_bank,
+};
+
+/* After reset, we will fetch all MCA error */
+static int mca_reset_valid_bank(int bank, struct mca_binfo *bi)
+{
+    uint64_t status;
+
+    status = mca_binfo_status(bank, bi);
+    return !!(status & MCi_STATUS_VAL);
+}
+
+struct mca_ops mca_reset_ops = {
+    .queue_type = MC_NONURGENT,
+    .flags = MC_FLAG_POLLED,
+    .get_gext = intel_get_extended_msrs,
+    .get_bext = cmci_get_bext,
+    .need_handle = mca_reset_valid_bank,
+};
+
 static void intel_machine_check(struct cpu_user_regs * regs, long error_code)
 {
     uint64_t gstatus;
     mctelem_cookie_t mctc = NULL;
     struct mca_summary bs;
     cpu_banks_t clear_bank;
+    int cur_severity;
+    int panic = 0, si= 0;

     mce_spin_lock(&mce_logout_lock);

     memset( &clear_bank, 0x0, sizeof(cpu_banks_t));
-    mctc = mcheck_mca_logout(MCA_MCE_SCAN, mca_allbanks, &bs, &clear_bank);
-
-    if (bs.errcnt) {
-        /* dump MCE error */
-        if (mctc != NULL)
-            x86_mcinfo_dump(mctelem_dataptr(mctc));
-
-        /*
-         * Uncorrected errors must be dealth with in softirq context.
-         */
-        if (bs.uc || bs.pcc) {
-            add_taint(TAINT_MACHINE_CHECK);
-            if (mctc != NULL)
-                mctelem_defer(mctc);
-            /*
-             * For PCC=1 and can't be recovered, context is lost, so reboot 
now without
-             * clearing  the banks, and deal with the telemetry after reboot
-             * (the MSRs are sticky)
-             */
-            if (bs.pcc)
-                mc_panic("State lost due to machine check exception.\n");
-            if (!bs.ripv)
-                mc_panic("RIPV =0 can't resume execution!\n");
-            if (!bs.recoverable)
-                mc_panic("Machine check exception software recovery fail.\n");
-        } else {
-            if (mctc != NULL)
-                mctelem_commit(mctc);
-        }
-        atomic_set(&found_error, 1);
-
-        mce_printk(MCE_CRITICAL, "MCE: clear_bank map %lx on CPU%d\n",
-                *((unsigned long*)clear_bank), smp_processor_id());
-        mcheck_mca_clearbanks(clear_bank);
-    } else {
-        if (mctc != NULL)
-            mctelem_dismiss(mctc);
+    mctc = mcheck_mca_handler(regs, error_code, &intel_mce_ops, &bs);
+
+    cur_severity = atomic_read(&trap_severity);
+    if (bs.severity >= cur_severity)
+    {
+        /* Mark last fatal CPU is the most severity one */
+        atomic_set(&trap_severity, bs.severity);
+        atomic_set(&trap_severity_cpu, smp_processor_id());
+    }
+
+    switch (bs.severity)
+    {
+    case MCA_FATAL:
+    /* Do we have partial one in this step ? */
+    case MCA_PARTIAL_RECOVERED:
+    case MCA_NEXT_STEP:
+    case MCA_RECOVERED:
+        add_taint(TAINT_MACHINE_CHECK);
+        atomic_inc(&found_error);
+        break;
+    default:
+        break;
     }
     mce_spin_unlock(&mce_logout_lock);

@@ -569,113 +867,92 @@ static void intel_machine_check(struct c
      * Wait until everybody has processed the trap.
      */
     mce_barrier_enter(&mce_trap_bar);
-    /* According to latest MCA OS writer guide, if no error bank found
-     * on all cpus, something unexpected happening, we can't do any
-     * recovery job but to reset the system.
-     */
-    if (atomic_read(&found_error) == 0)
-        mc_panic("Unexpected condition for the MCE handler, need reset\n");
+    switch (atomic_read(&trap_severity))
+    {
+    case MCA_FATAL:
+        panic = 1;
+        break;
+    case MCA_NEXT_STEP:
+    case MCA_PARTIAL_RECOVERED:
+        si = 1;
+        break;
+    case MCA_NO_ERROR:
+    case MCA_CORRECTED:
+        /* According to latest MCA OS writer guide, if no error bank found
+         * on all cpus, something unexpected happening, we can't do any
+         * recovery job but to reset the system.
+         */
+        if (atomic_read(&found_error) == 0)
+            panic = 1;
+        break;
+    default:
+        break;
+    }
+
+    if (panic)
+    {
+        if (mctc)
+        {
+            x86_mcinfo_dump(mctelem_dataptr(mctc));
+            /* Some dom0 can fetch the log even when xen panic, commit it */
+            mctelem_commit(mctc);
+        }
+        if (atomic_read(&trap_severity_cpu) == smp_processor_id())
+            mc_panic("Fatal MCE error, reset the system\n");
+        else
+        {
+            local_irq_enable();
+            for ( ; ; )
+                halt();
+        }
+    } else {
+        switch (bs.severity)
+        {
+            case MCA_NEXT_STEP:
+            case MCA_PARTIAL_RECOVERED:
+                if (mctc)
+                {
+                    x86_mcinfo_dump(mctelem_dataptr(mctc));
+                    mctelem_defer(mctc);
+                }
+                break;
+            case MCA_RECOVERED:
+                if (mctc)
+                {
+                    x86_mcinfo_dump(mctelem_dataptr(mctc));
+                    mctelem_commit(mctc);
+                }
+                break;
+            default:
+                break;
+        }
+    }
     mce_barrier_exit(&mce_trap_bar);

     /* Clear error finding flags after all cpus finishes above judgement */
     mce_barrier_enter(&mce_trap_bar);
-    if (atomic_read(&found_error)) {
-        mce_printk(MCE_CRITICAL, "MCE: Choose one CPU "
-                       "to clear error finding flag\n ");
+
+    if (atomic_read(&trap_severity_cpu) == smp_processor_id())
+    {
+        atomic_set(&trap_severity_cpu, -1);
+        atomic_set(&trap_severity, 0);
         atomic_set(&found_error, 0);
     }
+
+    mce_barrier_exit(&mce_trap_bar);
+
+    /*
+     * N.B. We need clear the MCIP after the barrier to avoid reenter the 
barrier
+     * again before the exit because of next MCE
+     */
     mca_rdmsrl(MSR_IA32_MCG_STATUS, gstatus);
     if ((gstatus & MCG_STATUS_MCIP) != 0) {
         mce_printk(MCE_CRITICAL, "MCE: Clear MCIP@ last step");
         mca_wrmsrl(MSR_IA32_MCG_STATUS, gstatus & ~MCG_STATUS_MCIP);
     }
-    mce_barrier_exit(&mce_trap_bar);
-
-    raise_softirq(MACHINE_CHECK_SOFTIRQ);
-}
-
-/* According to MCA OS writer guide, CMCI handler need to clear bank when
- * 1) CE (UC = 0)
- * 2) ser_support = 1, Superious error, OVER = 0, EN = 0, [UC = 1]
- * 3) ser_support = 1, UCNA, OVER = 0, S = 1, AR = 0, PCC = 0, [UC = 1, EN = 1]
- * MCA handler need to clear bank when
- * 1) ser_support = 1, Superious error, OVER = 0, EN = 0, UC = 1
- * 2) ser_support = 1, SRAR, UC = 1, OVER = 0, S = 1, AR = 1, [EN = 1]
- * 3) ser_support = 1, SRAO, UC = 1, S = 1, AR = 0, [EN = 1]
-*/
-
-static int intel_need_clearbank_scan(enum mca_source who, u64 status)
-{
-    if ( who == MCA_CMCI_HANDLER) {
-        /* CMCI need clear bank */
-        if ( !(status & MCi_STATUS_UC) )
-            return 1;
-        /* Spurious need clear bank */
-        else if ( ser_support && !(status & MCi_STATUS_OVER)
-                    && !(status & MCi_STATUS_EN) )
-            return 1;
-        /* UCNA OVER = 0 need clear bank */
-        else if ( ser_support && !(status & MCi_STATUS_OVER)
-                    && !(status & MCi_STATUS_PCC) && !(status & MCi_STATUS_S)
-                    && !(status & MCi_STATUS_AR))
-            return 1;
-        /* Only Log, no clear */
-        else return 0;
-    }
-    else if ( who == MCA_MCE_SCAN) {
-        /* Spurious need clear bank */
-        if ( ser_support && !(status & MCi_STATUS_OVER)
-                    && (status & MCi_STATUS_UC) && !(status & MCi_STATUS_EN))
-            return 1;
-        /* SRAR OVER=0 clear bank. OVER = 1 have caused reset */
-        else if ( ser_support && (status & MCi_STATUS_UC)
-                    && (status & MCi_STATUS_S) && (status & MCi_STATUS_AR )
-                    && (status & MCi_STATUS_OVER) )
-            return 1;
-        /* SRAO need clear bank */
-        else if ( ser_support && !(status & MCi_STATUS_AR)
-                    && (status & MCi_STATUS_S) && (status & MCi_STATUS_UC))
-            return 1;
-        else
-            return 0;
-    }
-
-    return 1;
-}
-
-/* MCE continues/is recoverable when
- * 1) CE UC = 0
- * 2) Supious ser_support = 1, OVER = 0, En = 0 [UC = 1]
- * 3) SRAR ser_support = 1, OVER = 0, PCC = 0, S = 1, AR = 1 [UC =1, EN = 1]
- * 4) SRAO ser_support = 1, PCC = 0, S = 1, AR = 0, EN = 1 [UC = 1]
- * 5) UCNA ser_support = 1, OVER = 0, EN = 1, PCC = 0, S = 0, AR = 0, [UC = 1]
- */
-static int intel_recoverable_scan(u64 status)
-{
-
-    if ( !(status & MCi_STATUS_UC ) )
-        return 1;
-    else if ( ser_support && !(status & MCi_STATUS_EN)
-                && !(status & MCi_STATUS_OVER) )
-        return 1;
-    /* SRAR error */
-    else if ( ser_support && !(status & MCi_STATUS_OVER)
-                && !(status & MCi_STATUS_PCC) && (status & MCi_STATUS_S)
-                && (status & MCi_STATUS_AR) ) {
-        mce_printk(MCE_VERBOSE, "MCE: No SRAR error defined currently.\n");
-        return 0;
-    }
-    /* SRAO error */
-    else if (ser_support && !(status & MCi_STATUS_PCC)
-                && (status & MCi_STATUS_S) && !(status & MCi_STATUS_AR)
-                && (status & MCi_STATUS_EN))
-        return 1;
-    /* UCNA error */
-    else if (ser_support && !(status & MCi_STATUS_OVER)
-                && (status & MCi_STATUS_EN) && !(status & MCi_STATUS_PCC)
-                && !(status & MCi_STATUS_S) && !(status & MCi_STATUS_AR))
-        return 1;
-    return 0;
+
+    if (si)
+        raise_softirq(MACHINE_CHECK_SOFTIRQ);
 }

 static DEFINE_SPINLOCK(cmci_discover_lock);
@@ -689,8 +966,7 @@ static int do_cmci_discover(int i)
     u64 val;

     rdmsrl(msr, val);
-    /* Some other CPU already owns this bank. */
-    if (val & CMCI_EN) {
+    /* Some other CPU already owns this bank. */ if (val & CMCI_EN) {
         clear_bit(i, __get_cpu_var(mce_banks_owned));
         goto out;
     }
@@ -733,8 +1009,7 @@ static void cmci_discover(void)
      * the CMCI interrupt will never be triggered again.
      */

-    mctc = mcheck_mca_logout(
-        MCA_CMCI_HANDLER, __get_cpu_var(mce_banks_owned), &bs, NULL);
+    mctc = mcheck_mca_handler(NULL, 0, &intel_cmci_ops, &bs);

     if (bs.errcnt && mctc != NULL) {
         if (dom0_vmce_enabled()) {
@@ -849,8 +1124,7 @@ fastcall void smp_cmci_interrupt(struct
     ack_APIC_irq();
     irq_enter();

-    mctc = mcheck_mca_logout(
-        MCA_CMCI_HANDLER, __get_cpu_var(mce_banks_owned), &bs, NULL);
+    mctc = mcheck_mca_handler(regs, 0, &intel_cmci_ops, &bs);

     if (bs.errcnt && mctc != NULL) {
         if (dom0_vmce_enabled()) {
@@ -914,7 +1188,7 @@ static void mce_init(void)
     /* log the machine checks left over from the previous reset.
      * This also clears all registers*/

-    mctc = mcheck_mca_logout(MCA_RESET, mca_allbanks, &bs, NULL);
+    mctc = mcheck_mca_handler(NULL, 0, &mca_reset_ops, &bs);

     /* in the boot up stage, print out and also log in DOM0 boot process */
     if (bs.errcnt && mctc != NULL) {
@@ -948,8 +1222,6 @@ enum mcheck_type intel_mcheck_init(struc

     /* machine check is available */
     x86_mce_vector_register(intel_machine_check);
-    mce_recoverable_register(intel_recoverable_scan);
-    mce_need_clearbank_register(intel_need_clearbank_scan);

     mce_init();
     mce_intel_feature_init(c);
@@ -994,5 +1266,3 @@ int intel_mce_rdmsr(uint32_t msr, uint64

     return ret;
 }
-
-
diff -r da1165144ac4 -r 5223a8944e2d xen/arch/x86/cpu/mcheck/non-fatal.c
--- a/xen/arch/x86/cpu/mcheck/non-fatal.c       Mon Apr 19 15:31:29 2010 +0800
+++ b/xen/arch/x86/cpu/mcheck/non-fatal.c       Mon Apr 19 16:41:28 2010 +0800
@@ -33,13 +33,30 @@ static int adjust = 0;
 static int adjust = 0;
 static int variable_period = 1;

+static int poller_valid_bank(int bank, struct mca_binfo *bi)
+{
+    uint64_t status;
+
+    status = mca_binfo_status(bank, bi);
+    if (!(status & MCi_STATUS_VAL))
+        return 0;
+    return test_bit(bank, __get_cpu_var(poll_bankmask));
+}
+
+static struct mca_ops nonfatal_ops =
+{
+    .queue_type = MC_NONURGENT,
+    .flags = MC_FLAG_POLLED,
+    .need_handle = poller_valid_bank,
+};
+
 static void mce_checkregs (void *info)
 {
        mctelem_cookie_t mctc;
        struct mca_summary bs;
        static uint64_t dumpcount = 0;

-       mctc = mcheck_mca_logout(MCA_POLLER, __get_cpu_var(poll_bankmask), &bs, 
NULL);
+       mctc = mcheck_mca_handler(NULL, 0, &nonfatal_ops, &bs);

        if (bs.errcnt && mctc != NULL) {
                adjust++;
diff -r da1165144ac4 -r 5223a8944e2d xen/arch/x86/cpu/mcheck/x86_mca.h
--- a/xen/arch/x86/cpu/mcheck/x86_mca.h Mon Apr 19 15:31:29 2010 +0800
+++ b/xen/arch/x86/cpu/mcheck/x86_mca.h Mon Apr 19 16:41:28 2010 +0800
@@ -104,39 +104,17 @@ DECLARE_PER_CPU(cpu_banks_t, mce_banks_o
  *    recovery_handler must figure it out.
 */

-/* MCA error has been recovered successfully by the recovery action*/
-#define MCA_RECOVERED (0x1 << 0)
-/* MCA error impact the specified DOMAIN in owner field below */
-#define MCA_OWNER (0x1 << 1)
-/* MCA error can't be recovered and need reset */
-#define MCA_NEED_RESET (0x1 << 2)
-/* MCA error did not have any action yet */
-#define MCA_NO_ACTION (0x1 << 3)
-
-struct mca_handle_result
-{
-    uint32_t result;
-    /* Used one result & MCA_OWNER */
-    domid_t owner;
-    /* Used by mca_error_handler, result & MCA_RECOVRED */
-    struct recovery_action *action;
-};
-
-extern void (*mca_prehandler)( struct cpu_user_regs *regs,
-                        struct mca_handle_result *result);
-
 struct mca_error_handler
 {
-    /* Assume corresponding recovery action could be uniquely
-     * identified by mca_code. Otherwise, we might need to have
-     * a seperate function to decode the corresponding actions
-     * for the particular mca error later.
-    */
-    uint16_t mca_code;
-    void (*recovery_handler)( struct mcinfo_bank *bank,
-                    struct mcinfo_global *global,
-                    struct mcinfo_extended *extension,
-                    struct mca_handle_result *result);
+    int (*check_error)(int bank,
+                       struct mcinfo_bank *mib,
+                       struct mcinfo_global *mig,
+                       struct mc_info *mi);
+    /* return 0 if error handling can continue */
+    int (*handler)(int bank,
+                   struct mcinfo_bank *mib,
+                   struct mcinfo_global *mig,
+                   struct mc_info *mi);
 };

 /* Global variables */
diff -r da1165144ac4 -r 5223a8944e2d xen/include/public/arch-x86/xen-mca.h
--- a/xen/include/public/arch-x86/xen-mca.h     Mon Apr 19 15:31:29 2010 +0800
+++ b/xen/include/public/arch-x86/xen-mca.h     Mon Apr 19 16:41:28 2010 +0800
@@ -181,12 +181,12 @@ struct mcinfo_extended {
  * REC_ACTION_RECOVERED flag will be returned.
  */

-/* Page Offline Action */
-#define MC_ACTION_PAGE_OFFLINE (0x1 << 0)
-/* CPU offline Action */
-#define MC_ACTION_CPU_OFFLINE (0x1 << 1)
+#define MC_ACTION_PAGE_OFFLINE (0x1)
+#define MC_ACTION_CPU_OFFLINE (0x2)
+#define MC_ACTION_VMCE_INJECTION (0x3)
 /* L3 cache disable Action */
-#define MC_ACTION_CACHE_SHRINK (0x1 << 2)
+#define MC_ACTION_CACHE_SHRINK (0x4)
+

 /* Below interface used between XEN/DOM0 for passing XEN's recovery action
  * information to DOM0.
@@ -210,6 +210,11 @@ struct cpu_offline_action
     uint16_t mc_core_threadid;
 };

+struct vmce_inject_action
+{
+    domid_t dom;
+};
+
 #define MAX_UNION_SIZE 16
 struct mcinfo_recovery
 {
@@ -220,6 +225,7 @@ struct mcinfo_recovery
     union {
         struct page_offline_action page_retire;
         struct cpu_offline_action cpu_offline;
+        struct vmce_inject_action vmce;
         uint8_t pad[MAX_UNION_SIZE];
     } action_info;
 };
diff -r da1165144ac4 -r 5223a8944e2d xen/include/xlat.lst
--- a/xen/include/xlat.lst      Mon Apr 19 15:31:29 2010 +0800
+++ b/xen/include/xlat.lst      Mon Apr 19 16:41:28 2010 +0800
@@ -26,6 +26,7 @@
 ?      mc_notifydomain                 arch-x86/xen-mca.h
 !      mc_physcpuinfo                  arch-x86/xen-mca.h
 ?      page_offline_action             arch-x86/xen-mca.h
+?   vmce_inject_action      arch-x86/xen-mca.h
 ?      evtchn_alloc_unbound            event_channel.h
 ?      evtchn_bind_interdomain         event_channel.h
 ?      evtchn_bind_ipi                 event_channel.h


Attachment: mce_ops.patch
Description: mce_ops.patch

_______________________________________________
Xen-devel mailing list
Xen-devel@xxxxxxxxxxxxxxxxxxx
http://lists.xensource.com/xen-devel
<Prev in Thread] Current Thread [Next in Thread>
  • [Xen-devel] [RFC] [PATCH 2/2] Clean-up the MCA handling code, Jiang, Yunhong <=