WARNING - OLD ARCHIVES

This is an archived copy of the Xen.org mailing list, which we have preserved to ensure that existing links to archives are not broken. The live archive, which contains the latest emails, can be found at http://lists.xen.org/
   
 
 
Xen 
 
Home Products Support Community News
 
   
 

xen-devel

[Xen-devel] [PATCH 3/6 ] Make mce_action action be usable for both delay

To: Keir Fraser <keir.fraser@xxxxxxxxxxxxx>
Subject: [Xen-devel] [PATCH 3/6 ] Make mce_action action be usable for both delayed handler and urgent handler
From: "Jiang, Yunhong" <yunhong.jiang@xxxxxxxxx>
Date: Wed, 9 Jun 2010 22:32:22 +0800
Accept-language: en-US
Acceptlanguage: en-US
Cc: xen-devel <xen-devel@xxxxxxxxxxxxxxxxxxx>
Delivery-date: Wed, 09 Jun 2010 07:41:53 -0700
Envelope-to: www-data@xxxxxxxxxxxxxxxxxxx
List-help: <mailto:xen-devel-request@lists.xensource.com?subject=help>
List-id: Xen developer discussion <xen-devel.lists.xensource.com>
List-post: <mailto:xen-devel@lists.xensource.com>
List-subscribe: <http://lists.xensource.com/mailman/listinfo/xen-devel>, <mailto:xen-devel-request@lists.xensource.com?subject=subscribe>
List-unsubscribe: <http://lists.xensource.com/mailman/listinfo/xen-devel>, <mailto:xen-devel-request@lists.xensource.com?subject=unsubscribe>
Sender: xen-devel-bounces@xxxxxxxxxxxxxxxxxxx
Thread-index: AcsH4JLPCinaKtkNS9OlS5qm3SP2rQ==
Thread-topic: [PATCH 3/6 ] Make mce_action action be usable for both delayed handler and urgent handler
Make mce_action action be usable for both delayed handler and urgent handler

Originally mce_action is called for delayed hander. Change it to be used for 
both delayed handler and urgent handler.Wrap it with mce_delayed_action for 
delay handler.

Change the return value to be more clearly.

Change the mca handler from mca_code to a function to be more flexible. And 
change the interface to mce_handler to be mca_binfo to pass more information.

Signed-off-by: Jiang, Yunhong <yunhong.jiang@xxxxxxxxx>

mce_intel.c |  210 ++++++++++++++++++++++++++++++++++++------------------------
 x86_mca.h   |   15 +++-
 2 files changed, 140 insertions(+), 85 deletions(-)

diff -r fb1adcc25acb xen/arch/x86/cpu/mcheck/mce_intel.c
--- a/xen/arch/x86/cpu/mcheck/mce_intel.c       Wed Jun 09 09:01:49 2010 +0800
+++ b/xen/arch/x86/cpu/mcheck/mce_intel.c       Wed Jun 09 14:39:37 2010 +0800
@@ -154,89 +154,124 @@ static void mce_barrier_enter(struct mce
 static void mce_barrier_enter(struct mce_softirq_barrier *);
 static void mce_barrier_exit(struct mce_softirq_barrier *);
 
-static void intel_UCR_handler(struct mcinfo_bank *bank,
-             struct mcinfo_global *global,
-             struct mcinfo_extended *extension,
-             struct mca_handle_result *result);
-#define INTEL_MAX_RECOVERY 2
-struct mca_error_handler intel_recovery_handler[INTEL_MAX_RECOVERY] =
-            {{0x017A, intel_UCR_handler}, {0x00C0, intel_UCR_handler}};
+struct mca_error_handler *mce_dhandlers, *mce_uhandlers;
+int mce_dhandler_num, mce_uhandler_num;
+
+enum mce_result
+{
+    MCER_NOERROR,
+    MCER_RECOVERED,
+    /* Not recoverd, but can continue */
+    MCER_CONTINUE,
+    MCER_RESET,
+};
+
+/* Maybe called in MCE context, no lock, no printk */
+static enum mce_result mce_action(struct cpu_user_regs *regs,
+                      mctelem_cookie_t mctc)
+{
+    struct mc_info *local_mi;
+    enum mce_result ret = MCER_NOERROR;
+    uint32_t i;
+    struct mcinfo_common *mic = NULL;
+    struct mca_handle_result mca_res;
+    struct mca_binfo binfo;
+    struct mca_error_handler *handlers = mce_dhandlers;
+    int handler_num = mce_dhandler_num;
+
+    /* When in mce context, regs is valid */
+    if (regs)
+    {
+        handler_num = mce_uhandler_num;
+        handlers = mce_uhandlers;
+    }
+
+    /* At least a default handler should be registerd */
+    ASSERT(handler_num);
+
+    local_mi = (struct mc_info*)mctelem_dataptr(mctc);
+    x86_mcinfo_lookup(mic, local_mi, MC_TYPE_GLOBAL);
+    if (mic == NULL) {
+        printk(KERN_ERR "MCE: get local buffer entry failed\n ");
+        return MCER_CONTINUE;
+    }
+
+    memset(&binfo, 0, sizeof(binfo));
+    binfo.mig = (struct mcinfo_global *)mic;
+    binfo.mi = local_mi;
+
+    /* Processing bank information */
+    x86_mcinfo_lookup(mic, local_mi, MC_TYPE_BANK);
+
+    for ( ; ret != MCER_RESET && mic && mic->size;
+          mic = x86_mcinfo_next(mic) )
+    {
+        if (mic->type != MC_TYPE_BANK) {
+            continue;
+        }
+        binfo.mib = (struct mcinfo_bank*)mic;
+        binfo.bank = binfo.mib->mc_bank;
+        memset(&mca_res, 0x0f, sizeof(mca_res));
+        for ( i = 0; i < handler_num; i++ ) {
+            if (handlers[i].owned_error(binfo.mib->mc_status))
+            {
+                handlers[i].recovery_handler(binfo.bank, &binfo, &mca_res);
+
+                if (mca_res.result & MCA_OWNER)
+                    binfo.mib->mc_domid = mca_res.owner;
+
+                if (mca_res.result == MCA_NEED_RESET)
+                    ret = MCER_RESET;
+                else if (mca_res.result == MCA_RECOVERED)
+                {
+                    if (ret < MCER_RECOVERED)
+                        ret = MCER_RECOVERED;
+                }
+                else if (mca_res.result == MCA_NO_ACTION)
+                {
+                    if (ret < MCER_CONTINUE)
+                        ret = MCER_CONTINUE;
+                }
+                break;
+            }
+        }
+        ASSERT(i != handler_num);
+    }
+
+    return ret;
+}
 
 /*
  * Called from mctelem_process_deferred. Return 1 if the telemetry
  * should be committed for dom0 consumption, 0 if it should be
  * dismissed.
  */
-static int mce_action(mctelem_cookie_t mctc)
-{
-    struct mc_info *local_mi;
-    uint32_t i;
-    struct mcinfo_common *mic = NULL;
-    struct mcinfo_global *mc_global;
-    struct mcinfo_bank *mc_bank;
-    struct mca_handle_result mca_res;
-
-    local_mi = (struct mc_info*)mctelem_dataptr(mctc);
-    x86_mcinfo_lookup(mic, local_mi, MC_TYPE_GLOBAL);
-    if (mic == NULL) {
-        printk(KERN_ERR "MCE: get local buffer entry failed\n ");
-        return 0;
-    }
-
-    mc_global = (struct mcinfo_global *)mic;
-
-    /* Processing bank information */
-    x86_mcinfo_lookup(mic, local_mi, MC_TYPE_BANK);
-
-    for ( ; mic && mic->size; mic = x86_mcinfo_next(mic) ) {
-        if (mic->type != MC_TYPE_BANK) {
-            continue;
-        }
-        mc_bank = (struct mcinfo_bank*)mic;
-
-        /* TODO: Add recovery actions here, such as page-offline, etc */
-        memset(&mca_res, 0x0f, sizeof(mca_res));
-        for ( i = 0; i < INTEL_MAX_RECOVERY; i++ ) {
-            if ( ((mc_bank->mc_status & 0xffff) ==
-                        intel_recovery_handler[i].mca_code) ||
-                  ((mc_bank->mc_status & 0xfff0) ==
-                        intel_recovery_handler[i].mca_code)) {
-                /* For SRAR, OVER = 1 should have caused reset
-                 * For SRAO, OVER = 1 skip recovery action, continue execution
-                 */
-                if (!(mc_bank->mc_status & MCi_STATUS_OVER))
-                    intel_recovery_handler[i].recovery_handler
-                                (mc_bank, mc_global, NULL, &mca_res);
-                else {
-                   if (!(mc_global->mc_gstatus & MCG_STATUS_RIPV))
-                       mca_res.result = MCA_NEED_RESET;
-                   else
-                       mca_res.result = MCA_NO_ACTION;
-                }
-                if (mca_res.result & MCA_OWNER)
-                    mc_bank->mc_domid = mca_res.owner;
-                if (mca_res.result == MCA_NEED_RESET)
-                    /* DOMID_XEN*/
-                    mc_panic("MCE: Software recovery failed for the UCR "
-                                "error\n");
-                else if (mca_res.result == MCA_RECOVERED)
-                    mce_printk(MCE_VERBOSE, "MCE: The UCR error is"
-                                "successfully recovered by software!\n");
-                else if (mca_res.result == MCA_NO_ACTION)
-                    mce_printk(MCE_VERBOSE, "MCE: Overwrite SRAO error can't"
-                                "do recover action, RIPV=1, let it be.\n");
-                break;
-            }
-        }
-        /* For SRAR, no defined recovery action should have caused reset
-         * in MCA Handler
-         */
-        if ( i >= INTEL_MAX_RECOVERY )
-            mce_printk(MCE_VERBOSE, "MCE: No software recovery action"
-                            " found for this SRAO error\n");
-
-    }
-    return 1;
+static int mce_delayed_action(mctelem_cookie_t mctc)
+{
+    enum mce_result result;
+    int ret = 0;
+
+    result = mce_action(NULL, mctc);
+
+    switch (result)
+    {
+    case MCER_RESET:
+        panic("MCE: Software recovery failed for the UCR\n");
+        break;
+    case MCER_RECOVERED:
+        dprintk(XENLOG_INFO, "MCE: Error is successfully recovered\n");
+        ret  = 1;
+        break;
+    case MCER_CONTINUE:
+        dprintk(XENLOG_INFO, "MCE: Error can't be recovered, "
+            "system is tainted\n");
+        ret = 1;
+        break;
+    default:
+        ret = 0;
+        break;
+    }
+    return ret;
 }
 
 /* Softirq Handler for this MCE# processing */
@@ -274,7 +309,7 @@ static void mce_softirq(void)
          * vMCE MSRs virtualization buffer
          */
         for_each_online_cpu(workcpu) {
-            mctelem_process_deferred(workcpu, mce_action);
+            mctelem_process_deferred(workcpu, mce_delayed_action);
         }
 
         /* Step2: Send Log to DOM0 through vIRQ */
@@ -466,11 +501,18 @@ intel_get_extended_msrs(struct mcinfo_gl
     return mc_ext;
 }
 
-static void intel_UCR_handler(struct mcinfo_bank *bank,
-             struct mcinfo_global *global,
-             struct mcinfo_extended *extension,
+#define INTEL_MAX_RECOVERY 2
+static int is_async_memerr(uint64_t status)
+{
+    return (status & 0xFFFF) == 0x17A || (status & 0xFFF0) == 0xC0;
+}
+
+static void intel_memerr_dhandler(int bnum,
+             struct mca_binfo *binfo,
              struct mca_handle_result *result)
 {
+    struct mcinfo_bank *bank = binfo->mib;
+    struct mcinfo_global *global = binfo->mig;
     struct domain *d;
     unsigned long mfn, gfn;
     uint32_t status;
@@ -545,6 +587,9 @@ static void intel_UCR_handler(struct mci
     }
 }
 
+struct mca_error_handler intel_mce_dhandlers[] =
+            {{is_async_memerr, intel_memerr_dhandler}};
+
 static void intel_machine_check(struct cpu_user_regs * regs, long error_code)
 {
     uint64_t gstatus;
@@ -1007,6 +1052,9 @@ static void intel_init_mce(void)
     x86_mce_vector_register(intel_machine_check);
     mce_recoverable_register(intel_recoverable_scan);
     mce_need_clearbank_register(intel_need_clearbank_scan);
+
+    mce_dhandlers = intel_mce_dhandlers;
+    mce_dhandler_num = sizeof(intel_mce_dhandlers)/sizeof(struct 
mca_error_handler);
 }
 
 static int intel_init_mca_banks(void)
diff -r fb1adcc25acb xen/arch/x86/cpu/mcheck/x86_mca.h
--- a/xen/arch/x86/cpu/mcheck/x86_mca.h Wed Jun 09 09:01:49 2010 +0800
+++ b/xen/arch/x86/cpu/mcheck/x86_mca.h Wed Jun 09 14:39:26 2010 +0800
@@ -151,6 +151,15 @@ struct mca_handle_result
     struct recovery_action *action;
 };
 
+/*Keep bank so that we can get staus even if mib is NULL */
+struct mca_binfo {
+    int bank;
+    struct mcinfo_global *mig;
+    struct mcinfo_bank *mib;
+    struct mc_info *mi;
+    struct cpu_user_regs *regs;
+};
+
 extern void (*mca_prehandler)( struct cpu_user_regs *regs,
                         struct mca_handle_result *result);
 
@@ -161,10 +170,8 @@ struct mca_error_handler
      * a seperate function to decode the corresponding actions
      * for the particular mca error later.
     */
-    uint16_t mca_code;
-    void (*recovery_handler)( struct mcinfo_bank *bank,
-                    struct mcinfo_global *global,
-                    struct mcinfo_extended *extension,
+    int (*owned_error)(uint64_t status);
+    void (*recovery_handler)(int bank, struct mca_binfo *binfo,
                     struct mca_handle_result *result);
 };
 


Attachment: mce_action.patch
Description: mce_action.patch

_______________________________________________
Xen-devel mailing list
Xen-devel@xxxxxxxxxxxxxxxxxxx
http://lists.xensource.com/xen-devel
<Prev in Thread] Current Thread [Next in Thread>
  • [Xen-devel] [PATCH 3/6 ] Make mce_action action be usable for both delayed handler and urgent handler, Jiang, Yunhong <=