WARNING - OLD ARCHIVES

This is an archived copy of the Xen.org mailing list, which we have preserved to ensure that existing links to archives are not broken. The live archive, which contains the latest emails, can be found at http://lists.xen.org/
   
 
 
Xen 
 
Home Products Support Community News
 
   
 

xen-devel

[Xen-devel] [PATCH] X86 MCE: Add SRAR handler

To: Keir Fraser <keir.xen@xxxxxxxxx>, "xen-devel@xxxxxxxxxxxxxxxxxxx" <xen-devel@xxxxxxxxxxxxxxxxxxx>
Subject: [Xen-devel] [PATCH] X86 MCE: Add SRAR handler
From: "Liu, Jinsong" <jinsong.liu@xxxxxxxxx>
Date: Thu, 29 Sep 2011 23:20:59 +0800
Accept-language: en-US
Acceptlanguage: en-US
Cc: "Jiang, Yunhong" <yunhong.jiang@xxxxxxxxx>
Delivery-date: Thu, 29 Sep 2011 08:21:49 -0700
Envelope-to: www-data@xxxxxxxxxxxxxxxxxxx
List-help: <mailto:xen-devel-request@lists.xensource.com?subject=help>
List-id: Xen developer discussion <xen-devel.lists.xensource.com>
List-post: <mailto:xen-devel@lists.xensource.com>
List-subscribe: <http://lists.xensource.com/mailman/listinfo/xen-devel>, <mailto:xen-devel-request@lists.xensource.com?subject=subscribe>
List-unsubscribe: <http://lists.xensource.com/mailman/listinfo/xen-devel>, <mailto:xen-devel-request@lists.xensource.com?subject=unsubscribe>
Sender: xen-devel-bounces@xxxxxxxxxxxxxxxxxxx
Thread-index: Acx+u2Q1yNlzPhj/SO+HbpwXHIBlVQ==
Thread-topic: [PATCH] X86 MCE: Add SRAR handler
X86 MCE: Add SRAR handler

Currently Intel SDM add 2 kinds of MCE SRAR errors:
1). Data Load error, error code = 0x134
2). Instruction Fetch error, error code = 0x150
This patch add handler to these new SRAR errors.
It based on existed mce infrastructure, add code to handle SRAR specific error.

Signed-off-by: Liu, Jinsong <jinsong.liu@xxxxxxxxx>

diff -r 8d6edc3d26d2 xen/arch/x86/cpu/mcheck/mce_intel.c
--- a/xen/arch/x86/cpu/mcheck/mce_intel.c       Sat Aug 13 10:14:58 2011 +0100
+++ b/xen/arch/x86/cpu/mcheck/mce_intel.c       Sat Aug 20 23:53:02 2011 +0800
@@ -37,6 +37,14 @@ static int __read_mostly nr_intel_ext_ms
  */
 #define INTEL_SRAO_MEM_SCRUB 0xC0 ... 0xCF
 #define INTEL_SRAO_L3_EWB    0x17A
+
+/* 
+ * Currently Intel SDM define 2 kinds of srar errors:
+ * 1). Data Load error, error code = 0x134
+ * 2). Instruction Fetch error, error code = 0x150
+ */
+#define INTEL_SRAR_DATA_LOAD   0x134
+#define INTEL_SRAR_INSTR_FETCH 0x150
 
 /* Thermal Hanlding */
 #ifdef CONFIG_X86_MCE_THERMAL
@@ -255,7 +263,7 @@ static enum mce_result mce_action(struct
         for ( i = 0; i < handler_num; i++ ) {
             if (handlers[i].owned_error(binfo.mib->mc_status))
             {
-                handlers[i].recovery_handler(&binfo, &bank_result);
+                handlers[i].recovery_handler(&binfo, &bank_result, regs);
                 if (worst_result < bank_result)
                     worst_result = bank_result;
                 break;
@@ -621,7 +629,8 @@ struct mcinfo_recovery *mci_add_pageoff_
 
 static void intel_memerr_dhandler(
              struct mca_binfo *binfo,
-             enum mce_result *result)
+             enum mce_result *result,
+             struct cpu_user_regs *regs)
 {
     struct mcinfo_bank *bank = binfo->mib;
     struct mcinfo_global *global = binfo->mig;
@@ -718,6 +727,32 @@ vmce_failed:
     }
 }
 
+static int intel_srar_check(uint64_t status)
+{
+    return ( intel_check_mce_type(status) == intel_mce_ucr_srar );
+}
+
+static void intel_srar_dhandler(
+             struct mca_binfo *binfo,
+             enum mce_result *result,
+             struct cpu_user_regs *regs)
+{
+    uint64_t status = binfo->mib->mc_status;
+
+    /* For unknown srar error code, reset system */
+    *result = MCER_RESET;
+
+    switch ( status & INTEL_MCCOD_MASK )
+    {
+    case INTEL_SRAR_DATA_LOAD:
+    case INTEL_SRAR_INSTR_FETCH:
+        intel_memerr_dhandler(binfo, result, regs);
+        break;
+    default:
+        break;
+    }
+}
+
 static int intel_srao_check(uint64_t status)
 {
     return ( intel_check_mce_type(status) == intel_mce_ucr_srao );
@@ -725,7 +760,8 @@ static int intel_srao_check(uint64_t sta
 
 static void intel_srao_dhandler(
              struct mca_binfo *binfo,
-             enum mce_result *result)
+             enum mce_result *result,
+             struct cpu_user_regs *regs)
 {
     uint64_t status = binfo->mib->mc_status;
 
@@ -738,7 +774,7 @@ static void intel_srao_dhandler(
         {
         case INTEL_SRAO_MEM_SCRUB:
         case INTEL_SRAO_L3_EWB:
-            intel_memerr_dhandler(binfo, result);
+            intel_memerr_dhandler(binfo, result, regs);
             break;
         default:
             break;
@@ -753,14 +789,15 @@ static int intel_default_check(uint64_t 
 
 static void intel_default_mce_dhandler(
              struct mca_binfo *binfo,
-             enum mce_result *result)
+             enum mce_result *result,
+             struct cpu_user_regs * regs)
 {
     uint64_t status = binfo->mib->mc_status;
     enum intel_mce_type type;
 
     type = intel_check_mce_type(status);
 
-    if (type == intel_mce_fatal || type == intel_mce_ucr_srar)
+    if (type == intel_mce_fatal)
         *result = MCER_RESET;
     else
         *result = MCER_CONTINUE;
@@ -768,12 +805,14 @@ static void intel_default_mce_dhandler(
 
 static const struct mca_error_handler intel_mce_dhandlers[] = {
     {intel_srao_check, intel_srao_dhandler},
+    {intel_srar_check, intel_srar_dhandler},
     {intel_default_check, intel_default_mce_dhandler}
 };
 
 static void intel_default_mce_uhandler(
              struct mca_binfo *binfo,
-             enum mce_result *result)
+             enum mce_result *result,
+             struct cpu_user_regs *regs)
 {
     uint64_t status = binfo->mib->mc_status;
     enum intel_mce_type type;
@@ -782,8 +821,12 @@ static void intel_default_mce_uhandler(
 
     switch (type)
     {
-    /* Panic if no handler for SRAR error */
     case intel_mce_ucr_srar:
+        if ( !guest_mode(regs) )
+            *result = MCER_RESET;
+        else
+            *result = MCER_CONTINUE;
+        break;
     case intel_mce_fatal:
         *result = MCER_RESET;
         break;
@@ -958,10 +1001,8 @@ static int intel_recoverable_scan(u64 st
     /* SRAR error */
     else if ( ser_support && !(status & MCi_STATUS_OVER) 
                 && !(status & MCi_STATUS_PCC) && (status & MCi_STATUS_S)
-                && (status & MCi_STATUS_AR) ) {
-        mce_printk(MCE_VERBOSE, "MCE: No SRAR error defined currently.\n");
-        return 0;
-    }
+                && (status & MCi_STATUS_AR) && (status & MCi_STATUS_EN) )
+        return 1;
     /* SRAO error */
     else if (ser_support && !(status & MCi_STATUS_PCC)
                 && (status & MCi_STATUS_S) && !(status & MCi_STATUS_AR)
diff -r 8d6edc3d26d2 xen/arch/x86/cpu/mcheck/x86_mca.h
--- a/xen/arch/x86/cpu/mcheck/x86_mca.h Sat Aug 13 10:14:58 2011 +0100
+++ b/xen/arch/x86/cpu/mcheck/x86_mca.h Sat Aug 20 23:53:02 2011 +0800
@@ -151,7 +151,7 @@ struct mca_error_handler
     */
     int (*owned_error)(uint64_t status);
     void (*recovery_handler)(struct mca_binfo *binfo,
-                    enum mce_result *result);
+                    enum mce_result *result, struct cpu_user_regs *regs);
 };
 
 /* Global variables */

Attachment: srar-1.patch
Description: srar-1.patch

_______________________________________________
Xen-devel mailing list
Xen-devel@xxxxxxxxxxxxxxxxxxx
http://lists.xensource.com/xen-devel