WARNING - OLD ARCHIVES

This is an archived copy of the Xen.org mailing list, which we have preserved to ensure that existing links to archives are not broken. The live archive, which contains the latest emails, can be found at http://lists.xen.org/
   
 
 
Xen 
 
Home Products Support Community News
 
   
 

xen-devel

[Xen-devel] [PATCH 4/6 ] Clean Intel's MCE handler code

To: Keir Fraser <keir.fraser@xxxxxxxxxxxxx>
Subject: [Xen-devel] [PATCH 4/6 ] Clean Intel's MCE handler code
From: "Jiang, Yunhong" <yunhong.jiang@xxxxxxxxx>
Date: Wed, 9 Jun 2010 22:32:14 +0800
Accept-language: en-US
Acceptlanguage: en-US
Cc: xen-devel <xen-devel@xxxxxxxxxxxxxxxxxxx>
Delivery-date: Wed, 09 Jun 2010 07:37:41 -0700
Envelope-to: www-data@xxxxxxxxxxxxxxxxxxx
List-help: <mailto:xen-devel-request@lists.xensource.com?subject=help>
List-id: Xen developer discussion <xen-devel.lists.xensource.com>
List-post: <mailto:xen-devel@lists.xensource.com>
List-subscribe: <http://lists.xensource.com/mailman/listinfo/xen-devel>, <mailto:xen-devel-request@lists.xensource.com?subject=subscribe>
List-unsubscribe: <http://lists.xensource.com/mailman/listinfo/xen-devel>, <mailto:xen-devel-request@lists.xensource.com?subject=unsubscribe>
Sender: xen-devel-bounces@xxxxxxxxxxxxxxxxxxx
Thread-index: AcsH4I3yfp+a7UXXQDysmsu9STjfQA==
Thread-topic: [PATCH 4/6 ] Clean Intel's MCE handler code
Clean Intel's MCE handler code

Add intel_mce_type check according to Intel's SDM.

Reduce intel_memerr_dhandler()'s indent to make code easily read. And add a 
page_off action when we offline the page, so that dom0 can knows about the 
action taken by xen hypervisor.

Add a default delay mce handler, which will crash if unknow SRAR error or fatal 
error, otherwise, system continue.

Signed-off-by: Jiang, Yunhong <yunhong.jiang@xxxxxxxxx>

mce_intel.c |  239 ++++++++++++++++++++++++++++++++++++++++++------------------
 x86_mca.h   |    3
 2 files changed, 174 insertions(+), 68 deletions(-)

diff -r 276e4646f9d8 xen/arch/x86/cpu/mcheck/mce_intel.c
--- a/xen/arch/x86/cpu/mcheck/mce_intel.c       Tue Jun 08 17:30:33 2010 +0800
+++ b/xen/arch/x86/cpu/mcheck/mce_intel.c       Tue Jun 08 17:53:04 2010 +0800
@@ -463,10 +463,76 @@ intel_get_extended_msrs(struct mcinfo_gl
     return mc_ext;
 }
 
-#define INTEL_MAX_RECOVERY 2
+enum intel_mce_type
+{
+    intel_mce_invalid,
+    intel_mce_fatal,
+    intel_mce_corrected,
+    intel_mce_ucr_ucna,
+    intel_mce_ucr_srao,
+    intel_mce_ucr_srar,
+};
+
+static enum intel_mce_type intel_check_mce_type(uint64_t status)
+{
+    if (!(status & MCi_STATUS_VAL))
+        return intel_mce_invalid;
+
+    if (status & MCi_STATUS_PCC)
+        return intel_mce_fatal;
+
+    /* Corrected error? */
+    if (!(status & MCi_STATUS_UC))
+        return intel_mce_corrected;
+
+    if (!ser_support)
+        return intel_mce_fatal;
+
+    if (status & MCi_STATUS_S)
+    {
+        if (status & MCi_STATUS_AR)
+        {
+            if (status & MCi_STATUS_OVER)
+                return intel_mce_fatal;
+            else
+                return intel_mce_ucr_srar;
+        } else
+            return intel_mce_ucr_srao;
+    }
+    else
+        return intel_mce_ucr_ucna;
+
+    /* Any type not included abovoe ? */
+    return intel_mce_fatal;
+}
+
 static int is_async_memerr(uint64_t status)
 {
     return (status & 0xFFFF) == 0x17A || (status & 0xFFF0) == 0xC0;
+}
+
+struct mcinfo_recovery *mci_add_pageoff_action(int bank, struct mc_info *mi,
+                              uint64_t mfn, uint32_t status)
+{
+    struct mcinfo_recovery *rec;
+
+    if (!mi)
+        return NULL;
+
+    rec = x86_mcinfo_reserve(mi, sizeof(struct mcinfo_recovery));
+    if (!rec)
+    {
+        mi->flags |= MCINFO_FLAGS_UNCOMPLETE;
+        return NULL;
+    }
+
+    memset(rec, 0, sizeof(struct mcinfo_recovery));
+
+    rec->mc_bank = bank;
+    rec->action_types = MC_ACTION_PAGE_OFFLINE;
+    rec->action_info.page_retire.mfn = mfn;
+    rec->action_info.page_retire.status = status;
+    return rec;
 }
 
 static void intel_memerr_dhandler(int bnum,
@@ -478,79 +544,116 @@ static void intel_memerr_dhandler(int bn
     struct domain *d;
     unsigned long mfn, gfn;
     uint32_t status;
+    uint64_t mc_status, mc_misc;
 
     mce_printk(MCE_VERBOSE, "MCE: Enter UCR recovery action\n");
     result->result = MCA_NEED_RESET;
-    if (bank->mc_addr != 0) {
-         mfn = bank->mc_addr >> PAGE_SHIFT;
-         if (!offline_page(mfn, 1, &status)) {
-              /* This is free page */
-              if (status & PG_OFFLINE_OFFLINED)
-                  result->result = MCA_RECOVERED;
-              else if (status & PG_OFFLINE_PENDING) {
-                 /* This page has owner */
-                  if (status & PG_OFFLINE_OWNED) {
-                      result->result |= MCA_OWNER;
-                      result->owner = status >> PG_OFFLINE_OWNER_SHIFT;
-                      mce_printk(MCE_QUIET, "MCE: This error page is ownded"
-                                  " by DOM %d\n", result->owner);
-                      /* Fill vMCE# injection and vMCE# MSR virtualization "
-                       * "related data */
-                      bank->mc_domid = result->owner;
-                      /* XXX: Cannot handle shared pages yet 
-                       * (this should identify all domains and gfn mapping to
-                       *  the mfn in question) */
-                      BUG_ON( result->owner == DOMID_COW );
-                      if ( result->owner != DOMID_XEN ) {
-
-                          d = get_domain_by_id(result->owner);
-                          if ( mca_ctl_conflict(bank, d) )
-                          {
-                              /* Guest has different MCE ctl with hypervisor */
-                              if ( d )
-                                  put_domain(d);
-                              return;
-                          }
-
-                          ASSERT(d);
-                          gfn =
-                              get_gpfn_from_mfn((bank->mc_addr) >> PAGE_SHIFT);
-                          bank->mc_addr =  gfn << PAGE_SHIFT |
-                                        (bank->mc_addr & (PAGE_SIZE -1 ));
-                          if ( fill_vmsr_data(bank, d,
-                                              global->mc_gstatus) == -1 )
-                          {
-                              mce_printk(MCE_QUIET, "Fill vMCE# data for DOM%d 
"
-                                      "failed\n", result->owner);
-                              put_domain(d);
-                              domain_crash(d);
-                              return;
-                          }
-                          /* We will inject vMCE to DOMU*/
-                          if ( inject_vmce(d) < 0 )
-                          {
-                              mce_printk(MCE_QUIET, "inject vMCE to DOM%d"
-                                          " failed\n", d->domain_id);
-                              put_domain(d);
-                              domain_crash(d);
-                              return;
-                          }
-                          /* Impacted domain go on with domain's recovery job
-                           * if the domain has its own MCA handler.
-                           * For xen, it has contained the error and finished
-                           * its own recovery job.
-                           */
-                          result->result = MCA_RECOVERED;
-                          put_domain(d);
-                      }
-                  }
-              }
-         }
-    }
+
+    mc_status = bank->mc_status;
+    mc_misc = bank->mc_misc;
+    if (!(mc_status &  MCi_STATUS_ADDRV) ||
+        !(mc_status & MCi_STATUS_MISCV) ||
+        ((mc_misc & MCi_MISC_ADDRMOD_MASK) != MCi_MISC_PHYSMOD) )
+    {
+        result->result |= MCA_NO_ACTION;
+        dprintk(XENLOG_WARNING,
+            "No physical address provided for memory error\n");
+        return;
+    }
+
+    mfn = bank->mc_addr >> PAGE_SHIFT;
+    if (offline_page(mfn, 1, &status))
+    {
+        dprintk(XENLOG_WARNING,
+                "Failed to offline page %lx for MCE error\n", mfn);
+        return;
+    }
+
+    mci_add_pageoff_action(bnum, binfo->mi, mfn, status);
+
+    /* This is free page */
+    if (status & PG_OFFLINE_OFFLINED)
+        result->result = MCA_RECOVERED;
+    else if (status & PG_OFFLINE_PENDING) {
+        /* This page has owner */
+        if (status & PG_OFFLINE_OWNED) {
+            result->result |= MCA_OWNER;
+            result->owner = status >> PG_OFFLINE_OWNER_SHIFT;
+            mce_printk(MCE_QUIET, "MCE: This error page is ownded"
+              " by DOM %d\n", result->owner);
+            /* Fill vMCE# injection and vMCE# MSR virtualization "
+             * "related data */
+            bank->mc_domid = result->owner;
+            /* XXX: Cannot handle shared pages yet 
+             * (this should identify all domains and gfn mapping to
+             *  the mfn in question) */
+            BUG_ON( result->owner == DOMID_COW );
+            if ( result->owner != DOMID_XEN ) {
+                d = get_domain_by_id(result->owner);
+                if ( mca_ctl_conflict(bank, d) )
+                {
+                    /* Guest has different MCE ctl with hypervisor */
+                    if ( d )
+                        put_domain(d);
+                    return;
+                }
+
+                ASSERT(d);
+                gfn = get_gpfn_from_mfn((bank->mc_addr) >> PAGE_SHIFT);
+                bank->mc_addr =  gfn << PAGE_SHIFT |
+                  (bank->mc_addr & (PAGE_SIZE -1 ));
+                if ( fill_vmsr_data(bank, d,
+                      global->mc_gstatus) == -1 )
+                {
+                    mce_printk(MCE_QUIET, "Fill vMCE# data for DOM%d "
+                      "failed\n", result->owner);
+                    put_domain(d);
+                    domain_crash(d);
+                    return;
+                }
+                /* We will inject vMCE to DOMU*/
+                if ( inject_vmce(d) < 0 )
+                {
+                    mce_printk(MCE_QUIET, "inject vMCE to DOM%d"
+                      " failed\n", d->domain_id);
+                    put_domain(d);
+                    domain_crash(d);
+                    return;
+                }
+                /* Impacted domain go on with domain's recovery job
+                 * if the domain has its own MCA handler.
+                 * For xen, it has contained the error and finished
+                 * its own recovery job.
+                 */
+                result->result = MCA_RECOVERED;
+                put_domain(d);
+            }
+        }
+    }
+}
+
+static int default_check(uint64_t status)
+{
+    return 1;
+}
+
+static void intel_default_dhandler(int bnum,
+             struct mca_binfo *binfo,
+             struct mca_handle_result *result)
+{
+    uint64_t status = binfo->mib->mc_status;
+    enum intel_mce_type type;
+
+    type = intel_check_mce_type(status);
+
+    if (type == intel_mce_fatal || type == intel_mce_ucr_srar)
+        result->result = MCA_RESET;
+    else if (type == intel_mce_ucr_srao)
+        result->result = MCA_NO_ACTION;
 }
 
 struct mca_error_handler intel_mce_dhandlers[] =
-            {{is_async_memerr, intel_memerr_dhandler}};
+            {{is_async_memerr, intel_memerr_dhandler}, {default_check, 
intel_default_dhandler}};
 
 static void intel_machine_check(struct cpu_user_regs * regs, long error_code)
 {
diff -r 276e4646f9d8 xen/arch/x86/cpu/mcheck/x86_mca.h
--- a/xen/arch/x86/cpu/mcheck/x86_mca.h Tue Jun 08 17:30:33 2010 +0800
+++ b/xen/arch/x86/cpu/mcheck/x86_mca.h Tue Jun 08 17:49:17 2010 +0800
@@ -87,6 +87,9 @@
 
 /*Intel Specific bitfield*/
 #define CMCI_THRESHOLD                 0x2
+
+#define MCi_MISC_ADDRMOD_MASK (0x7UL << 6)
+#define MCi_MISC_PHYSMOD    (0x2UL << 6)
 
 #include <asm/domain.h>
 


Attachment: intel_mce_clean.patch
Description: intel_mce_clean.patch

_______________________________________________
Xen-devel mailing list
Xen-devel@xxxxxxxxxxxxxxxxxxx
http://lists.xensource.com/xen-devel
<Prev in Thread] Current Thread [Next in Thread>
  • [Xen-devel] [PATCH 4/6 ] Clean Intel's MCE handler code, Jiang, Yunhong <=