WARNING - OLD ARCHIVES

This is an archived copy of the Xen.org mailing list, which we have preserved to ensure that existing links to archives are not broken. The live archive, which contains the latest emails, can be found at http://lists.xen.org/
   
 
 
Xen 
 
Home Products Support Community News
 
   
 

xen-changelog

[Xen-changelog] [xen-unstable] x86 mce: Clean Intel's MCE handler code

To: xen-changelog@xxxxxxxxxxxxxxxxxxx
Subject: [Xen-changelog] [xen-unstable] x86 mce: Clean Intel's MCE handler code
From: Xen patchbot-unstable <patchbot-unstable@xxxxxxxxxxxxxxxxxxx>
Date: Thu, 10 Jun 2010 02:25:17 -0700
Delivery-date: Thu, 10 Jun 2010 02:26:23 -0700
Envelope-to: www-data@xxxxxxxxxxxxxxxxxxx
List-help: <mailto:xen-changelog-request@lists.xensource.com?subject=help>
List-id: BK change log <xen-changelog.lists.xensource.com>
List-post: <mailto:xen-changelog@lists.xensource.com>
List-subscribe: <http://lists.xensource.com/mailman/listinfo/xen-changelog>, <mailto:xen-changelog-request@lists.xensource.com?subject=subscribe>
List-unsubscribe: <http://lists.xensource.com/mailman/listinfo/xen-changelog>, <mailto:xen-changelog-request@lists.xensource.com?subject=unsubscribe>
Reply-to: xen-devel@xxxxxxxxxxxxxxxxxxx
Sender: xen-changelog-bounces@xxxxxxxxxxxxxxxxxxx
# HG changeset patch
# User Keir Fraser <keir.fraser@xxxxxxxxxx>
# Date 1276154326 -3600
# Node ID 704bcd622dc2ee4acb799d1bbd08ca1b28af0552
# Parent  2d2812de6792e51c722e51baf6b16e4b776f41b3
x86 mce: Clean Intel's MCE handler code

Add intel_mce_type check according to Intel's SDM.

Reduce intel_memerr_dhandler()'s indent to make code easily read. And
add a page_off action when we offline the page, so that dom0 can knows
about the action taken by xen hypervisor.

Add a default delay mce handler, which will crash if unknow SRAR error
or fatal error, otherwise, system continue.

Signed-off-by: Jiang, Yunhong <yunhong.jiang@xxxxxxxxx>
---
 xen/arch/x86/cpu/mcheck/mce_intel.c |  239 +++++++++++++++++++++++++-----------
 xen/arch/x86/cpu/mcheck/x86_mca.h   |    3 
 2 files changed, 174 insertions(+), 68 deletions(-)

diff -r 2d2812de6792 -r 704bcd622dc2 xen/arch/x86/cpu/mcheck/mce_intel.c
--- a/xen/arch/x86/cpu/mcheck/mce_intel.c       Thu Jun 10 08:18:11 2010 +0100
+++ b/xen/arch/x86/cpu/mcheck/mce_intel.c       Thu Jun 10 08:18:46 2010 +0100
@@ -501,10 +501,76 @@ intel_get_extended_msrs(struct mcinfo_gl
     return mc_ext;
 }
 
-#define INTEL_MAX_RECOVERY 2
+enum intel_mce_type
+{
+    intel_mce_invalid,
+    intel_mce_fatal,
+    intel_mce_corrected,
+    intel_mce_ucr_ucna,
+    intel_mce_ucr_srao,
+    intel_mce_ucr_srar,
+};
+
+static enum intel_mce_type intel_check_mce_type(uint64_t status)
+{
+    if (!(status & MCi_STATUS_VAL))
+        return intel_mce_invalid;
+
+    if (status & MCi_STATUS_PCC)
+        return intel_mce_fatal;
+
+    /* Corrected error? */
+    if (!(status & MCi_STATUS_UC))
+        return intel_mce_corrected;
+
+    if (!ser_support)
+        return intel_mce_fatal;
+
+    if (status & MCi_STATUS_S)
+    {
+        if (status & MCi_STATUS_AR)
+        {
+            if (status & MCi_STATUS_OVER)
+                return intel_mce_fatal;
+            else
+                return intel_mce_ucr_srar;
+        } else
+            return intel_mce_ucr_srao;
+    }
+    else
+        return intel_mce_ucr_ucna;
+
+    /* Any type not included abovoe ? */
+    return intel_mce_fatal;
+}
+
 static int is_async_memerr(uint64_t status)
 {
     return (status & 0xFFFF) == 0x17A || (status & 0xFFF0) == 0xC0;
+}
+
+struct mcinfo_recovery *mci_add_pageoff_action(int bank, struct mc_info *mi,
+                              uint64_t mfn, uint32_t status)
+{
+    struct mcinfo_recovery *rec;
+
+    if (!mi)
+        return NULL;
+
+    rec = x86_mcinfo_reserve(mi, sizeof(struct mcinfo_recovery));
+    if (!rec)
+    {
+        mi->flags |= MCINFO_FLAGS_UNCOMPLETE;
+        return NULL;
+    }
+
+    memset(rec, 0, sizeof(struct mcinfo_recovery));
+
+    rec->mc_bank = bank;
+    rec->action_types = MC_ACTION_PAGE_OFFLINE;
+    rec->action_info.page_retire.mfn = mfn;
+    rec->action_info.page_retire.status = status;
+    return rec;
 }
 
 static void intel_memerr_dhandler(int bnum,
@@ -516,79 +582,116 @@ static void intel_memerr_dhandler(int bn
     struct domain *d;
     unsigned long mfn, gfn;
     uint32_t status;
+    uint64_t mc_status, mc_misc;
 
     mce_printk(MCE_VERBOSE, "MCE: Enter UCR recovery action\n");
     result->result = MCA_NEED_RESET;
-    if (bank->mc_addr != 0) {
-         mfn = bank->mc_addr >> PAGE_SHIFT;
-         if (!offline_page(mfn, 1, &status)) {
-              /* This is free page */
-              if (status & PG_OFFLINE_OFFLINED)
-                  result->result = MCA_RECOVERED;
-              else if (status & PG_OFFLINE_PENDING) {
-                 /* This page has owner */
-                  if (status & PG_OFFLINE_OWNED) {
-                      result->result |= MCA_OWNER;
-                      result->owner = status >> PG_OFFLINE_OWNER_SHIFT;
-                      mce_printk(MCE_QUIET, "MCE: This error page is ownded"
-                                  " by DOM %d\n", result->owner);
-                      /* Fill vMCE# injection and vMCE# MSR virtualization "
-                       * "related data */
-                      bank->mc_domid = result->owner;
-                      /* XXX: Cannot handle shared pages yet 
-                       * (this should identify all domains and gfn mapping to
-                       *  the mfn in question) */
-                      BUG_ON( result->owner == DOMID_COW );
-                      if ( result->owner != DOMID_XEN ) {
-
-                          d = get_domain_by_id(result->owner);
-                          if ( mca_ctl_conflict(bank, d) )
-                          {
-                              /* Guest has different MCE ctl with hypervisor */
-                              if ( d )
-                                  put_domain(d);
-                              return;
-                          }
-
-                          ASSERT(d);
-                          gfn =
-                              get_gpfn_from_mfn((bank->mc_addr) >> PAGE_SHIFT);
-                          bank->mc_addr =  gfn << PAGE_SHIFT |
-                                        (bank->mc_addr & (PAGE_SIZE -1 ));
-                          if ( fill_vmsr_data(bank, d,
-                                              global->mc_gstatus) == -1 )
-                          {
-                              mce_printk(MCE_QUIET, "Fill vMCE# data for DOM%d 
"
-                                      "failed\n", result->owner);
-                              put_domain(d);
-                              domain_crash(d);
-                              return;
-                          }
-                          /* We will inject vMCE to DOMU*/
-                          if ( inject_vmce(d) < 0 )
-                          {
-                              mce_printk(MCE_QUIET, "inject vMCE to DOM%d"
-                                          " failed\n", d->domain_id);
-                              put_domain(d);
-                              domain_crash(d);
-                              return;
-                          }
-                          /* Impacted domain go on with domain's recovery job
-                           * if the domain has its own MCA handler.
-                           * For xen, it has contained the error and finished
-                           * its own recovery job.
-                           */
-                          result->result = MCA_RECOVERED;
-                          put_domain(d);
-                      }
-                  }
-              }
-         }
-    }
+
+    mc_status = bank->mc_status;
+    mc_misc = bank->mc_misc;
+    if (!(mc_status &  MCi_STATUS_ADDRV) ||
+        !(mc_status & MCi_STATUS_MISCV) ||
+        ((mc_misc & MCi_MISC_ADDRMOD_MASK) != MCi_MISC_PHYSMOD) )
+    {
+        result->result |= MCA_NO_ACTION;
+        dprintk(XENLOG_WARNING,
+            "No physical address provided for memory error\n");
+        return;
+    }
+
+    mfn = bank->mc_addr >> PAGE_SHIFT;
+    if (offline_page(mfn, 1, &status))
+    {
+        dprintk(XENLOG_WARNING,
+                "Failed to offline page %lx for MCE error\n", mfn);
+        return;
+    }
+
+    mci_add_pageoff_action(bnum, binfo->mi, mfn, status);
+
+    /* This is free page */
+    if (status & PG_OFFLINE_OFFLINED)
+        result->result = MCA_RECOVERED;
+    else if (status & PG_OFFLINE_PENDING) {
+        /* This page has owner */
+        if (status & PG_OFFLINE_OWNED) {
+            result->result |= MCA_OWNER;
+            result->owner = status >> PG_OFFLINE_OWNER_SHIFT;
+            mce_printk(MCE_QUIET, "MCE: This error page is ownded"
+              " by DOM %d\n", result->owner);
+            /* Fill vMCE# injection and vMCE# MSR virtualization "
+             * "related data */
+            bank->mc_domid = result->owner;
+            /* XXX: Cannot handle shared pages yet 
+             * (this should identify all domains and gfn mapping to
+             *  the mfn in question) */
+            BUG_ON( result->owner == DOMID_COW );
+            if ( result->owner != DOMID_XEN ) {
+                d = get_domain_by_id(result->owner);
+                if ( mca_ctl_conflict(bank, d) )
+                {
+                    /* Guest has different MCE ctl with hypervisor */
+                    if ( d )
+                        put_domain(d);
+                    return;
+                }
+
+                ASSERT(d);
+                gfn = get_gpfn_from_mfn((bank->mc_addr) >> PAGE_SHIFT);
+                bank->mc_addr =  gfn << PAGE_SHIFT |
+                  (bank->mc_addr & (PAGE_SIZE -1 ));
+                if ( fill_vmsr_data(bank, d,
+                      global->mc_gstatus) == -1 )
+                {
+                    mce_printk(MCE_QUIET, "Fill vMCE# data for DOM%d "
+                      "failed\n", result->owner);
+                    put_domain(d);
+                    domain_crash(d);
+                    return;
+                }
+                /* We will inject vMCE to DOMU*/
+                if ( inject_vmce(d) < 0 )
+                {
+                    mce_printk(MCE_QUIET, "inject vMCE to DOM%d"
+                      " failed\n", d->domain_id);
+                    put_domain(d);
+                    domain_crash(d);
+                    return;
+                }
+                /* Impacted domain go on with domain's recovery job
+                 * if the domain has its own MCA handler.
+                 * For xen, it has contained the error and finished
+                 * its own recovery job.
+                 */
+                result->result = MCA_RECOVERED;
+                put_domain(d);
+            }
+        }
+    }
+}
+
+static int default_check(uint64_t status)
+{
+    return 1;
+}
+
+static void intel_default_dhandler(int bnum,
+             struct mca_binfo *binfo,
+             struct mca_handle_result *result)
+{
+    uint64_t status = binfo->mib->mc_status;
+    enum intel_mce_type type;
+
+    type = intel_check_mce_type(status);
+
+    if (type == intel_mce_fatal || type == intel_mce_ucr_srar)
+        result->result = MCA_RESET;
+    else if (type == intel_mce_ucr_srao)
+        result->result = MCA_NO_ACTION;
 }
 
 struct mca_error_handler intel_mce_dhandlers[] =
-            {{is_async_memerr, intel_memerr_dhandler}};
+            {{is_async_memerr, intel_memerr_dhandler}, {default_check, 
intel_default_dhandler}};
 
 static void intel_machine_check(struct cpu_user_regs * regs, long error_code)
 {
diff -r 2d2812de6792 -r 704bcd622dc2 xen/arch/x86/cpu/mcheck/x86_mca.h
--- a/xen/arch/x86/cpu/mcheck/x86_mca.h Thu Jun 10 08:18:11 2010 +0100
+++ b/xen/arch/x86/cpu/mcheck/x86_mca.h Thu Jun 10 08:18:46 2010 +0100
@@ -87,6 +87,9 @@
 
 /*Intel Specific bitfield*/
 #define CMCI_THRESHOLD                 0x2
+
+#define MCi_MISC_ADDRMOD_MASK (0x7UL << 6)
+#define MCi_MISC_PHYSMOD    (0x2UL << 6)
 
 #include <asm/domain.h>
 

_______________________________________________
Xen-changelog mailing list
Xen-changelog@xxxxxxxxxxxxxxxxxxx
http://lists.xensource.com/xen-changelog

<Prev in Thread] Current Thread [Next in Thread>
  • [Xen-changelog] [xen-unstable] x86 mce: Clean Intel's MCE handler code, Xen patchbot-unstable <=