[Xen-devel] [PATCH 4/6 ] Clean Intel's MCE handler code

Clean Intel's MCE handler code

Add intel_mce_type check according to Intel's SDM.

Reduce intel_memerr_dhandler()'s indent to make code easily read. And add a 
page_off action when we offline the page, so that dom0 can knows about the 
action taken by xen hypervisor.

Add a default delay mce handler, which will crash if unknow SRAR error or fatal 
error, otherwise, system continue.

Signed-off-by: Jiang, Yunhong <yunhong.jiang@xxxxxxxxx>

mce_intel.c |  239 ++++++++++++++++++++++++++++++++++++++++++------------------
 x86_mca.h   |    3
 2 files changed, 174 insertions(+), 68 deletions(-)

diff -r 276e4646f9d8 xen/arch/x86/cpu/mcheck/mce_intel.c
--- a/xen/arch/x86/cpu/mcheck/mce_intel.c       Tue Jun 08 17:30:33 2010 +0800
+++ b/xen/arch/x86/cpu/mcheck/mce_intel.c       Tue Jun 08 17:53:04 2010 +0800
@@ -463,10 +463,76 @@ intel_get_extended_msrs(struct mcinfo_gl
     return mc_ext;
 }
 
-#define INTEL_MAX_RECOVERY 2
+enum intel_mce_type
+{
+    intel_mce_invalid,
+    intel_mce_fatal,
+    intel_mce_corrected,
+    intel_mce_ucr_ucna,
+    intel_mce_ucr_srao,
+    intel_mce_ucr_srar,
+};
+
+static enum intel_mce_type intel_check_mce_type(uint64_t status)
+{
+    if (!(status & MCi_STATUS_VAL))
+        return intel_mce_invalid;
+
+    if (status & MCi_STATUS_PCC)
+        return intel_mce_fatal;
+
+    /* Corrected error? */
+    if (!(status & MCi_STATUS_UC))
+        return intel_mce_corrected;
+
+    if (!ser_support)
+        return intel_mce_fatal;
+
+    if (status & MCi_STATUS_S)
+    {
+        if (status & MCi_STATUS_AR)
+        {
+            if (status & MCi_STATUS_OVER)
+                return intel_mce_fatal;
+            else
+                return intel_mce_ucr_srar;
+        } else
+            return intel_mce_ucr_srao;
+    }
+    else
+        return intel_mce_ucr_ucna;
+
+    /* Any type not included abovoe ? */
+    return intel_mce_fatal;
+}
+
 static int is_async_memerr(uint64_t status)
 {
     return (status & 0xFFFF) == 0x17A || (status & 0xFFF0) == 0xC0;
+}
+
+struct mcinfo_recovery *mci_add_pageoff_action(int bank, struct mc_info *mi,
+                              uint64_t mfn, uint32_t status)
+{
+    struct mcinfo_recovery *rec;
+
+    if (!mi)
+        return NULL;
+
+    rec = x86_mcinfo_reserve(mi, sizeof(struct mcinfo_recovery));
+    if (!rec)
+    {
+        mi->flags |= MCINFO_FLAGS_UNCOMPLETE;
+        return NULL;
+    }
+
+    memset(rec, 0, sizeof(struct mcinfo_recovery));
+
+    rec->mc_bank = bank;
+    rec->action_types = MC_ACTION_PAGE_OFFLINE;
+    rec->action_info.page_retire.mfn = mfn;
+    rec->action_info.page_retire.status = status;
+    return rec;
 }
 
 static void intel_memerr_dhandler(int bnum,
@@ -478,79 +544,116 @@ static void intel_memerr_dhandler(int bn
     struct domain *d;
     unsigned long mfn, gfn;
     uint32_t status;
+    uint64_t mc_status, mc_misc;
 
     mce_printk(MCE_VERBOSE, "MCE: Enter UCR recovery action\n");
     result->result = MCA_NEED_RESET;
-    if (bank->mc_addr != 0) {
-         mfn = bank->mc_addr >> PAGE_SHIFT;
-         if (!offline_page(mfn, 1, &status)) {
-              /* This is free page */
-              if (status & PG_OFFLINE_OFFLINED)
-                  result->result = MCA_RECOVERED;
-              else if (status & PG_OFFLINE_PENDING) {
-                 /* This page has owner */
-                  if (status & PG_OFFLINE_OWNED) {
-                      result->result |= MCA_OWNER;
-                      result->owner = status >> PG_OFFLINE_OWNER_SHIFT;
-                      mce_printk(MCE_QUIET, "MCE: This error page is ownded"
-                                  " by DOM %d\n", result->owner);
-                      /* Fill vMCE# injection and vMCE# MSR virtualization "
-                       * "related data */
-                      bank->mc_domid = result->owner;
-                      /* XXX: Cannot handle shared pages yet 
-                       * (this should identify all domains and gfn mapping to
-                       *  the mfn in question) */
-                      BUG_ON( result->owner == DOMID_COW );
-                      if ( result->owner != DOMID_XEN ) {
-
-                          d = get_domain_by_id(result->owner);
-                          if ( mca_ctl_conflict(bank, d) )
-                          {
-                              /* Guest has different MCE ctl with hypervisor */
-                              if ( d )
-                                  put_domain(d);
-                              return;
-                          }
-
-                          ASSERT(d);
-                          gfn =
-                              get_gpfn_from_mfn((bank->mc_addr) >> PAGE_SHIFT);
-                          bank->mc_addr =  gfn << PAGE_SHIFT |
-                                        (bank->mc_addr & (PAGE_SIZE -1 ));
-                          if ( fill_vmsr_data(bank, d,
-                                              global->mc_gstatus) == -1 )
-                          {
-                              mce_printk(MCE_QUIET, "Fill vMCE# data for DOM%d 
"
-                                      "failed\n", result->owner);
-                              put_domain(d);
-                              domain_crash(d);
-                              return;
-                          }
-                          /* We will inject vMCE to DOMU*/
-                          if ( inject_vmce(d) < 0 )
-                          {
-                              mce_printk(MCE_QUIET, "inject vMCE to DOM%d"
-                                          " failed\n", d->domain_id);
-                              put_domain(d);
-                              domain_crash(d);
-                              return;
-                          }
-                          /* Impacted domain go on with domain's recovery job
-                           * if the domain has its own MCA handler.
-                           * For xen, it has contained the error and finished
-                           * its own recovery job.
-                           */
-                          result->result = MCA_RECOVERED;
-                          put_domain(d);
-                      }
-                  }
-              }
-         }
-    }
+
+    mc_status = bank->mc_status;
+    mc_misc = bank->mc_misc;
+    if (!(mc_status &  MCi_STATUS_ADDRV) ||
+        !(mc_status & MCi_STATUS_MISCV) ||
+        ((mc_misc & MCi_MISC_ADDRMOD_MASK) != MCi_MISC_PHYSMOD) )
+    {
+        result->result |= MCA_NO_ACTION;
+        dprintk(XENLOG_WARNING,
+            "No physical address provided for memory error\n");
+        return;
+    }
+
+    mfn = bank->mc_addr >> PAGE_SHIFT;
+    if (offline_page(mfn, 1, &status))
+    {
+        dprintk(XENLOG_WARNING,
+                "Failed to offline page %lx for MCE error\n", mfn);
+        return;
+    }
+
+    mci_add_pageoff_action(bnum, binfo->mi, mfn, status);
+
+    /* This is free page */
+    if (status & PG_OFFLINE_OFFLINED)
+        result->result = MCA_RECOVERED;
+    else if (status & PG_OFFLINE_PENDING) {
+        /* This page has owner */
+        if (status & PG_OFFLINE_OWNED) {
+            result->result |= MCA_OWNER;
+            result->owner = status >> PG_OFFLINE_OWNER_SHIFT;
+            mce_printk(MCE_QUIET, "MCE: This error page is ownded"
+              " by DOM %d\n", result->owner);
+            /* Fill vMCE# injection and vMCE# MSR virtualization "
+             * "related data */
+            bank->mc_domid = result->owner;
+            /* XXX: Cannot handle shared pages yet 
+             * (this should identify all domains and gfn mapping to
+             *  the mfn in question) */
+            BUG_ON( result->owner == DOMID_COW );
+            if ( result->owner != DOMID_XEN ) {
+                d = get_domain_by_id(result->owner);
+                if ( mca_ctl_conflict(bank, d) )
+                {
+                    /* Guest has different MCE ctl with hypervisor */
+                    if ( d )
+                        put_domain(d);
+                    return;
+                }
+
+                ASSERT(d);
+                gfn = get_gpfn_from_mfn((bank->mc_addr) >> PAGE_SHIFT);
+                bank->mc_addr =  gfn << PAGE_SHIFT |
+                  (bank->mc_addr & (PAGE_SIZE -1 ));
+                if ( fill_vmsr_data(bank, d,
+                      global->mc_gstatus) == -1 )
+                {
+                    mce_printk(MCE_QUIET, "Fill vMCE# data for DOM%d "
+                      "failed\n", result->owner);
+                    put_domain(d);
+                    domain_crash(d);
+                    return;
+                }
+                /* We will inject vMCE to DOMU*/
+                if ( inject_vmce(d) < 0 )
+                {
+                    mce_printk(MCE_QUIET, "inject vMCE to DOM%d"
+                      " failed\n", d->domain_id);
+                    put_domain(d);
+                    domain_crash(d);
+                    return;
+                }
+                /* Impacted domain go on with domain's recovery job
+                 * if the domain has its own MCA handler.
+                 * For xen, it has contained the error and finished
+                 * its own recovery job.
+                 */
+                result->result = MCA_RECOVERED;
+                put_domain(d);
+            }
+        }
+    }
+}
+
+static int default_check(uint64_t status)
+{
+    return 1;
+}
+
+static void intel_default_dhandler(int bnum,
+             struct mca_binfo *binfo,
+             struct mca_handle_result *result)
+{
+    uint64_t status = binfo->mib->mc_status;
+    enum intel_mce_type type;
+
+    type = intel_check_mce_type(status);
+
+    if (type == intel_mce_fatal || type == intel_mce_ucr_srar)
+        result->result = MCA_RESET;
+    else if (type == intel_mce_ucr_srao)
+        result->result = MCA_NO_ACTION;
 }
 
 struct mca_error_handler intel_mce_dhandlers[] =
-            {{is_async_memerr, intel_memerr_dhandler}};
+            {{is_async_memerr, intel_memerr_dhandler}, {default_check, 
intel_default_dhandler}};
 
 static void intel_machine_check(struct cpu_user_regs * regs, long error_code)
 {
diff -r 276e4646f9d8 xen/arch/x86/cpu/mcheck/x86_mca.h
--- a/xen/arch/x86/cpu/mcheck/x86_mca.h Tue Jun 08 17:30:33 2010 +0800
+++ b/xen/arch/x86/cpu/mcheck/x86_mca.h Tue Jun 08 17:49:17 2010 +0800
@@ -87,6 +87,9 @@
 
 /*Intel Specific bitfield*/
 #define CMCI_THRESHOLD                 0x2
+
+#define MCi_MISC_ADDRMOD_MASK (0x7UL << 6)
+#define MCi_MISC_PHYSMOD    (0x2UL << 6)
 
 #include <asm/domain.h>

intel_mce_clean.patch
Description: intel_mce_clean.patch

_______________________________________________
Xen-devel mailing list
Xen-devel@xxxxxxxxxxxxxxxxxxx
http://lists.xensource.com/xen-devel

WARNING - OLD ARCHIVES

xen-devel

[Xen-devel] [PATCH 4/6 ] Clean Intel's MCE handler code