[Xen-devel] [PATCH] MCA injection (debug) code

The attached patch adds the capability for dom0 to inject MCA errorsinto Xen. The basic idea is to add specific MSR values that will be readby the polling code or the #MC handler. On AMD cpus, the MSRs canactually be written, on Intel CPUs, interpose code is used to fake theMSR values.

Since actual hardware problems are rare, it's convenient to be able totest the code like this. We run our testsuite using this code. Othershave expressed some interest in this, so that's why I'm submitting thispatch.


- Frank

Provide MCA "injection" hypervisor services.

Signed-off-by: Gavin Maltby <gavin.maltby@xxxxxxx>

diff --git a/xen/arch/x86/cpu/mcheck/amd_f10.c 
b/xen/arch/x86/cpu/mcheck/amd_f10.c
--- a/xen/arch/x86/cpu/mcheck/amd_f10.c
+++ b/xen/arch/x86/cpu/mcheck/amd_f10.c
@@ -74,9 +74,9 @@ amd_f10_handler(struct mc_info *mi, uint
        mc_ext.mc_msr[1].reg = MSR_F10_MC4_MISC2;
        mc_ext.mc_msr[2].reg = MSR_F10_MC4_MISC3;
 
-       rdmsrl(MSR_F10_MC4_MISC1, mc_ext.mc_msr[0].value);
-       rdmsrl(MSR_F10_MC4_MISC2, mc_ext.mc_msr[1].value);
-       rdmsrl(MSR_F10_MC4_MISC3, mc_ext.mc_msr[2].value);
+       mca_rdmsrl(MSR_F10_MC4_MISC1, mc_ext.mc_msr[0].value);
+       mca_rdmsrl(MSR_F10_MC4_MISC2, mc_ext.mc_msr[1].value);
+       mca_rdmsrl(MSR_F10_MC4_MISC3, mc_ext.mc_msr[2].value);
        
        x86_mcinfo_add(mi, &mc_ext);
        return MCA_EXTINFO_LOCAL;
diff --git a/xen/arch/x86/cpu/mcheck/amd_nonfatal.c 
b/xen/arch/x86/cpu/mcheck/amd_nonfatal.c
--- a/xen/arch/x86/cpu/mcheck/amd_nonfatal.c
+++ b/xen/arch/x86/cpu/mcheck/amd_nonfatal.c
@@ -147,7 +147,7 @@ static void mce_amd_work_fn(void *data)
                uint64_t value;
                uint32_t counter;
 
-               rdmsrl(MSR_IA32_MC4_MISC, value);
+               mca_rdmsrl(MSR_IA32_MC4_MISC, value);
                /* Only the error counter field is of interest
                 * Bit field is described in AMD K8 BKDG chapter 6.4.5.5
                 */
@@ -172,7 +172,7 @@ static void mce_amd_work_fn(void *data)
                        value &= ~(0x60FFF00000000ULL);
                        /* Counter enable */
                        value |= (1ULL << 51);
-                       wrmsrl(MSR_IA32_MC4_MISC, value);
+                       mca_wrmsrl(MSR_IA32_MC4_MISC, value);
                        wmb();
                }
        }
diff --git a/xen/arch/x86/cpu/mcheck/mce.c b/xen/arch/x86/cpu/mcheck/mce.c
--- a/xen/arch/x86/cpu/mcheck/mce.c
+++ b/xen/arch/x86/cpu/mcheck/mce.c
@@ -27,9 +27,11 @@ unsigned int nr_mce_banks;
 
 EXPORT_SYMBOL_GPL(nr_mce_banks);       /* non-fatal.o */
 
+static void intpose_init(void);
 static void mcinfo_clear(struct mc_info *);
 
-#define        SEG_PL(segsel) ((segsel) & 0x3)
+#define        SEG_PL(segsel)                  ((segsel) & 0x3)
+#define _MC_MSRINJ_F_REQ_HWCR_WREN     (1 << 16)
 
 #if 1  /* XXFM switch to 0 for putback */
 
@@ -109,7 +111,7 @@ mctelem_cookie_t mcheck_mca_logout(enum 
        cpu_nr = smp_processor_id();
        BUG_ON(cpu_nr != v->processor);
 
-       rdmsrl(MSR_IA32_MCG_STATUS, gstatus);
+       mca_rdmsrl(MSR_IA32_MCG_STATUS, gstatus);
 
        memset(&mcg, 0, sizeof (mcg));
        mcg.common.type = MC_TYPE_GLOBAL;
@@ -156,7 +158,7 @@ mctelem_cookie_t mcheck_mca_logout(enum 
                if (!test_bit(i, bankmask))
                        continue;
 
-               rdmsrl(MSR_IA32_MC0_STATUS + i * 4, status);
+               mca_rdmsrl(MSR_IA32_MC0_STATUS + i * 4, status);
                if (!(status & MCi_STATUS_VAL))
                        continue;       /* this bank has no valid telemetry */
 
@@ -189,7 +191,7 @@ mctelem_cookie_t mcheck_mca_logout(enum 
                addr = misc = 0;
 
                if (status & MCi_STATUS_ADDRV) {
-                       rdmsrl(MSR_IA32_MC0_ADDR + 4 * i, addr);
+                       mca_rdmsrl(MSR_IA32_MC0_ADDR + 4 * i, addr);
                        d = maddr_get_owner(addr);
                        if (d != NULL && (who == MCA_POLLER ||
                            who == MCA_CMCI_HANDLER))
@@ -197,13 +199,13 @@ mctelem_cookie_t mcheck_mca_logout(enum 
                }
 
                if (status & MCi_STATUS_MISCV)
-                       rdmsrl(MSR_IA32_MC0_MISC + 4 * i, misc);
+                       mca_rdmsrl(MSR_IA32_MC0_MISC + 4 * i, misc);
 
                mcb.mc_addr = addr;
                mcb.mc_misc = misc;
 
                if (who == MCA_CMCI_HANDLER) {
-                       rdmsrl(MSR_IA32_MC0_CTL2 + i, mcb.mc_ctrl2);
+                       mca_rdmsrl(MSR_IA32_MC0_CTL2 + i, mcb.mc_ctrl2);
                        rdtscll(mcb.mc_tsc);
                }
 
@@ -221,7 +223,7 @@ mctelem_cookie_t mcheck_mca_logout(enum 
                }
 
                /* Clear status */
-               wrmsrl(MSR_IA32_MC0_STATUS + 4 * i, 0x0ULL);
+               mca_wrmsrl(MSR_IA32_MC0_STATUS + 4 * i, 0x0ULL);
                wmb();
        }
 
@@ -281,7 +283,7 @@ void mcheck_cmn_handler(struct cpu_user_
 
        /* Read global status;  if it does not indicate machine check
         * in progress then bail as long as we have a valid ip to return to. */
-       rdmsrl(MSR_IA32_MCG_STATUS, gstatus);
+       mca_rdmsrl(MSR_IA32_MCG_STATUS, gstatus);
        ripv = ((gstatus & MCG_STATUS_RIPV) != 0);
        if (!(gstatus & MCG_STATUS_MCIP) && ripv) {
                add_taint(TAINT_MACHINE_CHECK); /* questionable */
@@ -300,7 +302,7 @@ void mcheck_cmn_handler(struct cpu_user_
 
        /* Clear MCIP or another #MC will enter shutdown state */
        gstatus &= ~MCG_STATUS_MCIP;
-       wrmsrl(MSR_IA32_MCG_STATUS, gstatus);
+       mca_wrmsrl(MSR_IA32_MCG_STATUS, gstatus);
        wmb();
 
        /* If no valid errors and our stack is intact, we're done */
@@ -540,6 +542,7 @@ void mcheck_init(struct cpuinfo_x86 *c)
                return;
        }
 
+       intpose_init();
        mctelem_init(sizeof (struct mc_info));
 
        switch (c->x86_vendor) {
@@ -768,6 +771,203 @@ void x86_mc_get_cpu_info(unsigned cpu, u
        }
 }
 
+#define        INTPOSE_NENT    50
+
+static struct intpose_ent {
+       unsigned  int cpu_nr;
+       uint64_t msr;
+       uint64_t val;
+} intpose_arr[INTPOSE_NENT];
+
+static void intpose_init(void)
+{
+       static int done;
+       int i;
+
+       if (done++ > 0)
+               return;
+
+       for (i = 0; i < INTPOSE_NENT; i++) {
+               intpose_arr[i].cpu_nr = -1;
+       }
+
+}
+
+struct intpose_ent *intpose_lookup(unsigned int cpu_nr, uint64_t msr,
+    uint64_t *valp)
+{
+       int i;
+
+       for (i = 0; i < INTPOSE_NENT; i++) {
+               if (intpose_arr[i].cpu_nr == cpu_nr &&
+                   intpose_arr[i].msr == msr) {
+                       if (valp != NULL)
+                               *valp = intpose_arr[i].val;
+                       return &intpose_arr[i];
+               }
+       }
+
+       return NULL;
+}
+
+static void intpose_add(unsigned int cpu_nr, uint64_t msr, uint64_t val)
+{
+       struct intpose_ent *ent;
+       int i;
+
+       if ((ent = intpose_lookup(cpu_nr, msr, NULL)) != NULL) {
+               ent->val = val;
+               return;
+       }
+
+       for (i = 0, ent = &intpose_arr[0]; i < INTPOSE_NENT; i++, ent++) {
+               if (ent->cpu_nr == -1) {
+                       ent->cpu_nr = cpu_nr;
+                       ent->msr = msr;
+                       ent->val = val;
+                       return;
+               }
+       }
+
+       printk("intpose_add: interpose array full - request dropped\n");
+}
+
+void intpose_inval(unsigned int cpu_nr, uint64_t msr)
+{
+       struct intpose_ent *ent;
+
+       if ((ent = intpose_lookup(cpu_nr, msr, NULL)) != NULL) {
+               ent->cpu_nr = -1;
+       }
+}
+
+#define        IS_MCA_BANKREG(r) \
+    ((r) >= MSR_IA32_MC0_CTL && \
+    (r) <= MSR_IA32_MC0_MISC + (nr_mce_banks - 1) * 4 && \
+    ((r) - MSR_IA32_MC0_CTL) % 4 != 0) /* excludes MCi_CTL */
+
+static int x86_mc_msrinject_verify(struct xen_mc_msrinject *mci)
+{
+       struct cpuinfo_x86 *c;
+       int i, errs = 0;
+
+       c = &cpu_data[smp_processor_id()];
+
+       for (i = 0; i < mci->mcinj_count; i++) {
+               uint64_t reg = mci->mcinj_msr[i].reg;
+               const char *reason = NULL;
+
+               if (IS_MCA_BANKREG(reg)) {
+                       if (c->x86_vendor == X86_VENDOR_AMD) {
+                               /* On AMD we can set MCi_STATUS_WREN in the
+                                * HWCR MSR to allow non-zero writes to banks
+                                * MSRs not to #GP.  The injector in dom0
+                                * should set that bit, but we detect when it
+                                * is necessary and set it as a courtesy to
+                                * avoid #GP in the hypervisor. */
+                               mci->mcinj_flags |=
+                                   _MC_MSRINJ_F_REQ_HWCR_WREN;
+                               continue;
+                       } else {
+                               /* No alternative but to interpose, so require
+                                * that the injector specified as such. */
+                               if (!(mci->mcinj_flags &
+                                   MC_MSRINJ_F_INTERPOSE)) {
+                                       reason = "must specify interposition";
+                               }
+                       }
+               } else {
+                       switch (reg) {
+                       /* MSRs acceptable on all x86 cpus */
+                       case MSR_IA32_MCG_STATUS:
+                               break;
+
+                       /* MSRs that the HV will take care of */
+                       case MSR_K8_HWCR:
+                               if (c->x86_vendor == X86_VENDOR_AMD)
+                                       reason = "HV will operate HWCR";
+                               else
+                                       reason ="only supported on AMD";
+                               break;
+
+                       default:
+                               reason = "not a recognized MCA MSR";
+                               break;
+                       }
+               }
+
+               if (reason != NULL) {
+                       printk("HV MSR INJECT ERROR: MSR 0x%llx %s\n",
+                           (unsigned long long)mci->mcinj_msr[i].reg, reason);
+                       errs++;
+               }
+       }
+
+       return !errs;
+}
+
+static uint64_t x86_mc_hwcr_wren(void)
+{
+       uint64_t old;
+
+       rdmsrl(MSR_K8_HWCR, old);
+
+       if (!(old & K8_HWCR_MCi_STATUS_WREN)) {
+               uint64_t new = old | K8_HWCR_MCi_STATUS_WREN;
+               wrmsrl(MSR_K8_HWCR, new);
+       }
+
+       return old;
+}
+
+static void x86_mc_hwcr_wren_restore(uint64_t hwcr)
+{
+       if (!(hwcr & K8_HWCR_MCi_STATUS_WREN))
+               wrmsrl(MSR_K8_HWCR, hwcr);
+}
+
+static void x86_mc_msrinject(void *data)
+{
+       struct xen_mc_msrinject *mci = data;
+       struct mcinfo_msr *msr;
+       struct cpuinfo_x86 *c;
+       uint64_t hwcr = 0;
+       int intpose;
+       int i;
+
+       c = &cpu_data[smp_processor_id()];
+
+       if (mci->mcinj_flags & _MC_MSRINJ_F_REQ_HWCR_WREN)
+               hwcr = x86_mc_hwcr_wren();
+
+       intpose = (mci->mcinj_flags & MC_MSRINJ_F_INTERPOSE) != 0;
+
+       for (i = 0, msr = &mci->mcinj_msr[0];
+           i < mci->mcinj_count; i++, msr++) {
+               printk("HV MSR INJECT (%s) target %u actual %u MSR 0x%llx "
+                   "<-- 0x%llx\n",
+                   intpose ?  "interpose" : "hardware",
+                   mci->mcinj_cpunr, smp_processor_id(),
+                   (unsigned long long)msr->reg,
+                   (unsigned long long)msr->value);
+
+               if (intpose)
+                       intpose_add(mci->mcinj_cpunr, msr->reg, msr->value);
+               else
+                       wrmsrl(msr->reg, msr->value);
+       }
+
+       if (mci->mcinj_flags & _MC_MSRINJ_F_REQ_HWCR_WREN)
+               x86_mc_hwcr_wren_restore(hwcr);
+}
+
+/*ARGSUSED*/
+static void x86_mc_mceinject(void *data)
+{
+       printk("Simulating #MC on cpu %d\n", smp_processor_id());
+       __asm__ __volatile__("int $0x12");
+}
+
 #if BITS_PER_LONG == 64
 
 #define        ID2COOKIE(id)   ((mctelem_cookie_t)(id))
@@ -797,6 +997,9 @@ long do_mca(XEN_GUEST_HANDLE(xen_mc_t) u
        xen_mc_logical_cpu_t *log_cpus = NULL;
        mctelem_cookie_t mctc;
        mctelem_class_t which;
+       unsigned int target;
+       struct xen_mc_msrinject *mc_msrinject;
+       struct xen_mc_mceinject *mc_mceinject;
 
        if ( copy_from_guest(op, u_xen_mc, 1) )
                return x86_mcerr("do_mca: failed copyin of xen_mc_t", -EFAULT);
@@ -901,6 +1104,59 @@ long do_mca(XEN_GUEST_HANDLE(xen_mc_t) u
                }
                break;
 
+       case XEN_MC_msrinject:
+               if ( !IS_PRIV(v->domain) )
+                       return x86_mcerr("do_mca inject", -EPERM);
+
+               if (nr_mce_banks == 0)
+                       return x86_mcerr("do_mca inject", -ENODEV);
+
+               mc_msrinject = &op->u.mc_msrinject;
+               target = mc_msrinject->mcinj_cpunr;
+
+               if (target >= NR_CPUS)
+                       return x86_mcerr("do_mca inject: bad target", -EINVAL);
+
+               if (!cpu_isset(target, cpu_online_map))
+                       return x86_mcerr("do_mca inject: target offline",
+                           -EINVAL);
+
+               if (mc_msrinject->mcinj_count == 0)
+                       return 0;
+
+               if (!x86_mc_msrinject_verify(mc_msrinject))
+                       return x86_mcerr("do_mca inject: illegal MSR", -EINVAL);
+
+               add_taint(TAINT_ERROR_INJECT);
+
+               on_selected_cpus(cpumask_of_cpu(target),
+                   x86_mc_msrinject, mc_msrinject, 1, 1);
+
+               break;
+
+       case XEN_MC_mceinject:
+               if ( !IS_PRIV(v->domain) )
+                       return x86_mcerr("do_mca #MC", -EPERM);
+
+               if (nr_mce_banks == 0)
+                       return x86_mcerr("do_mca #MC", -ENODEV);
+
+               mc_mceinject = &op->u.mc_mceinject;
+               target = mc_mceinject->mceinj_cpunr;
+
+               if (target >= NR_CPUS)
+                       return x86_mcerr("do_mca #MC: bad target", -EINVAL);
+                      
+               if (!cpu_isset(target, cpu_online_map))
+                       return x86_mcerr("do_mca #MC: target offline", -EINVAL);
+
+               add_taint(TAINT_ERROR_INJECT);
+
+               on_selected_cpus(cpumask_of_cpu(target),
+                   x86_mc_mceinject, mc_mceinject, 1, 1);
+
+               break;
+
        default:
                return x86_mcerr("do_mca: bad command", -EINVAL);
        }
diff --git a/xen/arch/x86/cpu/mcheck/mce.h b/xen/arch/x86/cpu/mcheck/mce.h
--- a/xen/arch/x86/cpu/mcheck/mce.h
+++ b/xen/arch/x86/cpu/mcheck/mce.h
@@ -41,6 +41,23 @@ extern void x86_mce_vector_register(x86_
 /* Common generic MCE handler that implementations may nominate
  * via x86_mce_vector_register. */
 extern void mcheck_cmn_handler(struct cpu_user_regs *, long, cpu_banks_t);
+
+/* Read an MSR, checking for an interposed value first */
+extern struct intpose_ent *intpose_lookup(unsigned int, uint64_t,
+    uint64_t *);
+extern void intpose_inval(unsigned int, uint64_t);
+
+#define mca_rdmsrl(msr, var) do { \
+       if (intpose_lookup(smp_processor_id(), msr, &var) == NULL) \
+               rdmsrl(msr, var); \
+} while (0)
+
+/* Write an MSR, invalidating any interposed value */
+#define        mca_wrmsrl(msr, val) do { \
+       intpose_inval(smp_processor_id(), msr); \
+       wrmsrl(msr, val); \
+} while (0)
+
 
 /* Utility function to "logout" all architectural MCA telemetry from the MCA
  * banks of the current processor.  A cookie is returned which may be
diff --git a/xen/include/public/arch-x86/xen-mca.h 
b/xen/include/public/arch-x86/xen-mca.h
--- a/xen/include/public/arch-x86/xen-mca.h
+++ b/xen/include/public/arch-x86/xen-mca.h
@@ -324,10 +324,31 @@ struct xen_mc_physcpuinfo {
        XEN_GUEST_HANDLE(xen_mc_logical_cpu_t) info;
 };
 
+#define XEN_MC_msrinject    4
+#define MC_MSRINJ_MAXMSRS       8
+struct xen_mc_msrinject {
+       /* IN */
+       unsigned int mcinj_cpunr;       /* target processor id */
+       uint32_t mcinj_flags;           /* see MC_MSRINJ_F_* below */
+       uint32_t mcinj_count;           /* 0 .. count-1 in array are valid */
+       uint32_t mcinj_pad0;
+       struct mcinfo_msr mcinj_msr[MC_MSRINJ_MAXMSRS];
+};
+
+/* Flags for mcinj_flags above; bits 16-31 are reserved */
+#define MC_MSRINJ_F_INTERPOSE   0x1
+
+#define XEN_MC_mceinject    5
+struct xen_mc_mceinject {
+       unsigned int mceinj_cpunr;      /* target processor id */
+};
+
 typedef union {
     struct xen_mc_fetch        mc_fetch;
     struct xen_mc_notifydomain mc_notifydomain;
     struct xen_mc_physcpuinfo  mc_physcpuinfo;
+    struct xen_mc_msrinject    mc_msrinject;
+    struct xen_mc_mceinject    mc_mceinject;
 } xen_mc_arg_t;
 
 struct xen_mc {
diff --git a/xen/include/xen/lib.h b/xen/include/xen/lib.h
--- a/xen/include/xen/lib.h
+++ b/xen/include/xen/lib.h
@@ -95,6 +95,7 @@ unsigned long long parse_size_and_unit(c
 #define TAINT_MACHINE_CHECK             (1<<1)
 #define TAINT_BAD_PAGE                  (1<<2)
 #define TAINT_SYNC_CONSOLE              (1<<3)
+#define TAINT_ERROR_INJECT             (1<<4)
 extern int tainted;
 #define TAINT_STRING_MAX_LEN            20
 extern char *print_tainted(char *str);

_______________________________________________
Xen-devel mailing list
Xen-devel@xxxxxxxxxxxxxxxxxxx
http://lists.xensource.com/xen-devel

WARNING - OLD ARCHIVES

xen-devel

[Xen-devel] [PATCH] MCA injection (debug) code