[Xen-devel] [PATCH 1/2] MCA support with page offlining

This is xen part.

[1/2] xen part: mca-support-with-page-offlining-xen.patch

Signed-off-by: Kazuhiro Suzuki <kaz@xxxxxxxxxxxxxx>

Thanks,
KAZ

diff -r 6595393a3d28 xen/arch/x86/cpu/mcheck/amd_f10.c
--- a/xen/arch/x86/cpu/mcheck/amd_f10.c Tue Dec 09 16:28:02 2008 +0000
+++ b/xen/arch/x86/cpu/mcheck/amd_f10.c Mon Dec 15 14:25:07 2008 +0900
@@ -82,8 +82,6 @@
 }
 
 
-extern void k8_machine_check(struct cpu_user_regs *regs, long error_code);
-
 /* AMD Family10 machine check */
 void amd_f10_mcheck_init(struct cpuinfo_x86 *c) 
 { 
@@ -91,7 +89,7 @@
        uint32_t i;
        int cpu_nr;
 
-       machine_check_vector = k8_machine_check;
+       machine_check_vector = x86_machine_check;
        mc_callback_bank_extended = amd_f10_handler;
        cpu_nr = smp_processor_id();
        wmb();
diff -r 6595393a3d28 xen/arch/x86/cpu/mcheck/amd_k8.c
--- a/xen/arch/x86/cpu/mcheck/amd_k8.c  Tue Dec 09 16:28:02 2008 +0000
+++ b/xen/arch/x86/cpu/mcheck/amd_k8.c  Mon Dec 15 14:25:07 2008 +0900
@@ -69,220 +69,8 @@
 #include "mce.h"
 #include "x86_mca.h"
 
+extern int mce_bootlog;
 
-/* Machine Check Handler for AMD K8 family series */
-void k8_machine_check(struct cpu_user_regs *regs, long error_code)
-{
-       struct vcpu *vcpu = current;
-       struct domain *curdom;
-       struct mc_info *mc_data;
-       struct mcinfo_global mc_global;
-       struct mcinfo_bank mc_info;
-       uint64_t status, addrv, miscv, uc;
-       uint32_t i;
-       unsigned int cpu_nr;
-       uint32_t xen_impacted = 0;
-#define DOM_NORMAL     0
-#define DOM0_TRAP      1
-#define DOMU_TRAP      2
-#define DOMU_KILLED    4
-       uint32_t dom_state = DOM_NORMAL;
-
-       /* This handler runs as interrupt gate. So IPIs from the
-        * polling service routine are defered until we finished.
-        */
-
-        /* Disable interrupts for the _vcpu_. It may not re-scheduled to
-        * an other physical CPU or the impacted process in the guest
-        * continues running with corrupted data, otherwise. */
-        vcpu_schedule_lock_irq(vcpu);
-
-       mc_data = x86_mcinfo_getptr();
-       cpu_nr = smp_processor_id();
-       curdom = vcpu->domain;
-
-       memset(&mc_global, 0, sizeof(mc_global));
-       mc_global.common.type = MC_TYPE_GLOBAL;
-       mc_global.common.size = sizeof(mc_global);
-
-       mc_global.mc_domid = curdom->domain_id; /* impacted domain */
-       mc_global.mc_coreid = vcpu->processor; /* impacted physical cpu */
-       BUG_ON(cpu_nr != vcpu->processor);
-       mc_global.mc_core_threadid = 0;
-       mc_global.mc_vcpuid = vcpu->vcpu_id; /* impacted vcpu */
-#if 0 /* TODO: on which socket is this physical core?
-         It's not clear to me how to figure this out. */
-       mc_global.mc_socketid = ???;
-#endif
-       mc_global.mc_flags |= MC_FLAG_UNCORRECTABLE;
-       rdmsrl(MSR_IA32_MCG_STATUS, mc_global.mc_gstatus);
-
-       /* Quick check, who is impacted */
-       xen_impacted = is_idle_domain(curdom);
-
-       /* Dom0 */
-       x86_mcinfo_clear(mc_data);
-       x86_mcinfo_add(mc_data, &mc_global);
-
-       for (i = 0; i < nr_mce_banks; i++) {
-               struct domain *d;
-
-               rdmsrl(MSR_IA32_MC0_STATUS + 4 * i, status);
-
-               if (!(status & MCi_STATUS_VAL))
-                       continue;
-
-               /* An error happened in this bank.
-                * This is expected to be an uncorrectable error,
-                * since correctable errors get polled.
-                */
-               uc = status & MCi_STATUS_UC;
-
-               memset(&mc_info, 0, sizeof(mc_info));
-               mc_info.common.type = MC_TYPE_BANK;
-               mc_info.common.size = sizeof(mc_info);
-               mc_info.mc_bank = i;
-               mc_info.mc_status = status;
-
-               addrv = 0;
-               if (status & MCi_STATUS_ADDRV) {
-                       rdmsrl(MSR_IA32_MC0_ADDR + 4 * i, addrv);
-                       
-                       d = maddr_get_owner(addrv);
-                       if (d != NULL)
-                               mc_info.mc_domid = d->domain_id;
-               }
-
-               miscv = 0;
-               if (status & MCi_STATUS_MISCV)
-                       rdmsrl(MSR_IA32_MC0_MISC + 4 * i, miscv);
-
-               mc_info.mc_addr = addrv;
-               mc_info.mc_misc = miscv;
-
-               x86_mcinfo_add(mc_data, &mc_info); /* Dom0 */
-
-               if (mc_callback_bank_extended)
-                       mc_callback_bank_extended(mc_data, i, status);
-
-               /* clear status */
-               wrmsrl(MSR_IA32_MC0_STATUS + 4 * i, 0x0ULL);
-               wmb();
-               add_taint(TAINT_MACHINE_CHECK);
-       }
-
-       status = mc_global.mc_gstatus;
-
-       /* clear MCIP or cpu enters shutdown state
-        * in case another MCE occurs. */
-       status &= ~MCG_STATUS_MCIP;
-       wrmsrl(MSR_IA32_MCG_STATUS, status);
-       wmb();
-
-       /* For the details see the discussion "MCE/MCA concept" on xen-devel.
-        * The thread started here:
-        * 
http://lists.xensource.com/archives/html/xen-devel/2007-05/msg01015.html
-        */
-
-       /* MCG_STATUS_RIPV: 
-        * When this bit is not set, then the instruction pointer onto the stack
-        * to resume at is not valid. If xen is interrupted, then we panic 
anyway
-        * right below. Otherwise it is up to the guest to figure out if 
-        * guest kernel or guest userland is affected and should kill either
-        * itself or the affected process.
-        */
-
-       /* MCG_STATUS_EIPV:
-        * Evaluation of EIPV is the job of the guest.
-        */
-
-       if (xen_impacted) {
-               /* Now we are going to panic anyway. Allow interrupts, so that
-                * printk on serial console can work. */
-               vcpu_schedule_unlock_irq(vcpu);
-
-               /* Uh, that means, machine check exception
-                * inside Xen occured. */
-               printk("Machine check exception occured in Xen.\n");
-
-               /* if MCG_STATUS_EIPV indicates, the IP on the stack is related
-                * to the error then it makes sense to print a stack trace.
-                * That can be useful for more detailed error analysis and/or
-                * error case studies to figure out, if we can clear
-                * xen_impacted and kill a DomU instead
-                * (i.e. if a guest only control structure is affected, but then
-                * we must ensure the bad pages are not re-used again).
-                */
-               if (status & MCG_STATUS_EIPV) {
-                       printk("MCE: Instruction Pointer is related to the 
error. "
-                               "Therefore, print the execution state.\n");
-                       show_execution_state(regs);
-               }
-               x86_mcinfo_dump(mc_data);
-               panic("End of MCE. Use mcelog to decode above error codes.\n");
-       }
-
-       /* If Dom0 registered a machine check handler, which is only possible
-        * with a PV MCA driver, then ... */
-       if ( guest_has_trap_callback(dom0, 0, TRAP_machine_check) ) {
-               dom_state = DOM0_TRAP;
-
-               /* ... deliver machine check trap to Dom0. */
-               send_guest_trap(dom0, 0, TRAP_machine_check);
-
-               /* Xen may tell Dom0 now to notify the DomU.
-                * But this will happen through a hypercall. */
-       } else
-               /* Dom0 did not register a machine check handler, but if DomU
-                * did so, then... */
-                if ( guest_has_trap_callback(curdom, vcpu->vcpu_id, 
TRAP_machine_check) ) {
-                       dom_state = DOMU_TRAP;
-
-                       /* ... deliver machine check trap to DomU */
-                       send_guest_trap(curdom, vcpu->vcpu_id, 
TRAP_machine_check);
-       } else {
-               /* hmm... noone feels responsible to handle the error.
-                * So, do a quick check if a DomU is impacted or not.
-                */
-               if (curdom == dom0) {
-                       /* Dom0 is impacted. Since noone can't handle
-                        * this error, panic! */
-                       x86_mcinfo_dump(mc_data);
-                       panic("MCE occured in Dom0, which it can't handle\n");
-
-                       /* UNREACHED */
-               } else {
-                       dom_state = DOMU_KILLED;
-
-                       /* Enable interrupts. This basically results in
-                        * calling sti on the *physical* cpu. But after
-                        * domain_crash() the vcpu pointer is invalid.
-                        * Therefore, we must unlock the irqs before killing
-                        * it. */
-                       vcpu_schedule_unlock_irq(vcpu);
-
-                       /* DomU is impacted. Kill it and continue. */
-                       domain_crash(curdom);
-               }
-       }
-
-
-       switch (dom_state) {
-       case DOM0_TRAP:
-       case DOMU_TRAP:
-               /* Enable interrupts. */
-               vcpu_schedule_unlock_irq(vcpu);
-
-               /* guest softirqs and event callbacks are scheduled
-                * immediately after this handler exits. */
-               break;
-       case DOMU_KILLED:
-               /* Nothing to do here. */
-               break;
-       default:
-               BUG();
-       }
-}
 
 
 /* AMD K8 machine check */
@@ -292,7 +80,7 @@
        uint32_t i;
        int cpu_nr;
 
-       machine_check_vector = k8_machine_check;
+       machine_check_vector = x86_machine_check;
        cpu_nr = smp_processor_id();
        wmb();
 
@@ -300,6 +88,17 @@
        if (value & MCG_CTL_P)  /* Control register present ? */
                wrmsrl (MSR_IA32_MCG_CTL, 0xffffffffffffffffULL);
        nr_mce_banks = value & MCG_CAP_COUNT;
+
+       /* Log the machine checks left over from the previous reset.
+          This also clears all registers */
+       for (i=0; i<nr_mce_banks; i++) {
+               u64 status;
+               rdmsrl(MSR_IA32_MC0_STATUS + i*4, status);
+               if (status & MCi_STATUS_VAL) {
+                       x86_machine_check(NULL, mce_bootlog ? -1 : -2);
+                       break;
+               }
+       }
 
        for (i = 0; i < nr_mce_banks; i++) {
                switch (i) {
diff -r 6595393a3d28 xen/arch/x86/cpu/mcheck/amd_nonfatal.c
--- a/xen/arch/x86/cpu/mcheck/amd_nonfatal.c    Tue Dec 09 16:28:02 2008 +0000
+++ b/xen/arch/x86/cpu/mcheck/amd_nonfatal.c    Mon Dec 15 14:25:07 2008 +0900
@@ -65,117 +65,12 @@
 #include "mce.h"
 #include "x86_mca.h"
 
-static struct timer mce_timer;
+static int hw_threshold = 0;
 
-#define MCE_PERIOD MILLISECS(15000)
-#define MCE_MIN    MILLISECS(2000)
-#define MCE_MAX    MILLISECS(30000)
+extern struct timer mce_timer;
 
-static s_time_t period = MCE_PERIOD;
-static int hw_threshold = 0;
-static int adjust = 0;
-
-/* The polling service routine:
- * Collects information of correctable errors and notifies
- * Dom0 via an event.
- */
-void mce_amd_checkregs(void *info)
-{
-       struct vcpu *vcpu = current;
-       struct mc_info *mc_data;
-       struct mcinfo_global mc_global;
-       struct mcinfo_bank mc_info;
-       uint64_t status, addrv, miscv;
-       unsigned int i;
-       unsigned int event_enabled;
-       unsigned int cpu_nr;
-       int error_found;
-
-       /* We don't need a slot yet. Only allocate one on error. */
-       mc_data = NULL;
-
-       cpu_nr = smp_processor_id();
-       event_enabled = guest_enabled_event(dom0->vcpu[0], VIRQ_MCA);
-       error_found = 0;
-
-       memset(&mc_global, 0, sizeof(mc_global));
-       mc_global.common.type = MC_TYPE_GLOBAL;
-       mc_global.common.size = sizeof(mc_global);
-
-       mc_global.mc_domid = vcpu->domain->domain_id; /* impacted domain */
-       mc_global.mc_coreid = vcpu->processor; /* impacted physical cpu */
-       BUG_ON(cpu_nr != vcpu->processor);
-       mc_global.mc_core_threadid = 0;
-       mc_global.mc_vcpuid = vcpu->vcpu_id; /* impacted vcpu */
-#if 0 /* TODO: on which socket is this physical core?
-         It's not clear to me how to figure this out. */
-       mc_global.mc_socketid = ???;
-#endif
-       mc_global.mc_flags |= MC_FLAG_CORRECTABLE;
-       rdmsrl(MSR_IA32_MCG_STATUS, mc_global.mc_gstatus);
-
-       for (i = 0; i < nr_mce_banks; i++) {
-               struct domain *d;
-
-               rdmsrl(MSR_IA32_MC0_STATUS + i * 4, status);
-
-               if (!(status & MCi_STATUS_VAL))
-                       continue;
-
-               if (mc_data == NULL) {
-                       /* Now we need a slot to fill in error telemetry. */
-                       mc_data = x86_mcinfo_getptr();
-                       BUG_ON(mc_data == NULL);
-                       x86_mcinfo_clear(mc_data);
-                       x86_mcinfo_add(mc_data, &mc_global);
-               }
-
-               memset(&mc_info, 0, sizeof(mc_info));
-               mc_info.common.type = MC_TYPE_BANK;
-               mc_info.common.size = sizeof(mc_info);
-               mc_info.mc_bank = i;
-               mc_info.mc_status = status;
-
-               /* Increase polling frequency */
-               error_found = 1;
-
-               addrv = 0;
-               if (status & MCi_STATUS_ADDRV) {
-                       rdmsrl(MSR_IA32_MC0_ADDR + i * 4, addrv);
-
-                       d = maddr_get_owner(addrv);
-                       if (d != NULL)
-                               mc_info.mc_domid = d->domain_id;
-               }
-
-               miscv = 0;
-               if (status & MCi_STATUS_MISCV)
-                       rdmsrl(MSR_IA32_MC0_MISC + i * 4, miscv);
-
-               mc_info.mc_addr = addrv;
-               mc_info.mc_misc = miscv;
-               x86_mcinfo_add(mc_data, &mc_info);
-
-               if (mc_callback_bank_extended)
-                       mc_callback_bank_extended(mc_data, i, status);
-
-               /* clear status */
-               wrmsrl(MSR_IA32_MC0_STATUS + i * 4, 0x0ULL);
-               wmb();
-       }
-
-       if (error_found > 0) {
-               /* If Dom0 enabled the VIRQ_MCA event, then ... */
-               if (event_enabled)
-                       /* ... notify it. */
-                       send_guest_global_virq(dom0, VIRQ_MCA);
-               else
-                       /* ... or dump it */
-                       x86_mcinfo_dump(mc_data);
-       }
-
-       adjust += error_found;
-}
+extern s_time_t period;
+extern int adjust;
 
 /* polling service routine invoker:
  * Adjust poll frequency at runtime. No error means slow polling frequency,
@@ -186,7 +81,7 @@
  */
 static void mce_amd_work_fn(void *data)
 {
-       on_each_cpu(mce_amd_checkregs, data, 1, 1);
+       on_each_cpu(x86_mce_checkregs, data, 1, 1);
 
        if (adjust > 0) {
                if ( !guest_enabled_event(dom0->vcpu[0], VIRQ_MCA) ) {
diff -r 6595393a3d28 xen/arch/x86/cpu/mcheck/mce.c
--- a/xen/arch/x86/cpu/mcheck/mce.c     Tue Dec 09 16:28:02 2008 +0000
+++ b/xen/arch/x86/cpu/mcheck/mce.c     Mon Dec 15 14:25:07 2008 +0900
@@ -7,6 +7,9 @@
 #include <xen/types.h>
 #include <xen/kernel.h>
 #include <xen/config.h>
+#include <xen/sched.h>
+#include <xen/sched-if.h>
+#include <xen/paging.h>
 #include <xen/smp.h>
 #include <xen/errno.h>
 
@@ -18,6 +21,12 @@
 
 int mce_disabled = 0;
 unsigned int nr_mce_banks;
+int mce_bootlog = 1;
+
+#define MAX_PAGE_OFFLINING 1024
+
+static struct page_info *page_offlining[MAX_PAGE_OFFLINING];
+static int num_page_offlining = 0;
 
 EXPORT_SYMBOL_GPL(nr_mce_banks);       /* non-fatal.o */
 
@@ -136,6 +145,9 @@
                        intel_p5_mcheck_init(c);
                if (c->x86==6)
                        intel_p6_mcheck_init(c);
+#else
+               if (c->x86==6)
+                       intel_p4_mcheck_init(c);
 #endif
                if (c->x86==15)
                        intel_p4_mcheck_init(c);
@@ -159,9 +171,19 @@
        mce_disabled = 1;
 }
 
+/* mce=off disables machine check.
+   mce=bootlog Log MCEs from before booting. Disabled by default on AMD.
+   mce=nobootlog Don't log MCEs from before booting. */
 static void __init mcheck_enable(char *str)
 {
-       mce_disabled = -1;
+       if (*str == '=')
+               str++;
+       if (!strcmp(str, "off"))
+               mce_disabled = 1;
+       else if (!strcmp(str, "bootlog") || !strcmp(str,"nobootlog"))
+               mce_bootlog = str[0] == 'b';
+       else
+               printk("mce= argument %s ignored.", str); 
 }
 
 custom_param("nomce", mcheck_disable);
@@ -221,6 +243,12 @@
        /* This function is called from the fetch hypercall with
         * the mc_lock spinlock held. Thus, no need for locking here.
         */
+
+       /* Return NULL if no data is available. */
+       if (mc_data.fetch_idx == mc_data.error_idx) {
+               *fetch_idx = mc_data.fetch_idx;
+               return NULL;
+       }
        mi = &(x86_mcinfo_mcdata(mc_data.fetch_idx));
        if ((d != dom0) && !x86_mcinfo_matches_guest(mi, d, v)) {
                /* Bogus domU command detected. */
@@ -431,6 +459,272 @@
        } while (1);
 }
 
+static int x86_page_offlining(unsigned long maddr, struct domain *d)
+{
+       int i;
+       struct page_info *pg;
+
+       if (!mfn_valid(maddr >> PAGE_SHIFT)) {
+               printk(XENLOG_ERR "Page offlining: ( %lx ) invalid.\n", maddr);
+               return -1;
+       }
+
+       /* convert physical address to physical page number */
+       pg = maddr_to_page(maddr);
+
+       if (pg == NULL) {
+               printk(XENLOG_ERR "Page offlining: ( %lx ) not found.\n",
+                      maddr);
+               return -1;
+       }
+
+       /* check whether a page number have been already registered or not */
+       for (i = 0; i < num_page_offlining; i++)
+               if (page_offlining[i] == pg)
+                       goto out;
+
+       /* limitation check and already having attribute 'reserved' */
+       if (num_page_offlining == MAX_PAGE_OFFLINING ||
+           pg->count_info & PGC_reserved) {
+               printk(XENLOG_DEBUG "Page offlining: ( %lx ) failure.\n",
+                      maddr);
+               return 1;
+       }
+
+       /* add attribute 'reserved' and register the page */
+       get_page(pg, d);
+       pg->count_info |= PGC_reserved;
+       page_offlining[num_page_offlining++] = pg;
+
+ out:
+       printk(XENLOG_DEBUG "Page offlining: ( %lx ) success.\n", maddr);
+       return 0;
+}
+
+
+/* Machine Check Handler for AMD K8 family series and Intel P4/Xeon family */
+void x86_machine_check(struct cpu_user_regs *regs, long error_code)
+{
+       struct vcpu *vcpu = current;
+       struct domain *curdom;
+       struct mc_info *mc_data;
+       struct mcinfo_global mc_global;
+       struct mcinfo_bank mc_info;
+       uint64_t status, addrv, miscv, uc;
+       uint32_t i;
+       unsigned int cpu_nr;
+       uint32_t xen_impacted = 0;
+#define DOM_NORMAL     0
+#define DOM0_TRAP      1
+#define DOMU_TRAP      2
+#define DOMU_KILLED    4
+       uint32_t dom_state = DOM_NORMAL;
+
+       /* This handler runs as interrupt gate. So IPIs from the
+        * polling service routine are defered until we finished.
+        */
+
+        /* Disable interrupts for the _vcpu_. It may not re-scheduled to
+        * an other physical CPU or the impacted process in the guest
+        * continues running with corrupted data, otherwise. */
+        vcpu_schedule_lock_irq(vcpu);
+
+       mc_data = x86_mcinfo_getptr();
+       cpu_nr = smp_processor_id();
+       curdom = vcpu->domain;
+
+       memset(&mc_global, 0, sizeof(mc_global));
+       mc_global.common.type = MC_TYPE_GLOBAL;
+       mc_global.common.size = sizeof(mc_global);
+
+       mc_global.mc_domid = curdom->domain_id; /* impacted domain */
+       mc_global.mc_coreid = vcpu->processor; /* impacted physical cpu */
+       BUG_ON(cpu_nr != vcpu->processor);
+       mc_global.mc_core_threadid = 0;
+       mc_global.mc_vcpuid = vcpu->vcpu_id; /* impacted vcpu */
+#if 0 /* TODO: on which socket is this physical core?
+         It's not clear to me how to figure this out. */
+       mc_global.mc_socketid = ???;
+#endif
+       mc_global.mc_flags |= MC_FLAG_UNCORRECTABLE;
+       rdmsrl(MSR_IA32_MCG_STATUS, mc_global.mc_gstatus);
+
+       /* Quick check, who is impacted */
+       xen_impacted = is_idle_domain(curdom);
+
+       /* Dom0 */
+       x86_mcinfo_clear(mc_data);
+       x86_mcinfo_add(mc_data, &mc_global);
+
+       for (i = 0; i < nr_mce_banks; i++) {
+               struct domain *d;
+
+               rdmsrl(MSR_IA32_MC0_STATUS + 4 * i, status);
+
+               if (!(status & MCi_STATUS_VAL))
+                       continue;
+
+               /* An error happened in this bank.
+                * This is expected to be an uncorrectable error,
+                * since correctable errors get polled.
+                */
+               uc = status & MCi_STATUS_UC;
+
+               memset(&mc_info, 0, sizeof(mc_info));
+               mc_info.common.type = MC_TYPE_BANK;
+               mc_info.common.size = sizeof(mc_info);
+               mc_info.mc_bank = i;
+               mc_info.mc_status = status;
+
+               addrv = 0;
+               if (status & MCi_STATUS_ADDRV) {
+                       rdmsrl(MSR_IA32_MC0_ADDR + 4 * i, addrv);
+                       
+                       d = maddr_get_owner(addrv);
+                       if (d != NULL) {
+                               mc_info.mc_domid = d->domain_id;
+
+                               /* Page offlining */
+                               x86_page_offlining(addrv, d);
+                       }
+               }
+
+               miscv = 0;
+               if (status & MCi_STATUS_MISCV)
+                       rdmsrl(MSR_IA32_MC0_MISC + 4 * i, miscv);
+
+               mc_info.mc_addr = addrv;
+               mc_info.mc_misc = miscv;
+
+               x86_mcinfo_add(mc_data, &mc_info); /* Dom0 */
+
+               if (mc_callback_bank_extended)
+                       mc_callback_bank_extended(mc_data, i, status);
+
+               /* clear status */
+               wrmsrl(MSR_IA32_MC0_STATUS + 4 * i, 0x0ULL);
+               wmb();
+               add_taint(TAINT_MACHINE_CHECK);
+       }
+
+       /* Never do anything final for the previous reset */
+       if (!regs) {
+               vcpu_schedule_unlock_irq(vcpu);
+               return;
+       }
+
+       status = mc_global.mc_gstatus;
+
+       /* clear MCIP or cpu enters shutdown state
+        * in case another MCE occurs. */
+       status &= ~MCG_STATUS_MCIP;
+       wrmsrl(MSR_IA32_MCG_STATUS, status);
+       wmb();
+
+       /* For the details see the discussion "MCE/MCA concept" on xen-devel.
+        * The thread started here:
+        * 
http://lists.xensource.com/archives/html/xen-devel/2007-05/msg01015.html
+        */
+
+       /* MCG_STATUS_RIPV: 
+        * When this bit is not set, then the instruction pointer onto the stack
+        * to resume at is not valid. If xen is interrupted, then we panic 
anyway
+        * right below. Otherwise it is up to the guest to figure out if 
+        * guest kernel or guest userland is affected and should kill either
+        * itself or the affected process.
+        */
+
+       /* MCG_STATUS_EIPV:
+        * Evaluation of EIPV is the job of the guest.
+        */
+
+       if (xen_impacted) {
+               /* Now we are going to panic anyway. Allow interrupts, so that
+                * printk on serial console can work. */
+               vcpu_schedule_unlock_irq(vcpu);
+
+               /* Uh, that means, machine check exception
+                * inside Xen occured. */
+               printk("Machine check exception occured in Xen.\n");
+
+               /* if MCG_STATUS_EIPV indicates, the IP on the stack is related
+                * to the error then it makes sense to print a stack trace.
+                * That can be useful for more detailed error analysis and/or
+                * error case studies to figure out, if we can clear
+                * xen_impacted and kill a DomU instead
+                * (i.e. if a guest only control structure is affected, but then
+                * we must ensure the bad pages are not re-used again).
+                */
+               if (status & MCG_STATUS_EIPV) {
+                       printk("MCE: Instruction Pointer is related to the 
error. "
+                               "Therefore, print the execution state.\n");
+                       show_execution_state(regs);
+               }
+               x86_mcinfo_dump(mc_data);
+               panic("End of MCE. Use mcelog to decode above error codes.\n");
+       }
+
+       /* If Dom0 registered a machine check handler, which is only possible
+        * with a PV MCA driver, then ... */
+       if ( guest_has_trap_callback(dom0, 0, TRAP_machine_check) ) {
+               dom_state = DOM0_TRAP;
+
+               /* ... deliver machine check trap to Dom0. */
+               send_guest_trap(dom0, 0, TRAP_machine_check);
+
+               /* Xen may tell Dom0 now to notify the DomU.
+                * But this will happen through a hypercall. */
+       } else
+               /* Dom0 did not register a machine check handler, but if DomU
+                * did so, then... */
+                if ( guest_has_trap_callback(curdom, vcpu->vcpu_id, 
TRAP_machine_check) ) {
+                       dom_state = DOMU_TRAP;
+
+                       /* ... deliver machine check trap to DomU */
+                       send_guest_trap(curdom, vcpu->vcpu_id, 
TRAP_machine_check);
+       } else {
+               /* hmm... noone feels responsible to handle the error.
+                * So, do a quick check if a DomU is impacted or not.
+                */
+               if (curdom == dom0) {
+                       /* Dom0 is impacted. Since noone can't handle
+                        * this error, panic! */
+                       x86_mcinfo_dump(mc_data);
+                       panic("MCE occured in Dom0, which it can't handle\n");
+
+                       /* UNREACHED */
+               } else {
+                       dom_state = DOMU_KILLED;
+
+                       /* Enable interrupts. This basically results in
+                        * calling sti on the *physical* cpu. But after
+                        * domain_crash() the vcpu pointer is invalid.
+                        * Therefore, we must unlock the irqs before killing
+                        * it. */
+                       vcpu_schedule_unlock_irq(vcpu);
+
+                       /* DomU is impacted. Kill it and continue. */
+                       domain_crash(curdom);
+               }
+       }
+
+
+       switch (dom_state) {
+       case DOM0_TRAP:
+       case DOMU_TRAP:
+               /* Enable interrupts. */
+               vcpu_schedule_unlock_irq(vcpu);
+
+               /* guest softirqs and event callbacks are scheduled
+                * immediately after this handler exits. */
+               break;
+       case DOMU_KILLED:
+               /* Nothing to do here. */
+               break;
+       default:
+               BUG();
+       }
+}
 
 
 /* Machine Check Architecture Hypercall */
@@ -564,7 +858,7 @@
                if ( copy_to_guest(u_xen_mc, op, 1) )
                        ret = -EFAULT;
 
-               if (ret == 0) {
+               if (ret == 0 && mc_notifydomain->flags == XEN_MC_OK) {
                        x86_mcinfo_marknotified(mc_notifydomain);
                }
 
diff -r 6595393a3d28 xen/arch/x86/cpu/mcheck/non-fatal.c
--- a/xen/arch/x86/cpu/mcheck/non-fatal.c       Tue Dec 09 16:28:02 2008 +0000
+++ b/xen/arch/x86/cpu/mcheck/non-fatal.c       Mon Dec 15 14:25:07 2008 +0900
@@ -14,16 +14,158 @@
 #include <xen/smp.h>
 #include <xen/timer.h>
 #include <xen/errno.h>
+#include <xen/event.h>
 #include <asm/processor.h> 
 #include <asm/system.h>
 #include <asm/msr.h>
 
 #include "mce.h"
+#include "x86_mca.h"
 
 static int firstbank;
-static struct timer mce_timer;
 
-#define MCE_PERIOD MILLISECS(15000)
+struct timer mce_timer;
+
+s_time_t period = MCE_PERIOD;
+int adjust = 0;
+
+/* The polling service routine:
+ * Collects information of correctable errors and notifies
+ * Dom0 via an event.
+ */
+void x86_mce_checkregs(void *info)
+{
+       struct vcpu *vcpu = current;
+       struct mc_info *mc_data;
+       struct mcinfo_global mc_global;
+       struct mcinfo_bank mc_info;
+       uint64_t status, addrv, miscv;
+       unsigned int i;
+       unsigned int event_enabled;
+       unsigned int cpu_nr;
+       int error_found;
+
+       /* We don't need a slot yet. Only allocate one on error. */
+       mc_data = NULL;
+
+       cpu_nr = smp_processor_id();
+       event_enabled = guest_enabled_event(dom0->vcpu[0], VIRQ_MCA);
+       error_found = 0;
+
+       memset(&mc_global, 0, sizeof(mc_global));
+       mc_global.common.type = MC_TYPE_GLOBAL;
+       mc_global.common.size = sizeof(mc_global);
+
+       mc_global.mc_domid = vcpu->domain->domain_id; /* impacted domain */
+       mc_global.mc_coreid = vcpu->processor; /* impacted physical cpu */
+       BUG_ON(cpu_nr != vcpu->processor);
+       mc_global.mc_core_threadid = 0;
+       mc_global.mc_vcpuid = vcpu->vcpu_id; /* impacted vcpu */
+#if 0 /* TODO: on which socket is this physical core?
+         It's not clear to me how to figure this out. */
+       mc_global.mc_socketid = ???;
+#endif
+       mc_global.mc_flags |= MC_FLAG_CORRECTABLE;
+       rdmsrl(MSR_IA32_MCG_STATUS, mc_global.mc_gstatus);
+
+       for (i = 0; i < nr_mce_banks; i++) {
+               struct domain *d;
+
+               rdmsrl(MSR_IA32_MC0_STATUS + i * 4, status);
+
+               if (!(status & MCi_STATUS_VAL))
+                       continue;
+
+               if (mc_data == NULL) {
+                       /* Now we need a slot to fill in error telemetry. */
+                       mc_data = x86_mcinfo_getptr();
+                       BUG_ON(mc_data == NULL);
+                       x86_mcinfo_clear(mc_data);
+                       x86_mcinfo_add(mc_data, &mc_global);
+               }
+
+               memset(&mc_info, 0, sizeof(mc_info));
+               mc_info.common.type = MC_TYPE_BANK;
+               mc_info.common.size = sizeof(mc_info);
+               mc_info.mc_bank = i;
+               mc_info.mc_status = status;
+
+               /* Increase polling frequency */
+               error_found = 1;
+
+               addrv = 0;
+               if (status & MCi_STATUS_ADDRV) {
+                       rdmsrl(MSR_IA32_MC0_ADDR + i * 4, addrv);
+
+                       d = maddr_get_owner(addrv);
+                       if (d != NULL)
+                               mc_info.mc_domid = d->domain_id;
+               }
+
+               miscv = 0;
+               if (status & MCi_STATUS_MISCV)
+                       rdmsrl(MSR_IA32_MC0_MISC + i * 4, miscv);
+
+               mc_info.mc_addr = addrv;
+               mc_info.mc_misc = miscv;
+               x86_mcinfo_add(mc_data, &mc_info);
+
+               if (mc_callback_bank_extended)
+                       mc_callback_bank_extended(mc_data, i, status);
+
+               /* clear status */
+               wrmsrl(MSR_IA32_MC0_STATUS + i * 4, 0x0ULL);
+               wmb();
+       }
+
+       if (error_found > 0) {
+               /* If Dom0 enabled the VIRQ_MCA event, then ... */
+               if (event_enabled)
+                       /* ... notify it. */
+                       send_guest_global_virq(dom0, VIRQ_MCA);
+               else
+                       /* ... or dump it */
+                       x86_mcinfo_dump(mc_data);
+       }
+
+       adjust += error_found;
+}
+
+static void p4_mce_work_fn(void *data)
+{ 
+       on_each_cpu(x86_mce_checkregs, NULL, 1, 1);
+
+       if (adjust > 0) {
+               if ( !guest_enabled_event(dom0->vcpu[0], VIRQ_MCA) ) {
+                       /* Dom0 did not enable VIRQ_MCA, so Xen is reporting. */
+                       printk("MCE: polling routine found correctable error. "
+                               " Use mcelog to parse above error output.\n");
+               }
+       }
+
+       if (adjust > 0) {
+               /* Increase polling frequency */
+               adjust++; /* adjust == 1 must have an effect */
+               period /= adjust;
+       } else {
+               /* Decrease polling frequency */
+               period *= 2;
+       }
+       if (period > MCE_MAX) {
+               /* limit: Poll at least every 30s */
+               period = MCE_MAX;
+       }
+       if (period < MCE_MIN) {
+               /* limit: Poll every 2s.
+                * When this is reached an uncorrectable error
+                * is expected to happen, if Dom0 does nothing.
+                */
+               period = MCE_MIN;
+       }
+
+       set_timer(&mce_timer, NOW() + period);
+       adjust = 0;
+}
 
 static void mce_checkregs (void *info)
 {
@@ -85,6 +227,15 @@
                break;
 
        case X86_VENDOR_INTEL:
+               if (c->x86 == 15        /* P4/Xeon */
+#ifdef CONFIG_X86_64
+                   || c->x86 == 6
+#endif
+                   ) {
+                       init_timer(&mce_timer, p4_mce_work_fn, NULL, 0);
+                       set_timer(&mce_timer, NOW() + period);
+                       break;
+               }
                init_timer(&mce_timer, mce_work_fn, NULL, 0);
                set_timer(&mce_timer, NOW() + MCE_PERIOD);
                break;
diff -r 6595393a3d28 xen/arch/x86/cpu/mcheck/p4.c
--- a/xen/arch/x86/cpu/mcheck/p4.c      Tue Dec 09 16:28:02 2008 +0000
+++ b/xen/arch/x86/cpu/mcheck/p4.c      Mon Dec 15 14:25:07 2008 +0900
@@ -15,6 +15,7 @@
 #include <asm/apic.h>
 
 #include "mce.h"
+#include "x86_mca.h"
 
 /* as supported by the P4/Xeon family */
 struct intel_mce_extended_msrs {
@@ -32,6 +33,7 @@
 };
 
 static int mce_num_extended_msrs = 0;
+extern int mce_bootlog;
 
 
 #ifdef CONFIG_X86_MCE_P4THERMAL
@@ -158,85 +160,13 @@
        return mce_num_extended_msrs;
 }
 
-static fastcall void intel_machine_check(struct cpu_user_regs * regs, long 
error_code)
-{
-       int recover=1;
-       u32 alow, ahigh, high, low;
-       u32 mcgstl, mcgsth;
-       int i;
-       struct intel_mce_extended_msrs dbg;
-
-       rdmsr (MSR_IA32_MCG_STATUS, mcgstl, mcgsth);
-       if (mcgstl & (1<<0))    /* Recoverable ? */
-               recover=0;
-
-       printk (KERN_EMERG "CPU %d: Machine Check Exception: %08x%08x\n",
-               smp_processor_id(), mcgsth, mcgstl);
-
-       if (intel_get_extended_msrs(&dbg)) {
-               printk (KERN_DEBUG "CPU %d: EIP: %08x EFLAGS: %08x\n",
-                       smp_processor_id(), dbg.eip, dbg.eflags);
-               printk (KERN_DEBUG "\teax: %08x ebx: %08x ecx: %08x edx: 
%08x\n",
-                       dbg.eax, dbg.ebx, dbg.ecx, dbg.edx);
-               printk (KERN_DEBUG "\tesi: %08x edi: %08x ebp: %08x esp: 
%08x\n",
-                       dbg.esi, dbg.edi, dbg.ebp, dbg.esp);
-       }
-
-       for (i=0; i<nr_mce_banks; i++) {
-               rdmsr (MSR_IA32_MC0_STATUS+i*4,low, high);
-               if (high & (1<<31)) {
-                       if (high & (1<<29))
-                               recover |= 1;
-                       if (high & (1<<25))
-                               recover |= 2;
-                       printk (KERN_EMERG "Bank %d: %08x%08x", i, high, low);
-                       high &= ~(1<<31);
-                       if (high & (1<<27)) {
-                               rdmsr (MSR_IA32_MC0_MISC+i*4, alow, ahigh);
-                               printk ("[%08x%08x]", ahigh, alow);
-                       }
-                       if (high & (1<<26)) {
-                               rdmsr (MSR_IA32_MC0_ADDR+i*4, alow, ahigh);
-                               printk (" at %08x%08x", ahigh, alow);
-                       }
-                       printk ("\n");
-               }
-       }
-
-       if (recover & 2)
-               panic ("CPU context corrupt");
-       if (recover & 1)
-               panic ("Unable to continue");
-
-       printk(KERN_EMERG "Attempting to continue.\n");
-       /* 
-        * Do not clear the MSR_IA32_MCi_STATUS if the error is not 
-        * recoverable/continuable.This will allow BIOS to look at the MSRs
-        * for errors if the OS could not log the error.
-        */
-       for (i=0; i<nr_mce_banks; i++) {
-               u32 msr;
-               msr = MSR_IA32_MC0_STATUS+i*4;
-               rdmsr (msr, low, high);
-               if (high&(1<<31)) {
-                       /* Clear it */
-                       wrmsr(msr, 0UL, 0UL);
-                       /* Serialize */
-                       wmb();
-                       add_taint(TAINT_MACHINE_CHECK);
-               }
-       }
-       mcgstl &= ~(1<<2);
-       wrmsr (MSR_IA32_MCG_STATUS,mcgstl, mcgsth);
-}
-
 
 void intel_p4_mcheck_init(struct cpuinfo_x86 *c)
 {
        u32 l, h;
        int i;
        
-       machine_check_vector = intel_machine_check;
+       machine_check_vector = x86_machine_check;
        wmb();
 
        printk (KERN_INFO "Intel machine check architecture supported.\n");
@@ -244,6 +174,17 @@
        if (l & (1<<8)) /* Control register present ? */
                wrmsr (MSR_IA32_MCG_CTL, 0xffffffff, 0xffffffff);
        nr_mce_banks = l & 0xff;
+
+       /* Log the machine checks left over from the previous reset.
+          This also clears all registers */
+       for (i=0; i<nr_mce_banks; i++) {
+               u64 status;
+               rdmsrl(MSR_IA32_MC0_STATUS + i*4, status);
+               if (status & MCi_STATUS_VAL) {
+                       x86_machine_check(NULL, mce_bootlog ? -1 : -2);
+                       break;
+               }
+       }
 
        for (i=0; i<nr_mce_banks; i++) {
                wrmsr (MSR_IA32_MC0_CTL+4*i, 0xffffffff, 0xffffffff);
diff -r 6595393a3d28 xen/arch/x86/cpu/mcheck/x86_mca.h
--- a/xen/arch/x86/cpu/mcheck/x86_mca.h Tue Dec 09 16:28:02 2008 +0000
+++ b/xen/arch/x86/cpu/mcheck/x86_mca.h Mon Dec 15 14:25:07 2008 +0900
@@ -70,3 +70,11 @@
 /* reserved bits */
 #define MCi_STATUS_OTHER_RESERVED2      0x0180000000000000ULL
 
+/* Polling period */
+#define MCE_PERIOD MILLISECS(15000)
+#define MCE_MIN    MILLISECS(2000)
+#define MCE_MAX    MILLISECS(30000)
+
+/* Common routines */
+void x86_machine_check(struct cpu_user_regs *regs, long error_code);
+void x86_mce_checkregs(void *info);
diff -r 6595393a3d28 xen/arch/x86/traps.c
--- a/xen/arch/x86/traps.c      Tue Dec 09 16:28:02 2008 +0000
+++ b/xen/arch/x86/traps.c      Mon Dec 15 14:25:07 2008 +0900
@@ -726,8 +726,10 @@
         if ( !opt_allow_hugepage )
             __clear_bit(X86_FEATURE_PSE, &d);
         __clear_bit(X86_FEATURE_PGE, &d);
+#ifndef __x86_64__
         __clear_bit(X86_FEATURE_MCE, &d);
         __clear_bit(X86_FEATURE_MCA, &d);
+#endif
         __clear_bit(X86_FEATURE_PSE36, &d);
     }
     switch ( (uint32_t)regs->eax )
diff -r 6595393a3d28 xen/common/page_alloc.c
--- a/xen/common/page_alloc.c   Tue Dec 09 16:28:02 2008 +0000
+++ b/xen/common/page_alloc.c   Mon Dec 15 14:25:07 2008 +0900
@@ -338,8 +338,14 @@
 
             /* Find smallest order which can satisfy the request. */
             for ( j = order; j <= MAX_ORDER; j++ )
-                if ( !list_empty(&heap(node, zone, j)) )
-                    goto found;
+                if ( !list_empty(&heap(node, zone, j)) ) {
+                    pg = list_entry(heap(node, zone, j).next, struct 
page_info, list);
+                    if (!(pg->count_info & PGC_reserved))
+                        goto found;
+                    else
+                        printk(XENLOG_DEBUG "Page %p(%lx) is not to be 
allocated.\n",
+                               pg, page_to_maddr(pg));
+                }
         } while ( zone-- > zone_lo ); /* careful: unsigned zone may wrap */
 
         /* Pick next node, wrapping around if needed. */
@@ -402,11 +408,22 @@
     unsigned long mask;
     unsigned int i, node = phys_to_nid(page_to_maddr(pg));
     struct domain *d;
+    int reserved = 0;
 
     ASSERT(zone < NR_ZONES);
     ASSERT(order <= MAX_ORDER);
     ASSERT(node >= 0);
     ASSERT(node < num_online_nodes());
+
+    for ( i = 0; i < (1 << order); i++) {
+        reserved += !!(pg[i].count_info & PGC_reserved);
+        if (!!(pg[i].count_info & PGC_reserved))
+            printk(XENLOG_DEBUG "Page %p(%lx) is not to be freed\n",
+                   &pg[i], page_to_maddr(&pg[i]));
+    }
+
+    if (reserved)
+        return;
 
     for ( i = 0; i < (1 << order); i++ )
     {
diff -r 6595393a3d28 xen/include/asm-x86/mm.h
--- a/xen/include/asm-x86/mm.h  Tue Dec 09 16:28:02 2008 +0000
+++ b/xen/include/asm-x86/mm.h  Mon Dec 15 14:25:07 2008 +0900
@@ -142,8 +142,11 @@
  /* 3-bit PAT/PCD/PWT cache-attribute hint. */
 #define PGC_cacheattr_base  26
 #define PGC_cacheattr_mask  (7U<<PGC_cacheattr_base)
- /* 26-bit count of references to this frame. */
-#define PGC_count_mask      ((1U<<26)-1)
+ /* Set for special pages, which can never be used */
+#define _PGC_reserved      25
+#define PGC_reserved       (1U<<_PGC_reserved)
+ /* 25-bit count of references to this frame. */
+#define PGC_count_mask      ((1U<<25)-1)
 
 #define is_xen_heap_page(page) is_xen_heap_mfn(page_to_mfn(page))
 #define is_xen_heap_mfn(mfn) ({                         \

_______________________________________________
Xen-devel mailing list
Xen-devel@xxxxxxxxxxxxxxxxxxx
http://lists.xensource.com/xen-devel

WARNING - OLD ARCHIVES

xen-devel

[Xen-devel] [PATCH 1/2] MCA support with page offlining