[Xen-devel] [RFC][PATCH 1/2] MCA support for Intel64

Hi,

This is a xen part patch.

Signed-off-by: Kazuhiro Suzuki <kaz@xxxxxxxxxxxxxx>

Thanks,
KAZ

diff -r f4552d9f6afb xen/arch/x86/cpu/mcheck/amd_f10.c
--- a/xen/arch/x86/cpu/mcheck/amd_f10.c Tue Sep 23 17:11:33 2008 +0100
+++ b/xen/arch/x86/cpu/mcheck/amd_f10.c Fri Sep 26 14:30:17 2008 +0900
@@ -82,8 +82,6 @@
 }
 
 
-extern void k8_machine_check(struct cpu_user_regs *regs, long error_code);
-
 /* AMD Family10 machine check */
 void amd_f10_mcheck_init(struct cpuinfo_x86 *c) 
 { 
@@ -91,7 +89,7 @@
        uint32_t i;
        int cpu_nr;
 
-       machine_check_vector = k8_machine_check;
+       machine_check_vector = x86_machine_check;
        mc_callback_bank_extended = amd_f10_handler;
        cpu_nr = smp_processor_id();
        wmb();
diff -r f4552d9f6afb xen/arch/x86/cpu/mcheck/amd_k8.c
--- a/xen/arch/x86/cpu/mcheck/amd_k8.c  Tue Sep 23 17:11:33 2008 +0100
+++ b/xen/arch/x86/cpu/mcheck/amd_k8.c  Fri Sep 26 14:30:17 2008 +0900
@@ -70,219 +70,6 @@
 #include "x86_mca.h"
 
 
-/* Machine Check Handler for AMD K8 family series */
-void k8_machine_check(struct cpu_user_regs *regs, long error_code)
-{
-       struct vcpu *vcpu = current;
-       struct domain *curdom;
-       struct mc_info *mc_data;
-       struct mcinfo_global mc_global;
-       struct mcinfo_bank mc_info;
-       uint64_t status, addrv, miscv, uc;
-       uint32_t i;
-       unsigned int cpu_nr;
-       uint32_t xen_impacted = 0;
-#define DOM_NORMAL     0
-#define DOM0_TRAP      1
-#define DOMU_TRAP      2
-#define DOMU_KILLED    4
-       uint32_t dom_state = DOM_NORMAL;
-
-       /* This handler runs as interrupt gate. So IPIs from the
-        * polling service routine are defered until we finished.
-        */
-
-        /* Disable interrupts for the _vcpu_. It may not re-scheduled to
-        * an other physical CPU or the impacted process in the guest
-        * continues running with corrupted data, otherwise. */
-        vcpu_schedule_lock_irq(vcpu);
-
-       mc_data = x86_mcinfo_getptr();
-       cpu_nr = smp_processor_id();
-       curdom = vcpu->domain;
-
-       memset(&mc_global, 0, sizeof(mc_global));
-       mc_global.common.type = MC_TYPE_GLOBAL;
-       mc_global.common.size = sizeof(mc_global);
-
-       mc_global.mc_domid = curdom->domain_id; /* impacted domain */
-       mc_global.mc_coreid = vcpu->processor; /* impacted physical cpu */
-       BUG_ON(cpu_nr != vcpu->processor);
-       mc_global.mc_core_threadid = 0;
-       mc_global.mc_vcpuid = vcpu->vcpu_id; /* impacted vcpu */
-#if 0 /* TODO: on which socket is this physical core?
-         It's not clear to me how to figure this out. */
-       mc_global.mc_socketid = ???;
-#endif
-       mc_global.mc_flags |= MC_FLAG_UNCORRECTABLE;
-       rdmsrl(MSR_IA32_MCG_STATUS, mc_global.mc_gstatus);
-
-       /* Quick check, who is impacted */
-       xen_impacted = is_idle_domain(curdom);
-
-       /* Dom0 */
-       x86_mcinfo_clear(mc_data);
-       x86_mcinfo_add(mc_data, &mc_global);
-
-       for (i = 0; i < nr_mce_banks; i++) {
-               struct domain *d;
-
-               rdmsrl(MSR_IA32_MC0_STATUS + 4 * i, status);
-
-               if (!(status & MCi_STATUS_VAL))
-                       continue;
-
-               /* An error happened in this bank.
-                * This is expected to be an uncorrectable error,
-                * since correctable errors get polled.
-                */
-               uc = status & MCi_STATUS_UC;
-
-               memset(&mc_info, 0, sizeof(mc_info));
-               mc_info.common.type = MC_TYPE_BANK;
-               mc_info.common.size = sizeof(mc_info);
-               mc_info.mc_bank = i;
-               mc_info.mc_status = status;
-
-               addrv = 0;
-               if (status & MCi_STATUS_ADDRV) {
-                       rdmsrl(MSR_IA32_MC0_ADDR + 4 * i, addrv);
-                       
-                       d = maddr_get_owner(addrv);
-                       if (d != NULL)
-                               mc_info.mc_domid = d->domain_id;
-               }
-
-               miscv = 0;
-               if (status & MCi_STATUS_MISCV)
-                       rdmsrl(MSR_IA32_MC0_MISC + 4 * i, miscv);
-
-               mc_info.mc_addr = addrv;
-               mc_info.mc_misc = miscv;
-
-               x86_mcinfo_add(mc_data, &mc_info); /* Dom0 */
-
-               if (mc_callback_bank_extended)
-                       mc_callback_bank_extended(mc_data, i, status);
-
-               /* clear status */
-               wrmsrl(MSR_IA32_MC0_STATUS + 4 * i, 0x0ULL);
-               wmb();
-               add_taint(TAINT_MACHINE_CHECK);
-       }
-
-       status = mc_global.mc_gstatus;
-
-       /* clear MCIP or cpu enters shutdown state
-        * in case another MCE occurs. */
-       status &= ~MCG_STATUS_MCIP;
-       wrmsrl(MSR_IA32_MCG_STATUS, status);
-       wmb();
-
-       /* For the details see the discussion "MCE/MCA concept" on xen-devel.
-        * The thread started here:
-        * 
http://lists.xensource.com/archives/html/xen-devel/2007-05/msg01015.html
-        */
-
-       /* MCG_STATUS_RIPV: 
-        * When this bit is not set, then the instruction pointer onto the stack
-        * to resume at is not valid. If xen is interrupted, then we panic 
anyway
-        * right below. Otherwise it is up to the guest to figure out if 
-        * guest kernel or guest userland is affected and should kill either
-        * itself or the affected process.
-        */
-
-       /* MCG_STATUS_EIPV:
-        * Evaluation of EIPV is the job of the guest.
-        */
-
-       if (xen_impacted) {
-               /* Now we are going to panic anyway. Allow interrupts, so that
-                * printk on serial console can work. */
-               vcpu_schedule_unlock_irq(vcpu);
-
-               /* Uh, that means, machine check exception
-                * inside Xen occured. */
-               printk("Machine check exception occured in Xen.\n");
-
-               /* if MCG_STATUS_EIPV indicates, the IP on the stack is related
-                * to the error then it makes sense to print a stack trace.
-                * That can be useful for more detailed error analysis and/or
-                * error case studies to figure out, if we can clear
-                * xen_impacted and kill a DomU instead
-                * (i.e. if a guest only control structure is affected, but then
-                * we must ensure the bad pages are not re-used again).
-                */
-               if (status & MCG_STATUS_EIPV) {
-                       printk("MCE: Instruction Pointer is related to the 
error. "
-                               "Therefore, print the execution state.\n");
-                       show_execution_state(regs);
-               }
-               x86_mcinfo_dump(mc_data);
-               panic("End of MCE. Use mcelog to decode above error codes.\n");
-       }
-
-       /* If Dom0 registered a machine check handler, which is only possible
-        * with a PV MCA driver, then ... */
-       if ( guest_has_trap_callback(dom0, 0, TRAP_machine_check) ) {
-               dom_state = DOM0_TRAP;
-
-               /* ... deliver machine check trap to Dom0. */
-               send_guest_trap(dom0, 0, TRAP_machine_check);
-
-               /* Xen may tell Dom0 now to notify the DomU.
-                * But this will happen through a hypercall. */
-       } else
-               /* Dom0 did not register a machine check handler, but if DomU
-                * did so, then... */
-                if ( guest_has_trap_callback(curdom, vcpu->vcpu_id, 
TRAP_machine_check) ) {
-                       dom_state = DOMU_TRAP;
-
-                       /* ... deliver machine check trap to DomU */
-                       send_guest_trap(curdom, vcpu->vcpu_id, 
TRAP_machine_check);
-       } else {
-               /* hmm... noone feels responsible to handle the error.
-                * So, do a quick check if a DomU is impacted or not.
-                */
-               if (curdom == dom0) {
-                       /* Dom0 is impacted. Since noone can't handle
-                        * this error, panic! */
-                       x86_mcinfo_dump(mc_data);
-                       panic("MCE occured in Dom0, which it can't handle\n");
-
-                       /* UNREACHED */
-               } else {
-                       dom_state = DOMU_KILLED;
-
-                       /* Enable interrupts. This basically results in
-                        * calling sti on the *physical* cpu. But after
-                        * domain_crash() the vcpu pointer is invalid.
-                        * Therefore, we must unlock the irqs before killing
-                        * it. */
-                       vcpu_schedule_unlock_irq(vcpu);
-
-                       /* DomU is impacted. Kill it and continue. */
-                       domain_crash(curdom);
-               }
-       }
-
-
-       switch (dom_state) {
-       case DOM0_TRAP:
-       case DOMU_TRAP:
-               /* Enable interrupts. */
-               vcpu_schedule_unlock_irq(vcpu);
-
-               /* guest softirqs and event callbacks are scheduled
-                * immediately after this handler exits. */
-               break;
-       case DOMU_KILLED:
-               /* Nothing to do here. */
-               break;
-       default:
-               BUG();
-       }
-}
 
 
 /* AMD K8 machine check */
@@ -292,7 +79,7 @@
        uint32_t i;
        int cpu_nr;
 
-       machine_check_vector = k8_machine_check;
+       machine_check_vector = x86_machine_check;
        cpu_nr = smp_processor_id();
        wmb();
 
diff -r f4552d9f6afb xen/arch/x86/cpu/mcheck/amd_nonfatal.c
--- a/xen/arch/x86/cpu/mcheck/amd_nonfatal.c    Tue Sep 23 17:11:33 2008 +0100
+++ b/xen/arch/x86/cpu/mcheck/amd_nonfatal.c    Fri Sep 26 14:30:17 2008 +0900
@@ -65,117 +65,12 @@
 #include "mce.h"
 #include "x86_mca.h"
 
-static struct timer mce_timer;
+static int hw_threshold = 0;
 
-#define MCE_PERIOD MILLISECS(15000)
-#define MCE_MIN    MILLISECS(2000)
-#define MCE_MAX    MILLISECS(30000)
+extern struct timer mce_timer;
 
-static s_time_t period = MCE_PERIOD;
-static int hw_threshold = 0;
-static int adjust = 0;
-
-/* The polling service routine:
- * Collects information of correctable errors and notifies
- * Dom0 via an event.
- */
-void mce_amd_checkregs(void *info)
-{
-       struct vcpu *vcpu = current;
-       struct mc_info *mc_data;
-       struct mcinfo_global mc_global;
-       struct mcinfo_bank mc_info;
-       uint64_t status, addrv, miscv;
-       unsigned int i;
-       unsigned int event_enabled;
-       unsigned int cpu_nr;
-       int error_found;
-
-       /* We don't need a slot yet. Only allocate one on error. */
-       mc_data = NULL;
-
-       cpu_nr = smp_processor_id();
-       event_enabled = guest_enabled_event(dom0->vcpu[0], VIRQ_MCA);
-       error_found = 0;
-
-       memset(&mc_global, 0, sizeof(mc_global));
-       mc_global.common.type = MC_TYPE_GLOBAL;
-       mc_global.common.size = sizeof(mc_global);
-
-       mc_global.mc_domid = vcpu->domain->domain_id; /* impacted domain */
-       mc_global.mc_coreid = vcpu->processor; /* impacted physical cpu */
-       BUG_ON(cpu_nr != vcpu->processor);
-       mc_global.mc_core_threadid = 0;
-       mc_global.mc_vcpuid = vcpu->vcpu_id; /* impacted vcpu */
-#if 0 /* TODO: on which socket is this physical core?
-         It's not clear to me how to figure this out. */
-       mc_global.mc_socketid = ???;
-#endif
-       mc_global.mc_flags |= MC_FLAG_CORRECTABLE;
-       rdmsrl(MSR_IA32_MCG_STATUS, mc_global.mc_gstatus);
-
-       for (i = 0; i < nr_mce_banks; i++) {
-               struct domain *d;
-
-               rdmsrl(MSR_IA32_MC0_STATUS + i * 4, status);
-
-               if (!(status & MCi_STATUS_VAL))
-                       continue;
-
-               if (mc_data == NULL) {
-                       /* Now we need a slot to fill in error telemetry. */
-                       mc_data = x86_mcinfo_getptr();
-                       BUG_ON(mc_data == NULL);
-                       x86_mcinfo_clear(mc_data);
-                       x86_mcinfo_add(mc_data, &mc_global);
-               }
-
-               memset(&mc_info, 0, sizeof(mc_info));
-               mc_info.common.type = MC_TYPE_BANK;
-               mc_info.common.size = sizeof(mc_info);
-               mc_info.mc_bank = i;
-               mc_info.mc_status = status;
-
-               /* Increase polling frequency */
-               error_found = 1;
-
-               addrv = 0;
-               if (status & MCi_STATUS_ADDRV) {
-                       rdmsrl(MSR_IA32_MC0_ADDR + i * 4, addrv);
-
-                       d = maddr_get_owner(addrv);
-                       if (d != NULL)
-                               mc_info.mc_domid = d->domain_id;
-               }
-
-               miscv = 0;
-               if (status & MCi_STATUS_MISCV)
-                       rdmsrl(MSR_IA32_MC0_MISC + i * 4, miscv);
-
-               mc_info.mc_addr = addrv;
-               mc_info.mc_misc = miscv;
-               x86_mcinfo_add(mc_data, &mc_info);
-
-               if (mc_callback_bank_extended)
-                       mc_callback_bank_extended(mc_data, i, status);
-
-               /* clear status */
-               wrmsrl(MSR_IA32_MC0_STATUS + i * 4, 0x0ULL);
-               wmb();
-       }
-
-       if (error_found > 0) {
-               /* If Dom0 enabled the VIRQ_MCA event, then ... */
-               if (event_enabled)
-                       /* ... notify it. */
-                       send_guest_global_virq(dom0, VIRQ_MCA);
-               else
-                       /* ... or dump it */
-                       x86_mcinfo_dump(mc_data);
-       }
-
-       adjust += error_found;
-}
+extern s_time_t period;
+extern int adjust;
 
 /* polling service routine invoker:
  * Adjust poll frequency at runtime. No error means slow polling frequency,
@@ -186,7 +81,7 @@
  */
 static void mce_amd_work_fn(void *data)
 {
-       on_each_cpu(mce_amd_checkregs, data, 1, 1);
+       on_each_cpu(x86_mce_checkregs, data, 1, 1);
 
        if (adjust > 0) {
                if ( !guest_enabled_event(dom0->vcpu[0], VIRQ_MCA) ) {
diff -r f4552d9f6afb xen/arch/x86/cpu/mcheck/mce.c
--- a/xen/arch/x86/cpu/mcheck/mce.c     Tue Sep 23 17:11:33 2008 +0100
+++ b/xen/arch/x86/cpu/mcheck/mce.c     Fri Sep 26 14:30:17 2008 +0900
@@ -7,6 +7,8 @@
 #include <xen/types.h>
 #include <xen/kernel.h>
 #include <xen/config.h>
+#include <xen/sched.h>
+#include <xen/sched-if.h>
 #include <xen/smp.h>
 #include <xen/errno.h>
 
@@ -431,6 +433,226 @@
        } while (1);
 }
 
+
+/* Machine Check Handler for AMD K8 family series and Intel P4/Xeon family */
+void x86_machine_check(struct cpu_user_regs *regs, long error_code)
+{
+       struct vcpu *vcpu = current;
+       struct domain *curdom;
+       struct mc_info *mc_data;
+       struct mcinfo_global mc_global;
+       struct mcinfo_bank mc_info;
+       uint64_t status, addrv, miscv, uc;
+       uint32_t i;
+       unsigned int cpu_nr;
+       uint32_t xen_impacted = 0;
+#define DOM_NORMAL     0
+#define DOM0_TRAP      1
+#define DOMU_TRAP      2
+#define DOMU_KILLED    4
+       uint32_t dom_state = DOM_NORMAL;
+
+       /* This handler runs as interrupt gate. So IPIs from the
+        * polling service routine are defered until we finished.
+        */
+
+        /* Disable interrupts for the _vcpu_. It may not re-scheduled to
+        * an other physical CPU or the impacted process in the guest
+        * continues running with corrupted data, otherwise. */
+        vcpu_schedule_lock_irq(vcpu);
+
+       mc_data = x86_mcinfo_getptr();
+       cpu_nr = smp_processor_id();
+       curdom = vcpu->domain;
+
+       memset(&mc_global, 0, sizeof(mc_global));
+       mc_global.common.type = MC_TYPE_GLOBAL;
+       mc_global.common.size = sizeof(mc_global);
+
+       mc_global.mc_domid = curdom->domain_id; /* impacted domain */
+       mc_global.mc_coreid = vcpu->processor; /* impacted physical cpu */
+       BUG_ON(cpu_nr != vcpu->processor);
+       mc_global.mc_core_threadid = 0;
+       mc_global.mc_vcpuid = vcpu->vcpu_id; /* impacted vcpu */
+#if 0 /* TODO: on which socket is this physical core?
+         It's not clear to me how to figure this out. */
+       mc_global.mc_socketid = ???;
+#endif
+       mc_global.mc_flags |= MC_FLAG_UNCORRECTABLE;
+       rdmsrl(MSR_IA32_MCG_STATUS, mc_global.mc_gstatus);
+
+       /* Quick check, who is impacted */
+       xen_impacted = is_idle_domain(curdom);
+
+       /* Dom0 */
+       x86_mcinfo_clear(mc_data);
+       x86_mcinfo_add(mc_data, &mc_global);
+
+       for (i = 0; i < nr_mce_banks; i++) {
+               struct domain *d;
+
+               rdmsrl(MSR_IA32_MC0_STATUS + 4 * i, status);
+
+               if (!(status & MCi_STATUS_VAL))
+                       continue;
+
+               /* An error happened in this bank.
+                * This is expected to be an uncorrectable error,
+                * since correctable errors get polled.
+                */
+               uc = status & MCi_STATUS_UC;
+
+               memset(&mc_info, 0, sizeof(mc_info));
+               mc_info.common.type = MC_TYPE_BANK;
+               mc_info.common.size = sizeof(mc_info);
+               mc_info.mc_bank = i;
+               mc_info.mc_status = status;
+
+               addrv = 0;
+               if (status & MCi_STATUS_ADDRV) {
+                       rdmsrl(MSR_IA32_MC0_ADDR + 4 * i, addrv);
+                       
+                       d = maddr_get_owner(addrv);
+                       if (d != NULL)
+                               mc_info.mc_domid = d->domain_id;
+               }
+
+               miscv = 0;
+               if (status & MCi_STATUS_MISCV)
+                       rdmsrl(MSR_IA32_MC0_MISC + 4 * i, miscv);
+
+               mc_info.mc_addr = addrv;
+               mc_info.mc_misc = miscv;
+
+               x86_mcinfo_add(mc_data, &mc_info); /* Dom0 */
+
+               if (mc_callback_bank_extended)
+                       mc_callback_bank_extended(mc_data, i, status);
+
+               /* clear status */
+               wrmsrl(MSR_IA32_MC0_STATUS + 4 * i, 0x0ULL);
+               wmb();
+               add_taint(TAINT_MACHINE_CHECK);
+       }
+
+       /* Never do anything final for the previous reset */
+       if (!regs) {
+               vcpu_schedule_unlock_irq(vcpu);
+               return;
+       }
+
+       status = mc_global.mc_gstatus;
+
+       /* clear MCIP or cpu enters shutdown state
+        * in case another MCE occurs. */
+       status &= ~MCG_STATUS_MCIP;
+       wrmsrl(MSR_IA32_MCG_STATUS, status);
+       wmb();
+
+       /* For the details see the discussion "MCE/MCA concept" on xen-devel.
+        * The thread started here:
+        * 
http://lists.xensource.com/archives/html/xen-devel/2007-05/msg01015.html
+        */
+
+       /* MCG_STATUS_RIPV: 
+        * When this bit is not set, then the instruction pointer onto the stack
+        * to resume at is not valid. If xen is interrupted, then we panic 
anyway
+        * right below. Otherwise it is up to the guest to figure out if 
+        * guest kernel or guest userland is affected and should kill either
+        * itself or the affected process.
+        */
+
+       /* MCG_STATUS_EIPV:
+        * Evaluation of EIPV is the job of the guest.
+        */
+
+       if (xen_impacted) {
+               /* Now we are going to panic anyway. Allow interrupts, so that
+                * printk on serial console can work. */
+               vcpu_schedule_unlock_irq(vcpu);
+
+               /* Uh, that means, machine check exception
+                * inside Xen occured. */
+               printk("Machine check exception occured in Xen.\n");
+
+               /* if MCG_STATUS_EIPV indicates, the IP on the stack is related
+                * to the error then it makes sense to print a stack trace.
+                * That can be useful for more detailed error analysis and/or
+                * error case studies to figure out, if we can clear
+                * xen_impacted and kill a DomU instead
+                * (i.e. if a guest only control structure is affected, but then
+                * we must ensure the bad pages are not re-used again).
+                */
+               if (status & MCG_STATUS_EIPV) {
+                       printk("MCE: Instruction Pointer is related to the 
error. "
+                               "Therefore, print the execution state.\n");
+                       show_execution_state(regs);
+               }
+               x86_mcinfo_dump(mc_data);
+               panic("End of MCE. Use mcelog to decode above error codes.\n");
+       }
+
+       /* If Dom0 registered a machine check handler, which is only possible
+        * with a PV MCA driver, then ... */
+       if ( guest_has_trap_callback(dom0, 0, TRAP_machine_check) ) {
+               dom_state = DOM0_TRAP;
+
+               /* ... deliver machine check trap to Dom0. */
+               send_guest_trap(dom0, 0, TRAP_machine_check);
+
+               /* Xen may tell Dom0 now to notify the DomU.
+                * But this will happen through a hypercall. */
+       } else
+               /* Dom0 did not register a machine check handler, but if DomU
+                * did so, then... */
+                if ( guest_has_trap_callback(curdom, vcpu->vcpu_id, 
TRAP_machine_check) ) {
+                       dom_state = DOMU_TRAP;
+
+                       /* ... deliver machine check trap to DomU */
+                       send_guest_trap(curdom, vcpu->vcpu_id, 
TRAP_machine_check);
+       } else {
+               /* hmm... noone feels responsible to handle the error.
+                * So, do a quick check if a DomU is impacted or not.
+                */
+               if (curdom == dom0) {
+                       /* Dom0 is impacted. Since noone can't handle
+                        * this error, panic! */
+                       x86_mcinfo_dump(mc_data);
+                       panic("MCE occured in Dom0, which it can't handle\n");
+
+                       /* UNREACHED */
+               } else {
+                       dom_state = DOMU_KILLED;
+
+                       /* Enable interrupts. This basically results in
+                        * calling sti on the *physical* cpu. But after
+                        * domain_crash() the vcpu pointer is invalid.
+                        * Therefore, we must unlock the irqs before killing
+                        * it. */
+                       vcpu_schedule_unlock_irq(vcpu);
+
+                       /* DomU is impacted. Kill it and continue. */
+                       domain_crash(curdom);
+               }
+       }
+
+
+       switch (dom_state) {
+       case DOM0_TRAP:
+       case DOMU_TRAP:
+               /* Enable interrupts. */
+               vcpu_schedule_unlock_irq(vcpu);
+
+               /* guest softirqs and event callbacks are scheduled
+                * immediately after this handler exits. */
+               break;
+       case DOMU_KILLED:
+               /* Nothing to do here. */
+               break;
+       default:
+               BUG();
+       }
+}
 
 
 /* Machine Check Architecture Hypercall */
diff -r f4552d9f6afb xen/arch/x86/cpu/mcheck/non-fatal.c
--- a/xen/arch/x86/cpu/mcheck/non-fatal.c       Tue Sep 23 17:11:33 2008 +0100
+++ b/xen/arch/x86/cpu/mcheck/non-fatal.c       Fri Sep 26 14:30:17 2008 +0900
@@ -14,16 +14,158 @@
 #include <xen/smp.h>
 #include <xen/timer.h>
 #include <xen/errno.h>
+#include <xen/event.h>
 #include <asm/processor.h> 
 #include <asm/system.h>
 #include <asm/msr.h>
 
 #include "mce.h"
+#include "x86_mca.h"
 
 static int firstbank;
-static struct timer mce_timer;
 
-#define MCE_PERIOD MILLISECS(15000)
+struct timer mce_timer;
+
+s_time_t period = MCE_PERIOD;
+int adjust = 0;
+
+/* The polling service routine:
+ * Collects information of correctable errors and notifies
+ * Dom0 via an event.
+ */
+void x86_mce_checkregs(void *info)
+{
+       struct vcpu *vcpu = current;
+       struct mc_info *mc_data;
+       struct mcinfo_global mc_global;
+       struct mcinfo_bank mc_info;
+       uint64_t status, addrv, miscv;
+       unsigned int i;
+       unsigned int event_enabled;
+       unsigned int cpu_nr;
+       int error_found;
+
+       /* We don't need a slot yet. Only allocate one on error. */
+       mc_data = NULL;
+
+       cpu_nr = smp_processor_id();
+       event_enabled = guest_enabled_event(dom0->vcpu[0], VIRQ_MCA);
+       error_found = 0;
+
+       memset(&mc_global, 0, sizeof(mc_global));
+       mc_global.common.type = MC_TYPE_GLOBAL;
+       mc_global.common.size = sizeof(mc_global);
+
+       mc_global.mc_domid = vcpu->domain->domain_id; /* impacted domain */
+       mc_global.mc_coreid = vcpu->processor; /* impacted physical cpu */
+       BUG_ON(cpu_nr != vcpu->processor);
+       mc_global.mc_core_threadid = 0;
+       mc_global.mc_vcpuid = vcpu->vcpu_id; /* impacted vcpu */
+#if 0 /* TODO: on which socket is this physical core?
+         It's not clear to me how to figure this out. */
+       mc_global.mc_socketid = ???;
+#endif
+       mc_global.mc_flags |= MC_FLAG_CORRECTABLE;
+       rdmsrl(MSR_IA32_MCG_STATUS, mc_global.mc_gstatus);
+
+       for (i = 0; i < nr_mce_banks; i++) {
+               struct domain *d;
+
+               rdmsrl(MSR_IA32_MC0_STATUS + i * 4, status);
+
+               if (!(status & MCi_STATUS_VAL))
+                       continue;
+
+               if (mc_data == NULL) {
+                       /* Now we need a slot to fill in error telemetry. */
+                       mc_data = x86_mcinfo_getptr();
+                       BUG_ON(mc_data == NULL);
+                       x86_mcinfo_clear(mc_data);
+                       x86_mcinfo_add(mc_data, &mc_global);
+               }
+
+               memset(&mc_info, 0, sizeof(mc_info));
+               mc_info.common.type = MC_TYPE_BANK;
+               mc_info.common.size = sizeof(mc_info);
+               mc_info.mc_bank = i;
+               mc_info.mc_status = status;
+
+               /* Increase polling frequency */
+               error_found = 1;
+
+               addrv = 0;
+               if (status & MCi_STATUS_ADDRV) {
+                       rdmsrl(MSR_IA32_MC0_ADDR + i * 4, addrv);
+
+                       d = maddr_get_owner(addrv);
+                       if (d != NULL)
+                               mc_info.mc_domid = d->domain_id;
+               }
+
+               miscv = 0;
+               if (status & MCi_STATUS_MISCV)
+                       rdmsrl(MSR_IA32_MC0_MISC + i * 4, miscv);
+
+               mc_info.mc_addr = addrv;
+               mc_info.mc_misc = miscv;
+               x86_mcinfo_add(mc_data, &mc_info);
+
+               if (mc_callback_bank_extended)
+                       mc_callback_bank_extended(mc_data, i, status);
+
+               /* clear status */
+               wrmsrl(MSR_IA32_MC0_STATUS + i * 4, 0x0ULL);
+               wmb();
+       }
+
+       if (error_found > 0) {
+               /* If Dom0 enabled the VIRQ_MCA event, then ... */
+               if (event_enabled)
+                       /* ... notify it. */
+                       send_guest_global_virq(dom0, VIRQ_MCA);
+               else
+                       /* ... or dump it */
+                       x86_mcinfo_dump(mc_data);
+       }
+
+       adjust += error_found;
+}
+
+static void p4_mce_work_fn(void *data)
+{ 
+       on_each_cpu(x86_mce_checkregs, NULL, 1, 1);
+
+       if (adjust > 0) {
+               if ( !guest_enabled_event(dom0->vcpu[0], VIRQ_MCA) ) {
+                       /* Dom0 did not enable VIRQ_MCA, so Xen is reporting. */
+                       printk("MCE: polling routine found correctable error. "
+                               " Use mcelog to parse above error output.\n");
+               }
+       }
+
+       if (adjust > 0) {
+               /* Increase polling frequency */
+               adjust++; /* adjust == 1 must have an effect */
+               period /= adjust;
+       } else {
+               /* Decrease polling frequency */
+               period *= 2;
+       }
+       if (period > MCE_MAX) {
+               /* limit: Poll at least every 30s */
+               period = MCE_MAX;
+       }
+       if (period < MCE_MIN) {
+               /* limit: Poll every 2s.
+                * When this is reached an uncorrectable error
+                * is expected to happen, if Dom0 does nothing.
+                */
+               period = MCE_MIN;
+       }
+
+       set_timer(&mce_timer, NOW() + period);
+       adjust = 0;
+}
 
 static void mce_checkregs (void *info)
 {
@@ -85,6 +227,11 @@
                break;
 
        case X86_VENDOR_INTEL:
+               if (c->x86 == 15) { /* P4/Xeon */
+                       init_timer(&mce_timer, p4_mce_work_fn, NULL, 0);
+                       set_timer(&mce_timer, NOW() + period);
+                       break;
+               }
                init_timer(&mce_timer, mce_work_fn, NULL, 0);
                set_timer(&mce_timer, NOW() + MCE_PERIOD);
                break;
diff -r f4552d9f6afb xen/arch/x86/cpu/mcheck/p4.c
--- a/xen/arch/x86/cpu/mcheck/p4.c      Tue Sep 23 17:11:33 2008 +0100
+++ b/xen/arch/x86/cpu/mcheck/p4.c      Fri Sep 26 14:30:17 2008 +0900
@@ -15,6 +15,7 @@
 #include <asm/apic.h>
 
 #include "mce.h"
+#include "x86_mca.h"
 
 /* as supported by the P4/Xeon family */
 struct intel_mce_extended_msrs {
@@ -32,6 +33,7 @@
 };
 
 static int mce_num_extended_msrs = 0;
+static int mce_bootlog = 1;
 
 
 #ifdef CONFIG_X86_MCE_P4THERMAL
@@ -158,85 +160,13 @@
        return mce_num_extended_msrs;
 }
 
-static fastcall void intel_machine_check(struct cpu_user_regs * regs, long 
error_code)
-{
-       int recover=1;
-       u32 alow, ahigh, high, low;
-       u32 mcgstl, mcgsth;
-       int i;
-       struct intel_mce_extended_msrs dbg;
-
-       rdmsr (MSR_IA32_MCG_STATUS, mcgstl, mcgsth);
-       if (mcgstl & (1<<0))    /* Recoverable ? */
-               recover=0;
-
-       printk (KERN_EMERG "CPU %d: Machine Check Exception: %08x%08x\n",
-               smp_processor_id(), mcgsth, mcgstl);
-
-       if (intel_get_extended_msrs(&dbg)) {
-               printk (KERN_DEBUG "CPU %d: EIP: %08x EFLAGS: %08x\n",
-                       smp_processor_id(), dbg.eip, dbg.eflags);
-               printk (KERN_DEBUG "\teax: %08x ebx: %08x ecx: %08x edx: 
%08x\n",
-                       dbg.eax, dbg.ebx, dbg.ecx, dbg.edx);
-               printk (KERN_DEBUG "\tesi: %08x edi: %08x ebp: %08x esp: 
%08x\n",
-                       dbg.esi, dbg.edi, dbg.ebp, dbg.esp);
-       }
-
-       for (i=0; i<nr_mce_banks; i++) {
-               rdmsr (MSR_IA32_MC0_STATUS+i*4,low, high);
-               if (high & (1<<31)) {
-                       if (high & (1<<29))
-                               recover |= 1;
-                       if (high & (1<<25))
-                               recover |= 2;
-                       printk (KERN_EMERG "Bank %d: %08x%08x", i, high, low);
-                       high &= ~(1<<31);
-                       if (high & (1<<27)) {
-                               rdmsr (MSR_IA32_MC0_MISC+i*4, alow, ahigh);
-                               printk ("[%08x%08x]", ahigh, alow);
-                       }
-                       if (high & (1<<26)) {
-                               rdmsr (MSR_IA32_MC0_ADDR+i*4, alow, ahigh);
-                               printk (" at %08x%08x", ahigh, alow);
-                       }
-                       printk ("\n");
-               }
-       }
-
-       if (recover & 2)
-               panic ("CPU context corrupt");
-       if (recover & 1)
-               panic ("Unable to continue");
-
-       printk(KERN_EMERG "Attempting to continue.\n");
-       /* 
-        * Do not clear the MSR_IA32_MCi_STATUS if the error is not 
-        * recoverable/continuable.This will allow BIOS to look at the MSRs
-        * for errors if the OS could not log the error.
-        */
-       for (i=0; i<nr_mce_banks; i++) {
-               u32 msr;
-               msr = MSR_IA32_MC0_STATUS+i*4;
-               rdmsr (msr, low, high);
-               if (high&(1<<31)) {
-                       /* Clear it */
-                       wrmsr(msr, 0UL, 0UL);
-                       /* Serialize */
-                       wmb();
-                       add_taint(TAINT_MACHINE_CHECK);
-               }
-       }
-       mcgstl &= ~(1<<2);
-       wrmsr (MSR_IA32_MCG_STATUS,mcgstl, mcgsth);
-}
-
 
 void intel_p4_mcheck_init(struct cpuinfo_x86 *c)
 {
        u32 l, h;
        int i;
        
-       machine_check_vector = intel_machine_check;
+       machine_check_vector = x86_machine_check;
        wmb();
 
        printk (KERN_INFO "Intel machine check architecture supported.\n");
@@ -244,6 +174,10 @@
        if (l & (1<<8)) /* Control register present ? */
                wrmsr (MSR_IA32_MCG_CTL, 0xffffffff, 0xffffffff);
        nr_mce_banks = l & 0xff;
+
+       /* Log the machine checks left over from the previous reset.
+          This also clears all registers */
+       x86_machine_check(NULL, mce_bootlog ? -1 : -2);
 
        for (i=0; i<nr_mce_banks; i++) {
                wrmsr (MSR_IA32_MC0_CTL+4*i, 0xffffffff, 0xffffffff);
diff -r f4552d9f6afb xen/arch/x86/cpu/mcheck/x86_mca.h
--- a/xen/arch/x86/cpu/mcheck/x86_mca.h Tue Sep 23 17:11:33 2008 +0100
+++ b/xen/arch/x86/cpu/mcheck/x86_mca.h Fri Sep 26 14:30:17 2008 +0900
@@ -70,3 +70,11 @@
 /* reserved bits */
 #define MCi_STATUS_OTHER_RESERVED2      0x0180000000000000ULL
 
+/* Polling period */
+#define MCE_PERIOD MILLISECS(15000)
+#define MCE_MIN    MILLISECS(2000)
+#define MCE_MAX    MILLISECS(30000)
+
+/* Common routines */
+void x86_machine_check(struct cpu_user_regs *regs, long error_code);
+void x86_mce_checkregs(void *info);
diff -r f4552d9f6afb xen/arch/x86/traps.c
--- a/xen/arch/x86/traps.c      Tue Sep 23 17:11:33 2008 +0100
+++ b/xen/arch/x86/traps.c      Fri Sep 26 14:30:17 2008 +0900
@@ -713,8 +713,10 @@
         __clear_bit(X86_FEATURE_VME, &d);
         __clear_bit(X86_FEATURE_PSE, &d);
         __clear_bit(X86_FEATURE_PGE, &d);
+#ifndef __x86_64__
         __clear_bit(X86_FEATURE_MCE, &d);
         __clear_bit(X86_FEATURE_MCA, &d);
+#endif
         __clear_bit(X86_FEATURE_PSE36, &d);
     }
     switch ( (uint32_t)regs->eax )

_______________________________________________
Xen-devel mailing list
Xen-devel@xxxxxxxxxxxxxxxxxxx
http://lists.xensource.com/xen-devel

WARNING - OLD ARCHIVES

xen-devel

[Xen-devel] [RFC][PATCH 1/2] MCA support for Intel64