This is xen part.
[1/2] xen part: mca-support-with-page-offlining-xen.patch
Signed-off-by: Kazuhiro Suzuki <kaz@xxxxxxxxxxxxxx>
Thanks,
KAZ
diff -r 6595393a3d28 xen/arch/x86/cpu/mcheck/amd_f10.c
--- a/xen/arch/x86/cpu/mcheck/amd_f10.c Tue Dec 09 16:28:02 2008 +0000
+++ b/xen/arch/x86/cpu/mcheck/amd_f10.c Mon Dec 15 14:25:07 2008 +0900
@@ -82,8 +82,6 @@
}
-extern void k8_machine_check(struct cpu_user_regs *regs, long error_code);
-
/* AMD Family10 machine check */
void amd_f10_mcheck_init(struct cpuinfo_x86 *c)
{
@@ -91,7 +89,7 @@
uint32_t i;
int cpu_nr;
- machine_check_vector = k8_machine_check;
+ machine_check_vector = x86_machine_check;
mc_callback_bank_extended = amd_f10_handler;
cpu_nr = smp_processor_id();
wmb();
diff -r 6595393a3d28 xen/arch/x86/cpu/mcheck/amd_k8.c
--- a/xen/arch/x86/cpu/mcheck/amd_k8.c Tue Dec 09 16:28:02 2008 +0000
+++ b/xen/arch/x86/cpu/mcheck/amd_k8.c Mon Dec 15 14:25:07 2008 +0900
@@ -69,220 +69,8 @@
#include "mce.h"
#include "x86_mca.h"
+extern int mce_bootlog;
-/* Machine Check Handler for AMD K8 family series */
-void k8_machine_check(struct cpu_user_regs *regs, long error_code)
-{
- struct vcpu *vcpu = current;
- struct domain *curdom;
- struct mc_info *mc_data;
- struct mcinfo_global mc_global;
- struct mcinfo_bank mc_info;
- uint64_t status, addrv, miscv, uc;
- uint32_t i;
- unsigned int cpu_nr;
- uint32_t xen_impacted = 0;
-#define DOM_NORMAL 0
-#define DOM0_TRAP 1
-#define DOMU_TRAP 2
-#define DOMU_KILLED 4
- uint32_t dom_state = DOM_NORMAL;
-
- /* This handler runs as interrupt gate. So IPIs from the
- * polling service routine are defered until we finished.
- */
-
- /* Disable interrupts for the _vcpu_. It may not re-scheduled to
- * an other physical CPU or the impacted process in the guest
- * continues running with corrupted data, otherwise. */
- vcpu_schedule_lock_irq(vcpu);
-
- mc_data = x86_mcinfo_getptr();
- cpu_nr = smp_processor_id();
- curdom = vcpu->domain;
-
- memset(&mc_global, 0, sizeof(mc_global));
- mc_global.common.type = MC_TYPE_GLOBAL;
- mc_global.common.size = sizeof(mc_global);
-
- mc_global.mc_domid = curdom->domain_id; /* impacted domain */
- mc_global.mc_coreid = vcpu->processor; /* impacted physical cpu */
- BUG_ON(cpu_nr != vcpu->processor);
- mc_global.mc_core_threadid = 0;
- mc_global.mc_vcpuid = vcpu->vcpu_id; /* impacted vcpu */
-#if 0 /* TODO: on which socket is this physical core?
- It's not clear to me how to figure this out. */
- mc_global.mc_socketid = ???;
-#endif
- mc_global.mc_flags |= MC_FLAG_UNCORRECTABLE;
- rdmsrl(MSR_IA32_MCG_STATUS, mc_global.mc_gstatus);
-
- /* Quick check, who is impacted */
- xen_impacted = is_idle_domain(curdom);
-
- /* Dom0 */
- x86_mcinfo_clear(mc_data);
- x86_mcinfo_add(mc_data, &mc_global);
-
- for (i = 0; i < nr_mce_banks; i++) {
- struct domain *d;
-
- rdmsrl(MSR_IA32_MC0_STATUS + 4 * i, status);
-
- if (!(status & MCi_STATUS_VAL))
- continue;
-
- /* An error happened in this bank.
- * This is expected to be an uncorrectable error,
- * since correctable errors get polled.
- */
- uc = status & MCi_STATUS_UC;
-
- memset(&mc_info, 0, sizeof(mc_info));
- mc_info.common.type = MC_TYPE_BANK;
- mc_info.common.size = sizeof(mc_info);
- mc_info.mc_bank = i;
- mc_info.mc_status = status;
-
- addrv = 0;
- if (status & MCi_STATUS_ADDRV) {
- rdmsrl(MSR_IA32_MC0_ADDR + 4 * i, addrv);
-
- d = maddr_get_owner(addrv);
- if (d != NULL)
- mc_info.mc_domid = d->domain_id;
- }
-
- miscv = 0;
- if (status & MCi_STATUS_MISCV)
- rdmsrl(MSR_IA32_MC0_MISC + 4 * i, miscv);
-
- mc_info.mc_addr = addrv;
- mc_info.mc_misc = miscv;
-
- x86_mcinfo_add(mc_data, &mc_info); /* Dom0 */
-
- if (mc_callback_bank_extended)
- mc_callback_bank_extended(mc_data, i, status);
-
- /* clear status */
- wrmsrl(MSR_IA32_MC0_STATUS + 4 * i, 0x0ULL);
- wmb();
- add_taint(TAINT_MACHINE_CHECK);
- }
-
- status = mc_global.mc_gstatus;
-
- /* clear MCIP or cpu enters shutdown state
- * in case another MCE occurs. */
- status &= ~MCG_STATUS_MCIP;
- wrmsrl(MSR_IA32_MCG_STATUS, status);
- wmb();
-
- /* For the details see the discussion "MCE/MCA concept" on xen-devel.
- * The thread started here:
- *
http://lists.xensource.com/archives/html/xen-devel/2007-05/msg01015.html
- */
-
- /* MCG_STATUS_RIPV:
- * When this bit is not set, then the instruction pointer onto the stack
- * to resume at is not valid. If xen is interrupted, then we panic
anyway
- * right below. Otherwise it is up to the guest to figure out if
- * guest kernel or guest userland is affected and should kill either
- * itself or the affected process.
- */
-
- /* MCG_STATUS_EIPV:
- * Evaluation of EIPV is the job of the guest.
- */
-
- if (xen_impacted) {
- /* Now we are going to panic anyway. Allow interrupts, so that
- * printk on serial console can work. */
- vcpu_schedule_unlock_irq(vcpu);
-
- /* Uh, that means, machine check exception
- * inside Xen occured. */
- printk("Machine check exception occured in Xen.\n");
-
- /* if MCG_STATUS_EIPV indicates, the IP on the stack is related
- * to the error then it makes sense to print a stack trace.
- * That can be useful for more detailed error analysis and/or
- * error case studies to figure out, if we can clear
- * xen_impacted and kill a DomU instead
- * (i.e. if a guest only control structure is affected, but then
- * we must ensure the bad pages are not re-used again).
- */
- if (status & MCG_STATUS_EIPV) {
- printk("MCE: Instruction Pointer is related to the
error. "
- "Therefore, print the execution state.\n");
- show_execution_state(regs);
- }
- x86_mcinfo_dump(mc_data);
- panic("End of MCE. Use mcelog to decode above error codes.\n");
- }
-
- /* If Dom0 registered a machine check handler, which is only possible
- * with a PV MCA driver, then ... */
- if ( guest_has_trap_callback(dom0, 0, TRAP_machine_check) ) {
- dom_state = DOM0_TRAP;
-
- /* ... deliver machine check trap to Dom0. */
- send_guest_trap(dom0, 0, TRAP_machine_check);
-
- /* Xen may tell Dom0 now to notify the DomU.
- * But this will happen through a hypercall. */
- } else
- /* Dom0 did not register a machine check handler, but if DomU
- * did so, then... */
- if ( guest_has_trap_callback(curdom, vcpu->vcpu_id,
TRAP_machine_check) ) {
- dom_state = DOMU_TRAP;
-
- /* ... deliver machine check trap to DomU */
- send_guest_trap(curdom, vcpu->vcpu_id,
TRAP_machine_check);
- } else {
- /* hmm... noone feels responsible to handle the error.
- * So, do a quick check if a DomU is impacted or not.
- */
- if (curdom == dom0) {
- /* Dom0 is impacted. Since noone can't handle
- * this error, panic! */
- x86_mcinfo_dump(mc_data);
- panic("MCE occured in Dom0, which it can't handle\n");
-
- /* UNREACHED */
- } else {
- dom_state = DOMU_KILLED;
-
- /* Enable interrupts. This basically results in
- * calling sti on the *physical* cpu. But after
- * domain_crash() the vcpu pointer is invalid.
- * Therefore, we must unlock the irqs before killing
- * it. */
- vcpu_schedule_unlock_irq(vcpu);
-
- /* DomU is impacted. Kill it and continue. */
- domain_crash(curdom);
- }
- }
-
-
- switch (dom_state) {
- case DOM0_TRAP:
- case DOMU_TRAP:
- /* Enable interrupts. */
- vcpu_schedule_unlock_irq(vcpu);
-
- /* guest softirqs and event callbacks are scheduled
- * immediately after this handler exits. */
- break;
- case DOMU_KILLED:
- /* Nothing to do here. */
- break;
- default:
- BUG();
- }
-}
/* AMD K8 machine check */
@@ -292,7 +80,7 @@
uint32_t i;
int cpu_nr;
- machine_check_vector = k8_machine_check;
+ machine_check_vector = x86_machine_check;
cpu_nr = smp_processor_id();
wmb();
@@ -300,6 +88,17 @@
if (value & MCG_CTL_P) /* Control register present ? */
wrmsrl (MSR_IA32_MCG_CTL, 0xffffffffffffffffULL);
nr_mce_banks = value & MCG_CAP_COUNT;
+
+ /* Log the machine checks left over from the previous reset.
+ This also clears all registers */
+ for (i=0; i<nr_mce_banks; i++) {
+ u64 status;
+ rdmsrl(MSR_IA32_MC0_STATUS + i*4, status);
+ if (status & MCi_STATUS_VAL) {
+ x86_machine_check(NULL, mce_bootlog ? -1 : -2);
+ break;
+ }
+ }
for (i = 0; i < nr_mce_banks; i++) {
switch (i) {
diff -r 6595393a3d28 xen/arch/x86/cpu/mcheck/amd_nonfatal.c
--- a/xen/arch/x86/cpu/mcheck/amd_nonfatal.c Tue Dec 09 16:28:02 2008 +0000
+++ b/xen/arch/x86/cpu/mcheck/amd_nonfatal.c Mon Dec 15 14:25:07 2008 +0900
@@ -65,117 +65,12 @@
#include "mce.h"
#include "x86_mca.h"
-static struct timer mce_timer;
+static int hw_threshold = 0;
-#define MCE_PERIOD MILLISECS(15000)
-#define MCE_MIN MILLISECS(2000)
-#define MCE_MAX MILLISECS(30000)
+extern struct timer mce_timer;
-static s_time_t period = MCE_PERIOD;
-static int hw_threshold = 0;
-static int adjust = 0;
-
-/* The polling service routine:
- * Collects information of correctable errors and notifies
- * Dom0 via an event.
- */
-void mce_amd_checkregs(void *info)
-{
- struct vcpu *vcpu = current;
- struct mc_info *mc_data;
- struct mcinfo_global mc_global;
- struct mcinfo_bank mc_info;
- uint64_t status, addrv, miscv;
- unsigned int i;
- unsigned int event_enabled;
- unsigned int cpu_nr;
- int error_found;
-
- /* We don't need a slot yet. Only allocate one on error. */
- mc_data = NULL;
-
- cpu_nr = smp_processor_id();
- event_enabled = guest_enabled_event(dom0->vcpu[0], VIRQ_MCA);
- error_found = 0;
-
- memset(&mc_global, 0, sizeof(mc_global));
- mc_global.common.type = MC_TYPE_GLOBAL;
- mc_global.common.size = sizeof(mc_global);
-
- mc_global.mc_domid = vcpu->domain->domain_id; /* impacted domain */
- mc_global.mc_coreid = vcpu->processor; /* impacted physical cpu */
- BUG_ON(cpu_nr != vcpu->processor);
- mc_global.mc_core_threadid = 0;
- mc_global.mc_vcpuid = vcpu->vcpu_id; /* impacted vcpu */
-#if 0 /* TODO: on which socket is this physical core?
- It's not clear to me how to figure this out. */
- mc_global.mc_socketid = ???;
-#endif
- mc_global.mc_flags |= MC_FLAG_CORRECTABLE;
- rdmsrl(MSR_IA32_MCG_STATUS, mc_global.mc_gstatus);
-
- for (i = 0; i < nr_mce_banks; i++) {
- struct domain *d;
-
- rdmsrl(MSR_IA32_MC0_STATUS + i * 4, status);
-
- if (!(status & MCi_STATUS_VAL))
- continue;
-
- if (mc_data == NULL) {
- /* Now we need a slot to fill in error telemetry. */
- mc_data = x86_mcinfo_getptr();
- BUG_ON(mc_data == NULL);
- x86_mcinfo_clear(mc_data);
- x86_mcinfo_add(mc_data, &mc_global);
- }
-
- memset(&mc_info, 0, sizeof(mc_info));
- mc_info.common.type = MC_TYPE_BANK;
- mc_info.common.size = sizeof(mc_info);
- mc_info.mc_bank = i;
- mc_info.mc_status = status;
-
- /* Increase polling frequency */
- error_found = 1;
-
- addrv = 0;
- if (status & MCi_STATUS_ADDRV) {
- rdmsrl(MSR_IA32_MC0_ADDR + i * 4, addrv);
-
- d = maddr_get_owner(addrv);
- if (d != NULL)
- mc_info.mc_domid = d->domain_id;
- }
-
- miscv = 0;
- if (status & MCi_STATUS_MISCV)
- rdmsrl(MSR_IA32_MC0_MISC + i * 4, miscv);
-
- mc_info.mc_addr = addrv;
- mc_info.mc_misc = miscv;
- x86_mcinfo_add(mc_data, &mc_info);
-
- if (mc_callback_bank_extended)
- mc_callback_bank_extended(mc_data, i, status);
-
- /* clear status */
- wrmsrl(MSR_IA32_MC0_STATUS + i * 4, 0x0ULL);
- wmb();
- }
-
- if (error_found > 0) {
- /* If Dom0 enabled the VIRQ_MCA event, then ... */
- if (event_enabled)
- /* ... notify it. */
- send_guest_global_virq(dom0, VIRQ_MCA);
- else
- /* ... or dump it */
- x86_mcinfo_dump(mc_data);
- }
-
- adjust += error_found;
-}
+extern s_time_t period;
+extern int adjust;
/* polling service routine invoker:
* Adjust poll frequency at runtime. No error means slow polling frequency,
@@ -186,7 +81,7 @@
*/
static void mce_amd_work_fn(void *data)
{
- on_each_cpu(mce_amd_checkregs, data, 1, 1);
+ on_each_cpu(x86_mce_checkregs, data, 1, 1);
if (adjust > 0) {
if ( !guest_enabled_event(dom0->vcpu[0], VIRQ_MCA) ) {
diff -r 6595393a3d28 xen/arch/x86/cpu/mcheck/mce.c
--- a/xen/arch/x86/cpu/mcheck/mce.c Tue Dec 09 16:28:02 2008 +0000
+++ b/xen/arch/x86/cpu/mcheck/mce.c Mon Dec 15 14:25:07 2008 +0900
@@ -7,6 +7,9 @@
#include <xen/types.h>
#include <xen/kernel.h>
#include <xen/config.h>
+#include <xen/sched.h>
+#include <xen/sched-if.h>
+#include <xen/paging.h>
#include <xen/smp.h>
#include <xen/errno.h>
@@ -18,6 +21,12 @@
int mce_disabled = 0;
unsigned int nr_mce_banks;
+int mce_bootlog = 1;
+
+#define MAX_PAGE_OFFLINING 1024
+
+static struct page_info *page_offlining[MAX_PAGE_OFFLINING];
+static int num_page_offlining = 0;
EXPORT_SYMBOL_GPL(nr_mce_banks); /* non-fatal.o */
@@ -136,6 +145,9 @@
intel_p5_mcheck_init(c);
if (c->x86==6)
intel_p6_mcheck_init(c);
+#else
+ if (c->x86==6)
+ intel_p4_mcheck_init(c);
#endif
if (c->x86==15)
intel_p4_mcheck_init(c);
@@ -159,9 +171,19 @@
mce_disabled = 1;
}
+/* mce=off disables machine check.
+ mce=bootlog Log MCEs from before booting. Disabled by default on AMD.
+ mce=nobootlog Don't log MCEs from before booting. */
static void __init mcheck_enable(char *str)
{
- mce_disabled = -1;
+ if (*str == '=')
+ str++;
+ if (!strcmp(str, "off"))
+ mce_disabled = 1;
+ else if (!strcmp(str, "bootlog") || !strcmp(str,"nobootlog"))
+ mce_bootlog = str[0] == 'b';
+ else
+ printk("mce= argument %s ignored.", str);
}
custom_param("nomce", mcheck_disable);
@@ -221,6 +243,12 @@
/* This function is called from the fetch hypercall with
* the mc_lock spinlock held. Thus, no need for locking here.
*/
+
+ /* Return NULL if no data is available. */
+ if (mc_data.fetch_idx == mc_data.error_idx) {
+ *fetch_idx = mc_data.fetch_idx;
+ return NULL;
+ }
mi = &(x86_mcinfo_mcdata(mc_data.fetch_idx));
if ((d != dom0) && !x86_mcinfo_matches_guest(mi, d, v)) {
/* Bogus domU command detected. */
@@ -431,6 +459,272 @@
} while (1);
}
+static int x86_page_offlining(unsigned long maddr, struct domain *d)
+{
+ int i;
+ struct page_info *pg;
+
+ if (!mfn_valid(maddr >> PAGE_SHIFT)) {
+ printk(XENLOG_ERR "Page offlining: ( %lx ) invalid.\n", maddr);
+ return -1;
+ }
+
+ /* convert physical address to physical page number */
+ pg = maddr_to_page(maddr);
+
+ if (pg == NULL) {
+ printk(XENLOG_ERR "Page offlining: ( %lx ) not found.\n",
+ maddr);
+ return -1;
+ }
+
+ /* check whether a page number have been already registered or not */
+ for (i = 0; i < num_page_offlining; i++)
+ if (page_offlining[i] == pg)
+ goto out;
+
+ /* limitation check and already having attribute 'reserved' */
+ if (num_page_offlining == MAX_PAGE_OFFLINING ||
+ pg->count_info & PGC_reserved) {
+ printk(XENLOG_DEBUG "Page offlining: ( %lx ) failure.\n",
+ maddr);
+ return 1;
+ }
+
+ /* add attribute 'reserved' and register the page */
+ get_page(pg, d);
+ pg->count_info |= PGC_reserved;
+ page_offlining[num_page_offlining++] = pg;
+
+ out:
+ printk(XENLOG_DEBUG "Page offlining: ( %lx ) success.\n", maddr);
+ return 0;
+}
+
+
+/* Machine Check Handler for AMD K8 family series and Intel P4/Xeon family */
+void x86_machine_check(struct cpu_user_regs *regs, long error_code)
+{
+ struct vcpu *vcpu = current;
+ struct domain *curdom;
+ struct mc_info *mc_data;
+ struct mcinfo_global mc_global;
+ struct mcinfo_bank mc_info;
+ uint64_t status, addrv, miscv, uc;
+ uint32_t i;
+ unsigned int cpu_nr;
+ uint32_t xen_impacted = 0;
+#define DOM_NORMAL 0
+#define DOM0_TRAP 1
+#define DOMU_TRAP 2
+#define DOMU_KILLED 4
+ uint32_t dom_state = DOM_NORMAL;
+
+ /* This handler runs as interrupt gate. So IPIs from the
+ * polling service routine are defered until we finished.
+ */
+
+ /* Disable interrupts for the _vcpu_. It may not re-scheduled to
+ * an other physical CPU or the impacted process in the guest
+ * continues running with corrupted data, otherwise. */
+ vcpu_schedule_lock_irq(vcpu);
+
+ mc_data = x86_mcinfo_getptr();
+ cpu_nr = smp_processor_id();
+ curdom = vcpu->domain;
+
+ memset(&mc_global, 0, sizeof(mc_global));
+ mc_global.common.type = MC_TYPE_GLOBAL;
+ mc_global.common.size = sizeof(mc_global);
+
+ mc_global.mc_domid = curdom->domain_id; /* impacted domain */
+ mc_global.mc_coreid = vcpu->processor; /* impacted physical cpu */
+ BUG_ON(cpu_nr != vcpu->processor);
+ mc_global.mc_core_threadid = 0;
+ mc_global.mc_vcpuid = vcpu->vcpu_id; /* impacted vcpu */
+#if 0 /* TODO: on which socket is this physical core?
+ It's not clear to me how to figure this out. */
+ mc_global.mc_socketid = ???;
+#endif
+ mc_global.mc_flags |= MC_FLAG_UNCORRECTABLE;
+ rdmsrl(MSR_IA32_MCG_STATUS, mc_global.mc_gstatus);
+
+ /* Quick check, who is impacted */
+ xen_impacted = is_idle_domain(curdom);
+
+ /* Dom0 */
+ x86_mcinfo_clear(mc_data);
+ x86_mcinfo_add(mc_data, &mc_global);
+
+ for (i = 0; i < nr_mce_banks; i++) {
+ struct domain *d;
+
+ rdmsrl(MSR_IA32_MC0_STATUS + 4 * i, status);
+
+ if (!(status & MCi_STATUS_VAL))
+ continue;
+
+ /* An error happened in this bank.
+ * This is expected to be an uncorrectable error,
+ * since correctable errors get polled.
+ */
+ uc = status & MCi_STATUS_UC;
+
+ memset(&mc_info, 0, sizeof(mc_info));
+ mc_info.common.type = MC_TYPE_BANK;
+ mc_info.common.size = sizeof(mc_info);
+ mc_info.mc_bank = i;
+ mc_info.mc_status = status;
+
+ addrv = 0;
+ if (status & MCi_STATUS_ADDRV) {
+ rdmsrl(MSR_IA32_MC0_ADDR + 4 * i, addrv);
+
+ d = maddr_get_owner(addrv);
+ if (d != NULL) {
+ mc_info.mc_domid = d->domain_id;
+
+ /* Page offlining */
+ x86_page_offlining(addrv, d);
+ }
+ }
+
+ miscv = 0;
+ if (status & MCi_STATUS_MISCV)
+ rdmsrl(MSR_IA32_MC0_MISC + 4 * i, miscv);
+
+ mc_info.mc_addr = addrv;
+ mc_info.mc_misc = miscv;
+
+ x86_mcinfo_add(mc_data, &mc_info); /* Dom0 */
+
+ if (mc_callback_bank_extended)
+ mc_callback_bank_extended(mc_data, i, status);
+
+ /* clear status */
+ wrmsrl(MSR_IA32_MC0_STATUS + 4 * i, 0x0ULL);
+ wmb();
+ add_taint(TAINT_MACHINE_CHECK);
+ }
+
+ /* Never do anything final for the previous reset */
+ if (!regs) {
+ vcpu_schedule_unlock_irq(vcpu);
+ return;
+ }
+
+ status = mc_global.mc_gstatus;
+
+ /* clear MCIP or cpu enters shutdown state
+ * in case another MCE occurs. */
+ status &= ~MCG_STATUS_MCIP;
+ wrmsrl(MSR_IA32_MCG_STATUS, status);
+ wmb();
+
+ /* For the details see the discussion "MCE/MCA concept" on xen-devel.
+ * The thread started here:
+ *
http://lists.xensource.com/archives/html/xen-devel/2007-05/msg01015.html
+ */
+
+ /* MCG_STATUS_RIPV:
+ * When this bit is not set, then the instruction pointer onto the stack
+ * to resume at is not valid. If xen is interrupted, then we panic
anyway
+ * right below. Otherwise it is up to the guest to figure out if
+ * guest kernel or guest userland is affected and should kill either
+ * itself or the affected process.
+ */
+
+ /* MCG_STATUS_EIPV:
+ * Evaluation of EIPV is the job of the guest.
+ */
+
+ if (xen_impacted) {
+ /* Now we are going to panic anyway. Allow interrupts, so that
+ * printk on serial console can work. */
+ vcpu_schedule_unlock_irq(vcpu);
+
+ /* Uh, that means, machine check exception
+ * inside Xen occured. */
+ printk("Machine check exception occured in Xen.\n");
+
+ /* if MCG_STATUS_EIPV indicates, the IP on the stack is related
+ * to the error then it makes sense to print a stack trace.
+ * That can be useful for more detailed error analysis and/or
+ * error case studies to figure out, if we can clear
+ * xen_impacted and kill a DomU instead
+ * (i.e. if a guest only control structure is affected, but then
+ * we must ensure the bad pages are not re-used again).
+ */
+ if (status & MCG_STATUS_EIPV) {
+ printk("MCE: Instruction Pointer is related to the
error. "
+ "Therefore, print the execution state.\n");
+ show_execution_state(regs);
+ }
+ x86_mcinfo_dump(mc_data);
+ panic("End of MCE. Use mcelog to decode above error codes.\n");
+ }
+
+ /* If Dom0 registered a machine check handler, which is only possible
+ * with a PV MCA driver, then ... */
+ if ( guest_has_trap_callback(dom0, 0, TRAP_machine_check) ) {
+ dom_state = DOM0_TRAP;
+
+ /* ... deliver machine check trap to Dom0. */
+ send_guest_trap(dom0, 0, TRAP_machine_check);
+
+ /* Xen may tell Dom0 now to notify the DomU.
+ * But this will happen through a hypercall. */
+ } else
+ /* Dom0 did not register a machine check handler, but if DomU
+ * did so, then... */
+ if ( guest_has_trap_callback(curdom, vcpu->vcpu_id,
TRAP_machine_check) ) {
+ dom_state = DOMU_TRAP;
+
+ /* ... deliver machine check trap to DomU */
+ send_guest_trap(curdom, vcpu->vcpu_id,
TRAP_machine_check);
+ } else {
+ /* hmm... noone feels responsible to handle the error.
+ * So, do a quick check if a DomU is impacted or not.
+ */
+ if (curdom == dom0) {
+ /* Dom0 is impacted. Since noone can't handle
+ * this error, panic! */
+ x86_mcinfo_dump(mc_data);
+ panic("MCE occured in Dom0, which it can't handle\n");
+
+ /* UNREACHED */
+ } else {
+ dom_state = DOMU_KILLED;
+
+ /* Enable interrupts. This basically results in
+ * calling sti on the *physical* cpu. But after
+ * domain_crash() the vcpu pointer is invalid.
+ * Therefore, we must unlock the irqs before killing
+ * it. */
+ vcpu_schedule_unlock_irq(vcpu);
+
+ /* DomU is impacted. Kill it and continue. */
+ domain_crash(curdom);
+ }
+ }
+
+
+ switch (dom_state) {
+ case DOM0_TRAP:
+ case DOMU_TRAP:
+ /* Enable interrupts. */
+ vcpu_schedule_unlock_irq(vcpu);
+
+ /* guest softirqs and event callbacks are scheduled
+ * immediately after this handler exits. */
+ break;
+ case DOMU_KILLED:
+ /* Nothing to do here. */
+ break;
+ default:
+ BUG();
+ }
+}
/* Machine Check Architecture Hypercall */
@@ -564,7 +858,7 @@
if ( copy_to_guest(u_xen_mc, op, 1) )
ret = -EFAULT;
- if (ret == 0) {
+ if (ret == 0 && mc_notifydomain->flags == XEN_MC_OK) {
x86_mcinfo_marknotified(mc_notifydomain);
}
diff -r 6595393a3d28 xen/arch/x86/cpu/mcheck/non-fatal.c
--- a/xen/arch/x86/cpu/mcheck/non-fatal.c Tue Dec 09 16:28:02 2008 +0000
+++ b/xen/arch/x86/cpu/mcheck/non-fatal.c Mon Dec 15 14:25:07 2008 +0900
@@ -14,16 +14,158 @@
#include <xen/smp.h>
#include <xen/timer.h>
#include <xen/errno.h>
+#include <xen/event.h>
#include <asm/processor.h>
#include <asm/system.h>
#include <asm/msr.h>
#include "mce.h"
+#include "x86_mca.h"
static int firstbank;
-static struct timer mce_timer;
-#define MCE_PERIOD MILLISECS(15000)
+struct timer mce_timer;
+
+s_time_t period = MCE_PERIOD;
+int adjust = 0;
+
+/* The polling service routine:
+ * Collects information of correctable errors and notifies
+ * Dom0 via an event.
+ */
+void x86_mce_checkregs(void *info)
+{
+ struct vcpu *vcpu = current;
+ struct mc_info *mc_data;
+ struct mcinfo_global mc_global;
+ struct mcinfo_bank mc_info;
+ uint64_t status, addrv, miscv;
+ unsigned int i;
+ unsigned int event_enabled;
+ unsigned int cpu_nr;
+ int error_found;
+
+ /* We don't need a slot yet. Only allocate one on error. */
+ mc_data = NULL;
+
+ cpu_nr = smp_processor_id();
+ event_enabled = guest_enabled_event(dom0->vcpu[0], VIRQ_MCA);
+ error_found = 0;
+
+ memset(&mc_global, 0, sizeof(mc_global));
+ mc_global.common.type = MC_TYPE_GLOBAL;
+ mc_global.common.size = sizeof(mc_global);
+
+ mc_global.mc_domid = vcpu->domain->domain_id; /* impacted domain */
+ mc_global.mc_coreid = vcpu->processor; /* impacted physical cpu */
+ BUG_ON(cpu_nr != vcpu->processor);
+ mc_global.mc_core_threadid = 0;
+ mc_global.mc_vcpuid = vcpu->vcpu_id; /* impacted vcpu */
+#if 0 /* TODO: on which socket is this physical core?
+ It's not clear to me how to figure this out. */
+ mc_global.mc_socketid = ???;
+#endif
+ mc_global.mc_flags |= MC_FLAG_CORRECTABLE;
+ rdmsrl(MSR_IA32_MCG_STATUS, mc_global.mc_gstatus);
+
+ for (i = 0; i < nr_mce_banks; i++) {
+ struct domain *d;
+
+ rdmsrl(MSR_IA32_MC0_STATUS + i * 4, status);
+
+ if (!(status & MCi_STATUS_VAL))
+ continue;
+
+ if (mc_data == NULL) {
+ /* Now we need a slot to fill in error telemetry. */
+ mc_data = x86_mcinfo_getptr();
+ BUG_ON(mc_data == NULL);
+ x86_mcinfo_clear(mc_data);
+ x86_mcinfo_add(mc_data, &mc_global);
+ }
+
+ memset(&mc_info, 0, sizeof(mc_info));
+ mc_info.common.type = MC_TYPE_BANK;
+ mc_info.common.size = sizeof(mc_info);
+ mc_info.mc_bank = i;
+ mc_info.mc_status = status;
+
+ /* Increase polling frequency */
+ error_found = 1;
+
+ addrv = 0;
+ if (status & MCi_STATUS_ADDRV) {
+ rdmsrl(MSR_IA32_MC0_ADDR + i * 4, addrv);
+
+ d = maddr_get_owner(addrv);
+ if (d != NULL)
+ mc_info.mc_domid = d->domain_id;
+ }
+
+ miscv = 0;
+ if (status & MCi_STATUS_MISCV)
+ rdmsrl(MSR_IA32_MC0_MISC + i * 4, miscv);
+
+ mc_info.mc_addr = addrv;
+ mc_info.mc_misc = miscv;
+ x86_mcinfo_add(mc_data, &mc_info);
+
+ if (mc_callback_bank_extended)
+ mc_callback_bank_extended(mc_data, i, status);
+
+ /* clear status */
+ wrmsrl(MSR_IA32_MC0_STATUS + i * 4, 0x0ULL);
+ wmb();
+ }
+
+ if (error_found > 0) {
+ /* If Dom0 enabled the VIRQ_MCA event, then ... */
+ if (event_enabled)
+ /* ... notify it. */
+ send_guest_global_virq(dom0, VIRQ_MCA);
+ else
+ /* ... or dump it */
+ x86_mcinfo_dump(mc_data);
+ }
+
+ adjust += error_found;
+}
+
+static void p4_mce_work_fn(void *data)
+{
+ on_each_cpu(x86_mce_checkregs, NULL, 1, 1);
+
+ if (adjust > 0) {
+ if ( !guest_enabled_event(dom0->vcpu[0], VIRQ_MCA) ) {
+ /* Dom0 did not enable VIRQ_MCA, so Xen is reporting. */
+ printk("MCE: polling routine found correctable error. "
+ " Use mcelog to parse above error output.\n");
+ }
+ }
+
+ if (adjust > 0) {
+ /* Increase polling frequency */
+ adjust++; /* adjust == 1 must have an effect */
+ period /= adjust;
+ } else {
+ /* Decrease polling frequency */
+ period *= 2;
+ }
+ if (period > MCE_MAX) {
+ /* limit: Poll at least every 30s */
+ period = MCE_MAX;
+ }
+ if (period < MCE_MIN) {
+ /* limit: Poll every 2s.
+ * When this is reached an uncorrectable error
+ * is expected to happen, if Dom0 does nothing.
+ */
+ period = MCE_MIN;
+ }
+
+ set_timer(&mce_timer, NOW() + period);
+ adjust = 0;
+}
static void mce_checkregs (void *info)
{
@@ -85,6 +227,15 @@
break;
case X86_VENDOR_INTEL:
+ if (c->x86 == 15 /* P4/Xeon */
+#ifdef CONFIG_X86_64
+ || c->x86 == 6
+#endif
+ ) {
+ init_timer(&mce_timer, p4_mce_work_fn, NULL, 0);
+ set_timer(&mce_timer, NOW() + period);
+ break;
+ }
init_timer(&mce_timer, mce_work_fn, NULL, 0);
set_timer(&mce_timer, NOW() + MCE_PERIOD);
break;
diff -r 6595393a3d28 xen/arch/x86/cpu/mcheck/p4.c
--- a/xen/arch/x86/cpu/mcheck/p4.c Tue Dec 09 16:28:02 2008 +0000
+++ b/xen/arch/x86/cpu/mcheck/p4.c Mon Dec 15 14:25:07 2008 +0900
@@ -15,6 +15,7 @@
#include <asm/apic.h>
#include "mce.h"
+#include "x86_mca.h"
/* as supported by the P4/Xeon family */
struct intel_mce_extended_msrs {
@@ -32,6 +33,7 @@
};
static int mce_num_extended_msrs = 0;
+extern int mce_bootlog;
#ifdef CONFIG_X86_MCE_P4THERMAL
@@ -158,85 +160,13 @@
return mce_num_extended_msrs;
}
-static fastcall void intel_machine_check(struct cpu_user_regs * regs, long
error_code)
-{
- int recover=1;
- u32 alow, ahigh, high, low;
- u32 mcgstl, mcgsth;
- int i;
- struct intel_mce_extended_msrs dbg;
-
- rdmsr (MSR_IA32_MCG_STATUS, mcgstl, mcgsth);
- if (mcgstl & (1<<0)) /* Recoverable ? */
- recover=0;
-
- printk (KERN_EMERG "CPU %d: Machine Check Exception: %08x%08x\n",
- smp_processor_id(), mcgsth, mcgstl);
-
- if (intel_get_extended_msrs(&dbg)) {
- printk (KERN_DEBUG "CPU %d: EIP: %08x EFLAGS: %08x\n",
- smp_processor_id(), dbg.eip, dbg.eflags);
- printk (KERN_DEBUG "\teax: %08x ebx: %08x ecx: %08x edx:
%08x\n",
- dbg.eax, dbg.ebx, dbg.ecx, dbg.edx);
- printk (KERN_DEBUG "\tesi: %08x edi: %08x ebp: %08x esp:
%08x\n",
- dbg.esi, dbg.edi, dbg.ebp, dbg.esp);
- }
-
- for (i=0; i<nr_mce_banks; i++) {
- rdmsr (MSR_IA32_MC0_STATUS+i*4,low, high);
- if (high & (1<<31)) {
- if (high & (1<<29))
- recover |= 1;
- if (high & (1<<25))
- recover |= 2;
- printk (KERN_EMERG "Bank %d: %08x%08x", i, high, low);
- high &= ~(1<<31);
- if (high & (1<<27)) {
- rdmsr (MSR_IA32_MC0_MISC+i*4, alow, ahigh);
- printk ("[%08x%08x]", ahigh, alow);
- }
- if (high & (1<<26)) {
- rdmsr (MSR_IA32_MC0_ADDR+i*4, alow, ahigh);
- printk (" at %08x%08x", ahigh, alow);
- }
- printk ("\n");
- }
- }
-
- if (recover & 2)
- panic ("CPU context corrupt");
- if (recover & 1)
- panic ("Unable to continue");
-
- printk(KERN_EMERG "Attempting to continue.\n");
- /*
- * Do not clear the MSR_IA32_MCi_STATUS if the error is not
- * recoverable/continuable.This will allow BIOS to look at the MSRs
- * for errors if the OS could not log the error.
- */
- for (i=0; i<nr_mce_banks; i++) {
- u32 msr;
- msr = MSR_IA32_MC0_STATUS+i*4;
- rdmsr (msr, low, high);
- if (high&(1<<31)) {
- /* Clear it */
- wrmsr(msr, 0UL, 0UL);
- /* Serialize */
- wmb();
- add_taint(TAINT_MACHINE_CHECK);
- }
- }
- mcgstl &= ~(1<<2);
- wrmsr (MSR_IA32_MCG_STATUS,mcgstl, mcgsth);
-}
-
void intel_p4_mcheck_init(struct cpuinfo_x86 *c)
{
u32 l, h;
int i;
- machine_check_vector = intel_machine_check;
+ machine_check_vector = x86_machine_check;
wmb();
printk (KERN_INFO "Intel machine check architecture supported.\n");
@@ -244,6 +174,17 @@
if (l & (1<<8)) /* Control register present ? */
wrmsr (MSR_IA32_MCG_CTL, 0xffffffff, 0xffffffff);
nr_mce_banks = l & 0xff;
+
+ /* Log the machine checks left over from the previous reset.
+ This also clears all registers */
+ for (i=0; i<nr_mce_banks; i++) {
+ u64 status;
+ rdmsrl(MSR_IA32_MC0_STATUS + i*4, status);
+ if (status & MCi_STATUS_VAL) {
+ x86_machine_check(NULL, mce_bootlog ? -1 : -2);
+ break;
+ }
+ }
for (i=0; i<nr_mce_banks; i++) {
wrmsr (MSR_IA32_MC0_CTL+4*i, 0xffffffff, 0xffffffff);
diff -r 6595393a3d28 xen/arch/x86/cpu/mcheck/x86_mca.h
--- a/xen/arch/x86/cpu/mcheck/x86_mca.h Tue Dec 09 16:28:02 2008 +0000
+++ b/xen/arch/x86/cpu/mcheck/x86_mca.h Mon Dec 15 14:25:07 2008 +0900
@@ -70,3 +70,11 @@
/* reserved bits */
#define MCi_STATUS_OTHER_RESERVED2 0x0180000000000000ULL
+/* Polling period */
+#define MCE_PERIOD MILLISECS(15000)
+#define MCE_MIN MILLISECS(2000)
+#define MCE_MAX MILLISECS(30000)
+
+/* Common routines */
+void x86_machine_check(struct cpu_user_regs *regs, long error_code);
+void x86_mce_checkregs(void *info);
diff -r 6595393a3d28 xen/arch/x86/traps.c
--- a/xen/arch/x86/traps.c Tue Dec 09 16:28:02 2008 +0000
+++ b/xen/arch/x86/traps.c Mon Dec 15 14:25:07 2008 +0900
@@ -726,8 +726,10 @@
if ( !opt_allow_hugepage )
__clear_bit(X86_FEATURE_PSE, &d);
__clear_bit(X86_FEATURE_PGE, &d);
+#ifndef __x86_64__
__clear_bit(X86_FEATURE_MCE, &d);
__clear_bit(X86_FEATURE_MCA, &d);
+#endif
__clear_bit(X86_FEATURE_PSE36, &d);
}
switch ( (uint32_t)regs->eax )
diff -r 6595393a3d28 xen/common/page_alloc.c
--- a/xen/common/page_alloc.c Tue Dec 09 16:28:02 2008 +0000
+++ b/xen/common/page_alloc.c Mon Dec 15 14:25:07 2008 +0900
@@ -338,8 +338,14 @@
/* Find smallest order which can satisfy the request. */
for ( j = order; j <= MAX_ORDER; j++ )
- if ( !list_empty(&heap(node, zone, j)) )
- goto found;
+ if ( !list_empty(&heap(node, zone, j)) ) {
+ pg = list_entry(heap(node, zone, j).next, struct
page_info, list);
+ if (!(pg->count_info & PGC_reserved))
+ goto found;
+ else
+ printk(XENLOG_DEBUG "Page %p(%lx) is not to be
allocated.\n",
+ pg, page_to_maddr(pg));
+ }
} while ( zone-- > zone_lo ); /* careful: unsigned zone may wrap */
/* Pick next node, wrapping around if needed. */
@@ -402,11 +408,22 @@
unsigned long mask;
unsigned int i, node = phys_to_nid(page_to_maddr(pg));
struct domain *d;
+ int reserved = 0;
ASSERT(zone < NR_ZONES);
ASSERT(order <= MAX_ORDER);
ASSERT(node >= 0);
ASSERT(node < num_online_nodes());
+
+ for ( i = 0; i < (1 << order); i++) {
+ reserved += !!(pg[i].count_info & PGC_reserved);
+ if (!!(pg[i].count_info & PGC_reserved))
+ printk(XENLOG_DEBUG "Page %p(%lx) is not to be freed\n",
+ &pg[i], page_to_maddr(&pg[i]));
+ }
+
+ if (reserved)
+ return;
for ( i = 0; i < (1 << order); i++ )
{
diff -r 6595393a3d28 xen/include/asm-x86/mm.h
--- a/xen/include/asm-x86/mm.h Tue Dec 09 16:28:02 2008 +0000
+++ b/xen/include/asm-x86/mm.h Mon Dec 15 14:25:07 2008 +0900
@@ -142,8 +142,11 @@
/* 3-bit PAT/PCD/PWT cache-attribute hint. */
#define PGC_cacheattr_base 26
#define PGC_cacheattr_mask (7U<<PGC_cacheattr_base)
- /* 26-bit count of references to this frame. */
-#define PGC_count_mask ((1U<<26)-1)
+ /* Set for special pages, which can never be used */
+#define _PGC_reserved 25
+#define PGC_reserved (1U<<_PGC_reserved)
+ /* 25-bit count of references to this frame. */
+#define PGC_count_mask ((1U<<25)-1)
#define is_xen_heap_page(page) is_xen_heap_mfn(page_to_mfn(page))
#define is_xen_heap_mfn(mfn) ({ \
_______________________________________________
Xen-devel mailing list
Xen-devel@xxxxxxxxxxxxxxxxxxx
http://lists.xensource.com/xen-devel
|