diff -r 959db3c01837 xen/arch/x86/cpu/mcheck/Makefile
--- a/xen/arch/x86/cpu/mcheck/Makefile	Fri Jul 04 14:41:35 2008 +0200
+++ b/xen/arch/x86/cpu/mcheck/Makefile	Fri Jul 04 14:48:37 2008 +0200
@@ -1,4 +1,7 @@ obj-y += k7.o
+obj-y += amd_nonfatal.o
 obj-y += k7.o
+obj-y += amd_k8.o
+obj-y += amd_f10.o
 obj-y += mce.o
 obj-y += non-fatal.o
 obj-y += p4.o
diff -r 959db3c01837 xen/arch/x86/cpu/mcheck/amd_f10.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/xen/arch/x86/cpu/mcheck/amd_f10.c	Fri Jul 04 14:48:37 2008 +0200
@@ -0,0 +1,131 @@
+/*
+ * MCA implementation for AMD Family10 CPUs
+ * Copyright (c) 2007 Advanced Micro Devices, Inc.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+ */
+
+
+/* K8 common MCA documentation published at
+ *
+ * AMD64 Architecture Programmer's Manual Volume 2:
+ * System Programming
+ * Publication # 24593 Revision: 3.12
+ * Issue Date: September 2006
+ */
+
+/* Family10 MCA documentation published at
+ *
+ * BIOS and Kernel Developer's Guide
+ * For AMD Family 10h Processors
+ * Publication # 31116 Revision: 1.08
+ * Isse Date: June 10, 2007
+ */
+
+
+#include <xen/init.h>
+#include <xen/types.h>
+#include <xen/kernel.h>
+#include <xen/config.h>
+#include <xen/smp.h>
+
+#include <asm/processor.h>
+#include <asm/system.h>
+#include <asm/msr.h>
+
+#include "mce.h"
+#include "x86_mca.h"
+
+
+static int amd_f10_handler(struct mc_info *mi, uint16_t bank, uint64_t status)
+{
+	struct mcinfo_extended mc_ext;
+
+	/* Family 0x10 introduced additional MSR that belong to the
+	 * northbridge bank (4). */
+	if (bank != 4)
+		return 0;
+
+	if (!(status & MCi_STATUS_VAL))
+		return 0;
+
+	if (!(status & MCi_STATUS_MISCV))
+		return 0;
+
+	memset(&mc_ext, 0, sizeof(mc_ext));
+	mc_ext.common.type = MC_TYPE_EXTENDED;
+	mc_ext.common.size = sizeof(mc_ext);
+	mc_ext.mc_msrs = 3;
+
+	mc_ext.mc_msr[0].reg = MSR_F10_MC4_MISC1;
+	mc_ext.mc_msr[1].reg = MSR_F10_MC4_MISC2;
+	mc_ext.mc_msr[2].reg = MSR_F10_MC4_MISC3;
+
+	rdmsrl(MSR_F10_MC4_MISC1, mc_ext.mc_msr[0].value);
+	rdmsrl(MSR_F10_MC4_MISC2, mc_ext.mc_msr[1].value);
+	rdmsrl(MSR_F10_MC4_MISC3, mc_ext.mc_msr[2].value);
+	
+	x86_mcinfo_add(mi, &mc_ext);
+	return 1;
+}
+
+
+extern void k8_machine_check(struct cpu_user_regs *regs, long error_code);
+
+/* AMD Family10 machine check */
+void amd_f10_mcheck_init(struct cpuinfo_x86 *c) 
+{ 
+	uint64_t value;
+	uint32_t i;
+	int cpu_nr;
+
+	machine_check_vector = k8_machine_check;
+	mc_callback_bank_extended = amd_f10_handler;
+	cpu_nr = smp_processor_id();
+	wmb();
+
+	rdmsrl(MSR_IA32_MCG_CAP, value);
+	if (value & MCG_CTL_P)	/* Control register present ? */
+		wrmsrl (MSR_IA32_MCG_CTL, 0xffffffffffffffffULL);
+	nr_mce_banks = value & MCG_CAP_COUNT;
+
+	for (i = 0; i < nr_mce_banks; i++) {
+		switch (i) {
+		case 4: /* Northbridge */
+			/* Enable error reporting of all errors,
+			 * enable error checking and
+			 * disable sync flooding */
+			wrmsrl(MSR_IA32_MC4_CTL, 0x02c3c008ffffffffULL);
+			wrmsrl(MSR_IA32_MC4_STATUS, 0x0ULL);
+
+			/* XXX: We should write the value 0x1087821UL into
+			 * to register F3x180 here, which sits in
+			 * the PCI extended configuration space.
+			 * Since this is not possible here, we can only hope,
+			 * Dom0 is doing that.
+			 */
+			break;
+
+		default:
+			/* Enable error reporting of all errors */
+			wrmsrl(MSR_IA32_MC0_CTL + 4 * i, 0xffffffffffffffffULL);
+			wrmsrl(MSR_IA32_MC0_STATUS + 4 * i, 0x0ULL);
+			break;
+		}
+	}
+
+	set_in_cr4(X86_CR4_MCE);
+	printk("CPU%i: AMD Family10h machine check reporting enabled.\n", cpu_nr);
+}
diff -r 959db3c01837 xen/arch/x86/cpu/mcheck/amd_k8.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/xen/arch/x86/cpu/mcheck/amd_k8.c	Fri Jul 04 14:48:37 2008 +0200
@@ -0,0 +1,324 @@
+/*
+ * MCA implementation for AMD K8 CPUs
+ * Copyright (c) 2007 Advanced Micro Devices, Inc.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+ */
+
+
+/* K8 common MCA documentation published at
+ *
+ * AMD64 Architecture Programmer's Manual Volume 2:
+ * System Programming
+ * Publication # 24593 Revision: 3.12
+ * Issue Date: September 2006
+ *
+ * URL:
+ * http://www.amd.com/us-en/assets/content_type/white_papers_and_tech_docs/24593.pdf
+ */
+
+/* The related documentation for K8 Revisions A - E is:
+ *
+ * BIOS and Kernel Developer's Guide for
+ * AMD Athlon 64 and AMD Opteron Processors
+ * Publication # 26094 Revision: 3.30
+ * Issue Date: February 2006
+ *
+ * URL:
+ * http://www.amd.com/us-en/assets/content_type/white_papers_and_tech_docs/26094.PDF
+ */
+
+/* The related documentation for K8 Revisions F - G is:
+ *
+ * BIOS and Kernel Developer's Guide for
+ * AMD NPT Family 0Fh Processors
+ * Publication # 32559 Revision: 3.04
+ * Issue Date: December 2006
+ *
+ * URL:
+ * http://www.amd.com/us-en/assets/content_type/white_papers_and_tech_docs/32559.pdf
+ */
+
+
+#include <xen/config.h>
+#include <xen/init.h>
+#include <xen/types.h>
+#include <xen/kernel.h>
+#include <xen/smp.h>
+#include <xen/sched.h>
+#include <xen/sched-if.h>
+#include <xen/softirq.h>
+
+#include <asm/processor.h>
+#include <asm/shared.h>
+#include <asm/system.h>
+#include <asm/msr.h>
+
+#include "mce.h"
+#include "x86_mca.h"
+
+
+/* Machine Check Handler for AMD K8 family series */
+void k8_machine_check(struct cpu_user_regs *regs, long error_code)
+{
+	struct vcpu *vcpu = current;
+	struct domain *curdom;
+	struct mc_info *mc_data;
+	struct mcinfo_global mc_global;
+	struct mcinfo_bank mc_info;
+	uint64_t status, addrv, miscv, uc;
+	uint32_t i;
+	unsigned int cpu_nr;
+	uint32_t xen_impacted = 0;
+#define DOM_NORMAL	0
+#define DOM0_TRAP	1
+#define DOMU_TRAP	2
+#define DOMU_KILLED	4
+	uint32_t dom_state = DOM_NORMAL;
+
+	/* This handler runs as interrupt gate. So IPIs from the
+	 * polling service routine are defered until we finished.
+	 */
+
+        /* Disable interrupts for the _vcpu_. It may not re-scheduled to
+	 * an other physical CPU or the impacted process in the guest
+	 * continues running with corrupted data, otherwise. */
+        vcpu_schedule_lock_irq(vcpu);
+
+	mc_data = x86_mcinfo_getptr();
+	cpu_nr = smp_processor_id();
+	curdom = vcpu->domain;
+
+	memset(&mc_global, 0, sizeof(mc_global));
+	mc_global.common.type = MC_TYPE_GLOBAL;
+	mc_global.common.size = sizeof(mc_global);
+
+	mc_global.mc_domid = curdom->domain_id; /* impacted domain */
+	mc_global.mc_coreid = vcpu->processor; /* impacted physical cpu */
+	BUG_ON(cpu_nr != vcpu->processor);
+	mc_global.mc_core_threadid = 0;
+	mc_global.mc_vcpuid = vcpu->vcpu_id; /* impacted vcpu */
+#if 0 /* TODO: on which socket is this physical core?
+         It's not clear to me how to figure this out. */
+	mc_global.mc_socketid = ???;
+#endif
+	mc_global.mc_flags |= MC_FLAG_UNCORRECTABLE;
+	rdmsrl(MSR_IA32_MCG_STATUS, mc_global.mc_gstatus);
+
+	/* Quick check, who is impacted */
+	xen_impacted = is_idle_domain(curdom);
+
+	/* Dom0 */
+	x86_mcinfo_clear(mc_data);
+	x86_mcinfo_add(mc_data, &mc_global);
+
+	for (i = 0; i < nr_mce_banks; i++) {
+		struct domain *d;
+
+		rdmsrl(MSR_IA32_MC0_STATUS + 4 * i, status);
+
+		if (!(status & MCi_STATUS_VAL))
+			continue;
+
+		/* An error happened in this bank.
+		 * This is expected to be an uncorrectable error,
+		 * since correctable errors get polled.
+		 */
+		uc = status & MCi_STATUS_UC;
+
+		memset(&mc_info, 0, sizeof(mc_info));
+		mc_info.common.type = MC_TYPE_BANK;
+		mc_info.common.size = sizeof(mc_info);
+		mc_info.mc_bank = i;
+		mc_info.mc_status = status;
+
+		addrv = 0;
+		if (status & MCi_STATUS_ADDRV) {
+			rdmsrl(MSR_IA32_MC0_ADDR + 4 * i, addrv);
+			
+			d = maddr_get_owner(addrv);
+			if (d != NULL)
+				mc_info.mc_domid = d->domain_id;
+		}
+
+		miscv = 0;
+		if (status & MCi_STATUS_MISCV)
+			rdmsrl(MSR_IA32_MC0_MISC + 4 * i, miscv);
+
+		mc_info.mc_addr = addrv;
+		mc_info.mc_misc = miscv;
+
+		x86_mcinfo_add(mc_data, &mc_info); /* Dom0 */
+
+		if (mc_callback_bank_extended)
+			mc_callback_bank_extended(mc_data, i, status);
+
+		/* clear status */
+		wrmsrl(MSR_IA32_MC0_STATUS + 4 * i, 0x0ULL);
+		wmb();
+		add_taint(TAINT_MACHINE_CHECK);
+	}
+
+	status = mc_global.mc_gstatus;
+
+	/* clear MCIP or cpu enters shutdown state
+	 * in case another MCE occurs. */
+	status &= ~MCG_STATUS_MCIP;
+	wrmsrl(MSR_IA32_MCG_STATUS, status);
+	wmb();
+
+	/* For the details see the discussion "MCE/MCA concept" on xen-devel.
+	 * The thread started here:
+	 * http://lists.xensource.com/archives/html/xen-devel/2007-05/msg01015.html
+	 */
+
+	/* MCG_STATUS_RIPV: 
+	 * When this bit is not set, then the instruction pointer onto the stack
+	 * to resume at is not valid. If xen is interrupted, then we panic anyway
+	 * right below. Otherwise it is up to the guest to figure out if 
+	 * guest kernel or guest userland is affected and should kill either
+	 * itself or the affected process.
+	 */
+
+	/* MCG_STATUS_EIPV:
+	 * Evaluation of EIPV is the job of the guest.
+	 */
+
+	if (xen_impacted) {
+		/* Now we are going to panic anyway. Allow interrupts, so that
+		 * printk on serial console can work. */
+		vcpu_schedule_unlock_irq(vcpu);
+
+		/* Uh, that means, machine check exception
+		 * inside Xen occured. */
+		printk("Machine check exception occured in Xen.\n");
+
+		/* if MCG_STATUS_EIPV indicates, the IP on the stack is related
+		 * to the error then it makes sense to print a stack trace.
+		 * That can be useful for more detailed error analysis and/or
+		 * error case studies to figure out, if we can clear
+		 * xen_impacted and kill a DomU instead
+		 * (i.e. if a guest only control structure is affected, but then
+		 * we must ensure the bad pages are not re-used again).
+		 */
+		if (status & MCG_STATUS_EIPV) {
+			printk("MCE: Instruction Pointer is related to the error. "
+				"Therefore, print the execution state.\n");
+			show_execution_state(regs);
+		}
+		x86_mcinfo_dump(mc_data);
+		panic("End of MCE. Use mcelog to decode above error codes.\n");
+	}
+
+	/* If Dom0 registered a machine check handler, which is only possible
+	 * with a PV MCA driver, then ... */
+	if ( guest_has_trap_callback(dom0, 0, TRAP_machine_check) ) {
+		dom_state = DOM0_TRAP;
+
+		/* ... deliver machine check trap to Dom0. */
+		send_guest_trap(dom0, 0, TRAP_machine_check);
+
+		/* Xen may tell Dom0 now to notify the DomU.
+		 * But this will happen through a hypercall. */
+	} else
+		/* Dom0 did not register a machine check handler, but if DomU
+		 * did so, then... */
+                if ( guest_has_trap_callback(curdom, vcpu->vcpu_id, TRAP_machine_check) ) {
+			dom_state = DOMU_TRAP;
+
+			/* ... deliver machine check trap to DomU */
+			send_guest_trap(curdom, vcpu->vcpu_id, TRAP_machine_check);
+	} else {
+		/* hmm... noone feels responsible to handle the error.
+		 * So, do a quick check if a DomU is impacted or not.
+		 */
+		if (curdom == dom0) {
+			/* Dom0 is impacted. Since noone can't handle
+			 * this error, panic! */
+			x86_mcinfo_dump(mc_data);
+			panic("MCE occured in Dom0, which it can't handle\n");
+
+			/* UNREACHED */
+		} else {
+			dom_state = DOMU_KILLED;
+
+			/* Enable interrupts. This basically results in
+			 * calling sti on the *physical* cpu. But after
+			 * domain_crash() the vcpu pointer is invalid.
+			 * Therefore, we must unlock the irqs before killing
+			 * it. */
+			vcpu_schedule_unlock_irq(vcpu);
+
+			/* DomU is impacted. Kill it and continue. */
+			domain_crash(curdom);
+		}
+	}
+
+
+	switch (dom_state) {
+	case DOM0_TRAP:
+	case DOMU_TRAP:
+		/* Enable interrupts. */
+		vcpu_schedule_unlock_irq(vcpu);
+
+		/* guest softirqs and event callbacks are scheduled
+		 * immediately after this handler exits. */
+		break;
+	case DOMU_KILLED:
+		/* Nothing to do here. */
+		break;
+	default:
+		BUG();
+	}
+}
+
+
+/* AMD K8 machine check */
+void amd_k8_mcheck_init(struct cpuinfo_x86 *c)
+{
+	uint64_t value;
+	uint32_t i;
+	int cpu_nr;
+
+	machine_check_vector = k8_machine_check;
+	cpu_nr = smp_processor_id();
+	wmb();
+
+	rdmsrl(MSR_IA32_MCG_CAP, value);
+	if (value & MCG_CTL_P)	/* Control register present ? */
+		wrmsrl (MSR_IA32_MCG_CTL, 0xffffffffffffffffULL);
+	nr_mce_banks = value & MCG_CAP_COUNT;
+
+	for (i = 0; i < nr_mce_banks; i++) {
+		switch (i) {
+		case 4: /* Northbridge */
+			/* Enable error reporting of all errors,
+			 * enable error checking and
+			 * disable sync flooding */
+			wrmsrl(MSR_IA32_MC4_CTL, 0x02c3c008ffffffffULL);
+			wrmsrl(MSR_IA32_MC4_STATUS, 0x0ULL);
+			break;
+
+		default:
+			/* Enable error reporting of all errors */
+			wrmsrl(MSR_IA32_MC0_CTL + 4 * i, 0xffffffffffffffffULL);
+			wrmsrl(MSR_IA32_MC0_STATUS + 4 * i, 0x0ULL);
+			break;
+		}
+	}
+
+	set_in_cr4(X86_CR4_MCE);
+	printk("CPU%i: AMD K8 machine check reporting enabled.\n", cpu_nr);
+}
diff -r 959db3c01837 xen/arch/x86/cpu/mcheck/amd_nonfatal.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/xen/arch/x86/cpu/mcheck/amd_nonfatal.c	Fri Jul 04 14:48:37 2008 +0200
@@ -0,0 +1,303 @@
+/*
+ * MCA implementation for AMD CPUs
+ * Copyright (c) 2007 Advanced Micro Devices, Inc.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+ */
+
+
+/* K8 common MCA documentation published at
+ *
+ * AMD64 Architecture Programmer's Manual Volume 2:
+ * System Programming
+ * Publication # 24593 Revision: 3.12
+ * Issue Date: September 2006
+ *
+ * URL:
+ * http://www.amd.com/us-en/assets/content_type/white_papers_and_tech_docs/24593.pdf
+ */
+
+/* The related documentation for K8 Revisions A - E is:
+ *
+ * BIOS and Kernel Developer's Guide for
+ * AMD Athlon 64 and AMD Opteron Processors
+ * Publication # 26094 Revision: 3.30
+ * Issue Date: February 2006
+ *
+ * URL:
+ * http://www.amd.com/us-en/assets/content_type/white_papers_and_tech_docs/26094.PDF
+ */
+
+/* The related documentation for K8 Revisions F - G is:
+ *
+ * BIOS and Kernel Developer's Guide for
+ * AMD NPT Family 0Fh Processors
+ * Publication # 32559 Revision: 3.04
+ * Issue Date: December 2006
+ *
+ * URL:
+ * http://www.amd.com/us-en/assets/content_type/white_papers_and_tech_docs/32559.pdf
+ */
+
+#include <xen/config.h>
+#include <xen/init.h>
+#include <xen/types.h>
+#include <xen/kernel.h>
+#include <xen/smp.h>
+#include <xen/timer.h>
+#include <xen/event.h>
+#include <asm/processor.h> 
+#include <asm/system.h>
+#include <asm/msr.h>
+
+#include "mce.h"
+#include "x86_mca.h"
+
+static struct timer mce_timer;
+
+#define MCE_PERIOD MILLISECS(15000)
+#define MCE_MIN    MILLISECS(2000)
+#define MCE_MAX    MILLISECS(30000)
+
+static s_time_t period = MCE_PERIOD;
+static int hw_threshold = 0;
+static int adjust = 0;
+
+/* The polling service routine:
+ * Collects information of correctable errors and notifies
+ * Dom0 via an event.
+ */
+void mce_amd_checkregs(void *info)
+{
+	struct vcpu *vcpu = current;
+	struct mc_info *mc_data;
+	struct mcinfo_global mc_global;
+	struct mcinfo_bank mc_info;
+	uint64_t status, addrv, miscv;
+	unsigned int i;
+	unsigned int event_enabled;
+	unsigned int cpu_nr;
+	int error_found;
+
+	/* We don't need a slot yet. Only allocate one on error. */
+	mc_data = NULL;
+
+	cpu_nr = smp_processor_id();
+	event_enabled = guest_enabled_event(dom0->vcpu[0], VIRQ_MCA);
+	error_found = 0;
+
+	memset(&mc_global, 0, sizeof(mc_global));
+	mc_global.common.type = MC_TYPE_GLOBAL;
+	mc_global.common.size = sizeof(mc_global);
+
+	mc_global.mc_domid = vcpu->domain->domain_id; /* impacted domain */
+	mc_global.mc_coreid = vcpu->processor; /* impacted physical cpu */
+	BUG_ON(cpu_nr != vcpu->processor);
+	mc_global.mc_core_threadid = 0;
+	mc_global.mc_vcpuid = vcpu->vcpu_id; /* impacted vcpu */
+#if 0 /* TODO: on which socket is this physical core?
+         It's not clear to me how to figure this out. */
+	mc_global.mc_socketid = ???;
+#endif
+	mc_global.mc_flags |= MC_FLAG_CORRECTABLE;
+	rdmsrl(MSR_IA32_MCG_STATUS, mc_global.mc_gstatus);
+
+	for (i = 0; i < nr_mce_banks; i++) {
+		struct domain *d;
+
+		rdmsrl(MSR_IA32_MC0_STATUS + i * 4, status);
+
+		if (!(status & MCi_STATUS_VAL))
+			continue;
+
+		if (mc_data == NULL) {
+			/* Now we need a slot to fill in error telemetry. */
+			mc_data = x86_mcinfo_getptr();
+			BUG_ON(mc_data == NULL);
+			x86_mcinfo_clear(mc_data);
+			x86_mcinfo_add(mc_data, &mc_global);
+		}
+
+		memset(&mc_info, 0, sizeof(mc_info));
+		mc_info.common.type = MC_TYPE_BANK;
+		mc_info.common.size = sizeof(mc_info);
+		mc_info.mc_bank = i;
+		mc_info.mc_status = status;
+
+		/* Increase polling frequency */
+		error_found = 1;
+
+		addrv = 0;
+		if (status & MCi_STATUS_ADDRV) {
+			rdmsrl(MSR_IA32_MC0_ADDR + i * 4, addrv);
+
+			d = maddr_get_owner(addrv);
+			if (d != NULL)
+				mc_info.mc_domid = d->domain_id;
+		}
+
+		miscv = 0;
+		if (status & MCi_STATUS_MISCV)
+			rdmsrl(MSR_IA32_MC0_MISC + i * 4, miscv);
+
+		mc_info.mc_addr = addrv;
+		mc_info.mc_misc = miscv;
+		x86_mcinfo_add(mc_data, &mc_info);
+
+		if (mc_callback_bank_extended)
+			mc_callback_bank_extended(mc_data, i, status);
+
+		/* clear status */
+		wrmsrl(MSR_IA32_MC0_STATUS + i * 4, 0x0ULL);
+		wmb();
+	}
+
+	if (error_found > 0) {
+		/* If Dom0 enabled the VIRQ_MCA event, then ... */
+		if (event_enabled)
+			/* ... notify it. */
+			send_guest_global_virq(dom0, VIRQ_MCA);
+		else
+			/* ... or dump it */
+			x86_mcinfo_dump(mc_data);
+	}
+
+	adjust += error_found;
+}
+
+/* polling service routine invoker:
+ * Adjust poll frequency at runtime. No error means slow polling frequency,
+ * an error means higher polling frequency.
+ * It uses hw threshold register introduced in AMD K8 RevF to detect
+ * multiple correctable errors between two polls. In that case,
+ * increase polling frequency higher than normal.
+ */
+static void mce_amd_work_fn(void *data)
+{
+	on_each_cpu(mce_amd_checkregs, data, 1, 1);
+
+	if (adjust > 0) {
+		if ( !guest_enabled_event(dom0->vcpu[0], VIRQ_MCA) ) {
+			/* Dom0 did not enable VIRQ_MCA, so Xen is reporting. */
+			printk("MCE: polling routine found correctable error. "
+				" Use mcelog to parse above error output.\n");
+		}
+	}
+
+	if (hw_threshold) {
+		uint64_t value;
+		uint32_t counter;
+
+		rdmsrl(MSR_IA32_MC4_MISC, value);
+		/* Only the error counter field is of interest
+		 * Bit field is described in AMD K8 BKDG chapter 6.4.5.5
+		 */
+		counter = (value & 0xFFF00000000ULL) >> 32U;
+
+		/* HW does not count *all* kinds of correctable errors.
+		 * Thus it is possible, that the polling routine finds an
+		 * correctable error even if the HW reports nothing.
+		 * However, the other way around is not possible (= BUG).
+		 */ 
+		if (counter > 0) {
+			/* HW reported correctable errors,
+			 * the polling routine did not find...
+			 */
+			BUG_ON(adjust == 0);
+			/* subtract 1 to not double count the error 
+			 * from the polling service routine */ 
+			adjust += (counter - 1);
+
+			/* Restart counter */
+			/* No interrupt, reset counter value */
+			value &= ~(0x60FFF00000000ULL);
+			/* Counter enable */
+			value |= (1ULL << 51);
+			wrmsrl(MSR_IA32_MC4_MISC, value);
+			wmb();
+		}
+	}
+
+	if (adjust > 0) {
+		/* Increase polling frequency */
+		adjust++; /* adjust == 1 must have an effect */
+		period /= adjust;
+	} else {
+		/* Decrease polling frequency */
+		period *= 2;
+	}
+	if (period > MCE_MAX) {
+		/* limit: Poll at least every 30s */
+		period = MCE_MAX;
+	}
+	if (period < MCE_MIN) {
+		/* limit: Poll every 2s.
+		 * When this is reached an uncorrectable error
+		 * is expected to happen, if Dom0 does nothing.
+		 */
+		period = MCE_MIN;
+	}
+
+	set_timer(&mce_timer, NOW() + period);
+	adjust = 0;
+}
+
+void amd_nonfatal_mcheck_init(struct cpuinfo_x86 *c)
+{
+	if (c->x86_vendor != X86_VENDOR_AMD)
+		return;
+
+	/* Assume we are on K8 or newer AMD CPU here */
+
+	/* The threshold bitfields in MSR_IA32_MC4_MISC has
+	 * been introduced along with the SVME feature bit. */
+	if (cpu_has(c, X86_FEATURE_SVME)) {
+		uint64_t value;
+
+		/* hw threshold registers present */
+		hw_threshold = 1;
+		rdmsrl(MSR_IA32_MC4_MISC, value);
+
+		if (value & (1ULL << 61)) { /* Locked bit */
+			/* Locked by BIOS. Not available for use */
+			hw_threshold = 0;
+		}
+		if (!(value & (1ULL << 63))) { /* Valid bit */
+			/* No CtrP present */
+			hw_threshold = 0;
+		} else {
+			if (!(value & (1ULL << 62))) { /* Counter Bit */
+				/* No counter field present */
+				hw_threshold = 0;
+			}
+		}
+
+		if (hw_threshold) {
+			/* No interrupt, reset counter value */
+			value &= ~(0x60FFF00000000ULL);
+			/* Counter enable */
+			value |= (1ULL << 51);
+			wrmsrl(MSR_IA32_MC4_MISC, value);
+			/* serialize */
+			wmb();
+			printk(XENLOG_INFO "MCA: Use hw thresholding to adjust polling frequency\n");
+		}
+	}
+
+	init_timer(&mce_timer, mce_amd_work_fn, NULL, 0);
+	set_timer(&mce_timer, NOW() + period);
+
+	return;
+}
diff -r 959db3c01837 xen/arch/x86/cpu/mcheck/k7.c
--- a/xen/arch/x86/cpu/mcheck/k7.c	Fri Jul 04 14:41:35 2008 +0200
+++ b/xen/arch/x86/cpu/mcheck/k7.c	Fri Jul 04 14:48:37 2008 +0200
@@ -66,8 +66,8 @@ static fastcall void k7_machine_check(st
 }
 
 
-/* AMD K7 machine check is Intel like */
-void amd_mcheck_init(struct cpuinfo_x86 *c)
+/* AMD K7 machine check */
+void amd_k7_mcheck_init(struct cpuinfo_x86 *c)
 {
 	u32 l, h;
 	int i;
@@ -75,7 +75,6 @@ void amd_mcheck_init(struct cpuinfo_x86 
 	machine_check_vector = k7_machine_check;
 	wmb();
 
-	printk (KERN_INFO "Intel machine check architecture supported.\n");
 	rdmsr (MSR_IA32_MCG_CAP, l, h);
 	if (l & (1<<8))	/* Control register present ? */
 		wrmsr (MSR_IA32_MCG_CTL, 0xffffffff, 0xffffffff);
@@ -90,6 +89,6 @@ void amd_mcheck_init(struct cpuinfo_x86 
 	}
 
 	set_in_cr4 (X86_CR4_MCE);
-	printk (KERN_INFO "Intel machine check reporting enabled on CPU#%d.\n",
+	printk (KERN_INFO "CPU%d: AMD K7 machine check reporting enabled.\n",
 		smp_processor_id());
 }
diff -r 959db3c01837 xen/arch/x86/cpu/mcheck/mce.c
--- a/xen/arch/x86/cpu/mcheck/mce.c	Fri Jul 04 14:41:35 2008 +0200
+++ b/xen/arch/x86/cpu/mcheck/mce.c	Fri Jul 04 14:48:37 2008 +0200
@@ -8,73 +8,151 @@
 #include <xen/kernel.h>
 #include <xen/config.h>
 #include <xen/smp.h>
+#include <xen/errno.h>
 
 #include <asm/processor.h> 
 #include <asm/system.h>
 
 #include "mce.h"
+#include "x86_mca.h"
 
 int mce_disabled = 0;
-int nr_mce_banks;
+unsigned int nr_mce_banks;
 
 EXPORT_SYMBOL_GPL(nr_mce_banks);	/* non-fatal.o */
 
+/* XXX For now a fixed array is used. Later this should be changed
+ * to a dynamic allocated array with the size calculated in relation
+ * to physical cpus present in the machine.
+ * The more physical cpus are available, the more entries you need.
+ */
+#define MAX_MCINFO	10
+
+struct mc_machine_notify {
+	struct mc_info mc;
+	uint32_t fetch_idx;
+	uint32_t valid;
+};
+
+struct mc_machine {
+
+	/* Array structure used for collecting machine check error telemetry. */
+	struct mc_info mc[MAX_MCINFO];
+
+	/* We handle multiple machine check reports lockless by
+	 * iterating through the array using the producer/consumer concept.
+	 */
+	/* Producer array index to fill with machine check error data.
+	 * Index must be increased atomically. */
+	uint32_t error_idx;
+
+	/* Consumer array index to fetch machine check error data from.
+	 * Index must be increased atomically. */
+	uint32_t fetch_idx;
+
+	/* Integer array holding the indeces of the mc array that allows
+         * a Dom0 to notify a DomU to re-fetch the same machine check error
+         * data. The notification and refetch also uses its own 
+	 * producer/consumer mechanism, because Dom0 may decide to not report
+	 * every error to the impacted DomU.
+	 */
+	struct mc_machine_notify notify[MAX_MCINFO];
+
+	/* Array index to get fetch_idx from.
+	 * Index must be increased atomically. */
+	uint32_t notifyproducer_idx;
+	uint32_t notifyconsumer_idx;
+};
+
+/* Global variable with machine check information. */
+struct mc_machine mc_data;
+
 /* Handle unconfigured int18 (should never happen) */
-static fastcall void unexpected_machine_check(struct cpu_user_regs * regs, long error_code)
+static void unexpected_machine_check(struct cpu_user_regs *regs, long error_code)
 {	
-	printk(KERN_ERR "CPU#%d: Unexpected int18 (Machine Check).\n", smp_processor_id());
+	printk(XENLOG_ERR "CPU#%d: Unexpected int18 (Machine Check).\n",
+		smp_processor_id());
 }
 
+
 /* Call the installed machine check handler for this CPU setup. */
-void fastcall (*machine_check_vector)(struct cpu_user_regs *, long error_code) = unexpected_machine_check;
+void (*machine_check_vector)(struct cpu_user_regs *regs, long error_code) = unexpected_machine_check;
+
+/* Init machine check callback handler
+ * It is used to collect additional information provided by newer
+ * CPU families/models without the need to duplicate the whole handler.
+ * This avoids having many handlers doing almost nearly the same and each
+ * with its own tweaks ands bugs. */
+int (*mc_callback_bank_extended)(struct mc_info *, uint16_t, uint64_t) = NULL;
+
+
+static void amd_mcheck_init(struct cpuinfo_x86 *ci)
+{
+
+	switch (ci->x86) {
+	case 6:
+		amd_k7_mcheck_init(ci);
+		break;
+
+	case 0xf:
+		amd_k8_mcheck_init(ci);
+		break;
+
+	case 0x10:
+		amd_f10_mcheck_init(ci);
+		break;
+
+	default:
+		/* Assume that machine check support is available.
+		 * The minimum provided support is at least the K8. */
+		amd_k8_mcheck_init(ci);
+	}
+}
 
 /* This has to be run for each processor */
 void mcheck_init(struct cpuinfo_x86 *c)
 {
-	if (mce_disabled==1)
+	if (mce_disabled == 1) {
+		printk(XENLOG_INFO "MCE support disabled by bootparam\n");
 		return;
+	}
+
+	if (!cpu_has(c, X86_FEATURE_MCE)) {
+		printk(XENLOG_INFO "CPU%i: No machine check support available\n",
+			smp_processor_id());
+		return;
+	}
+
+	memset(&mc_data, 0, sizeof(struct mc_machine));
 
 	switch (c->x86_vendor) {
-		case X86_VENDOR_AMD:
-			amd_mcheck_init(c);
-			break;
+	case X86_VENDOR_AMD:
+		amd_mcheck_init(c);
+		break;
 
-		case X86_VENDOR_INTEL:
+	case X86_VENDOR_INTEL:
 #ifndef CONFIG_X86_64
-			if (c->x86==5)
-				intel_p5_mcheck_init(c);
-			if (c->x86==6)
-				intel_p6_mcheck_init(c);
+		if (c->x86==5)
+			intel_p5_mcheck_init(c);
+		if (c->x86==6)
+			intel_p6_mcheck_init(c);
 #endif
-			if (c->x86==15)
-				intel_p4_mcheck_init(c);
-			break;
+		if (c->x86==15)
+			intel_p4_mcheck_init(c);
+		break;
 
 #ifndef CONFIG_X86_64
-		case X86_VENDOR_CENTAUR:
-			if (c->x86==5)
-				winchip_mcheck_init(c);
-			break;
+	case X86_VENDOR_CENTAUR:
+		if (c->x86==5)
+			winchip_mcheck_init(c);
+		break;
 #endif
 
-		default:
-			break;
+	default:
+		break;
 	}
 }
 
-static unsigned long old_cr4 __initdata;
-
-void __init stop_mce(void)
-{
-	old_cr4 = read_cr4();
-	clear_in_cr4(X86_CR4_MCE);
-}
-
-void __init restart_mce(void)
-{
-	if (old_cr4 & X86_CR4_MCE)
-		set_in_cr4(X86_CR4_MCE);
-}
 
 static void __init mcheck_disable(char *str)
 {
@@ -88,3 +166,411 @@ static void __init mcheck_enable(char *s
 
 custom_param("nomce", mcheck_disable);
 custom_param("mce", mcheck_enable);
+
+
+#include <xen/guest_access.h>
+#include <asm/traps.h>
+
+struct mc_info *x86_mcinfo_getptr(void)
+{
+	struct mc_info *mi;
+	uint32_t entry, next;
+
+	for (;;) {
+		entry = mc_data.error_idx;
+		smp_rmb();
+		next = entry + 1;
+		if (cmpxchg(&mc_data.error_idx, entry, next) == entry)
+			break;
+	}
+
+	mi = &(mc_data.mc[(entry % MAX_MCINFO)]);
+	BUG_ON(mc_data.error_idx < mc_data.fetch_idx);
+
+	return mi;
+}
+
+static int x86_mcinfo_matches_guest(const struct mc_info *mi,
+			const struct domain *d, const struct vcpu *v)
+{
+	struct mcinfo_common *mic;
+	struct mcinfo_global *mig;
+
+	x86_mcinfo_lookup(mic, mi, MC_TYPE_GLOBAL);
+	mig = (struct mcinfo_global *)mic;
+	if (mig == NULL)
+		return 0;
+
+	if (d->domain_id != mig->mc_domid)
+		return 0;
+
+	if (v->vcpu_id != mig->mc_vcpuid)
+		return 0;
+
+	return 1;
+}
+
+
+#define x86_mcinfo_mcdata(idx) (mc_data.mc[(idx % MAX_MCINFO)])
+
+static struct mc_info *x86_mcinfo_getfetchptr(uint32_t *fetch_idx,
+				const struct domain *d, const struct vcpu *v)
+{
+	struct mc_info *mi;
+
+	/* This function is called from the fetch hypercall with
+	 * the mc_lock spinlock held. Thus, no need for locking here.
+	 */
+	mi = &(x86_mcinfo_mcdata(mc_data.fetch_idx));
+	if ((d != dom0) && !x86_mcinfo_matches_guest(mi, d, v)) {
+		/* Bogus domU command detected. */
+		*fetch_idx = 0;
+		return NULL;
+	}
+
+	*fetch_idx = mc_data.fetch_idx;
+	mc_data.fetch_idx++;
+	BUG_ON(mc_data.fetch_idx > mc_data.error_idx);
+
+	return mi;
+}
+
+
+static void x86_mcinfo_marknotified(struct xen_mc_notifydomain *mc_notifydomain)
+{
+	struct mc_machine_notify *mn;
+	struct mcinfo_common *mic = NULL;
+	struct mcinfo_global *mig;
+	struct domain *d;
+	int i;
+
+	/* This function is called from the notifier hypercall with
+	 * the mc_notify_lock spinlock held. Thus, no need for locking here.
+	 */
+
+	/* First invalidate entries for guests that disappeared after
+	 * notification (e.g. shutdown/crash). This step prevents the
+	 * notification array from filling up with stalling/leaking entries.
+	 */
+	for (i = mc_data.notifyconsumer_idx; i < mc_data.notifyproducer_idx; i++) {
+		mn = &(mc_data.notify[(i % MAX_MCINFO)]);
+		x86_mcinfo_lookup(mic, &mn->mc, MC_TYPE_GLOBAL);
+		BUG_ON(mic == NULL);
+		mig = (struct mcinfo_global *)mic;
+		d = get_domain_by_id(mig->mc_domid);
+		if (d == NULL) {
+			/* Domain does not exist. */
+			mn->valid = 0;
+		}
+		if ((!mn->valid) && (i == mc_data.notifyconsumer_idx))
+			mc_data.notifyconsumer_idx++;
+	}
+
+	/* Now put in the error telemetry. Since all error data fetchable
+	 * by domUs are uncorrectable errors, they are very important.
+	 * So we dump them before overriding them. When a guest takes that long,
+	 * then we can assume something bad already happened (crash, hang, etc.)
+	 */
+	mn = &(mc_data.notify[(mc_data.notifyproducer_idx % MAX_MCINFO)]);
+
+	if (mn->valid) {
+		struct mcinfo_common *mic = NULL;
+		struct mcinfo_global *mig;
+
+		/* To not loose the information, we dump it. */
+		x86_mcinfo_lookup(mic, &mn->mc, MC_TYPE_GLOBAL);
+		BUG_ON(mic == NULL);
+		mig = (struct mcinfo_global *)mic;
+		printk(XENLOG_WARNING "Domain ID %u was notified by Dom0 to "
+			"fetch machine check error telemetry. But Domain ID "
+			"did not do that in time.\n",
+			mig->mc_domid);
+		x86_mcinfo_dump(&mn->mc);
+	}
+
+	memcpy(&mn->mc, &(x86_mcinfo_mcdata(mc_notifydomain->fetch_idx)),
+		sizeof(struct mc_info));
+	mn->fetch_idx = mc_notifydomain->fetch_idx;
+	mn->valid = 1;
+
+	mc_data.notifyproducer_idx++;
+
+	/* By design there can never be more notifies than machine check errors.
+	 * If that ever happens, then we hit a bug. */
+	BUG_ON(mc_data.notifyproducer_idx > mc_data.fetch_idx);
+	BUG_ON(mc_data.notifyconsumer_idx > mc_data.notifyproducer_idx);
+}
+
+static struct mc_info *x86_mcinfo_getnotifiedptr(uint32_t *fetch_idx,
+				const struct domain *d, const struct vcpu *v)
+{
+	struct mc_machine_notify *mn = NULL;
+	uint32_t i;
+	int found;
+
+	/* This function is called from the fetch hypercall with
+	 * the mc_notify_lock spinlock held. Thus, no need for locking here.
+	 */
+
+	/* The notifier data is filled in the order guests get notified, but
+	 * guests may fetch them in a different order. That's why we need
+	 * the game with valid/invalid entries. */
+	found = 0;
+	for (i = mc_data.notifyconsumer_idx; i < mc_data.notifyproducer_idx; i++) {
+		mn = &(mc_data.notify[(i % MAX_MCINFO)]);
+		if (!mn->valid) {
+			if (i == mc_data.notifyconsumer_idx)
+				mc_data.notifyconsumer_idx++;
+			continue;
+		}
+		if (x86_mcinfo_matches_guest(&mn->mc, d, v)) {
+			found = 1;
+			break;
+		}
+	}
+
+	if (!found) {
+		/* This domain has never been notified. This must be
+		 * a bogus domU command. */
+		*fetch_idx = 0;
+		return NULL;
+	}
+
+	BUG_ON(mn == NULL);
+	*fetch_idx = mn->fetch_idx;
+	mn->valid = 0;
+
+	BUG_ON(mc_data.notifyconsumer_idx > mc_data.notifyproducer_idx);
+	return &mn->mc;
+}
+
+
+void x86_mcinfo_clear(struct mc_info *mi)
+{
+	memset(mi, 0, sizeof(struct mc_info));
+	x86_mcinfo_nentries(mi) = 0;
+}
+
+
+int x86_mcinfo_add(struct mc_info *mi, void *mcinfo)
+{
+	int i;
+	unsigned long end1, end2;
+	struct mcinfo_common *mic, *mic_base, *mic_index;
+
+	mic = (struct mcinfo_common *)mcinfo;
+	mic_index = mic_base = x86_mcinfo_first(mi);
+
+	/* go to first free entry */
+	for (i = 0; i < x86_mcinfo_nentries(mi); i++) {
+		mic_index = x86_mcinfo_next(mic_index);
+	}
+
+	/* check if there is enough size */
+	end1 = (unsigned long)((uint8_t *)mic_base + sizeof(struct mc_info));
+	end2 = (unsigned long)((uint8_t *)mic_index + mic->size);
+
+	if (end1 < end2)
+		return -ENOSPC; /* No space. Can't add entry. */
+
+	/* there's enough space. add entry. */
+	memcpy(mic_index, mic, mic->size);
+	x86_mcinfo_nentries(mi)++;
+
+	return 0;
+}
+
+
+/* Dump machine check information in a format,
+ * mcelog can parse. This is used only when
+ * Dom0 does not take the notification. */
+void x86_mcinfo_dump(struct mc_info *mi)
+{
+	struct mcinfo_common *mic = NULL;
+	struct mcinfo_global *mc_global;
+	struct mcinfo_bank *mc_bank;
+
+	/* first print the global info */
+	x86_mcinfo_lookup(mic, mi, MC_TYPE_GLOBAL);
+	if (mic == NULL)
+		return;
+	mc_global = (struct mcinfo_global *)mic;
+	if (mc_global->mc_flags & MC_FLAG_UNCORRECTABLE) {
+		printk(XENLOG_WARNING
+			"CPU%d: Machine Check Exception: %16"PRIx64"\n",
+			mc_global->mc_coreid, mc_global->mc_gstatus);
+	} else {
+		printk(XENLOG_WARNING "MCE: The hardware reports a non "
+			"fatal, correctable incident occured on "
+			"CPU %d.\n",
+			mc_global->mc_coreid);
+	}
+
+	/* then the bank information */
+	x86_mcinfo_lookup(mic, mi, MC_TYPE_BANK); /* finds the first entry */
+	do {
+		if (mic == NULL)
+			return;
+		if (mic->type != MC_TYPE_BANK)
+			continue;
+
+		mc_bank = (struct mcinfo_bank *)mic;
+	
+		printk(XENLOG_WARNING "Bank %d: %16"PRIx64,
+			mc_bank->mc_bank,
+			mc_bank->mc_status);
+		if (mc_bank->mc_status & MCi_STATUS_MISCV)
+			printk("[%16"PRIx64"]", mc_bank->mc_misc);
+		if (mc_bank->mc_status & MCi_STATUS_ADDRV)
+			printk(" at %16"PRIx64, mc_bank->mc_addr);
+
+		printk("\n");
+		mic = x86_mcinfo_next(mic); /* next entry */
+		if ((mic == NULL) || (mic->size == 0))
+			break;
+	} while (1);
+}
+
+
+
+/* Machine Check Architecture Hypercall */
+long do_mca(XEN_GUEST_HANDLE(xen_mc_t) u_xen_mc)
+{
+	long ret = 0;
+	struct xen_mc curop, *op = &curop;
+	struct vcpu *v = current;
+	struct domain *domU;
+	struct xen_mc_fetch *mc_fetch;
+	struct xen_mc_notifydomain *mc_notifydomain;
+	struct mc_info *mi;
+	uint32_t flags;
+	uint32_t fetch_idx;
+        uint16_t vcpuid;
+	/* Use a different lock for the notify hypercall in order to allow
+	 * a DomU to fetch mc data while Dom0 notifies another DomU. */
+	static DEFINE_SPINLOCK(mc_lock);
+	static DEFINE_SPINLOCK(mc_notify_lock);
+
+	if ( copy_from_guest(op, u_xen_mc, 1) )
+		return -EFAULT;
+
+	if ( op->interface_version != XEN_MCA_INTERFACE_VERSION )
+		return -EACCES;
+
+	switch ( op->cmd ) {
+	case XEN_MC_fetch:
+		/* This hypercall is for any domain */
+		mc_fetch = &op->u.mc_fetch;
+
+		switch (mc_fetch->flags) {
+		case XEN_MC_CORRECTABLE:
+			/* But polling mode is Dom0 only, because
+			 * correctable errors are reported to Dom0 only */
+			if ( !IS_PRIV(v->domain) )
+				return -EPERM;
+			break;
+
+		case XEN_MC_TRAP:
+			break;
+		default:
+			return -EFAULT;
+		}
+
+		flags = XEN_MC_OK;
+		spin_lock(&mc_lock);
+
+		if ( IS_PRIV(v->domain) ) {
+			/* this must be Dom0. So a notify hypercall
+			 * can't have happened before. */
+			mi = x86_mcinfo_getfetchptr(&fetch_idx, dom0, v);
+		} else {
+			/* Hypercall comes from an unprivileged domain */
+			domU = v->domain;
+			if (guest_has_trap_callback(dom0, 0, TRAP_machine_check)) {
+				/* Dom0 must have notified this DomU before
+				 * via the notify hypercall. */
+				mi = x86_mcinfo_getnotifiedptr(&fetch_idx, domU, v);
+			} else {
+				/* Xen notified the DomU. */
+				mi = x86_mcinfo_getfetchptr(&fetch_idx, domU, v);
+			}
+		}
+
+		if (mi) {
+			memcpy(&mc_fetch->mc_info, mi,
+				sizeof(struct mc_info));
+		} else {
+			/* There is no data for a bogus DomU command. */
+			flags |= XEN_MC_NODATA;
+			memset(&mc_fetch->mc_info, 0, sizeof(struct mc_info));
+		}
+
+		mc_fetch->flags = flags;
+		mc_fetch->fetch_idx = fetch_idx;
+
+		if ( copy_to_guest(u_xen_mc, op, 1) )
+			ret = -EFAULT;
+
+		spin_unlock(&mc_lock);
+		break;
+
+	case XEN_MC_notifydomain:
+		/* This hypercall is for Dom0 only */
+		if ( !IS_PRIV(v->domain) )
+			return -EPERM;
+
+		spin_lock(&mc_notify_lock);
+
+		mc_notifydomain = &op->u.mc_notifydomain;
+		domU = get_domain_by_id(mc_notifydomain->mc_domid);
+		vcpuid = mc_notifydomain->mc_vcpuid;
+
+		if ((domU == NULL) || (domU == dom0)) {
+			/* It's not possible to notify a non-existent domain
+			 * or the dom0. */
+			spin_unlock(&mc_notify_lock);
+			return -EACCES;
+		}
+
+		if (vcpuid >= MAX_VIRT_CPUS) {
+			/* It's not possible to notify a vcpu, Xen can't
+			 * assign to a domain. */
+			spin_unlock(&mc_notify_lock);
+			return -EACCES;
+		}
+
+		mc_notifydomain->flags = XEN_MC_OK;
+
+		mi = &(x86_mcinfo_mcdata(mc_notifydomain->fetch_idx));
+		if (!x86_mcinfo_matches_guest(mi, domU, domU->vcpu[vcpuid])) {
+			/* The error telemetry is not for the guest, Dom0
+			 * wants to notify. */
+			mc_notifydomain->flags |= XEN_MC_NOMATCH;
+		} else if ( guest_has_trap_callback(domU, vcpuid,
+						TRAP_machine_check) )
+		{
+			/* Send notification */
+			if ( send_guest_trap(domU, vcpuid, TRAP_machine_check) )
+				mc_notifydomain->flags |= XEN_MC_NOTDELIVERED;
+		} else
+			mc_notifydomain->flags |= XEN_MC_CANNOTHANDLE;
+
+#ifdef DEBUG
+		/* sanity check - these two flags are mutually exclusive */
+		if ((flags & XEN_MC_CANNOTHANDLE) && (flags & XEN_MC_NOTDELIVERED))
+			BUG();
+#endif
+
+		if ( copy_to_guest(u_xen_mc, op, 1) )
+			ret = -EFAULT;
+
+		if (ret == 0) {
+			x86_mcinfo_marknotified(mc_notifydomain);
+		}
+
+		spin_unlock(&mc_notify_lock);
+		break;
+	}
+
+	return ret;
+}
diff -r 959db3c01837 xen/arch/x86/cpu/mcheck/mce.h
--- a/xen/arch/x86/cpu/mcheck/mce.h	Fri Jul 04 14:41:35 2008 +0200
+++ b/xen/arch/x86/cpu/mcheck/mce.h	Fri Jul 04 14:48:37 2008 +0200
@@ -1,14 +1,30 @@
 #include <xen/init.h>
+#include <asm/traps.h>
 
-void amd_mcheck_init(struct cpuinfo_x86 *c);
+/* Init functions */
+void amd_nonfatal_mcheck_init(struct cpuinfo_x86 *c);
+void amd_k7_mcheck_init(struct cpuinfo_x86 *c);
+void amd_k8_mcheck_init(struct cpuinfo_x86 *c);
+void amd_f10_mcheck_init(struct cpuinfo_x86 *c);
 void intel_p4_mcheck_init(struct cpuinfo_x86 *c);
 void intel_p5_mcheck_init(struct cpuinfo_x86 *c);
 void intel_p6_mcheck_init(struct cpuinfo_x86 *c);
 void winchip_mcheck_init(struct cpuinfo_x86 *c);
 
-/* Call the installed machine check handler for this CPU setup. */
-extern fastcall void (*machine_check_vector)(struct cpu_user_regs *, long error_code);
+/* Function pointer used in the handlers to collect additional information
+ * provided by newer CPU families/models without the need to duplicate
+ * the whole handler resulting in various handlers each with its own
+ * tweaks and bugs */
+extern int (*mc_callback_bank_extended)(struct mc_info *mi,
+		uint16_t bank, uint64_t status);
 
+
+/* Helper functions used for collecting error telemetry */
+struct mc_info *x86_mcinfo_getptr(void);
+void x86_mcinfo_clear(struct mc_info *mi);
+int x86_mcinfo_add(struct mc_info *mi, void *mcinfo);
+void x86_mcinfo_dump(struct mc_info *mi);
+
+/* Global variables */
 extern int mce_disabled __initdata;
-extern int nr_mce_banks;
-
+extern unsigned int nr_mce_banks;
diff -r 959db3c01837 xen/arch/x86/cpu/mcheck/non-fatal.c
--- a/xen/arch/x86/cpu/mcheck/non-fatal.c	Fri Jul 04 14:41:35 2008 +0200
+++ b/xen/arch/x86/cpu/mcheck/non-fatal.c	Fri Jul 04 14:48:37 2008 +0200
@@ -68,19 +68,29 @@ static int __init init_nonfatal_mce_chec
 	if (!cpu_has(c, X86_FEATURE_MCA))
 		return -ENODEV;
 
-	/* Some Athlons misbehave when we frob bank 0 */
-	if (boot_cpu_data.x86_vendor == X86_VENDOR_AMD &&
-		boot_cpu_data.x86 == 6)
-			firstbank = 1;
-	else
-			firstbank = 0;
-
 	/*
 	 * Check for non-fatal errors every MCE_RATE s
 	 */
-	init_timer(&mce_timer, mce_work_fn, NULL, 0);
-	set_timer(&mce_timer, NOW() + MCE_PERIOD);
-	printk(KERN_INFO "Machine check exception polling timer started.\n");
+	switch (c->x86_vendor) {
+	case X86_VENDOR_AMD:
+		if (c->x86 == 6) { /* K7 */
+			firstbank = 1;
+			init_timer(&mce_timer, mce_work_fn, NULL, 0);
+			set_timer(&mce_timer, NOW() + MCE_PERIOD);
+			break;
+		}
+
+		/* Assume we are on K8 or newer AMD CPU here */
+		amd_nonfatal_mcheck_init(c);
+		break;
+
+	case X86_VENDOR_INTEL:
+		init_timer(&mce_timer, mce_work_fn, NULL, 0);
+		set_timer(&mce_timer, NOW() + MCE_PERIOD);
+		break;
+	}
+
+	printk(KERN_INFO "MCA: Machine check polling timer started.\n");
 	return 0;
 }
 __initcall(init_nonfatal_mce_checker);
diff -r 959db3c01837 xen/arch/x86/cpu/mcheck/x86_mca.h
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/xen/arch/x86/cpu/mcheck/x86_mca.h	Fri Jul 04 14:48:37 2008 +0200
@@ -0,0 +1,72 @@
+/*
+ * MCA implementation for AMD K7/K8 CPUs
+ * Copyright (c) 2007 Advanced Micro Devices, Inc. 
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+ */
+
+
+/* The MCA/MCE MSRs should not be used anywhere else.
+ * They are cpu family/model specific and are only for use
+ * in terms of machine check handling.
+ * So we define them here rather in <asm/msr.h>.
+ */
+
+
+/* Bitfield of the MSR_IA32_MCG_CAP register */
+#define MCG_CAP_COUNT           0x00000000000000ffULL
+#define MCG_CTL_P               0x0000000000000100ULL
+/* Bits 9-63 are reserved */
+
+/* Bitfield of the MSR_IA32_MCG_STATUS register */
+#define MCG_STATUS_RIPV         0x0000000000000001ULL
+#define MCG_STATUS_EIPV         0x0000000000000002ULL
+#define MCG_STATUS_MCIP         0x0000000000000004ULL
+/* Bits 3-63 are reserved */
+
+/* Bitfield of MSR_K8_MCi_STATUS registers */
+/* MCA error code */
+#define MCi_STATUS_MCA          0x000000000000ffffULL
+/* model-specific error code */
+#define MCi_STATUS_MSEC         0x00000000ffff0000ULL
+/* Other information */
+#define MCi_STATUS_OTHER        0x01ffffff00000000ULL
+/* processor context corrupt */
+#define MCi_STATUS_PCC          0x0200000000000000ULL
+/* MSR_K8_MCi_ADDR register valid */
+#define MCi_STATUS_ADDRV        0x0400000000000000ULL
+/* MSR_K8_MCi_MISC register valid */
+#define MCi_STATUS_MISCV        0x0800000000000000ULL
+/* error condition enabled */
+#define MCi_STATUS_EN           0x1000000000000000ULL
+/* uncorrected error */
+#define MCi_STATUS_UC           0x2000000000000000ULL
+/* status register overflow */
+#define MCi_STATUS_OVER         0x4000000000000000ULL
+/* valid */
+#define MCi_STATUS_VAL          0x8000000000000000ULL
+
+/* Bitfield of MSi_STATUS_OTHER field */
+/* reserved bits */
+#define MCi_STATUS_OTHER_RESERVED1      0x00001fff00000000ULL
+/* uncorrectable ECC error */
+#define MCi_STATUS_OTEHR_UC_ECC         0x0000200000000000ULL
+/* correctable ECC error */
+#define MCi_STATUS_OTHER_C_ECC          0x0000400000000000ULL
+/* ECC syndrome of an ECC error */
+#define MCi_STATUS_OTHER_ECC_SYNDROME   0x007f800000000000ULL
+/* reserved bits */
+#define MCi_STATUS_OTHER_RESERVED2      0x0180000000000000ULL
+
diff -r 959db3c01837 xen/arch/x86/nmi.c
--- a/xen/arch/x86/nmi.c	Fri Jul 04 14:41:35 2008 +0200
+++ b/xen/arch/x86/nmi.c	Fri Jul 04 14:48:37 2008 +0200
@@ -457,10 +457,10 @@ static void do_nmi_stats(unsigned char k
     if ( ((d = dom0) == NULL) || ((v = d->vcpu[0]) == NULL) )
         return;
 
-    if ( v->nmi_pending || v->nmi_masked )
+    if ( v->nmi_pending || (v->trap_priority >= VCPU_TRAP_NMI) )
         printk("dom0 vpu0: NMI %s%s\n",
                v->nmi_pending ? "pending " : "",
-               v->nmi_masked  ? "masked " : "");
+               (v->trap_priority >= VCPU_TRAP_NMI)  ? "masked " : "");
     else
         printk("dom0 vcpu0: NMI neither pending nor masked\n");
 }
diff -r 959db3c01837 xen/arch/x86/traps.c
--- a/xen/arch/x86/traps.c	Fri Jul 04 14:41:35 2008 +0200
+++ b/xen/arch/x86/traps.c	Fri Jul 04 14:48:37 2008 +0200
@@ -487,6 +487,20 @@ static unsigned int check_guest_io_break
 }
 
 /*
+ * Called from asm to set up the MCE trapbounce info.
+ * Returns 0 if no callback is set up, else 1.
+ */
+asmlinkage int set_guest_machinecheck_trapbounce(void)
+{
+    struct vcpu *v = current;
+    struct trap_bounce *tb = &v->arch.trap_bounce;
+ 
+    do_guest_trap(TRAP_machine_check, guest_cpu_user_regs(), 0);
+    tb->flags &= ~TBF_EXCEPTION; /* not needed for MCE delivery path */
+    return !null_trap_bounce(v, tb);
+}
+
+/*
  * Called from asm to set up the NMI trapbounce info.
  * Returns 0 if no callback is set up, else 1.
  */
@@ -905,8 +919,6 @@ asmlinkage void do_int3(struct cpu_user_
 
 asmlinkage void do_machine_check(struct cpu_user_regs *regs)
 {
-    extern fastcall void (*machine_check_vector)(
-        struct cpu_user_regs *, long error_code);
     machine_check_vector(regs, regs->error_code);
 }
 
@@ -3037,6 +3049,24 @@ long unregister_guest_nmi_callback(void)
     return 0;
 }
 
+int guest_has_trap_callback(struct domain *d, uint16_t vcpuid, unsigned int trap_nr)
+{
+    struct vcpu *v;
+    struct trap_info *t;
+
+    BUG_ON(d == NULL);
+    BUG_ON(vcpuid >= MAX_VIRT_CPUS);
+
+    /* Sanity check - XXX should be more fine grained. */
+    BUG_ON(trap_nr > TRAP_syscall);
+
+    v = d->vcpu[vcpuid];
+    t = &v->arch.guest_context.trap_ctxt[trap_nr];
+
+    return (t->address != 0);
+}
+
+
 int send_guest_trap(struct domain *d, uint16_t vcpuid, unsigned int trap_nr)
 {
     struct vcpu *v;
@@ -3057,6 +3087,23 @@ int send_guest_trap(struct domain *d, ui
                /* not safe to wake up a vcpu here */
                raise_softirq(NMI_MCE_SOFTIRQ);
                return 0;
+        }
+        break;
+
+    case TRAP_machine_check:
+
+        /* We are called by the machine check (exception or polling) handlers
+         * on the physical CPU that reported a machine check error. */
+
+        if ( !test_and_set_bool(v->mce_pending) ) {
+                st = &per_cpu(softirq_trap, smp_processor_id());
+                st->domain = d;
+                st->vcpu = v;
+                st->processor = v->processor;
+
+                /* not safe to wake up a vcpu here */
+                raise_softirq(NMI_MCE_SOFTIRQ);
+                return 0;
         }
         break;
     }
diff -r 959db3c01837 xen/arch/x86/x86_32/asm-offsets.c
--- a/xen/arch/x86/x86_32/asm-offsets.c	Fri Jul 04 14:41:35 2008 +0200
+++ b/xen/arch/x86/x86_32/asm-offsets.c	Fri Jul 04 14:48:37 2008 +0200
@@ -67,7 +67,11 @@ void __dummy__(void)
            arch.guest_context.kernel_sp);
     OFFSET(VCPU_guest_context_flags, struct vcpu, arch.guest_context.flags);
     OFFSET(VCPU_nmi_pending, struct vcpu, nmi_pending);
-    OFFSET(VCPU_nmi_masked, struct vcpu, nmi_masked);
+    OFFSET(VCPU_mce_pending, struct vcpu, mce_pending);
+    OFFSET(VCPU_old_trap_priority, struct vcpu, old_trap_priority);
+    OFFSET(VCPU_trap_priority, struct vcpu, trap_priority);
+    DEFINE(VCPU_TRAP_NMI, VCPU_TRAP_NMI);
+    DEFINE(VCPU_TRAP_MCE, VCPU_TRAP_MCE);
     DEFINE(_VGCF_failsafe_disables_events, _VGCF_failsafe_disables_events);
     BLANK();
 
diff -r 959db3c01837 xen/arch/x86/x86_32/entry.S
--- a/xen/arch/x86/x86_32/entry.S	Fri Jul 04 14:41:35 2008 +0200
+++ b/xen/arch/x86/x86_32/entry.S	Fri Jul 04 14:48:37 2008 +0200
@@ -229,6 +229,8 @@ test_all_events:
         shl  $IRQSTAT_shift,%eax
         test %ecx,irq_stat(%eax,1)
         jnz  process_softirqs
+        testb $1,VCPU_mce_pending(%ebx)
+        jnz  process_mce
         testb $1,VCPU_nmi_pending(%ebx)
         jnz  process_nmi
 test_guest_events:
@@ -255,15 +257,35 @@ process_softirqs:
         jmp  test_all_events
 
         ALIGN
+/* %ebx: struct vcpu */
+process_mce:
+        cmpw $VCPU_TRAP_MCE,VCPU_trap_priority(%ebx)
+        jae  test_guest_events
+        sti
+        movb $0,VCPU_mce_pending(%ebx)
+        call set_guest_machinecheck_trapbounce
+        test %eax,%eax
+        jz   test_all_events
+        movw VCPU_trap_priority(%ebx),%dx           # safe priority for the
+        movw %dx,VCPU_old_trap_priority(%ebx)       # iret hypercall
+        movw $VCPU_TRAP_MCE,VCPU_trap_priority(%ebx)
+        jmp process_trap
+
+        ALIGN
+/* %ebx: struct vcpu */
 process_nmi:
-        testb $1,VCPU_nmi_masked(%ebx)
-        jnz  test_guest_events
+        cmpw $VCPU_TRAP_NMI,VCPU_trap_priority(%ebx)
+        jae  test_guest_events
         sti
         movb $0,VCPU_nmi_pending(%ebx)
         call set_guest_nmi_trapbounce
         test %eax,%eax
         jz   test_all_events
-        movb $1,VCPU_nmi_masked(%ebx)
+        movw VCPU_trap_priority(%ebx),%dx           # safe priority for the
+        movw %dx,VCPU_old_trap_priority(%ebx)       # iret hypercall
+        movw $VCPU_TRAP_NMI,VCPU_trap_priority(%ebx)
+        /* FALLTHROUGH */
+process_trap:
         leal VCPU_trap_bounce(%ebx),%edx
         call create_bounce_frame
         jmp  test_all_events
@@ -681,6 +703,10 @@ ENTRY(hypercall_table)
         .long do_sysctl             /* 35 */
         .long do_domctl
         .long do_kexec_op
+        .rept __HYPERVISOR_arch_0-((.-hypercall_table)/4)
+        .long do_ni_hypercall
+        .endr
+        .long do_mca                /* 48 */
         .rept NR_hypercalls-((.-hypercall_table)/4)
         .long do_ni_hypercall
         .endr
@@ -724,6 +750,10 @@ ENTRY(hypercall_args_table)
         .byte 1 /* do_sysctl            */  /* 35 */
         .byte 1 /* do_domctl            */
         .byte 2 /* do_kexec_op          */
+        .rept __HYPERVISOR_arch_0-(.-hypercall_args_table)
+        .byte 0 /* do_ni_hypercall      */
+        .endr
+        .byte 1 /* do_mca               */  /* 48 */
         .rept NR_hypercalls-(.-hypercall_args_table)
         .byte 0 /* do_ni_hypercall      */
         .endr
diff -r 959db3c01837 xen/arch/x86/x86_32/traps.c
--- a/xen/arch/x86/x86_32/traps.c	Fri Jul 04 14:41:35 2008 +0200
+++ b/xen/arch/x86/x86_32/traps.c	Fri Jul 04 14:48:37 2008 +0200
@@ -256,11 +256,12 @@ unsigned long do_iret(void)
     }
 
     /* Restore affinity.  */
-    if (v->nmi_masked && !cpus_equal(v->cpu_affinity_tmp, v->cpu_affinity))
+    if ((v->trap_priority >= VCPU_TRAP_NMI)
+       && !cpus_equal(v->cpu_affinity_tmp, v->cpu_affinity))
         vcpu_set_affinity(v, &v->cpu_affinity_tmp);
 
-    /* No longer in NMI context. */
-    v->nmi_masked = 0;
+    /* Restore previous trap priority */
+    v->trap_priority = v->old_trap_priority;
 
     /* Restore upcall mask from supplied EFLAGS.IF. */
     vcpu_info(v, evtchn_upcall_mask) = !(eflags & X86_EFLAGS_IF);
diff -r 959db3c01837 xen/arch/x86/x86_64/asm-offsets.c
--- a/xen/arch/x86/x86_64/asm-offsets.c	Fri Jul 04 14:41:35 2008 +0200
+++ b/xen/arch/x86/x86_64/asm-offsets.c	Fri Jul 04 14:48:37 2008 +0200
@@ -92,7 +92,11 @@ void __dummy__(void)
     OFFSET(VCPU_kernel_ss, struct vcpu, arch.guest_context.kernel_ss);
     OFFSET(VCPU_guest_context_flags, struct vcpu, arch.guest_context.flags);
     OFFSET(VCPU_nmi_pending, struct vcpu, nmi_pending);
-    OFFSET(VCPU_nmi_masked, struct vcpu, nmi_masked);
+    OFFSET(VCPU_mce_pending, struct vcpu, mce_pending);
+    OFFSET(VCPU_old_trap_priority, struct vcpu, old_trap_priority);
+    OFFSET(VCPU_trap_priority, struct vcpu, trap_priority);
+    DEFINE(VCPU_TRAP_NMI, VCPU_TRAP_NMI);
+    DEFINE(VCPU_TRAP_MCE, VCPU_TRAP_MCE);
     DEFINE(_VGCF_failsafe_disables_events, _VGCF_failsafe_disables_events);
     DEFINE(_VGCF_syscall_disables_events,  _VGCF_syscall_disables_events);
     BLANK();
diff -r 959db3c01837 xen/arch/x86/x86_64/compat/entry.S
--- a/xen/arch/x86/x86_64/compat/entry.S	Fri Jul 04 14:41:35 2008 +0200
+++ b/xen/arch/x86/x86_64/compat/entry.S	Fri Jul 04 14:48:37 2008 +0200
@@ -101,6 +101,8 @@ ENTRY(compat_test_all_events)
         leaq  irq_stat(%rip),%rcx
         testl $~0,(%rcx,%rax,1)
         jnz   compat_process_softirqs
+        testb $1,VCPU_mce_pending(%rbx)
+        jnz   compat_process_mce
         testb $1,VCPU_nmi_pending(%rbx)
         jnz   compat_process_nmi
 compat_test_guest_events:
@@ -129,15 +131,34 @@ compat_process_softirqs:
 
 	ALIGN
 /* %rbx: struct vcpu */
+compat_process_mce:
+        cmpw $VCPU_TRAP_MCE,VCPU_trap_priority(%rbx)
+        jae  compat_test_guest_events
+        sti
+        movb $0,VCPU_mce_pending(%rbx)
+        call set_guest_machinecheck_trapbounce
+        testl %eax,%eax
+        jz    compat_test_all_events
+        movw VCPU_trap_priority(%rbx),%dx           # safe priority for the
+        movw %dx,VCPU_old_trap_priority(%rbx)       # iret hypercall
+        movw  $VCPU_TRAP_MCE,VCPU_trap_priority(%rbx)
+        jmp   compat_process_trap
+
+	ALIGN
+/* %rbx: struct vcpu */
 compat_process_nmi:
-        testb $1,VCPU_nmi_masked(%rbx)
-        jnz   compat_test_guest_events
+        cmpw $VCPU_TRAP_NMI,VCPU_trap_priority(%rbx)
+        jae   compat_test_guest_events
         sti
         movb  $0,VCPU_nmi_pending(%rbx)
         call  set_guest_nmi_trapbounce
         testl %eax,%eax
         jz    compat_test_all_events
-        movb  $1,VCPU_nmi_masked(%rbx)
+        movw VCPU_trap_priority(%rbx),%dx           # safe priority for the
+        movw %dx,VCPU_old_trap_priority(%rbx)       # iret hypercall
+        movw  $VCPU_TRAP_NMI,VCPU_trap_priority(%rbx)
+        /* FALLTHROUGH */
+compat_process_trap:
         leaq  VCPU_trap_bounce(%rbx),%rdx
         call  compat_create_bounce_frame
         jmp   compat_test_all_events
@@ -386,6 +407,10 @@ ENTRY(compat_hypercall_table)
         .quad do_sysctl                 /* 35 */
         .quad do_domctl
         .quad compat_kexec_op
+        .rept __HYPERVISOR_arch_0-((.-compat_hypercall_table)/8)
+        .quad compat_ni_hypercall
+        .endr
+        .quad do_mca                    /* 48 */
         .rept NR_hypercalls-((.-compat_hypercall_table)/8)
         .quad compat_ni_hypercall
         .endr
@@ -429,6 +454,10 @@ ENTRY(compat_hypercall_args_table)
         .byte 1 /* do_sysctl                */  /* 35 */
         .byte 1 /* do_domctl                */
         .byte 2 /* compat_kexec_op          */
+        .rept __HYPERVISOR_arch_0-(.-compat_hypercall_args_table)
+        .byte 0 /* compat_ni_hypercall      */
+        .endr
+        .byte 1 /* do_mca                   */
         .rept NR_hypercalls-(.-compat_hypercall_args_table)
         .byte 0 /* compat_ni_hypercall      */
         .endr
diff -r 959db3c01837 xen/arch/x86/x86_64/compat/traps.c
--- a/xen/arch/x86/x86_64/compat/traps.c	Fri Jul 04 14:41:35 2008 +0200
+++ b/xen/arch/x86/x86_64/compat/traps.c	Fri Jul 04 14:48:37 2008 +0200
@@ -122,11 +122,12 @@ unsigned int compat_iret(void)
         regs->_esp += 16;
 
     /* Restore affinity.  */
-    if (v->nmi_masked && !cpus_equal(v->cpu_affinity_tmp, v->cpu_affinity))
+    if ((v->trap_priority >= VCPU_TRAP_NMI)
+       && !cpus_equal(v->cpu_affinity_tmp, v->cpu_affinity))
         vcpu_set_affinity(v, &v->cpu_affinity_tmp);
 
-    /* No longer in NMI context. */
-    v->nmi_masked = 0;
+    /* Restore previous trap priority */
+    v->trap_priority = v->old_trap_priority;
 
     /* Restore upcall mask from supplied EFLAGS.IF. */
     vcpu_info(v, evtchn_upcall_mask) = !(eflags & X86_EFLAGS_IF);
diff -r 959db3c01837 xen/arch/x86/x86_64/entry.S
--- a/xen/arch/x86/x86_64/entry.S	Fri Jul 04 14:41:35 2008 +0200
+++ b/xen/arch/x86/x86_64/entry.S	Fri Jul 04 14:48:37 2008 +0200
@@ -205,6 +205,8 @@ test_all_events:
         leaq  irq_stat(%rip),%rcx
         testl $~0,(%rcx,%rax,1)
         jnz   process_softirqs
+        testb $1,VCPU_mce_pending(%rbx)
+        jnz   process_mce
         testb $1,VCPU_nmi_pending(%rbx)
         jnz   process_nmi
 test_guest_events:
@@ -231,15 +233,34 @@ process_softirqs:
 
         ALIGN
 /* %rbx: struct vcpu */
+process_mce:
+        cmpw $VCPU_TRAP_MCE,VCPU_trap_priority(%rbx)
+        jae  test_guest_events
+        sti
+        movb $0,VCPU_mce_pending(%rbx)
+        call set_guest_machinecheck_trapbounce
+        test %eax,%eax
+        jz   test_all_events
+        movw VCPU_trap_priority(%rbx),%dx           # safe priority for the
+        movw %dx,VCPU_old_trap_priority(%rbx)       # iret hypercall
+        movw $VCPU_TRAP_MCE,VCPU_trap_priority(%rbx)
+        jmp  process_trap
+
+        ALIGN
+/* %rbx: struct vcpu */
 process_nmi:
-        testb $1,VCPU_nmi_masked(%rbx)
-        jnz  test_guest_events
+        cmpw $VCPU_TRAP_NMI,VCPU_trap_priority(%rbx)
+        jae  test_guest_events
         sti
         movb $0,VCPU_nmi_pending(%rbx)
         call set_guest_nmi_trapbounce
         test %eax,%eax
         jz   test_all_events
-        movb $1,VCPU_nmi_masked(%rbx)
+        movw VCPU_trap_priority(%rbx),%dx           # safe priority for the
+        movw %dx,VCPU_old_trap_priority(%rbx)       # iret hypercall
+        movw $VCPU_TRAP_NMI,VCPU_trap_priority(%rbx)
+        /* FALLTHROUGH */
+process_trap:
         leaq VCPU_trap_bounce(%rbx),%rdx
         call create_bounce_frame
         jmp  test_all_events
@@ -671,6 +692,10 @@ ENTRY(hypercall_table)
         .quad do_sysctl             /* 35 */
         .quad do_domctl
         .quad do_kexec_op
+        .rept __HYPERVISOR_arch_0-((.-hypercall_table)/8)
+        .quad do_ni_hypercall
+        .endr
+        .quad do_mca                /* 48 */
         .rept NR_hypercalls-((.-hypercall_table)/8)
         .quad do_ni_hypercall
         .endr
@@ -715,6 +740,10 @@ ENTRY(hypercall_args_table)
         .byte 1 /* do_domctl            */
         .byte 2 /* do_kexec             */
         .byte 1 /* do_xsm_op            */
+        .rept __HYPERVISOR_arch_0-(.-hypercall_args_table)
+        .byte 0 /* do_ni_hypercall      */
+        .endr
+        .byte 1 /* do_mca               */  /* 48 */
         .rept NR_hypercalls-(.-hypercall_args_table)
         .byte 0 /* do_ni_hypercall      */
         .endr
diff -r 959db3c01837 xen/arch/x86/x86_64/traps.c
--- a/xen/arch/x86/x86_64/traps.c	Fri Jul 04 14:41:35 2008 +0200
+++ b/xen/arch/x86/x86_64/traps.c	Fri Jul 04 14:48:37 2008 +0200
@@ -289,11 +289,12 @@ unsigned long do_iret(void)
     }
 
     /* Restore affinity.  */
-    if (v->nmi_masked && !cpus_equal(v->cpu_affinity_tmp, v->cpu_affinity))
+    if ((v->trap_priority >= VCPU_TRAP_NMI)
+       && !cpus_equal(v->cpu_affinity_tmp, v->cpu_affinity))
         vcpu_set_affinity(v, &v->cpu_affinity_tmp);
 
-    /* No longer in NMI context. */
-    v->nmi_masked = 0;
+    /* Restore previous trap priority */
+    v->trap_priority = v->old_trap_priority;
 
     /* Restore upcall mask from supplied EFLAGS.IF. */
     vcpu_info(v, evtchn_upcall_mask) = !(iret_saved.rflags & EF_IE);
diff -r 959db3c01837 xen/common/domain.c
--- a/xen/common/domain.c	Fri Jul 04 14:41:35 2008 +0200
+++ b/xen/common/domain.c	Fri Jul 04 14:48:37 2008 +0200
@@ -654,7 +654,9 @@ void vcpu_reset(struct vcpu *v)
     v->is_polling      = 0;
     v->is_initialised  = 0;
     v->nmi_pending     = 0;
-    v->nmi_masked      = 0;
+    v->mce_pending     = 0;
+    v->old_trap_priority = VCPU_TRAP_NONE;
+    v->trap_priority   = VCPU_TRAP_NONE;
     clear_bit(_VPF_blocked, &v->pause_flags);
 
     domain_unlock(v->domain);
diff -r 959db3c01837 xen/common/event_channel.c
--- a/xen/common/event_channel.c	Fri Jul 04 14:41:35 2008 +0200
+++ b/xen/common/event_channel.c	Fri Jul 04 14:48:37 2008 +0200
@@ -587,6 +587,21 @@ void send_guest_vcpu_virq(struct vcpu *v
     evtchn_set_pending(v, port);
 }
 
+int guest_enabled_event(struct vcpu *v, int virq)
+{
+    int port;
+
+    if ( unlikely(v == NULL) )
+        return 0;
+
+    port = v->virq_to_evtchn[virq];
+    if ( port == 0 )
+        return 0;
+
+    /* virq is in use */
+    return 1;
+}
+
 void send_guest_global_virq(struct domain *d, int virq)
 {
     int port;
diff -r 959db3c01837 xen/include/Makefile
--- a/xen/include/Makefile	Fri Jul 04 14:41:35 2008 +0200
+++ b/xen/include/Makefile	Fri Jul 04 14:48:37 2008 +0200
@@ -20,6 +20,7 @@ headers-y := \
     compat/xen.h \
     compat/xencomm.h \
     compat/xenoprof.h
+headers-$(CONFIG_X86)     += compat/arch-x86/xen-mca.h
 headers-$(CONFIG_X86)     += compat/arch-x86/xen.h
 headers-$(CONFIG_X86)     += compat/arch-x86/xen-$(compat-arch-y).h
 headers-y                 += compat/arch-$(compat-arch-y).h compat/xlat.h
diff -r 959db3c01837 xen/include/asm-x86/event.h
--- a/xen/include/asm-x86/event.h	Fri Jul 04 14:41:35 2008 +0200
+++ b/xen/include/asm-x86/event.h	Fri Jul 04 14:48:37 2008 +0200
@@ -69,7 +69,12 @@ static inline void local_event_delivery_
 /* No arch specific virq definition now. Default to global. */
 static inline int arch_virq_is_global(int virq)
 {
-    return 1;
+    switch (virq) {
+    case VIRQ_MCA:
+        return 1;
+    default:
+        return 1;
+    }
 }
 
 #endif
diff -r 959db3c01837 xen/include/asm-x86/mm.h
--- a/xen/include/asm-x86/mm.h	Fri Jul 04 14:41:35 2008 +0200
+++ b/xen/include/asm-x86/mm.h	Fri Jul 04 14:48:37 2008 +0200
@@ -141,6 +141,9 @@ static inline u32 pickle_domptr(struct d
 #define page_get_owner(_p)    (unpickle_domptr((_p)->u.inuse._domain))
 #define page_set_owner(_p,_d) ((_p)->u.inuse._domain = pickle_domptr(_d))
 
+#define maddr_get_owner(ma)   (page_get_owner(maddr_to_page((ma))))
+#define vaddr_get_owner(va)   (page_get_owner(virt_to_page((va))))
+
 #define XENSHARE_writable 0
 #define XENSHARE_readonly 1
 extern void share_xen_page_with_guest(
diff -r 959db3c01837 xen/include/asm-x86/traps.h
--- a/xen/include/asm-x86/traps.h	Fri Jul 04 14:41:35 2008 +0200
+++ b/xen/include/asm-x86/traps.h	Fri Jul 04 14:48:37 2008 +0200
@@ -26,6 +26,18 @@ struct softirq_trap {
 	int processor;		/* physical cpu to inject trap */
 };
 
+struct cpu_user_regs;
+
+extern void (*machine_check_vector)(struct cpu_user_regs *regs, long error_code);
+ 
+/**
+ * guest_has_trap_callback
+ *
+ * returns true (non-zero) if guest registered a trap handler
+ */
+extern int guest_has_trap_callback(struct domain *d, uint16_t vcpuid,
+				unsigned int trap_nr);
+
 /**
  * send_guest_trap
  *
@@ -35,5 +47,4 @@ extern int send_guest_trap(struct domain
 extern int send_guest_trap(struct domain *d, uint16_t vcpuid,
 				unsigned int trap_nr);
 
-
 #endif /* ASM_TRAP_H */
diff -r 959db3c01837 xen/include/public/arch-x86/xen-mca.h
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/xen/include/public/arch-x86/xen-mca.h	Fri Jul 04 14:48:37 2008 +0200
@@ -0,0 +1,279 @@
+/******************************************************************************
+ * arch-x86/mca.h
+ * 
+ * Contributed by Advanced Micro Devices, Inc.
+ * Author: Christoph Egger <Christoph.Egger@xxxxxxx>
+ *
+ * Guest OS machine check interface to x86 Xen.
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ * DEALINGS IN THE SOFTWARE.
+ */
+
+/* Full MCA functionality has the following Usecases from the guest side:
+ *
+ * Must have's:
+ * 1. Dom0 and DomU register machine check trap callback handlers
+ *    (already done via "set_trap_table" hypercall)
+ * 2. Dom0 registers machine check event callback handler
+ *    (doable via EVTCHNOP_bind_virq)
+ * 3. Dom0 and DomU fetches machine check data
+ * 4. Dom0 wants Xen to notify a DomU
+ * 5. Dom0 gets DomU ID from physical address
+ * 6. Dom0 wants Xen to kill DomU (already done for "xm destroy")
+ *
+ * Nice to have's:
+ * 7. Dom0 wants Xen to deactivate a physical CPU
+ *    This is better done as separate task, physical CPU hotplugging,
+ *    and hypercall(s) should be sysctl's
+ * 8. Page migration proposed from Xen NUMA work, where Dom0 can tell Xen to
+ *    move a DomU (or Dom0 itself) away from a malicious page
+ *    producing correctable errors.
+ * 9. offlining physical page:
+ *    Xen free's and never re-uses a certain physical page.
+ * 10. Testfacility: Allow Dom0 to write values into machine check MSR's
+ *     and tell Xen to trigger a machine check
+ */
+
+#ifndef __XEN_PUBLIC_ARCH_X86_MCA_H__
+#define __XEN_PUBLIC_ARCH_X86_MCA_H__
+
+/* Hypercall */
+#define __HYPERVISOR_mca __HYPERVISOR_arch_0
+
+#define XEN_MCA_INTERFACE_VERSION 0x03000001
+
+/* IN: Dom0 calls hypercall from MC event handler. */
+#define XEN_MC_CORRECTABLE  0x0
+/* IN: Dom0/DomU calls hypercall from MC trap handler. */
+#define XEN_MC_TRAP         0x1
+/* XEN_MC_CORRECTABLE and XEN_MC_TRAP are mutually exclusive. */
+
+/* OUT: All is ok */
+#define XEN_MC_OK           0x0
+/* OUT: Domain could not fetch data. */
+#define XEN_MC_FETCHFAILED  0x1
+/* OUT: There was no machine check data to fetch. */
+#define XEN_MC_NODATA       0x2
+/* OUT: Between notification time and this hypercall an other
+ *  (most likely) correctable error happened. The fetched data,
+ *  does not match the original machine check data. */
+#define XEN_MC_NOMATCH      0x4
+
+/* OUT: DomU did not register MC NMI handler. Try something else. */
+#define XEN_MC_CANNOTHANDLE 0x8
+/* OUT: Notifying DomU failed. Retry later or try something else. */
+#define XEN_MC_NOTDELIVERED 0x10
+/* Note, XEN_MC_CANNOTHANDLE and XEN_MC_NOTDELIVERED are mutually exclusive. */
+
+
+#ifndef __ASSEMBLY__
+
+#define VIRQ_MCA VIRQ_ARCH_0 /* G. (DOM0) Machine Check Architecture */
+
+/*
+ * Machine Check Architecure:
+ * structs are read-only and used to report all kinds of
+ * correctable and uncorrectable errors detected by the HW.
+ * Dom0 and DomU: register a handler to get notified.
+ * Dom0 only: Correctable errors are reported via VIRQ_MCA
+ * Dom0 and DomU: Uncorrectable errors are reported via nmi handlers
+ */
+#define MC_TYPE_GLOBAL          0
+#define MC_TYPE_BANK            1
+#define MC_TYPE_EXTENDED        2
+
+struct mcinfo_common {
+    uint16_t type;      /* structure type */
+    uint16_t size;      /* size of this struct in bytes */
+};
+
+
+#define MC_FLAG_CORRECTABLE     (1 << 0)
+#define MC_FLAG_UNCORRECTABLE   (1 << 1)
+
+/* contains global x86 mc information */
+struct mcinfo_global {
+    struct mcinfo_common common;
+
+    /* running domain at the time in error (most likely the impacted one) */
+    uint16_t mc_domid;
+    uint32_t mc_socketid; /* physical socket of the physical core */
+    uint16_t mc_coreid; /* physical impacted core */
+    uint16_t mc_core_threadid; /* core thread of physical core */
+    uint16_t mc_vcpuid; /* virtual cpu scheduled for mc_domid */
+    uint64_t mc_gstatus; /* global status */
+    uint32_t mc_flags;
+};
+
+/* contains bank local x86 mc information */
+struct mcinfo_bank {
+    struct mcinfo_common common;
+
+    uint16_t mc_bank; /* bank nr */
+    uint16_t mc_domid; /* Usecase 5: domain referenced by mc_addr on dom0
+                        * and if mc_addr is valid. Never valid on DomU. */
+    uint64_t mc_status; /* bank status */
+    uint64_t mc_addr;   /* bank address, only valid
+                         * if addr bit is set in mc_status */
+    uint64_t mc_misc;
+};
+
+
+struct mcinfo_msr {
+    uint64_t reg;   /* MSR */
+    uint64_t value; /* MSR value */
+};
+
+/* contains mc information from other
+ * or additional mc MSRs */ 
+struct mcinfo_extended {
+    struct mcinfo_common common;
+
+    /* You can fill up to five registers.
+     * If you need more, then use this structure
+     * multiple times. */
+
+    uint32_t mc_msrs; /* Number of msr with valid values. */
+    struct mcinfo_msr mc_msr[5];
+};
+
+#define MCINFO_HYPERCALLSIZE	1024
+#define MCINFO_MAXSIZE		768
+
+struct mc_info {
+    /* Number of mcinfo_* entries in mi_data */
+    uint32_t mi_nentries;
+
+    uint8_t mi_data[MCINFO_MAXSIZE - sizeof(uint32_t)];
+};
+typedef struct mc_info mc_info_t;
+
+
+
+/* 
+ * OS's should use these instead of writing their own lookup function
+ * each with its own bugs and drawbacks.
+ * We use macros instead of static inline functions to allow guests
+ * to include this header in assembly files (*.S).
+ */
+/* Prototype:
+ *    uint32_t x86_mcinfo_nentries(struct mc_info *mi);
+ */
+#define x86_mcinfo_nentries(_mi)    \
+    (_mi)->mi_nentries
+/* Prototype:
+ *    struct mcinfo_common *x86_mcinfo_first(struct mc_info *mi);
+ */
+#define x86_mcinfo_first(_mi)       \
+    (struct mcinfo_common *)((_mi)->mi_data)
+/* Prototype:
+ *    struct mcinfo_common *x86_mcinfo_next(struct mcinfo_common *mic);
+ */
+#define x86_mcinfo_next(_mic)       \
+    (struct mcinfo_common *)((uint8_t *)(_mic) + (_mic)->size)
+
+/* Prototype:
+ *    void x86_mcinfo_lookup(void *ret, struct mc_info *mi, uint16_t type);
+ */
+#define x86_mcinfo_lookup(_ret, _mi, _type)    \
+    do {                                                        \
+        uint32_t found, i;                                      \
+        struct mcinfo_common *_mic;                             \
+                                                                \
+        found = 0;                                              \
+	(_ret) = NULL;						\
+	if (_mi == NULL) break;					\
+        _mic = x86_mcinfo_first(_mi);                           \
+        for (i = 0; i < x86_mcinfo_nentries(_mi); i++) {        \
+            if (_mic->type == (_type)) {                        \
+                found = 1;                                      \
+                break;                                          \
+            }                                                   \
+            _mic = x86_mcinfo_next(_mic);                       \
+        }                                                       \
+        (_ret) = found ? _mic : NULL;                           \
+    } while (0)
+
+
+/* Usecase 1
+ * Register machine check trap callback handler
+ *    (already done via "set_trap_table" hypercall)
+ */
+
+/* Usecase 2
+ * Dom0 registers machine check event callback handler
+ * done by EVTCHNOP_bind_virq
+ */
+
+/* Usecase 3
+ * Fetch machine check data from hypervisor.
+ * Note, this hypercall is special, because both Dom0 and DomU must use this.
+ */
+#define XEN_MC_fetch            1
+struct xen_mc_fetch {
+    /* IN/OUT variables. */
+    uint32_t flags;
+
+/* IN: XEN_MC_CORRECTABLE, XEN_MC_TRAP */
+/* OUT: XEN_MC_OK, XEN_MC_FETCHFAILED, XEN_MC_NODATA, XEN_MC_NOMATCH */
+
+    /* OUT variables. */
+    uint32_t fetch_idx;  /* only useful for Dom0 for the notify hypercall */
+    struct mc_info mc_info;
+};
+typedef struct xen_mc_fetch xen_mc_fetch_t;
+DEFINE_XEN_GUEST_HANDLE(xen_mc_fetch_t);
+
+
+/* Usecase 4
+ * This tells the hypervisor to notify a DomU about the machine check error
+ */
+#define XEN_MC_notifydomain     2
+struct xen_mc_notifydomain {
+    /* IN variables. */
+    uint16_t mc_domid;    /* The unprivileged domain to notify. */
+    uint16_t mc_vcpuid;   /* The vcpu in mc_domid to notify.
+                           * Usually echo'd value from the fetch hypercall. */
+    uint32_t fetch_idx;   /* echo'd value from the fetch hypercall. */
+
+    /* IN/OUT variables. */
+    uint32_t flags;
+
+/* IN: XEN_MC_CORRECTABLE, XEN_MC_TRAP */
+/* OUT: XEN_MC_OK, XEN_MC_CANNOTHANDLE, XEN_MC_NOTDELIVERED, XEN_MC_NOMATCH */
+};
+typedef struct xen_mc_notifydomain xen_mc_notifydomain_t;
+DEFINE_XEN_GUEST_HANDLE(xen_mc_notifydomain_t);
+
+
+struct xen_mc {
+    uint32_t cmd;
+    uint32_t interface_version; /* XEN_MCA_INTERFACE_VERSION */
+    union {
+        struct xen_mc_fetch        mc_fetch;
+        struct xen_mc_notifydomain mc_notifydomain;
+        uint8_t pad[MCINFO_HYPERCALLSIZE];
+    } u;
+};
+typedef struct xen_mc xen_mc_t;
+DEFINE_XEN_GUEST_HANDLE(xen_mc_t);
+
+#endif /* __ASSEMBLY__ */
+
+#endif /* __XEN_PUBLIC_ARCH_X86_MCA_H__ */
diff -r 959db3c01837 xen/include/public/arch-x86/xen.h
--- a/xen/include/public/arch-x86/xen.h	Fri Jul 04 14:41:35 2008 +0200
+++ b/xen/include/public/arch-x86/xen.h	Fri Jul 04 14:48:37 2008 +0200
@@ -76,6 +76,10 @@ typedef unsigned long xen_pfn_t;
 /* Maximum number of virtual CPUs in multi-processor guests. */
 #define MAX_VIRT_CPUS 32
 
+
+/* Machine check support */
+#include "xen-mca.h"
+
 #ifndef __ASSEMBLY__
 
 typedef unsigned long xen_ulong_t;
diff -r 959db3c01837 xen/include/xen/event.h
--- a/xen/include/xen/event.h	Fri Jul 04 14:41:35 2008 +0200
+++ b/xen/include/xen/event.h	Fri Jul 04 14:48:37 2008 +0200
@@ -50,6 +50,9 @@ void free_xen_event_channel(
 void free_xen_event_channel(
     struct vcpu *local_vcpu, int port);
 
+/* Query if event channel is in use by the guest */
+int guest_enabled_event(struct vcpu *v, int virq);
+
 /* Notify remote end of a Xen-attached event channel.*/
 void notify_via_xen_event_channel(int lport);
 
diff -r 959db3c01837 xen/include/xen/sched.h
--- a/xen/include/xen/sched.h	Fri Jul 04 14:41:35 2008 +0200
+++ b/xen/include/xen/sched.h	Fri Jul 04 14:48:37 2008 +0200
@@ -112,10 +112,21 @@ struct vcpu
     bool_t           is_initialised;
     /* Currently running on a CPU? */
     bool_t           is_running;
+    /* MCE callback pending for this VCPU? */
+    bool_t           mce_pending;
     /* NMI callback pending for this VCPU? */
     bool_t           nmi_pending;
-    /* Avoid NMI reentry by allowing NMIs to be masked for short periods. */
-    bool_t           nmi_masked;
+
+    /* Higher priorized traps may interrupt lower priorized traps,
+     * lower priorized traps wait until higher priorized traps finished.
+     * Note: This concept is known as "system priority level" (spl)
+     * in the UNIX world. */
+    uint16_t         old_trap_priority;
+    uint16_t         trap_priority;
+#define VCPU_TRAP_NONE    0
+#define VCPU_TRAP_NMI     1
+#define VCPU_TRAP_MCE     2
+
     /* Require shutdown to be deferred for some asynchronous operation? */
     bool_t           defer_shutdown;
     /* VCPU is paused following shutdown request (d->is_shutting_down)? */