diff -r 959db3c01837 xen/arch/x86/cpu/mcheck/Makefile --- a/xen/arch/x86/cpu/mcheck/Makefile Fri Jul 04 14:41:35 2008 +0200 +++ b/xen/arch/x86/cpu/mcheck/Makefile Fri Jul 04 14:48:37 2008 +0200 @@ -1,4 +1,7 @@ obj-y += k7.o +obj-y += amd_nonfatal.o obj-y += k7.o +obj-y += amd_k8.o +obj-y += amd_f10.o obj-y += mce.o obj-y += non-fatal.o obj-y += p4.o diff -r 959db3c01837 xen/arch/x86/cpu/mcheck/amd_f10.c --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/xen/arch/x86/cpu/mcheck/amd_f10.c Fri Jul 04 14:48:37 2008 +0200 @@ -0,0 +1,131 @@ +/* + * MCA implementation for AMD Family10 CPUs + * Copyright (c) 2007 Advanced Micro Devices, Inc. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + */ + + +/* K8 common MCA documentation published at + * + * AMD64 Architecture Programmer's Manual Volume 2: + * System Programming + * Publication # 24593 Revision: 3.12 + * Issue Date: September 2006 + */ + +/* Family10 MCA documentation published at + * + * BIOS and Kernel Developer's Guide + * For AMD Family 10h Processors + * Publication # 31116 Revision: 1.08 + * Isse Date: June 10, 2007 + */ + + +#include +#include +#include +#include +#include + +#include +#include +#include + +#include "mce.h" +#include "x86_mca.h" + + +static int amd_f10_handler(struct mc_info *mi, uint16_t bank, uint64_t status) +{ + struct mcinfo_extended mc_ext; + + /* Family 0x10 introduced additional MSR that belong to the + * northbridge bank (4). */ + if (bank != 4) + return 0; + + if (!(status & MCi_STATUS_VAL)) + return 0; + + if (!(status & MCi_STATUS_MISCV)) + return 0; + + memset(&mc_ext, 0, sizeof(mc_ext)); + mc_ext.common.type = MC_TYPE_EXTENDED; + mc_ext.common.size = sizeof(mc_ext); + mc_ext.mc_msrs = 3; + + mc_ext.mc_msr[0].reg = MSR_F10_MC4_MISC1; + mc_ext.mc_msr[1].reg = MSR_F10_MC4_MISC2; + mc_ext.mc_msr[2].reg = MSR_F10_MC4_MISC3; + + rdmsrl(MSR_F10_MC4_MISC1, mc_ext.mc_msr[0].value); + rdmsrl(MSR_F10_MC4_MISC2, mc_ext.mc_msr[1].value); + rdmsrl(MSR_F10_MC4_MISC3, mc_ext.mc_msr[2].value); + + x86_mcinfo_add(mi, &mc_ext); + return 1; +} + + +extern void k8_machine_check(struct cpu_user_regs *regs, long error_code); + +/* AMD Family10 machine check */ +void amd_f10_mcheck_init(struct cpuinfo_x86 *c) +{ + uint64_t value; + uint32_t i; + int cpu_nr; + + machine_check_vector = k8_machine_check; + mc_callback_bank_extended = amd_f10_handler; + cpu_nr = smp_processor_id(); + wmb(); + + rdmsrl(MSR_IA32_MCG_CAP, value); + if (value & MCG_CTL_P) /* Control register present ? */ + wrmsrl (MSR_IA32_MCG_CTL, 0xffffffffffffffffULL); + nr_mce_banks = value & MCG_CAP_COUNT; + + for (i = 0; i < nr_mce_banks; i++) { + switch (i) { + case 4: /* Northbridge */ + /* Enable error reporting of all errors, + * enable error checking and + * disable sync flooding */ + wrmsrl(MSR_IA32_MC4_CTL, 0x02c3c008ffffffffULL); + wrmsrl(MSR_IA32_MC4_STATUS, 0x0ULL); + + /* XXX: We should write the value 0x1087821UL into + * to register F3x180 here, which sits in + * the PCI extended configuration space. + * Since this is not possible here, we can only hope, + * Dom0 is doing that. + */ + break; + + default: + /* Enable error reporting of all errors */ + wrmsrl(MSR_IA32_MC0_CTL + 4 * i, 0xffffffffffffffffULL); + wrmsrl(MSR_IA32_MC0_STATUS + 4 * i, 0x0ULL); + break; + } + } + + set_in_cr4(X86_CR4_MCE); + printk("CPU%i: AMD Family10h machine check reporting enabled.\n", cpu_nr); +} diff -r 959db3c01837 xen/arch/x86/cpu/mcheck/amd_k8.c --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/xen/arch/x86/cpu/mcheck/amd_k8.c Fri Jul 04 14:48:37 2008 +0200 @@ -0,0 +1,324 @@ +/* + * MCA implementation for AMD K8 CPUs + * Copyright (c) 2007 Advanced Micro Devices, Inc. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + */ + + +/* K8 common MCA documentation published at + * + * AMD64 Architecture Programmer's Manual Volume 2: + * System Programming + * Publication # 24593 Revision: 3.12 + * Issue Date: September 2006 + * + * URL: + * http://www.amd.com/us-en/assets/content_type/white_papers_and_tech_docs/24593.pdf + */ + +/* The related documentation for K8 Revisions A - E is: + * + * BIOS and Kernel Developer's Guide for + * AMD Athlon 64 and AMD Opteron Processors + * Publication # 26094 Revision: 3.30 + * Issue Date: February 2006 + * + * URL: + * http://www.amd.com/us-en/assets/content_type/white_papers_and_tech_docs/26094.PDF + */ + +/* The related documentation for K8 Revisions F - G is: + * + * BIOS and Kernel Developer's Guide for + * AMD NPT Family 0Fh Processors + * Publication # 32559 Revision: 3.04 + * Issue Date: December 2006 + * + * URL: + * http://www.amd.com/us-en/assets/content_type/white_papers_and_tech_docs/32559.pdf + */ + + +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include + +#include "mce.h" +#include "x86_mca.h" + + +/* Machine Check Handler for AMD K8 family series */ +void k8_machine_check(struct cpu_user_regs *regs, long error_code) +{ + struct vcpu *vcpu = current; + struct domain *curdom; + struct mc_info *mc_data; + struct mcinfo_global mc_global; + struct mcinfo_bank mc_info; + uint64_t status, addrv, miscv, uc; + uint32_t i; + unsigned int cpu_nr; + uint32_t xen_impacted = 0; +#define DOM_NORMAL 0 +#define DOM0_TRAP 1 +#define DOMU_TRAP 2 +#define DOMU_KILLED 4 + uint32_t dom_state = DOM_NORMAL; + + /* This handler runs as interrupt gate. So IPIs from the + * polling service routine are defered until we finished. + */ + + /* Disable interrupts for the _vcpu_. It may not re-scheduled to + * an other physical CPU or the impacted process in the guest + * continues running with corrupted data, otherwise. */ + vcpu_schedule_lock_irq(vcpu); + + mc_data = x86_mcinfo_getptr(); + cpu_nr = smp_processor_id(); + curdom = vcpu->domain; + + memset(&mc_global, 0, sizeof(mc_global)); + mc_global.common.type = MC_TYPE_GLOBAL; + mc_global.common.size = sizeof(mc_global); + + mc_global.mc_domid = curdom->domain_id; /* impacted domain */ + mc_global.mc_coreid = vcpu->processor; /* impacted physical cpu */ + BUG_ON(cpu_nr != vcpu->processor); + mc_global.mc_core_threadid = 0; + mc_global.mc_vcpuid = vcpu->vcpu_id; /* impacted vcpu */ +#if 0 /* TODO: on which socket is this physical core? + It's not clear to me how to figure this out. */ + mc_global.mc_socketid = ???; +#endif + mc_global.mc_flags |= MC_FLAG_UNCORRECTABLE; + rdmsrl(MSR_IA32_MCG_STATUS, mc_global.mc_gstatus); + + /* Quick check, who is impacted */ + xen_impacted = is_idle_domain(curdom); + + /* Dom0 */ + x86_mcinfo_clear(mc_data); + x86_mcinfo_add(mc_data, &mc_global); + + for (i = 0; i < nr_mce_banks; i++) { + struct domain *d; + + rdmsrl(MSR_IA32_MC0_STATUS + 4 * i, status); + + if (!(status & MCi_STATUS_VAL)) + continue; + + /* An error happened in this bank. + * This is expected to be an uncorrectable error, + * since correctable errors get polled. + */ + uc = status & MCi_STATUS_UC; + + memset(&mc_info, 0, sizeof(mc_info)); + mc_info.common.type = MC_TYPE_BANK; + mc_info.common.size = sizeof(mc_info); + mc_info.mc_bank = i; + mc_info.mc_status = status; + + addrv = 0; + if (status & MCi_STATUS_ADDRV) { + rdmsrl(MSR_IA32_MC0_ADDR + 4 * i, addrv); + + d = maddr_get_owner(addrv); + if (d != NULL) + mc_info.mc_domid = d->domain_id; + } + + miscv = 0; + if (status & MCi_STATUS_MISCV) + rdmsrl(MSR_IA32_MC0_MISC + 4 * i, miscv); + + mc_info.mc_addr = addrv; + mc_info.mc_misc = miscv; + + x86_mcinfo_add(mc_data, &mc_info); /* Dom0 */ + + if (mc_callback_bank_extended) + mc_callback_bank_extended(mc_data, i, status); + + /* clear status */ + wrmsrl(MSR_IA32_MC0_STATUS + 4 * i, 0x0ULL); + wmb(); + add_taint(TAINT_MACHINE_CHECK); + } + + status = mc_global.mc_gstatus; + + /* clear MCIP or cpu enters shutdown state + * in case another MCE occurs. */ + status &= ~MCG_STATUS_MCIP; + wrmsrl(MSR_IA32_MCG_STATUS, status); + wmb(); + + /* For the details see the discussion "MCE/MCA concept" on xen-devel. + * The thread started here: + * http://lists.xensource.com/archives/html/xen-devel/2007-05/msg01015.html + */ + + /* MCG_STATUS_RIPV: + * When this bit is not set, then the instruction pointer onto the stack + * to resume at is not valid. If xen is interrupted, then we panic anyway + * right below. Otherwise it is up to the guest to figure out if + * guest kernel or guest userland is affected and should kill either + * itself or the affected process. + */ + + /* MCG_STATUS_EIPV: + * Evaluation of EIPV is the job of the guest. + */ + + if (xen_impacted) { + /* Now we are going to panic anyway. Allow interrupts, so that + * printk on serial console can work. */ + vcpu_schedule_unlock_irq(vcpu); + + /* Uh, that means, machine check exception + * inside Xen occured. */ + printk("Machine check exception occured in Xen.\n"); + + /* if MCG_STATUS_EIPV indicates, the IP on the stack is related + * to the error then it makes sense to print a stack trace. + * That can be useful for more detailed error analysis and/or + * error case studies to figure out, if we can clear + * xen_impacted and kill a DomU instead + * (i.e. if a guest only control structure is affected, but then + * we must ensure the bad pages are not re-used again). + */ + if (status & MCG_STATUS_EIPV) { + printk("MCE: Instruction Pointer is related to the error. " + "Therefore, print the execution state.\n"); + show_execution_state(regs); + } + x86_mcinfo_dump(mc_data); + panic("End of MCE. Use mcelog to decode above error codes.\n"); + } + + /* If Dom0 registered a machine check handler, which is only possible + * with a PV MCA driver, then ... */ + if ( guest_has_trap_callback(dom0, 0, TRAP_machine_check) ) { + dom_state = DOM0_TRAP; + + /* ... deliver machine check trap to Dom0. */ + send_guest_trap(dom0, 0, TRAP_machine_check); + + /* Xen may tell Dom0 now to notify the DomU. + * But this will happen through a hypercall. */ + } else + /* Dom0 did not register a machine check handler, but if DomU + * did so, then... */ + if ( guest_has_trap_callback(curdom, vcpu->vcpu_id, TRAP_machine_check) ) { + dom_state = DOMU_TRAP; + + /* ... deliver machine check trap to DomU */ + send_guest_trap(curdom, vcpu->vcpu_id, TRAP_machine_check); + } else { + /* hmm... noone feels responsible to handle the error. + * So, do a quick check if a DomU is impacted or not. + */ + if (curdom == dom0) { + /* Dom0 is impacted. Since noone can't handle + * this error, panic! */ + x86_mcinfo_dump(mc_data); + panic("MCE occured in Dom0, which it can't handle\n"); + + /* UNREACHED */ + } else { + dom_state = DOMU_KILLED; + + /* Enable interrupts. This basically results in + * calling sti on the *physical* cpu. But after + * domain_crash() the vcpu pointer is invalid. + * Therefore, we must unlock the irqs before killing + * it. */ + vcpu_schedule_unlock_irq(vcpu); + + /* DomU is impacted. Kill it and continue. */ + domain_crash(curdom); + } + } + + + switch (dom_state) { + case DOM0_TRAP: + case DOMU_TRAP: + /* Enable interrupts. */ + vcpu_schedule_unlock_irq(vcpu); + + /* guest softirqs and event callbacks are scheduled + * immediately after this handler exits. */ + break; + case DOMU_KILLED: + /* Nothing to do here. */ + break; + default: + BUG(); + } +} + + +/* AMD K8 machine check */ +void amd_k8_mcheck_init(struct cpuinfo_x86 *c) +{ + uint64_t value; + uint32_t i; + int cpu_nr; + + machine_check_vector = k8_machine_check; + cpu_nr = smp_processor_id(); + wmb(); + + rdmsrl(MSR_IA32_MCG_CAP, value); + if (value & MCG_CTL_P) /* Control register present ? */ + wrmsrl (MSR_IA32_MCG_CTL, 0xffffffffffffffffULL); + nr_mce_banks = value & MCG_CAP_COUNT; + + for (i = 0; i < nr_mce_banks; i++) { + switch (i) { + case 4: /* Northbridge */ + /* Enable error reporting of all errors, + * enable error checking and + * disable sync flooding */ + wrmsrl(MSR_IA32_MC4_CTL, 0x02c3c008ffffffffULL); + wrmsrl(MSR_IA32_MC4_STATUS, 0x0ULL); + break; + + default: + /* Enable error reporting of all errors */ + wrmsrl(MSR_IA32_MC0_CTL + 4 * i, 0xffffffffffffffffULL); + wrmsrl(MSR_IA32_MC0_STATUS + 4 * i, 0x0ULL); + break; + } + } + + set_in_cr4(X86_CR4_MCE); + printk("CPU%i: AMD K8 machine check reporting enabled.\n", cpu_nr); +} diff -r 959db3c01837 xen/arch/x86/cpu/mcheck/amd_nonfatal.c --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/xen/arch/x86/cpu/mcheck/amd_nonfatal.c Fri Jul 04 14:48:37 2008 +0200 @@ -0,0 +1,303 @@ +/* + * MCA implementation for AMD CPUs + * Copyright (c) 2007 Advanced Micro Devices, Inc. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + */ + + +/* K8 common MCA documentation published at + * + * AMD64 Architecture Programmer's Manual Volume 2: + * System Programming + * Publication # 24593 Revision: 3.12 + * Issue Date: September 2006 + * + * URL: + * http://www.amd.com/us-en/assets/content_type/white_papers_and_tech_docs/24593.pdf + */ + +/* The related documentation for K8 Revisions A - E is: + * + * BIOS and Kernel Developer's Guide for + * AMD Athlon 64 and AMD Opteron Processors + * Publication # 26094 Revision: 3.30 + * Issue Date: February 2006 + * + * URL: + * http://www.amd.com/us-en/assets/content_type/white_papers_and_tech_docs/26094.PDF + */ + +/* The related documentation for K8 Revisions F - G is: + * + * BIOS and Kernel Developer's Guide for + * AMD NPT Family 0Fh Processors + * Publication # 32559 Revision: 3.04 + * Issue Date: December 2006 + * + * URL: + * http://www.amd.com/us-en/assets/content_type/white_papers_and_tech_docs/32559.pdf + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "mce.h" +#include "x86_mca.h" + +static struct timer mce_timer; + +#define MCE_PERIOD MILLISECS(15000) +#define MCE_MIN MILLISECS(2000) +#define MCE_MAX MILLISECS(30000) + +static s_time_t period = MCE_PERIOD; +static int hw_threshold = 0; +static int adjust = 0; + +/* The polling service routine: + * Collects information of correctable errors and notifies + * Dom0 via an event. + */ +void mce_amd_checkregs(void *info) +{ + struct vcpu *vcpu = current; + struct mc_info *mc_data; + struct mcinfo_global mc_global; + struct mcinfo_bank mc_info; + uint64_t status, addrv, miscv; + unsigned int i; + unsigned int event_enabled; + unsigned int cpu_nr; + int error_found; + + /* We don't need a slot yet. Only allocate one on error. */ + mc_data = NULL; + + cpu_nr = smp_processor_id(); + event_enabled = guest_enabled_event(dom0->vcpu[0], VIRQ_MCA); + error_found = 0; + + memset(&mc_global, 0, sizeof(mc_global)); + mc_global.common.type = MC_TYPE_GLOBAL; + mc_global.common.size = sizeof(mc_global); + + mc_global.mc_domid = vcpu->domain->domain_id; /* impacted domain */ + mc_global.mc_coreid = vcpu->processor; /* impacted physical cpu */ + BUG_ON(cpu_nr != vcpu->processor); + mc_global.mc_core_threadid = 0; + mc_global.mc_vcpuid = vcpu->vcpu_id; /* impacted vcpu */ +#if 0 /* TODO: on which socket is this physical core? + It's not clear to me how to figure this out. */ + mc_global.mc_socketid = ???; +#endif + mc_global.mc_flags |= MC_FLAG_CORRECTABLE; + rdmsrl(MSR_IA32_MCG_STATUS, mc_global.mc_gstatus); + + for (i = 0; i < nr_mce_banks; i++) { + struct domain *d; + + rdmsrl(MSR_IA32_MC0_STATUS + i * 4, status); + + if (!(status & MCi_STATUS_VAL)) + continue; + + if (mc_data == NULL) { + /* Now we need a slot to fill in error telemetry. */ + mc_data = x86_mcinfo_getptr(); + BUG_ON(mc_data == NULL); + x86_mcinfo_clear(mc_data); + x86_mcinfo_add(mc_data, &mc_global); + } + + memset(&mc_info, 0, sizeof(mc_info)); + mc_info.common.type = MC_TYPE_BANK; + mc_info.common.size = sizeof(mc_info); + mc_info.mc_bank = i; + mc_info.mc_status = status; + + /* Increase polling frequency */ + error_found = 1; + + addrv = 0; + if (status & MCi_STATUS_ADDRV) { + rdmsrl(MSR_IA32_MC0_ADDR + i * 4, addrv); + + d = maddr_get_owner(addrv); + if (d != NULL) + mc_info.mc_domid = d->domain_id; + } + + miscv = 0; + if (status & MCi_STATUS_MISCV) + rdmsrl(MSR_IA32_MC0_MISC + i * 4, miscv); + + mc_info.mc_addr = addrv; + mc_info.mc_misc = miscv; + x86_mcinfo_add(mc_data, &mc_info); + + if (mc_callback_bank_extended) + mc_callback_bank_extended(mc_data, i, status); + + /* clear status */ + wrmsrl(MSR_IA32_MC0_STATUS + i * 4, 0x0ULL); + wmb(); + } + + if (error_found > 0) { + /* If Dom0 enabled the VIRQ_MCA event, then ... */ + if (event_enabled) + /* ... notify it. */ + send_guest_global_virq(dom0, VIRQ_MCA); + else + /* ... or dump it */ + x86_mcinfo_dump(mc_data); + } + + adjust += error_found; +} + +/* polling service routine invoker: + * Adjust poll frequency at runtime. No error means slow polling frequency, + * an error means higher polling frequency. + * It uses hw threshold register introduced in AMD K8 RevF to detect + * multiple correctable errors between two polls. In that case, + * increase polling frequency higher than normal. + */ +static void mce_amd_work_fn(void *data) +{ + on_each_cpu(mce_amd_checkregs, data, 1, 1); + + if (adjust > 0) { + if ( !guest_enabled_event(dom0->vcpu[0], VIRQ_MCA) ) { + /* Dom0 did not enable VIRQ_MCA, so Xen is reporting. */ + printk("MCE: polling routine found correctable error. " + " Use mcelog to parse above error output.\n"); + } + } + + if (hw_threshold) { + uint64_t value; + uint32_t counter; + + rdmsrl(MSR_IA32_MC4_MISC, value); + /* Only the error counter field is of interest + * Bit field is described in AMD K8 BKDG chapter 6.4.5.5 + */ + counter = (value & 0xFFF00000000ULL) >> 32U; + + /* HW does not count *all* kinds of correctable errors. + * Thus it is possible, that the polling routine finds an + * correctable error even if the HW reports nothing. + * However, the other way around is not possible (= BUG). + */ + if (counter > 0) { + /* HW reported correctable errors, + * the polling routine did not find... + */ + BUG_ON(adjust == 0); + /* subtract 1 to not double count the error + * from the polling service routine */ + adjust += (counter - 1); + + /* Restart counter */ + /* No interrupt, reset counter value */ + value &= ~(0x60FFF00000000ULL); + /* Counter enable */ + value |= (1ULL << 51); + wrmsrl(MSR_IA32_MC4_MISC, value); + wmb(); + } + } + + if (adjust > 0) { + /* Increase polling frequency */ + adjust++; /* adjust == 1 must have an effect */ + period /= adjust; + } else { + /* Decrease polling frequency */ + period *= 2; + } + if (period > MCE_MAX) { + /* limit: Poll at least every 30s */ + period = MCE_MAX; + } + if (period < MCE_MIN) { + /* limit: Poll every 2s. + * When this is reached an uncorrectable error + * is expected to happen, if Dom0 does nothing. + */ + period = MCE_MIN; + } + + set_timer(&mce_timer, NOW() + period); + adjust = 0; +} + +void amd_nonfatal_mcheck_init(struct cpuinfo_x86 *c) +{ + if (c->x86_vendor != X86_VENDOR_AMD) + return; + + /* Assume we are on K8 or newer AMD CPU here */ + + /* The threshold bitfields in MSR_IA32_MC4_MISC has + * been introduced along with the SVME feature bit. */ + if (cpu_has(c, X86_FEATURE_SVME)) { + uint64_t value; + + /* hw threshold registers present */ + hw_threshold = 1; + rdmsrl(MSR_IA32_MC4_MISC, value); + + if (value & (1ULL << 61)) { /* Locked bit */ + /* Locked by BIOS. Not available for use */ + hw_threshold = 0; + } + if (!(value & (1ULL << 63))) { /* Valid bit */ + /* No CtrP present */ + hw_threshold = 0; + } else { + if (!(value & (1ULL << 62))) { /* Counter Bit */ + /* No counter field present */ + hw_threshold = 0; + } + } + + if (hw_threshold) { + /* No interrupt, reset counter value */ + value &= ~(0x60FFF00000000ULL); + /* Counter enable */ + value |= (1ULL << 51); + wrmsrl(MSR_IA32_MC4_MISC, value); + /* serialize */ + wmb(); + printk(XENLOG_INFO "MCA: Use hw thresholding to adjust polling frequency\n"); + } + } + + init_timer(&mce_timer, mce_amd_work_fn, NULL, 0); + set_timer(&mce_timer, NOW() + period); + + return; +} diff -r 959db3c01837 xen/arch/x86/cpu/mcheck/k7.c --- a/xen/arch/x86/cpu/mcheck/k7.c Fri Jul 04 14:41:35 2008 +0200 +++ b/xen/arch/x86/cpu/mcheck/k7.c Fri Jul 04 14:48:37 2008 +0200 @@ -66,8 +66,8 @@ static fastcall void k7_machine_check(st } -/* AMD K7 machine check is Intel like */ -void amd_mcheck_init(struct cpuinfo_x86 *c) +/* AMD K7 machine check */ +void amd_k7_mcheck_init(struct cpuinfo_x86 *c) { u32 l, h; int i; @@ -75,7 +75,6 @@ void amd_mcheck_init(struct cpuinfo_x86 machine_check_vector = k7_machine_check; wmb(); - printk (KERN_INFO "Intel machine check architecture supported.\n"); rdmsr (MSR_IA32_MCG_CAP, l, h); if (l & (1<<8)) /* Control register present ? */ wrmsr (MSR_IA32_MCG_CTL, 0xffffffff, 0xffffffff); @@ -90,6 +89,6 @@ void amd_mcheck_init(struct cpuinfo_x86 } set_in_cr4 (X86_CR4_MCE); - printk (KERN_INFO "Intel machine check reporting enabled on CPU#%d.\n", + printk (KERN_INFO "CPU%d: AMD K7 machine check reporting enabled.\n", smp_processor_id()); } diff -r 959db3c01837 xen/arch/x86/cpu/mcheck/mce.c --- a/xen/arch/x86/cpu/mcheck/mce.c Fri Jul 04 14:41:35 2008 +0200 +++ b/xen/arch/x86/cpu/mcheck/mce.c Fri Jul 04 14:48:37 2008 +0200 @@ -8,73 +8,151 @@ #include #include #include +#include #include #include #include "mce.h" +#include "x86_mca.h" int mce_disabled = 0; -int nr_mce_banks; +unsigned int nr_mce_banks; EXPORT_SYMBOL_GPL(nr_mce_banks); /* non-fatal.o */ +/* XXX For now a fixed array is used. Later this should be changed + * to a dynamic allocated array with the size calculated in relation + * to physical cpus present in the machine. + * The more physical cpus are available, the more entries you need. + */ +#define MAX_MCINFO 10 + +struct mc_machine_notify { + struct mc_info mc; + uint32_t fetch_idx; + uint32_t valid; +}; + +struct mc_machine { + + /* Array structure used for collecting machine check error telemetry. */ + struct mc_info mc[MAX_MCINFO]; + + /* We handle multiple machine check reports lockless by + * iterating through the array using the producer/consumer concept. + */ + /* Producer array index to fill with machine check error data. + * Index must be increased atomically. */ + uint32_t error_idx; + + /* Consumer array index to fetch machine check error data from. + * Index must be increased atomically. */ + uint32_t fetch_idx; + + /* Integer array holding the indeces of the mc array that allows + * a Dom0 to notify a DomU to re-fetch the same machine check error + * data. The notification and refetch also uses its own + * producer/consumer mechanism, because Dom0 may decide to not report + * every error to the impacted DomU. + */ + struct mc_machine_notify notify[MAX_MCINFO]; + + /* Array index to get fetch_idx from. + * Index must be increased atomically. */ + uint32_t notifyproducer_idx; + uint32_t notifyconsumer_idx; +}; + +/* Global variable with machine check information. */ +struct mc_machine mc_data; + /* Handle unconfigured int18 (should never happen) */ -static fastcall void unexpected_machine_check(struct cpu_user_regs * regs, long error_code) +static void unexpected_machine_check(struct cpu_user_regs *regs, long error_code) { - printk(KERN_ERR "CPU#%d: Unexpected int18 (Machine Check).\n", smp_processor_id()); + printk(XENLOG_ERR "CPU#%d: Unexpected int18 (Machine Check).\n", + smp_processor_id()); } + /* Call the installed machine check handler for this CPU setup. */ -void fastcall (*machine_check_vector)(struct cpu_user_regs *, long error_code) = unexpected_machine_check; +void (*machine_check_vector)(struct cpu_user_regs *regs, long error_code) = unexpected_machine_check; + +/* Init machine check callback handler + * It is used to collect additional information provided by newer + * CPU families/models without the need to duplicate the whole handler. + * This avoids having many handlers doing almost nearly the same and each + * with its own tweaks ands bugs. */ +int (*mc_callback_bank_extended)(struct mc_info *, uint16_t, uint64_t) = NULL; + + +static void amd_mcheck_init(struct cpuinfo_x86 *ci) +{ + + switch (ci->x86) { + case 6: + amd_k7_mcheck_init(ci); + break; + + case 0xf: + amd_k8_mcheck_init(ci); + break; + + case 0x10: + amd_f10_mcheck_init(ci); + break; + + default: + /* Assume that machine check support is available. + * The minimum provided support is at least the K8. */ + amd_k8_mcheck_init(ci); + } +} /* This has to be run for each processor */ void mcheck_init(struct cpuinfo_x86 *c) { - if (mce_disabled==1) + if (mce_disabled == 1) { + printk(XENLOG_INFO "MCE support disabled by bootparam\n"); return; + } + + if (!cpu_has(c, X86_FEATURE_MCE)) { + printk(XENLOG_INFO "CPU%i: No machine check support available\n", + smp_processor_id()); + return; + } + + memset(&mc_data, 0, sizeof(struct mc_machine)); switch (c->x86_vendor) { - case X86_VENDOR_AMD: - amd_mcheck_init(c); - break; + case X86_VENDOR_AMD: + amd_mcheck_init(c); + break; - case X86_VENDOR_INTEL: + case X86_VENDOR_INTEL: #ifndef CONFIG_X86_64 - if (c->x86==5) - intel_p5_mcheck_init(c); - if (c->x86==6) - intel_p6_mcheck_init(c); + if (c->x86==5) + intel_p5_mcheck_init(c); + if (c->x86==6) + intel_p6_mcheck_init(c); #endif - if (c->x86==15) - intel_p4_mcheck_init(c); - break; + if (c->x86==15) + intel_p4_mcheck_init(c); + break; #ifndef CONFIG_X86_64 - case X86_VENDOR_CENTAUR: - if (c->x86==5) - winchip_mcheck_init(c); - break; + case X86_VENDOR_CENTAUR: + if (c->x86==5) + winchip_mcheck_init(c); + break; #endif - default: - break; + default: + break; } } -static unsigned long old_cr4 __initdata; - -void __init stop_mce(void) -{ - old_cr4 = read_cr4(); - clear_in_cr4(X86_CR4_MCE); -} - -void __init restart_mce(void) -{ - if (old_cr4 & X86_CR4_MCE) - set_in_cr4(X86_CR4_MCE); -} static void __init mcheck_disable(char *str) { @@ -88,3 +166,411 @@ static void __init mcheck_enable(char *s custom_param("nomce", mcheck_disable); custom_param("mce", mcheck_enable); + + +#include +#include + +struct mc_info *x86_mcinfo_getptr(void) +{ + struct mc_info *mi; + uint32_t entry, next; + + for (;;) { + entry = mc_data.error_idx; + smp_rmb(); + next = entry + 1; + if (cmpxchg(&mc_data.error_idx, entry, next) == entry) + break; + } + + mi = &(mc_data.mc[(entry % MAX_MCINFO)]); + BUG_ON(mc_data.error_idx < mc_data.fetch_idx); + + return mi; +} + +static int x86_mcinfo_matches_guest(const struct mc_info *mi, + const struct domain *d, const struct vcpu *v) +{ + struct mcinfo_common *mic; + struct mcinfo_global *mig; + + x86_mcinfo_lookup(mic, mi, MC_TYPE_GLOBAL); + mig = (struct mcinfo_global *)mic; + if (mig == NULL) + return 0; + + if (d->domain_id != mig->mc_domid) + return 0; + + if (v->vcpu_id != mig->mc_vcpuid) + return 0; + + return 1; +} + + +#define x86_mcinfo_mcdata(idx) (mc_data.mc[(idx % MAX_MCINFO)]) + +static struct mc_info *x86_mcinfo_getfetchptr(uint32_t *fetch_idx, + const struct domain *d, const struct vcpu *v) +{ + struct mc_info *mi; + + /* This function is called from the fetch hypercall with + * the mc_lock spinlock held. Thus, no need for locking here. + */ + mi = &(x86_mcinfo_mcdata(mc_data.fetch_idx)); + if ((d != dom0) && !x86_mcinfo_matches_guest(mi, d, v)) { + /* Bogus domU command detected. */ + *fetch_idx = 0; + return NULL; + } + + *fetch_idx = mc_data.fetch_idx; + mc_data.fetch_idx++; + BUG_ON(mc_data.fetch_idx > mc_data.error_idx); + + return mi; +} + + +static void x86_mcinfo_marknotified(struct xen_mc_notifydomain *mc_notifydomain) +{ + struct mc_machine_notify *mn; + struct mcinfo_common *mic = NULL; + struct mcinfo_global *mig; + struct domain *d; + int i; + + /* This function is called from the notifier hypercall with + * the mc_notify_lock spinlock held. Thus, no need for locking here. + */ + + /* First invalidate entries for guests that disappeared after + * notification (e.g. shutdown/crash). This step prevents the + * notification array from filling up with stalling/leaking entries. + */ + for (i = mc_data.notifyconsumer_idx; i < mc_data.notifyproducer_idx; i++) { + mn = &(mc_data.notify[(i % MAX_MCINFO)]); + x86_mcinfo_lookup(mic, &mn->mc, MC_TYPE_GLOBAL); + BUG_ON(mic == NULL); + mig = (struct mcinfo_global *)mic; + d = get_domain_by_id(mig->mc_domid); + if (d == NULL) { + /* Domain does not exist. */ + mn->valid = 0; + } + if ((!mn->valid) && (i == mc_data.notifyconsumer_idx)) + mc_data.notifyconsumer_idx++; + } + + /* Now put in the error telemetry. Since all error data fetchable + * by domUs are uncorrectable errors, they are very important. + * So we dump them before overriding them. When a guest takes that long, + * then we can assume something bad already happened (crash, hang, etc.) + */ + mn = &(mc_data.notify[(mc_data.notifyproducer_idx % MAX_MCINFO)]); + + if (mn->valid) { + struct mcinfo_common *mic = NULL; + struct mcinfo_global *mig; + + /* To not loose the information, we dump it. */ + x86_mcinfo_lookup(mic, &mn->mc, MC_TYPE_GLOBAL); + BUG_ON(mic == NULL); + mig = (struct mcinfo_global *)mic; + printk(XENLOG_WARNING "Domain ID %u was notified by Dom0 to " + "fetch machine check error telemetry. But Domain ID " + "did not do that in time.\n", + mig->mc_domid); + x86_mcinfo_dump(&mn->mc); + } + + memcpy(&mn->mc, &(x86_mcinfo_mcdata(mc_notifydomain->fetch_idx)), + sizeof(struct mc_info)); + mn->fetch_idx = mc_notifydomain->fetch_idx; + mn->valid = 1; + + mc_data.notifyproducer_idx++; + + /* By design there can never be more notifies than machine check errors. + * If that ever happens, then we hit a bug. */ + BUG_ON(mc_data.notifyproducer_idx > mc_data.fetch_idx); + BUG_ON(mc_data.notifyconsumer_idx > mc_data.notifyproducer_idx); +} + +static struct mc_info *x86_mcinfo_getnotifiedptr(uint32_t *fetch_idx, + const struct domain *d, const struct vcpu *v) +{ + struct mc_machine_notify *mn = NULL; + uint32_t i; + int found; + + /* This function is called from the fetch hypercall with + * the mc_notify_lock spinlock held. Thus, no need for locking here. + */ + + /* The notifier data is filled in the order guests get notified, but + * guests may fetch them in a different order. That's why we need + * the game with valid/invalid entries. */ + found = 0; + for (i = mc_data.notifyconsumer_idx; i < mc_data.notifyproducer_idx; i++) { + mn = &(mc_data.notify[(i % MAX_MCINFO)]); + if (!mn->valid) { + if (i == mc_data.notifyconsumer_idx) + mc_data.notifyconsumer_idx++; + continue; + } + if (x86_mcinfo_matches_guest(&mn->mc, d, v)) { + found = 1; + break; + } + } + + if (!found) { + /* This domain has never been notified. This must be + * a bogus domU command. */ + *fetch_idx = 0; + return NULL; + } + + BUG_ON(mn == NULL); + *fetch_idx = mn->fetch_idx; + mn->valid = 0; + + BUG_ON(mc_data.notifyconsumer_idx > mc_data.notifyproducer_idx); + return &mn->mc; +} + + +void x86_mcinfo_clear(struct mc_info *mi) +{ + memset(mi, 0, sizeof(struct mc_info)); + x86_mcinfo_nentries(mi) = 0; +} + + +int x86_mcinfo_add(struct mc_info *mi, void *mcinfo) +{ + int i; + unsigned long end1, end2; + struct mcinfo_common *mic, *mic_base, *mic_index; + + mic = (struct mcinfo_common *)mcinfo; + mic_index = mic_base = x86_mcinfo_first(mi); + + /* go to first free entry */ + for (i = 0; i < x86_mcinfo_nentries(mi); i++) { + mic_index = x86_mcinfo_next(mic_index); + } + + /* check if there is enough size */ + end1 = (unsigned long)((uint8_t *)mic_base + sizeof(struct mc_info)); + end2 = (unsigned long)((uint8_t *)mic_index + mic->size); + + if (end1 < end2) + return -ENOSPC; /* No space. Can't add entry. */ + + /* there's enough space. add entry. */ + memcpy(mic_index, mic, mic->size); + x86_mcinfo_nentries(mi)++; + + return 0; +} + + +/* Dump machine check information in a format, + * mcelog can parse. This is used only when + * Dom0 does not take the notification. */ +void x86_mcinfo_dump(struct mc_info *mi) +{ + struct mcinfo_common *mic = NULL; + struct mcinfo_global *mc_global; + struct mcinfo_bank *mc_bank; + + /* first print the global info */ + x86_mcinfo_lookup(mic, mi, MC_TYPE_GLOBAL); + if (mic == NULL) + return; + mc_global = (struct mcinfo_global *)mic; + if (mc_global->mc_flags & MC_FLAG_UNCORRECTABLE) { + printk(XENLOG_WARNING + "CPU%d: Machine Check Exception: %16"PRIx64"\n", + mc_global->mc_coreid, mc_global->mc_gstatus); + } else { + printk(XENLOG_WARNING "MCE: The hardware reports a non " + "fatal, correctable incident occured on " + "CPU %d.\n", + mc_global->mc_coreid); + } + + /* then the bank information */ + x86_mcinfo_lookup(mic, mi, MC_TYPE_BANK); /* finds the first entry */ + do { + if (mic == NULL) + return; + if (mic->type != MC_TYPE_BANK) + continue; + + mc_bank = (struct mcinfo_bank *)mic; + + printk(XENLOG_WARNING "Bank %d: %16"PRIx64, + mc_bank->mc_bank, + mc_bank->mc_status); + if (mc_bank->mc_status & MCi_STATUS_MISCV) + printk("[%16"PRIx64"]", mc_bank->mc_misc); + if (mc_bank->mc_status & MCi_STATUS_ADDRV) + printk(" at %16"PRIx64, mc_bank->mc_addr); + + printk("\n"); + mic = x86_mcinfo_next(mic); /* next entry */ + if ((mic == NULL) || (mic->size == 0)) + break; + } while (1); +} + + + +/* Machine Check Architecture Hypercall */ +long do_mca(XEN_GUEST_HANDLE(xen_mc_t) u_xen_mc) +{ + long ret = 0; + struct xen_mc curop, *op = &curop; + struct vcpu *v = current; + struct domain *domU; + struct xen_mc_fetch *mc_fetch; + struct xen_mc_notifydomain *mc_notifydomain; + struct mc_info *mi; + uint32_t flags; + uint32_t fetch_idx; + uint16_t vcpuid; + /* Use a different lock for the notify hypercall in order to allow + * a DomU to fetch mc data while Dom0 notifies another DomU. */ + static DEFINE_SPINLOCK(mc_lock); + static DEFINE_SPINLOCK(mc_notify_lock); + + if ( copy_from_guest(op, u_xen_mc, 1) ) + return -EFAULT; + + if ( op->interface_version != XEN_MCA_INTERFACE_VERSION ) + return -EACCES; + + switch ( op->cmd ) { + case XEN_MC_fetch: + /* This hypercall is for any domain */ + mc_fetch = &op->u.mc_fetch; + + switch (mc_fetch->flags) { + case XEN_MC_CORRECTABLE: + /* But polling mode is Dom0 only, because + * correctable errors are reported to Dom0 only */ + if ( !IS_PRIV(v->domain) ) + return -EPERM; + break; + + case XEN_MC_TRAP: + break; + default: + return -EFAULT; + } + + flags = XEN_MC_OK; + spin_lock(&mc_lock); + + if ( IS_PRIV(v->domain) ) { + /* this must be Dom0. So a notify hypercall + * can't have happened before. */ + mi = x86_mcinfo_getfetchptr(&fetch_idx, dom0, v); + } else { + /* Hypercall comes from an unprivileged domain */ + domU = v->domain; + if (guest_has_trap_callback(dom0, 0, TRAP_machine_check)) { + /* Dom0 must have notified this DomU before + * via the notify hypercall. */ + mi = x86_mcinfo_getnotifiedptr(&fetch_idx, domU, v); + } else { + /* Xen notified the DomU. */ + mi = x86_mcinfo_getfetchptr(&fetch_idx, domU, v); + } + } + + if (mi) { + memcpy(&mc_fetch->mc_info, mi, + sizeof(struct mc_info)); + } else { + /* There is no data for a bogus DomU command. */ + flags |= XEN_MC_NODATA; + memset(&mc_fetch->mc_info, 0, sizeof(struct mc_info)); + } + + mc_fetch->flags = flags; + mc_fetch->fetch_idx = fetch_idx; + + if ( copy_to_guest(u_xen_mc, op, 1) ) + ret = -EFAULT; + + spin_unlock(&mc_lock); + break; + + case XEN_MC_notifydomain: + /* This hypercall is for Dom0 only */ + if ( !IS_PRIV(v->domain) ) + return -EPERM; + + spin_lock(&mc_notify_lock); + + mc_notifydomain = &op->u.mc_notifydomain; + domU = get_domain_by_id(mc_notifydomain->mc_domid); + vcpuid = mc_notifydomain->mc_vcpuid; + + if ((domU == NULL) || (domU == dom0)) { + /* It's not possible to notify a non-existent domain + * or the dom0. */ + spin_unlock(&mc_notify_lock); + return -EACCES; + } + + if (vcpuid >= MAX_VIRT_CPUS) { + /* It's not possible to notify a vcpu, Xen can't + * assign to a domain. */ + spin_unlock(&mc_notify_lock); + return -EACCES; + } + + mc_notifydomain->flags = XEN_MC_OK; + + mi = &(x86_mcinfo_mcdata(mc_notifydomain->fetch_idx)); + if (!x86_mcinfo_matches_guest(mi, domU, domU->vcpu[vcpuid])) { + /* The error telemetry is not for the guest, Dom0 + * wants to notify. */ + mc_notifydomain->flags |= XEN_MC_NOMATCH; + } else if ( guest_has_trap_callback(domU, vcpuid, + TRAP_machine_check) ) + { + /* Send notification */ + if ( send_guest_trap(domU, vcpuid, TRAP_machine_check) ) + mc_notifydomain->flags |= XEN_MC_NOTDELIVERED; + } else + mc_notifydomain->flags |= XEN_MC_CANNOTHANDLE; + +#ifdef DEBUG + /* sanity check - these two flags are mutually exclusive */ + if ((flags & XEN_MC_CANNOTHANDLE) && (flags & XEN_MC_NOTDELIVERED)) + BUG(); +#endif + + if ( copy_to_guest(u_xen_mc, op, 1) ) + ret = -EFAULT; + + if (ret == 0) { + x86_mcinfo_marknotified(mc_notifydomain); + } + + spin_unlock(&mc_notify_lock); + break; + } + + return ret; +} diff -r 959db3c01837 xen/arch/x86/cpu/mcheck/mce.h --- a/xen/arch/x86/cpu/mcheck/mce.h Fri Jul 04 14:41:35 2008 +0200 +++ b/xen/arch/x86/cpu/mcheck/mce.h Fri Jul 04 14:48:37 2008 +0200 @@ -1,14 +1,30 @@ #include +#include -void amd_mcheck_init(struct cpuinfo_x86 *c); +/* Init functions */ +void amd_nonfatal_mcheck_init(struct cpuinfo_x86 *c); +void amd_k7_mcheck_init(struct cpuinfo_x86 *c); +void amd_k8_mcheck_init(struct cpuinfo_x86 *c); +void amd_f10_mcheck_init(struct cpuinfo_x86 *c); void intel_p4_mcheck_init(struct cpuinfo_x86 *c); void intel_p5_mcheck_init(struct cpuinfo_x86 *c); void intel_p6_mcheck_init(struct cpuinfo_x86 *c); void winchip_mcheck_init(struct cpuinfo_x86 *c); -/* Call the installed machine check handler for this CPU setup. */ -extern fastcall void (*machine_check_vector)(struct cpu_user_regs *, long error_code); +/* Function pointer used in the handlers to collect additional information + * provided by newer CPU families/models without the need to duplicate + * the whole handler resulting in various handlers each with its own + * tweaks and bugs */ +extern int (*mc_callback_bank_extended)(struct mc_info *mi, + uint16_t bank, uint64_t status); + +/* Helper functions used for collecting error telemetry */ +struct mc_info *x86_mcinfo_getptr(void); +void x86_mcinfo_clear(struct mc_info *mi); +int x86_mcinfo_add(struct mc_info *mi, void *mcinfo); +void x86_mcinfo_dump(struct mc_info *mi); + +/* Global variables */ extern int mce_disabled __initdata; -extern int nr_mce_banks; - +extern unsigned int nr_mce_banks; diff -r 959db3c01837 xen/arch/x86/cpu/mcheck/non-fatal.c --- a/xen/arch/x86/cpu/mcheck/non-fatal.c Fri Jul 04 14:41:35 2008 +0200 +++ b/xen/arch/x86/cpu/mcheck/non-fatal.c Fri Jul 04 14:48:37 2008 +0200 @@ -68,19 +68,29 @@ static int __init init_nonfatal_mce_chec if (!cpu_has(c, X86_FEATURE_MCA)) return -ENODEV; - /* Some Athlons misbehave when we frob bank 0 */ - if (boot_cpu_data.x86_vendor == X86_VENDOR_AMD && - boot_cpu_data.x86 == 6) - firstbank = 1; - else - firstbank = 0; - /* * Check for non-fatal errors every MCE_RATE s */ - init_timer(&mce_timer, mce_work_fn, NULL, 0); - set_timer(&mce_timer, NOW() + MCE_PERIOD); - printk(KERN_INFO "Machine check exception polling timer started.\n"); + switch (c->x86_vendor) { + case X86_VENDOR_AMD: + if (c->x86 == 6) { /* K7 */ + firstbank = 1; + init_timer(&mce_timer, mce_work_fn, NULL, 0); + set_timer(&mce_timer, NOW() + MCE_PERIOD); + break; + } + + /* Assume we are on K8 or newer AMD CPU here */ + amd_nonfatal_mcheck_init(c); + break; + + case X86_VENDOR_INTEL: + init_timer(&mce_timer, mce_work_fn, NULL, 0); + set_timer(&mce_timer, NOW() + MCE_PERIOD); + break; + } + + printk(KERN_INFO "MCA: Machine check polling timer started.\n"); return 0; } __initcall(init_nonfatal_mce_checker); diff -r 959db3c01837 xen/arch/x86/cpu/mcheck/x86_mca.h --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/xen/arch/x86/cpu/mcheck/x86_mca.h Fri Jul 04 14:48:37 2008 +0200 @@ -0,0 +1,72 @@ +/* + * MCA implementation for AMD K7/K8 CPUs + * Copyright (c) 2007 Advanced Micro Devices, Inc. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + */ + + +/* The MCA/MCE MSRs should not be used anywhere else. + * They are cpu family/model specific and are only for use + * in terms of machine check handling. + * So we define them here rather in . + */ + + +/* Bitfield of the MSR_IA32_MCG_CAP register */ +#define MCG_CAP_COUNT 0x00000000000000ffULL +#define MCG_CTL_P 0x0000000000000100ULL +/* Bits 9-63 are reserved */ + +/* Bitfield of the MSR_IA32_MCG_STATUS register */ +#define MCG_STATUS_RIPV 0x0000000000000001ULL +#define MCG_STATUS_EIPV 0x0000000000000002ULL +#define MCG_STATUS_MCIP 0x0000000000000004ULL +/* Bits 3-63 are reserved */ + +/* Bitfield of MSR_K8_MCi_STATUS registers */ +/* MCA error code */ +#define MCi_STATUS_MCA 0x000000000000ffffULL +/* model-specific error code */ +#define MCi_STATUS_MSEC 0x00000000ffff0000ULL +/* Other information */ +#define MCi_STATUS_OTHER 0x01ffffff00000000ULL +/* processor context corrupt */ +#define MCi_STATUS_PCC 0x0200000000000000ULL +/* MSR_K8_MCi_ADDR register valid */ +#define MCi_STATUS_ADDRV 0x0400000000000000ULL +/* MSR_K8_MCi_MISC register valid */ +#define MCi_STATUS_MISCV 0x0800000000000000ULL +/* error condition enabled */ +#define MCi_STATUS_EN 0x1000000000000000ULL +/* uncorrected error */ +#define MCi_STATUS_UC 0x2000000000000000ULL +/* status register overflow */ +#define MCi_STATUS_OVER 0x4000000000000000ULL +/* valid */ +#define MCi_STATUS_VAL 0x8000000000000000ULL + +/* Bitfield of MSi_STATUS_OTHER field */ +/* reserved bits */ +#define MCi_STATUS_OTHER_RESERVED1 0x00001fff00000000ULL +/* uncorrectable ECC error */ +#define MCi_STATUS_OTEHR_UC_ECC 0x0000200000000000ULL +/* correctable ECC error */ +#define MCi_STATUS_OTHER_C_ECC 0x0000400000000000ULL +/* ECC syndrome of an ECC error */ +#define MCi_STATUS_OTHER_ECC_SYNDROME 0x007f800000000000ULL +/* reserved bits */ +#define MCi_STATUS_OTHER_RESERVED2 0x0180000000000000ULL + diff -r 959db3c01837 xen/arch/x86/nmi.c --- a/xen/arch/x86/nmi.c Fri Jul 04 14:41:35 2008 +0200 +++ b/xen/arch/x86/nmi.c Fri Jul 04 14:48:37 2008 +0200 @@ -457,10 +457,10 @@ static void do_nmi_stats(unsigned char k if ( ((d = dom0) == NULL) || ((v = d->vcpu[0]) == NULL) ) return; - if ( v->nmi_pending || v->nmi_masked ) + if ( v->nmi_pending || (v->trap_priority >= VCPU_TRAP_NMI) ) printk("dom0 vpu0: NMI %s%s\n", v->nmi_pending ? "pending " : "", - v->nmi_masked ? "masked " : ""); + (v->trap_priority >= VCPU_TRAP_NMI) ? "masked " : ""); else printk("dom0 vcpu0: NMI neither pending nor masked\n"); } diff -r 959db3c01837 xen/arch/x86/traps.c --- a/xen/arch/x86/traps.c Fri Jul 04 14:41:35 2008 +0200 +++ b/xen/arch/x86/traps.c Fri Jul 04 14:48:37 2008 +0200 @@ -487,6 +487,20 @@ static unsigned int check_guest_io_break } /* + * Called from asm to set up the MCE trapbounce info. + * Returns 0 if no callback is set up, else 1. + */ +asmlinkage int set_guest_machinecheck_trapbounce(void) +{ + struct vcpu *v = current; + struct trap_bounce *tb = &v->arch.trap_bounce; + + do_guest_trap(TRAP_machine_check, guest_cpu_user_regs(), 0); + tb->flags &= ~TBF_EXCEPTION; /* not needed for MCE delivery path */ + return !null_trap_bounce(v, tb); +} + +/* * Called from asm to set up the NMI trapbounce info. * Returns 0 if no callback is set up, else 1. */ @@ -905,8 +919,6 @@ asmlinkage void do_int3(struct cpu_user_ asmlinkage void do_machine_check(struct cpu_user_regs *regs) { - extern fastcall void (*machine_check_vector)( - struct cpu_user_regs *, long error_code); machine_check_vector(regs, regs->error_code); } @@ -3037,6 +3049,24 @@ long unregister_guest_nmi_callback(void) return 0; } +int guest_has_trap_callback(struct domain *d, uint16_t vcpuid, unsigned int trap_nr) +{ + struct vcpu *v; + struct trap_info *t; + + BUG_ON(d == NULL); + BUG_ON(vcpuid >= MAX_VIRT_CPUS); + + /* Sanity check - XXX should be more fine grained. */ + BUG_ON(trap_nr > TRAP_syscall); + + v = d->vcpu[vcpuid]; + t = &v->arch.guest_context.trap_ctxt[trap_nr]; + + return (t->address != 0); +} + + int send_guest_trap(struct domain *d, uint16_t vcpuid, unsigned int trap_nr) { struct vcpu *v; @@ -3057,6 +3087,23 @@ int send_guest_trap(struct domain *d, ui /* not safe to wake up a vcpu here */ raise_softirq(NMI_MCE_SOFTIRQ); return 0; + } + break; + + case TRAP_machine_check: + + /* We are called by the machine check (exception or polling) handlers + * on the physical CPU that reported a machine check error. */ + + if ( !test_and_set_bool(v->mce_pending) ) { + st = &per_cpu(softirq_trap, smp_processor_id()); + st->domain = d; + st->vcpu = v; + st->processor = v->processor; + + /* not safe to wake up a vcpu here */ + raise_softirq(NMI_MCE_SOFTIRQ); + return 0; } break; } diff -r 959db3c01837 xen/arch/x86/x86_32/asm-offsets.c --- a/xen/arch/x86/x86_32/asm-offsets.c Fri Jul 04 14:41:35 2008 +0200 +++ b/xen/arch/x86/x86_32/asm-offsets.c Fri Jul 04 14:48:37 2008 +0200 @@ -67,7 +67,11 @@ void __dummy__(void) arch.guest_context.kernel_sp); OFFSET(VCPU_guest_context_flags, struct vcpu, arch.guest_context.flags); OFFSET(VCPU_nmi_pending, struct vcpu, nmi_pending); - OFFSET(VCPU_nmi_masked, struct vcpu, nmi_masked); + OFFSET(VCPU_mce_pending, struct vcpu, mce_pending); + OFFSET(VCPU_old_trap_priority, struct vcpu, old_trap_priority); + OFFSET(VCPU_trap_priority, struct vcpu, trap_priority); + DEFINE(VCPU_TRAP_NMI, VCPU_TRAP_NMI); + DEFINE(VCPU_TRAP_MCE, VCPU_TRAP_MCE); DEFINE(_VGCF_failsafe_disables_events, _VGCF_failsafe_disables_events); BLANK(); diff -r 959db3c01837 xen/arch/x86/x86_32/entry.S --- a/xen/arch/x86/x86_32/entry.S Fri Jul 04 14:41:35 2008 +0200 +++ b/xen/arch/x86/x86_32/entry.S Fri Jul 04 14:48:37 2008 +0200 @@ -229,6 +229,8 @@ test_all_events: shl $IRQSTAT_shift,%eax test %ecx,irq_stat(%eax,1) jnz process_softirqs + testb $1,VCPU_mce_pending(%ebx) + jnz process_mce testb $1,VCPU_nmi_pending(%ebx) jnz process_nmi test_guest_events: @@ -255,15 +257,35 @@ process_softirqs: jmp test_all_events ALIGN +/* %ebx: struct vcpu */ +process_mce: + cmpw $VCPU_TRAP_MCE,VCPU_trap_priority(%ebx) + jae test_guest_events + sti + movb $0,VCPU_mce_pending(%ebx) + call set_guest_machinecheck_trapbounce + test %eax,%eax + jz test_all_events + movw VCPU_trap_priority(%ebx),%dx # safe priority for the + movw %dx,VCPU_old_trap_priority(%ebx) # iret hypercall + movw $VCPU_TRAP_MCE,VCPU_trap_priority(%ebx) + jmp process_trap + + ALIGN +/* %ebx: struct vcpu */ process_nmi: - testb $1,VCPU_nmi_masked(%ebx) - jnz test_guest_events + cmpw $VCPU_TRAP_NMI,VCPU_trap_priority(%ebx) + jae test_guest_events sti movb $0,VCPU_nmi_pending(%ebx) call set_guest_nmi_trapbounce test %eax,%eax jz test_all_events - movb $1,VCPU_nmi_masked(%ebx) + movw VCPU_trap_priority(%ebx),%dx # safe priority for the + movw %dx,VCPU_old_trap_priority(%ebx) # iret hypercall + movw $VCPU_TRAP_NMI,VCPU_trap_priority(%ebx) + /* FALLTHROUGH */ +process_trap: leal VCPU_trap_bounce(%ebx),%edx call create_bounce_frame jmp test_all_events @@ -681,6 +703,10 @@ ENTRY(hypercall_table) .long do_sysctl /* 35 */ .long do_domctl .long do_kexec_op + .rept __HYPERVISOR_arch_0-((.-hypercall_table)/4) + .long do_ni_hypercall + .endr + .long do_mca /* 48 */ .rept NR_hypercalls-((.-hypercall_table)/4) .long do_ni_hypercall .endr @@ -724,6 +750,10 @@ ENTRY(hypercall_args_table) .byte 1 /* do_sysctl */ /* 35 */ .byte 1 /* do_domctl */ .byte 2 /* do_kexec_op */ + .rept __HYPERVISOR_arch_0-(.-hypercall_args_table) + .byte 0 /* do_ni_hypercall */ + .endr + .byte 1 /* do_mca */ /* 48 */ .rept NR_hypercalls-(.-hypercall_args_table) .byte 0 /* do_ni_hypercall */ .endr diff -r 959db3c01837 xen/arch/x86/x86_32/traps.c --- a/xen/arch/x86/x86_32/traps.c Fri Jul 04 14:41:35 2008 +0200 +++ b/xen/arch/x86/x86_32/traps.c Fri Jul 04 14:48:37 2008 +0200 @@ -256,11 +256,12 @@ unsigned long do_iret(void) } /* Restore affinity. */ - if (v->nmi_masked && !cpus_equal(v->cpu_affinity_tmp, v->cpu_affinity)) + if ((v->trap_priority >= VCPU_TRAP_NMI) + && !cpus_equal(v->cpu_affinity_tmp, v->cpu_affinity)) vcpu_set_affinity(v, &v->cpu_affinity_tmp); - /* No longer in NMI context. */ - v->nmi_masked = 0; + /* Restore previous trap priority */ + v->trap_priority = v->old_trap_priority; /* Restore upcall mask from supplied EFLAGS.IF. */ vcpu_info(v, evtchn_upcall_mask) = !(eflags & X86_EFLAGS_IF); diff -r 959db3c01837 xen/arch/x86/x86_64/asm-offsets.c --- a/xen/arch/x86/x86_64/asm-offsets.c Fri Jul 04 14:41:35 2008 +0200 +++ b/xen/arch/x86/x86_64/asm-offsets.c Fri Jul 04 14:48:37 2008 +0200 @@ -92,7 +92,11 @@ void __dummy__(void) OFFSET(VCPU_kernel_ss, struct vcpu, arch.guest_context.kernel_ss); OFFSET(VCPU_guest_context_flags, struct vcpu, arch.guest_context.flags); OFFSET(VCPU_nmi_pending, struct vcpu, nmi_pending); - OFFSET(VCPU_nmi_masked, struct vcpu, nmi_masked); + OFFSET(VCPU_mce_pending, struct vcpu, mce_pending); + OFFSET(VCPU_old_trap_priority, struct vcpu, old_trap_priority); + OFFSET(VCPU_trap_priority, struct vcpu, trap_priority); + DEFINE(VCPU_TRAP_NMI, VCPU_TRAP_NMI); + DEFINE(VCPU_TRAP_MCE, VCPU_TRAP_MCE); DEFINE(_VGCF_failsafe_disables_events, _VGCF_failsafe_disables_events); DEFINE(_VGCF_syscall_disables_events, _VGCF_syscall_disables_events); BLANK(); diff -r 959db3c01837 xen/arch/x86/x86_64/compat/entry.S --- a/xen/arch/x86/x86_64/compat/entry.S Fri Jul 04 14:41:35 2008 +0200 +++ b/xen/arch/x86/x86_64/compat/entry.S Fri Jul 04 14:48:37 2008 +0200 @@ -101,6 +101,8 @@ ENTRY(compat_test_all_events) leaq irq_stat(%rip),%rcx testl $~0,(%rcx,%rax,1) jnz compat_process_softirqs + testb $1,VCPU_mce_pending(%rbx) + jnz compat_process_mce testb $1,VCPU_nmi_pending(%rbx) jnz compat_process_nmi compat_test_guest_events: @@ -129,15 +131,34 @@ compat_process_softirqs: ALIGN /* %rbx: struct vcpu */ +compat_process_mce: + cmpw $VCPU_TRAP_MCE,VCPU_trap_priority(%rbx) + jae compat_test_guest_events + sti + movb $0,VCPU_mce_pending(%rbx) + call set_guest_machinecheck_trapbounce + testl %eax,%eax + jz compat_test_all_events + movw VCPU_trap_priority(%rbx),%dx # safe priority for the + movw %dx,VCPU_old_trap_priority(%rbx) # iret hypercall + movw $VCPU_TRAP_MCE,VCPU_trap_priority(%rbx) + jmp compat_process_trap + + ALIGN +/* %rbx: struct vcpu */ compat_process_nmi: - testb $1,VCPU_nmi_masked(%rbx) - jnz compat_test_guest_events + cmpw $VCPU_TRAP_NMI,VCPU_trap_priority(%rbx) + jae compat_test_guest_events sti movb $0,VCPU_nmi_pending(%rbx) call set_guest_nmi_trapbounce testl %eax,%eax jz compat_test_all_events - movb $1,VCPU_nmi_masked(%rbx) + movw VCPU_trap_priority(%rbx),%dx # safe priority for the + movw %dx,VCPU_old_trap_priority(%rbx) # iret hypercall + movw $VCPU_TRAP_NMI,VCPU_trap_priority(%rbx) + /* FALLTHROUGH */ +compat_process_trap: leaq VCPU_trap_bounce(%rbx),%rdx call compat_create_bounce_frame jmp compat_test_all_events @@ -386,6 +407,10 @@ ENTRY(compat_hypercall_table) .quad do_sysctl /* 35 */ .quad do_domctl .quad compat_kexec_op + .rept __HYPERVISOR_arch_0-((.-compat_hypercall_table)/8) + .quad compat_ni_hypercall + .endr + .quad do_mca /* 48 */ .rept NR_hypercalls-((.-compat_hypercall_table)/8) .quad compat_ni_hypercall .endr @@ -429,6 +454,10 @@ ENTRY(compat_hypercall_args_table) .byte 1 /* do_sysctl */ /* 35 */ .byte 1 /* do_domctl */ .byte 2 /* compat_kexec_op */ + .rept __HYPERVISOR_arch_0-(.-compat_hypercall_args_table) + .byte 0 /* compat_ni_hypercall */ + .endr + .byte 1 /* do_mca */ .rept NR_hypercalls-(.-compat_hypercall_args_table) .byte 0 /* compat_ni_hypercall */ .endr diff -r 959db3c01837 xen/arch/x86/x86_64/compat/traps.c --- a/xen/arch/x86/x86_64/compat/traps.c Fri Jul 04 14:41:35 2008 +0200 +++ b/xen/arch/x86/x86_64/compat/traps.c Fri Jul 04 14:48:37 2008 +0200 @@ -122,11 +122,12 @@ unsigned int compat_iret(void) regs->_esp += 16; /* Restore affinity. */ - if (v->nmi_masked && !cpus_equal(v->cpu_affinity_tmp, v->cpu_affinity)) + if ((v->trap_priority >= VCPU_TRAP_NMI) + && !cpus_equal(v->cpu_affinity_tmp, v->cpu_affinity)) vcpu_set_affinity(v, &v->cpu_affinity_tmp); - /* No longer in NMI context. */ - v->nmi_masked = 0; + /* Restore previous trap priority */ + v->trap_priority = v->old_trap_priority; /* Restore upcall mask from supplied EFLAGS.IF. */ vcpu_info(v, evtchn_upcall_mask) = !(eflags & X86_EFLAGS_IF); diff -r 959db3c01837 xen/arch/x86/x86_64/entry.S --- a/xen/arch/x86/x86_64/entry.S Fri Jul 04 14:41:35 2008 +0200 +++ b/xen/arch/x86/x86_64/entry.S Fri Jul 04 14:48:37 2008 +0200 @@ -205,6 +205,8 @@ test_all_events: leaq irq_stat(%rip),%rcx testl $~0,(%rcx,%rax,1) jnz process_softirqs + testb $1,VCPU_mce_pending(%rbx) + jnz process_mce testb $1,VCPU_nmi_pending(%rbx) jnz process_nmi test_guest_events: @@ -231,15 +233,34 @@ process_softirqs: ALIGN /* %rbx: struct vcpu */ +process_mce: + cmpw $VCPU_TRAP_MCE,VCPU_trap_priority(%rbx) + jae test_guest_events + sti + movb $0,VCPU_mce_pending(%rbx) + call set_guest_machinecheck_trapbounce + test %eax,%eax + jz test_all_events + movw VCPU_trap_priority(%rbx),%dx # safe priority for the + movw %dx,VCPU_old_trap_priority(%rbx) # iret hypercall + movw $VCPU_TRAP_MCE,VCPU_trap_priority(%rbx) + jmp process_trap + + ALIGN +/* %rbx: struct vcpu */ process_nmi: - testb $1,VCPU_nmi_masked(%rbx) - jnz test_guest_events + cmpw $VCPU_TRAP_NMI,VCPU_trap_priority(%rbx) + jae test_guest_events sti movb $0,VCPU_nmi_pending(%rbx) call set_guest_nmi_trapbounce test %eax,%eax jz test_all_events - movb $1,VCPU_nmi_masked(%rbx) + movw VCPU_trap_priority(%rbx),%dx # safe priority for the + movw %dx,VCPU_old_trap_priority(%rbx) # iret hypercall + movw $VCPU_TRAP_NMI,VCPU_trap_priority(%rbx) + /* FALLTHROUGH */ +process_trap: leaq VCPU_trap_bounce(%rbx),%rdx call create_bounce_frame jmp test_all_events @@ -671,6 +692,10 @@ ENTRY(hypercall_table) .quad do_sysctl /* 35 */ .quad do_domctl .quad do_kexec_op + .rept __HYPERVISOR_arch_0-((.-hypercall_table)/8) + .quad do_ni_hypercall + .endr + .quad do_mca /* 48 */ .rept NR_hypercalls-((.-hypercall_table)/8) .quad do_ni_hypercall .endr @@ -715,6 +740,10 @@ ENTRY(hypercall_args_table) .byte 1 /* do_domctl */ .byte 2 /* do_kexec */ .byte 1 /* do_xsm_op */ + .rept __HYPERVISOR_arch_0-(.-hypercall_args_table) + .byte 0 /* do_ni_hypercall */ + .endr + .byte 1 /* do_mca */ /* 48 */ .rept NR_hypercalls-(.-hypercall_args_table) .byte 0 /* do_ni_hypercall */ .endr diff -r 959db3c01837 xen/arch/x86/x86_64/traps.c --- a/xen/arch/x86/x86_64/traps.c Fri Jul 04 14:41:35 2008 +0200 +++ b/xen/arch/x86/x86_64/traps.c Fri Jul 04 14:48:37 2008 +0200 @@ -289,11 +289,12 @@ unsigned long do_iret(void) } /* Restore affinity. */ - if (v->nmi_masked && !cpus_equal(v->cpu_affinity_tmp, v->cpu_affinity)) + if ((v->trap_priority >= VCPU_TRAP_NMI) + && !cpus_equal(v->cpu_affinity_tmp, v->cpu_affinity)) vcpu_set_affinity(v, &v->cpu_affinity_tmp); - /* No longer in NMI context. */ - v->nmi_masked = 0; + /* Restore previous trap priority */ + v->trap_priority = v->old_trap_priority; /* Restore upcall mask from supplied EFLAGS.IF. */ vcpu_info(v, evtchn_upcall_mask) = !(iret_saved.rflags & EF_IE); diff -r 959db3c01837 xen/common/domain.c --- a/xen/common/domain.c Fri Jul 04 14:41:35 2008 +0200 +++ b/xen/common/domain.c Fri Jul 04 14:48:37 2008 +0200 @@ -654,7 +654,9 @@ void vcpu_reset(struct vcpu *v) v->is_polling = 0; v->is_initialised = 0; v->nmi_pending = 0; - v->nmi_masked = 0; + v->mce_pending = 0; + v->old_trap_priority = VCPU_TRAP_NONE; + v->trap_priority = VCPU_TRAP_NONE; clear_bit(_VPF_blocked, &v->pause_flags); domain_unlock(v->domain); diff -r 959db3c01837 xen/common/event_channel.c --- a/xen/common/event_channel.c Fri Jul 04 14:41:35 2008 +0200 +++ b/xen/common/event_channel.c Fri Jul 04 14:48:37 2008 +0200 @@ -587,6 +587,21 @@ void send_guest_vcpu_virq(struct vcpu *v evtchn_set_pending(v, port); } +int guest_enabled_event(struct vcpu *v, int virq) +{ + int port; + + if ( unlikely(v == NULL) ) + return 0; + + port = v->virq_to_evtchn[virq]; + if ( port == 0 ) + return 0; + + /* virq is in use */ + return 1; +} + void send_guest_global_virq(struct domain *d, int virq) { int port; diff -r 959db3c01837 xen/include/Makefile --- a/xen/include/Makefile Fri Jul 04 14:41:35 2008 +0200 +++ b/xen/include/Makefile Fri Jul 04 14:48:37 2008 +0200 @@ -20,6 +20,7 @@ headers-y := \ compat/xen.h \ compat/xencomm.h \ compat/xenoprof.h +headers-$(CONFIG_X86) += compat/arch-x86/xen-mca.h headers-$(CONFIG_X86) += compat/arch-x86/xen.h headers-$(CONFIG_X86) += compat/arch-x86/xen-$(compat-arch-y).h headers-y += compat/arch-$(compat-arch-y).h compat/xlat.h diff -r 959db3c01837 xen/include/asm-x86/event.h --- a/xen/include/asm-x86/event.h Fri Jul 04 14:41:35 2008 +0200 +++ b/xen/include/asm-x86/event.h Fri Jul 04 14:48:37 2008 +0200 @@ -69,7 +69,12 @@ static inline void local_event_delivery_ /* No arch specific virq definition now. Default to global. */ static inline int arch_virq_is_global(int virq) { - return 1; + switch (virq) { + case VIRQ_MCA: + return 1; + default: + return 1; + } } #endif diff -r 959db3c01837 xen/include/asm-x86/mm.h --- a/xen/include/asm-x86/mm.h Fri Jul 04 14:41:35 2008 +0200 +++ b/xen/include/asm-x86/mm.h Fri Jul 04 14:48:37 2008 +0200 @@ -141,6 +141,9 @@ static inline u32 pickle_domptr(struct d #define page_get_owner(_p) (unpickle_domptr((_p)->u.inuse._domain)) #define page_set_owner(_p,_d) ((_p)->u.inuse._domain = pickle_domptr(_d)) +#define maddr_get_owner(ma) (page_get_owner(maddr_to_page((ma)))) +#define vaddr_get_owner(va) (page_get_owner(virt_to_page((va)))) + #define XENSHARE_writable 0 #define XENSHARE_readonly 1 extern void share_xen_page_with_guest( diff -r 959db3c01837 xen/include/asm-x86/traps.h --- a/xen/include/asm-x86/traps.h Fri Jul 04 14:41:35 2008 +0200 +++ b/xen/include/asm-x86/traps.h Fri Jul 04 14:48:37 2008 +0200 @@ -26,6 +26,18 @@ struct softirq_trap { int processor; /* physical cpu to inject trap */ }; +struct cpu_user_regs; + +extern void (*machine_check_vector)(struct cpu_user_regs *regs, long error_code); + +/** + * guest_has_trap_callback + * + * returns true (non-zero) if guest registered a trap handler + */ +extern int guest_has_trap_callback(struct domain *d, uint16_t vcpuid, + unsigned int trap_nr); + /** * send_guest_trap * @@ -35,5 +47,4 @@ extern int send_guest_trap(struct domain extern int send_guest_trap(struct domain *d, uint16_t vcpuid, unsigned int trap_nr); - #endif /* ASM_TRAP_H */ diff -r 959db3c01837 xen/include/public/arch-x86/xen-mca.h --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/xen/include/public/arch-x86/xen-mca.h Fri Jul 04 14:48:37 2008 +0200 @@ -0,0 +1,279 @@ +/****************************************************************************** + * arch-x86/mca.h + * + * Contributed by Advanced Micro Devices, Inc. + * Author: Christoph Egger + * + * Guest OS machine check interface to x86 Xen. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER + * DEALINGS IN THE SOFTWARE. + */ + +/* Full MCA functionality has the following Usecases from the guest side: + * + * Must have's: + * 1. Dom0 and DomU register machine check trap callback handlers + * (already done via "set_trap_table" hypercall) + * 2. Dom0 registers machine check event callback handler + * (doable via EVTCHNOP_bind_virq) + * 3. Dom0 and DomU fetches machine check data + * 4. Dom0 wants Xen to notify a DomU + * 5. Dom0 gets DomU ID from physical address + * 6. Dom0 wants Xen to kill DomU (already done for "xm destroy") + * + * Nice to have's: + * 7. Dom0 wants Xen to deactivate a physical CPU + * This is better done as separate task, physical CPU hotplugging, + * and hypercall(s) should be sysctl's + * 8. Page migration proposed from Xen NUMA work, where Dom0 can tell Xen to + * move a DomU (or Dom0 itself) away from a malicious page + * producing correctable errors. + * 9. offlining physical page: + * Xen free's and never re-uses a certain physical page. + * 10. Testfacility: Allow Dom0 to write values into machine check MSR's + * and tell Xen to trigger a machine check + */ + +#ifndef __XEN_PUBLIC_ARCH_X86_MCA_H__ +#define __XEN_PUBLIC_ARCH_X86_MCA_H__ + +/* Hypercall */ +#define __HYPERVISOR_mca __HYPERVISOR_arch_0 + +#define XEN_MCA_INTERFACE_VERSION 0x03000001 + +/* IN: Dom0 calls hypercall from MC event handler. */ +#define XEN_MC_CORRECTABLE 0x0 +/* IN: Dom0/DomU calls hypercall from MC trap handler. */ +#define XEN_MC_TRAP 0x1 +/* XEN_MC_CORRECTABLE and XEN_MC_TRAP are mutually exclusive. */ + +/* OUT: All is ok */ +#define XEN_MC_OK 0x0 +/* OUT: Domain could not fetch data. */ +#define XEN_MC_FETCHFAILED 0x1 +/* OUT: There was no machine check data to fetch. */ +#define XEN_MC_NODATA 0x2 +/* OUT: Between notification time and this hypercall an other + * (most likely) correctable error happened. The fetched data, + * does not match the original machine check data. */ +#define XEN_MC_NOMATCH 0x4 + +/* OUT: DomU did not register MC NMI handler. Try something else. */ +#define XEN_MC_CANNOTHANDLE 0x8 +/* OUT: Notifying DomU failed. Retry later or try something else. */ +#define XEN_MC_NOTDELIVERED 0x10 +/* Note, XEN_MC_CANNOTHANDLE and XEN_MC_NOTDELIVERED are mutually exclusive. */ + + +#ifndef __ASSEMBLY__ + +#define VIRQ_MCA VIRQ_ARCH_0 /* G. (DOM0) Machine Check Architecture */ + +/* + * Machine Check Architecure: + * structs are read-only and used to report all kinds of + * correctable and uncorrectable errors detected by the HW. + * Dom0 and DomU: register a handler to get notified. + * Dom0 only: Correctable errors are reported via VIRQ_MCA + * Dom0 and DomU: Uncorrectable errors are reported via nmi handlers + */ +#define MC_TYPE_GLOBAL 0 +#define MC_TYPE_BANK 1 +#define MC_TYPE_EXTENDED 2 + +struct mcinfo_common { + uint16_t type; /* structure type */ + uint16_t size; /* size of this struct in bytes */ +}; + + +#define MC_FLAG_CORRECTABLE (1 << 0) +#define MC_FLAG_UNCORRECTABLE (1 << 1) + +/* contains global x86 mc information */ +struct mcinfo_global { + struct mcinfo_common common; + + /* running domain at the time in error (most likely the impacted one) */ + uint16_t mc_domid; + uint32_t mc_socketid; /* physical socket of the physical core */ + uint16_t mc_coreid; /* physical impacted core */ + uint16_t mc_core_threadid; /* core thread of physical core */ + uint16_t mc_vcpuid; /* virtual cpu scheduled for mc_domid */ + uint64_t mc_gstatus; /* global status */ + uint32_t mc_flags; +}; + +/* contains bank local x86 mc information */ +struct mcinfo_bank { + struct mcinfo_common common; + + uint16_t mc_bank; /* bank nr */ + uint16_t mc_domid; /* Usecase 5: domain referenced by mc_addr on dom0 + * and if mc_addr is valid. Never valid on DomU. */ + uint64_t mc_status; /* bank status */ + uint64_t mc_addr; /* bank address, only valid + * if addr bit is set in mc_status */ + uint64_t mc_misc; +}; + + +struct mcinfo_msr { + uint64_t reg; /* MSR */ + uint64_t value; /* MSR value */ +}; + +/* contains mc information from other + * or additional mc MSRs */ +struct mcinfo_extended { + struct mcinfo_common common; + + /* You can fill up to five registers. + * If you need more, then use this structure + * multiple times. */ + + uint32_t mc_msrs; /* Number of msr with valid values. */ + struct mcinfo_msr mc_msr[5]; +}; + +#define MCINFO_HYPERCALLSIZE 1024 +#define MCINFO_MAXSIZE 768 + +struct mc_info { + /* Number of mcinfo_* entries in mi_data */ + uint32_t mi_nentries; + + uint8_t mi_data[MCINFO_MAXSIZE - sizeof(uint32_t)]; +}; +typedef struct mc_info mc_info_t; + + + +/* + * OS's should use these instead of writing their own lookup function + * each with its own bugs and drawbacks. + * We use macros instead of static inline functions to allow guests + * to include this header in assembly files (*.S). + */ +/* Prototype: + * uint32_t x86_mcinfo_nentries(struct mc_info *mi); + */ +#define x86_mcinfo_nentries(_mi) \ + (_mi)->mi_nentries +/* Prototype: + * struct mcinfo_common *x86_mcinfo_first(struct mc_info *mi); + */ +#define x86_mcinfo_first(_mi) \ + (struct mcinfo_common *)((_mi)->mi_data) +/* Prototype: + * struct mcinfo_common *x86_mcinfo_next(struct mcinfo_common *mic); + */ +#define x86_mcinfo_next(_mic) \ + (struct mcinfo_common *)((uint8_t *)(_mic) + (_mic)->size) + +/* Prototype: + * void x86_mcinfo_lookup(void *ret, struct mc_info *mi, uint16_t type); + */ +#define x86_mcinfo_lookup(_ret, _mi, _type) \ + do { \ + uint32_t found, i; \ + struct mcinfo_common *_mic; \ + \ + found = 0; \ + (_ret) = NULL; \ + if (_mi == NULL) break; \ + _mic = x86_mcinfo_first(_mi); \ + for (i = 0; i < x86_mcinfo_nentries(_mi); i++) { \ + if (_mic->type == (_type)) { \ + found = 1; \ + break; \ + } \ + _mic = x86_mcinfo_next(_mic); \ + } \ + (_ret) = found ? _mic : NULL; \ + } while (0) + + +/* Usecase 1 + * Register machine check trap callback handler + * (already done via "set_trap_table" hypercall) + */ + +/* Usecase 2 + * Dom0 registers machine check event callback handler + * done by EVTCHNOP_bind_virq + */ + +/* Usecase 3 + * Fetch machine check data from hypervisor. + * Note, this hypercall is special, because both Dom0 and DomU must use this. + */ +#define XEN_MC_fetch 1 +struct xen_mc_fetch { + /* IN/OUT variables. */ + uint32_t flags; + +/* IN: XEN_MC_CORRECTABLE, XEN_MC_TRAP */ +/* OUT: XEN_MC_OK, XEN_MC_FETCHFAILED, XEN_MC_NODATA, XEN_MC_NOMATCH */ + + /* OUT variables. */ + uint32_t fetch_idx; /* only useful for Dom0 for the notify hypercall */ + struct mc_info mc_info; +}; +typedef struct xen_mc_fetch xen_mc_fetch_t; +DEFINE_XEN_GUEST_HANDLE(xen_mc_fetch_t); + + +/* Usecase 4 + * This tells the hypervisor to notify a DomU about the machine check error + */ +#define XEN_MC_notifydomain 2 +struct xen_mc_notifydomain { + /* IN variables. */ + uint16_t mc_domid; /* The unprivileged domain to notify. */ + uint16_t mc_vcpuid; /* The vcpu in mc_domid to notify. + * Usually echo'd value from the fetch hypercall. */ + uint32_t fetch_idx; /* echo'd value from the fetch hypercall. */ + + /* IN/OUT variables. */ + uint32_t flags; + +/* IN: XEN_MC_CORRECTABLE, XEN_MC_TRAP */ +/* OUT: XEN_MC_OK, XEN_MC_CANNOTHANDLE, XEN_MC_NOTDELIVERED, XEN_MC_NOMATCH */ +}; +typedef struct xen_mc_notifydomain xen_mc_notifydomain_t; +DEFINE_XEN_GUEST_HANDLE(xen_mc_notifydomain_t); + + +struct xen_mc { + uint32_t cmd; + uint32_t interface_version; /* XEN_MCA_INTERFACE_VERSION */ + union { + struct xen_mc_fetch mc_fetch; + struct xen_mc_notifydomain mc_notifydomain; + uint8_t pad[MCINFO_HYPERCALLSIZE]; + } u; +}; +typedef struct xen_mc xen_mc_t; +DEFINE_XEN_GUEST_HANDLE(xen_mc_t); + +#endif /* __ASSEMBLY__ */ + +#endif /* __XEN_PUBLIC_ARCH_X86_MCA_H__ */ diff -r 959db3c01837 xen/include/public/arch-x86/xen.h --- a/xen/include/public/arch-x86/xen.h Fri Jul 04 14:41:35 2008 +0200 +++ b/xen/include/public/arch-x86/xen.h Fri Jul 04 14:48:37 2008 +0200 @@ -76,6 +76,10 @@ typedef unsigned long xen_pfn_t; /* Maximum number of virtual CPUs in multi-processor guests. */ #define MAX_VIRT_CPUS 32 + +/* Machine check support */ +#include "xen-mca.h" + #ifndef __ASSEMBLY__ typedef unsigned long xen_ulong_t; diff -r 959db3c01837 xen/include/xen/event.h --- a/xen/include/xen/event.h Fri Jul 04 14:41:35 2008 +0200 +++ b/xen/include/xen/event.h Fri Jul 04 14:48:37 2008 +0200 @@ -50,6 +50,9 @@ void free_xen_event_channel( void free_xen_event_channel( struct vcpu *local_vcpu, int port); +/* Query if event channel is in use by the guest */ +int guest_enabled_event(struct vcpu *v, int virq); + /* Notify remote end of a Xen-attached event channel.*/ void notify_via_xen_event_channel(int lport); diff -r 959db3c01837 xen/include/xen/sched.h --- a/xen/include/xen/sched.h Fri Jul 04 14:41:35 2008 +0200 +++ b/xen/include/xen/sched.h Fri Jul 04 14:48:37 2008 +0200 @@ -112,10 +112,21 @@ struct vcpu bool_t is_initialised; /* Currently running on a CPU? */ bool_t is_running; + /* MCE callback pending for this VCPU? */ + bool_t mce_pending; /* NMI callback pending for this VCPU? */ bool_t nmi_pending; - /* Avoid NMI reentry by allowing NMIs to be masked for short periods. */ - bool_t nmi_masked; + + /* Higher priorized traps may interrupt lower priorized traps, + * lower priorized traps wait until higher priorized traps finished. + * Note: This concept is known as "system priority level" (spl) + * in the UNIX world. */ + uint16_t old_trap_priority; + uint16_t trap_priority; +#define VCPU_TRAP_NONE 0 +#define VCPU_TRAP_NMI 1 +#define VCPU_TRAP_MCE 2 + /* Require shutdown to be deferred for some asynchronous operation? */ bool_t defer_shutdown; /* VCPU is paused following shutdown request (d->is_shutting_down)? */