Sorry forgot the attachment.
--jyh
>-----Original Message-----
>From: xen-devel-bounces@xxxxxxxxxxxxxxxxxxx
>[mailto:xen-devel-bounces@xxxxxxxxxxxxxxxxxxx] On Behalf Of Jiang, Yunhong
>Sent: Friday, April 16, 2010 6:56 PM
>To: Keir Fraser; Frank.Vanderlinden@xxxxxxx; Christoph Egger
>Cc: xen-devel@xxxxxxxxxxxxxxxxxxx
>Subject: [Xen-devel] [PATCH] Clean-up on MCA MSR virtualization and vMCE
>injection
>
>Clean-up on MCA MSR virtualization and vMCE injection
>
>Remove all virtual MCE related work into a seperated file.
>It also try to do some clean-up on the vMCE, including:
>a) renmae some function name like mce_init_msr/mce_rdmsr to be
> vmce_init_msr/vmce_rdmsr to make it more straightforward,
>b) make the vmca_msrs be a pointer in arch_domain,
> to decrease arch_domain's size
>c) extract per-bank MCA MSR access to be seperated function
> (bank_mce_wrmsr/bank_mce_rdmsr) to make it be a bit cleaner.
>d) A new file xen/include/asm-x86/mce.h is added for vmce related header.
>
>Signed-off-by: Jiang, Yunhong <yunhong.jiang@xxxxxxxxx>
>
>diff -r 7ee8bb40200a -r b4fd50c22d9c xen/arch/x86/cpu/mcheck/Makefile
>--- a/xen/arch/x86/cpu/mcheck/Makefile Thu Apr 15 19:11:16 2010 +0100
>+++ b/xen/arch/x86/cpu/mcheck/Makefile Fri Apr 16 18:55:03 2010 +0800
>@@ -7,3 +7,4 @@ obj-y += mce_intel.o
> obj-y += mce_intel.o
> obj-y += mce_amd_quirks.o
> obj-y += non-fatal.o
>+obj-y += vmce.o
>diff -r 7ee8bb40200a -r b4fd50c22d9c xen/arch/x86/cpu/mcheck/mce.c
>--- a/xen/arch/x86/cpu/mcheck/mce.c Thu Apr 15 19:11:16 2010 +0100
>+++ b/xen/arch/x86/cpu/mcheck/mce.c Fri Apr 16 18:55:03 2010 +0800
>@@ -31,11 +31,11 @@ unsigned int nr_mce_banks;
> unsigned int nr_mce_banks;
>
> int mce_broadcast = 0;
>-static uint64_t g_mcg_cap;
>+uint64_t g_mcg_cap;
>
> /* Real value in physical CTL MSR */
>-static uint64_t h_mcg_ctl = 0UL;
>-static uint64_t *h_mci_ctrl;
>+uint64_t h_mcg_ctl = 0UL;
>+uint64_t *h_mci_ctrl;
> int firstbank;
>
> static void intpose_init(void);
>@@ -752,234 +752,6 @@ u64 mce_cap_init(void)
> return value;
> }
>
>-/* Guest vMCE# MSRs virtualization ops (rdmsr/wrmsr) */
>-void mce_init_msr(struct domain *d)
>-{
>- d->arch.vmca_msrs.mcg_status = 0x0;
>- d->arch.vmca_msrs.mcg_cap = g_mcg_cap;
>- d->arch.vmca_msrs.mcg_ctl = ~(uint64_t)0x0;
>- d->arch.vmca_msrs.nr_injection = 0;
>- memset(d->arch.vmca_msrs.mci_ctl, ~0,
>- sizeof(d->arch.vmca_msrs.mci_ctl));
>- INIT_LIST_HEAD(&d->arch.vmca_msrs.impact_header);
>- spin_lock_init(&d->arch.vmca_msrs.lock);
>-}
>-
>-int mce_rdmsr(uint32_t msr, uint64_t *val)
>-{
>- struct domain *d = current->domain;
>- int ret = 1;
>- unsigned int bank;
>- struct bank_entry *entry = NULL;
>-
>- *val = 0;
>- spin_lock(&d->arch.vmca_msrs.lock);
>-
>- switch ( msr )
>- {
>- case MSR_IA32_MCG_STATUS:
>- *val = d->arch.vmca_msrs.mcg_status;
>- if (*val)
>- mce_printk(MCE_VERBOSE,
>- "MCE: rdmsr MCG_STATUS 0x%"PRIx64"\n", *val);
>- break;
>- case MSR_IA32_MCG_CAP:
>- *val = d->arch.vmca_msrs.mcg_cap;
>- mce_printk(MCE_VERBOSE, "MCE: rdmsr MCG_CAP 0x%"PRIx64"\n",
>- *val);
>- break;
>- case MSR_IA32_MCG_CTL:
>- /* Always 0 if no CTL support */
>- *val = d->arch.vmca_msrs.mcg_ctl & h_mcg_ctl;
>- mce_printk(MCE_VERBOSE, "MCE: rdmsr MCG_CTL 0x%"PRIx64"\n",
>- *val);
>- break;
>- case MSR_IA32_MC0_CTL ... MSR_IA32_MC0_CTL + 4 * MAX_NR_BANKS - 1:
>- bank = (msr - MSR_IA32_MC0_CTL) / 4;
>- if ( bank >= (d->arch.vmca_msrs.mcg_cap & MCG_CAP_COUNT) )
>- {
>- mce_printk(MCE_QUIET, "MCE: MSR %x is not MCA MSR\n", msr);
>- ret = 0;
>- break;
>- }
>- switch (msr & (MSR_IA32_MC0_CTL | 3))
>- {
>- case MSR_IA32_MC0_CTL:
>- *val = d->arch.vmca_msrs.mci_ctl[bank] &
>- (h_mci_ctrl ? h_mci_ctrl[bank] : ~0UL);
>- mce_printk(MCE_VERBOSE, "MCE: rdmsr MC%u_CTL
>0x%"PRIx64"\n",
>- bank, *val);
>- break;
>- case MSR_IA32_MC0_STATUS:
>- /* Only error bank is read. Non-error banks simply return. */
>- if ( !list_empty(&d->arch.vmca_msrs.impact_header) )
>- {
>- entry = list_entry(d->arch.vmca_msrs.impact_header.next,
>- struct bank_entry, list);
>- if (entry->bank == bank) {
>- *val = entry->mci_status;
>- mce_printk(MCE_VERBOSE,
>- "MCE: rd MC%u_STATUS in vMCE# context "
>- "value 0x%"PRIx64"\n", bank, *val);
>- }
>- else
>- entry = NULL;
>- }
>- break;
>- case MSR_IA32_MC0_ADDR:
>- if ( !list_empty(&d->arch.vmca_msrs.impact_header) )
>- {
>- entry = list_entry(d->arch.vmca_msrs.impact_header.next,
>- struct bank_entry, list);
>- if ( entry->bank == bank )
>- {
>- *val = entry->mci_addr;
>- mce_printk(MCE_VERBOSE,
>- "MCE: rdmsr MC%u_ADDR in vMCE# context "
>- "0x%"PRIx64"\n", bank, *val);
>- }
>- }
>- break;
>- case MSR_IA32_MC0_MISC:
>- if ( !list_empty(&d->arch.vmca_msrs.impact_header) )
>- {
>- entry = list_entry(d->arch.vmca_msrs.impact_header.next,
>- struct bank_entry, list);
>- if ( entry->bank == bank )
>- {
>- *val = entry->mci_misc;
>- mce_printk(MCE_VERBOSE,
>- "MCE: rd MC%u_MISC in vMCE# context "
>- "0x%"PRIx64"\n", bank, *val);
>- }
>- }
>- break;
>- }
>- break;
>- default:
>- switch ( boot_cpu_data.x86_vendor )
>- {
>- case X86_VENDOR_INTEL:
>- ret = intel_mce_rdmsr(msr, val);
>- break;
>- default:
>- ret = 0;
>- break;
>- }
>- break;
>- }
>-
>- spin_unlock(&d->arch.vmca_msrs.lock);
>- return ret;
>-}
>-
>-int mce_wrmsr(u32 msr, u64 val)
>-{
>- struct domain *d = current->domain;
>- struct bank_entry *entry = NULL;
>- unsigned int bank;
>- int ret = 1;
>-
>- if ( !g_mcg_cap )
>- return 0;
>-
>- spin_lock(&d->arch.vmca_msrs.lock);
>-
>- switch ( msr )
>- {
>- case MSR_IA32_MCG_CTL:
>- d->arch.vmca_msrs.mcg_ctl = val;
>- break;
>- case MSR_IA32_MCG_STATUS:
>- d->arch.vmca_msrs.mcg_status = val;
>- mce_printk(MCE_VERBOSE, "MCE: wrmsr MCG_STATUS %"PRIx64"\n",
>val);
>- /* For HVM guest, this is the point for deleting vMCE injection node
>*/
>- if ( d->is_hvm && (d->arch.vmca_msrs.nr_injection > 0) )
>- {
>- d->arch.vmca_msrs.nr_injection--; /* Should be 0 */
>- if ( !list_empty(&d->arch.vmca_msrs.impact_header) )
>- {
>- entry = list_entry(d->arch.vmca_msrs.impact_header.next,
>- struct bank_entry, list);
>- if ( entry->mci_status & MCi_STATUS_VAL )
>- mce_printk(MCE_QUIET, "MCE: MCi_STATUS MSR should
>have "
>- "been cleared before write MCG_STATUS
>MSR\n");
>-
>- mce_printk(MCE_QUIET, "MCE: Delete HVM last injection "
>- "Node, nr_injection %u\n",
>- d->arch.vmca_msrs.nr_injection);
>- list_del(&entry->list);
>- xfree(entry);
>- }
>- else
>- mce_printk(MCE_QUIET, "MCE: Not found HVM guest"
>- " last injection Node, something Wrong!\n");
>- }
>- break;
>- case MSR_IA32_MCG_CAP:
>- mce_printk(MCE_QUIET, "MCE: MCG_CAP is read-only\n");
>- ret = -1;
>- break;
>- case MSR_IA32_MC0_CTL ... MSR_IA32_MC0_CTL + 4 * MAX_NR_BANKS - 1:
>- bank = (msr - MSR_IA32_MC0_CTL) / 4;
>- if ( bank >= (d->arch.vmca_msrs.mcg_cap & MCG_CAP_COUNT) )
>- {
>- mce_printk(MCE_QUIET, "MCE: MSR %x is not MCA MSR\n", msr);
>- ret = 0;
>- break;
>- }
>- switch ( msr & (MSR_IA32_MC0_CTL | 3) )
>- {
>- case MSR_IA32_MC0_CTL:
>- d->arch.vmca_msrs.mci_ctl[bank] = val;
>- break;
>- case MSR_IA32_MC0_STATUS:
>- /* Give the first entry of the list, it corresponds to current
>- * vMCE# injection. When vMCE# is finished processing by the
>- * the guest, this node will be deleted.
>- * Only error bank is written. Non-error banks simply return.
>- */
>- if ( !list_empty(&d->arch.vmca_msrs.impact_header) )
>- {
>- entry = list_entry(d->arch.vmca_msrs.impact_header.next,
>- struct bank_entry, list);
>- if ( entry->bank == bank )
>- entry->mci_status = val;
>- mce_printk(MCE_VERBOSE,
>- "MCE: wr MC%u_STATUS %"PRIx64" in vMCE#\n",
>- bank, val);
>- }
>- else
>- mce_printk(MCE_VERBOSE,
>- "MCE: wr MC%u_STATUS %"PRIx64"\n", bank, val);
>- break;
>- case MSR_IA32_MC0_ADDR:
>- mce_printk(MCE_QUIET, "MCE: MC%u_ADDR is read-only\n", bank);
>- ret = -1;
>- break;
>- case MSR_IA32_MC0_MISC:
>- mce_printk(MCE_QUIET, "MCE: MC%u_MISC is read-only\n", bank);
>- ret = -1;
>- break;
>- }
>- break;
>- default:
>- switch ( boot_cpu_data.x86_vendor )
>- {
>- case X86_VENDOR_INTEL:
>- ret = intel_mce_wrmsr(msr, val);
>- break;
>- default:
>- ret = 0;
>- break;
>- }
>- break;
>- }
>-
>- spin_unlock(&d->arch.vmca_msrs.lock);
>- return ret;
>-}
>-
> static void mcinfo_clear(struct mc_info *mi)
> {
> memset(mi, 0, sizeof(struct mc_info));
>@@ -1238,11 +1010,11 @@ int mca_ctl_conflict(struct mcinfo_bank
> return 1;
>
> /* Will MCE happen in host if If host mcg_ctl is 0? */
>- if ( ~d->arch.vmca_msrs.mcg_ctl & h_mcg_ctl )
>+ if ( ~d->arch.vmca_msrs->mcg_ctl & h_mcg_ctl )
> return 1;
>
> bank_nr = bank->mc_bank;
>- if (~d->arch.vmca_msrs.mci_ctl[bank_nr] & h_mci_ctrl[bank_nr] )
>+ if (~d->arch.vmca_msrs->mci_ctl[bank_nr] & h_mci_ctrl[bank_nr] )
> return 1;
> return 0;
> }
>diff -r 7ee8bb40200a -r b4fd50c22d9c xen/arch/x86/cpu/mcheck/mce.h
>--- a/xen/arch/x86/cpu/mcheck/mce.h Thu Apr 15 19:11:16 2010 +0100
>+++ b/xen/arch/x86/cpu/mcheck/mce.h Fri Apr 16 18:55:03 2010 +0800
>@@ -164,4 +164,32 @@ int x86_mcinfo_add(struct mc_info *mi, v
> int x86_mcinfo_add(struct mc_info *mi, void *mcinfo);
> void x86_mcinfo_dump(struct mc_info *mi);
>
>+int fill_vmsr_data(struct mcinfo_bank *mc_bank, struct domain *d,
>+ uint64_t gstatus);
>+int inject_vmce(struct domain *d);
>+int vmce_domain_inject(struct mcinfo_bank *bank, struct domain *d, struct
>mcinfo_global *global);
>+
>+extern uint64_t g_mcg_cap;
>+/* Real value in physical CTL MSR */
>+extern uint64_t h_mcg_ctl;
>+extern uint64_t *h_mci_ctrl;
>+
>+extern unsigned int nr_mce_banks;
>+
>+static inline int mce_vendor_bank_msr(uint32_t msr)
>+{
>+ if ( boot_cpu_data.x86_vendor == X86_VENDOR_INTEL &&
>+ (msr > MSR_IA32_MC0_CTL2 && msr < (MSR_IA32_MC0_CTL2 +
>nr_mce_banks)) )
>+ return 1;
>+ return 0;
>+}
>+
>+static inline int mce_bank_msr(uint32_t msr)
>+{
>+ if ( (msr > MSR_IA32_MC0_CTL2 &&
>+ msr < (MSR_IA32_MC0_CTL + 4 * nr_mce_banks - 1)) ||
>+ mce_vendor_bank_msr(msr) )
>+ return 1;
>+ return 0;
>+}
> #endif /* _MCE_H */
>diff -r 7ee8bb40200a -r b4fd50c22d9c xen/arch/x86/cpu/mcheck/mce_intel.c
>--- a/xen/arch/x86/cpu/mcheck/mce_intel.c Thu Apr 15 19:11:16 2010 +0100
>+++ b/xen/arch/x86/cpu/mcheck/mce_intel.c Fri Apr 16 18:55:03 2010
>+0800
>@@ -11,6 +11,7 @@
> #include <asm/system.h>
> #include <asm/msr.h>
> #include <asm/p2m.h>
>+#include <asm/mce.h>
> #include "mce.h"
> #include "x86_mca.h"
>
>@@ -199,126 +200,6 @@ intel_get_extended_msrs(struct mc_info *
> return MCA_EXTINFO_GLOBAL;
> }
>
>-/* This node list records errors impacting a domain. when one
>- * MCE# happens, one error bank impacts a domain. This error node
>- * will be inserted to the tail of the per_dom data for vMCE# MSR
>- * virtualization. When one vMCE# injection is finished processing
>- * processed by guest, the corresponding node will be deleted.
>- * This node list is for GUEST vMCE# MSRS virtualization.
>- */
>-static struct bank_entry* alloc_bank_entry(void) {
>- struct bank_entry *entry;
>-
>- entry = xmalloc(struct bank_entry);
>- if (!entry) {
>- printk(KERN_ERR "MCE: malloc bank_entry failed\n");
>- return NULL;
>- }
>- memset(entry, 0x0, sizeof(entry));
>- INIT_LIST_HEAD(&entry->list);
>- return entry;
>-}
>-
>-/* Fill error bank info for #vMCE injection and GUEST vMCE#
>- * MSR virtualization data
>- * 1) Log down how many nr_injections of the impacted.
>- * 2) Copy MCE# error bank to impacted DOM node list,
>- for vMCE# MSRs virtualization
>-*/
>-
>-static int fill_vmsr_data(struct mcinfo_bank *mc_bank, struct domain *d,
>- uint64_t gstatus) {
>- struct bank_entry *entry;
>-
>- /* This error bank impacts one domain, we need to fill domain related
>- * data for vMCE MSRs virtualization and vMCE# injection */
>- if (mc_bank->mc_domid != (uint16_t)~0) {
>- /* For HVM guest, Only when first vMCE is consumed by HVM guest
>successfully,
>- * will we generete another node and inject another vMCE
>- */
>- if ( (d->is_hvm) && (d->arch.vmca_msrs.nr_injection > 0) )
>- {
>- mce_printk(MCE_QUIET, "MCE: HVM guest has not handled
>previous"
>- " vMCE yet!\n");
>- return -1;
>- }
>- entry = alloc_bank_entry();
>- if (entry == NULL)
>- return -1;
>-
>- entry->mci_status = mc_bank->mc_status;
>- entry->mci_addr = mc_bank->mc_addr;
>- entry->mci_misc = mc_bank->mc_misc;
>- entry->bank = mc_bank->mc_bank;
>-
>- spin_lock(&d->arch.vmca_msrs.lock);
>- /* New error Node, insert to the tail of the per_dom data */
>- list_add_tail(&entry->list, &d->arch.vmca_msrs.impact_header);
>- /* Fill MSR global status */
>- d->arch.vmca_msrs.mcg_status = gstatus;
>- /* New node impact the domain, need another vMCE# injection*/
>- d->arch.vmca_msrs.nr_injection++;
>- spin_unlock(&d->arch.vmca_msrs.lock);
>-
>- mce_printk(MCE_VERBOSE,"MCE: Found error @[BANK%d "
>- "status %"PRIx64" addr %"PRIx64" domid %d]\n ",
>- mc_bank->mc_bank, mc_bank->mc_status,
>mc_bank->mc_addr,
>- mc_bank->mc_domid);
>- }
>- return 0;
>-}
>-
>-static int inject_mce(struct domain *d)
>-{
>- int cpu = smp_processor_id();
>- cpumask_t affinity;
>-
>- /* PV guest and HVM guest have different vMCE# injection
>- * methods*/
>-
>- if ( !test_and_set_bool(d->vcpu[0]->mce_pending) )
>- {
>- if (d->is_hvm)
>- {
>- mce_printk(MCE_VERBOSE, "MCE: inject vMCE to HVM DOM %d\n",
>- d->domain_id);
>- vcpu_kick(d->vcpu[0]);
>- }
>- /* PV guest including DOM0 */
>- else
>- {
>- mce_printk(MCE_VERBOSE, "MCE: inject vMCE to PV DOM%d\n",
>- d->domain_id);
>- if (guest_has_trap_callback
>- (d, 0, TRAP_machine_check))
>- {
>- d->vcpu[0]->cpu_affinity_tmp =
>- d->vcpu[0]->cpu_affinity;
>- cpus_clear(affinity);
>- cpu_set(cpu, affinity);
>- mce_printk(MCE_VERBOSE, "MCE: CPU%d set affinity,
>old %d\n", cpu,
>- d->vcpu[0]->processor);
>- vcpu_set_affinity(d->vcpu[0], &affinity);
>- vcpu_kick(d->vcpu[0]);
>- }
>- else
>- {
>- mce_printk(MCE_VERBOSE, "MCE: Kill PV guest with No MCE
>handler\n");
>- domain_crash(d);
>- }
>- }
>- }
>- else {
>- /* new vMCE comes while first one has not been injected yet,
>- * in this case, inject fail. [We can't lose this vMCE for
>- * the mce node's consistency].
>- */
>- mce_printk(MCE_QUIET, "There's a pending vMCE waiting to be injected
>"
>- " to this DOM%d!\n", d->domain_id);
>- return -1;
>- }
>- return 0;
>-}
>
> static void intel_UCR_handler(struct mcinfo_bank *bank,
> struct mcinfo_global *global,
>@@ -377,7 +258,7 @@ static void intel_UCR_handler(struct mci
> return;
> }
> /* We will inject vMCE to DOMU*/
>- if ( inject_mce(d) < 0 )
>+ if ( inject_vmce(d) < 0 )
> {
> mce_printk(MCE_QUIET, "inject vMCE to
>DOM%d"
> " failed\n", d->domain_id);
>diff -r 7ee8bb40200a -r b4fd50c22d9c xen/arch/x86/cpu/mcheck/vmce.c
>--- /dev/null Thu Jan 01 00:00:00 1970 +0000
>+++ b/xen/arch/x86/cpu/mcheck/vmce.c Fri Apr 16 18:55:03 2010 +0800
>@@ -0,0 +1,451 @@
>+/*
>+ * vmce.c - virtual MCE support
>+ */
>+
>+#include <xen/init.h>
>+#include <xen/types.h>
>+#include <xen/irq.h>
>+#include <xen/event.h>
>+#include <xen/kernel.h>
>+#include <xen/delay.h>
>+#include <xen/smp.h>
>+#include <xen/mm.h>
>+#include <asm/processor.h>
>+#include <public/sysctl.h>
>+#include <asm/system.h>
>+#include <asm/msr.h>
>+#include <asm/p2m.h>
>+#include "mce.h"
>+#include "x86_mca.h"
>+
>+int vmce_init_msr(struct domain *d)
>+{
>+ if ( dom_vmce(d) )
>+ {
>+ dprintk(XENLOG_G_WARNING, "Domain %d has inited vMCE\n",
>d->domain_id);
>+ return 0;
>+ }
>+
>+ /* Allocate the vmca_msrs and mci_ctl togother */
>+ dom_vmce(d) = xmalloc(struct domain_mca_msrs);
>+ if ( !dom_vmce(d) )
>+ return -ENOMEM;
>+
>+ dom_vmce(d)->mci_ctl = xmalloc_array(uint64_t, nr_mce_banks);
>+ if ( !dom_vmce(d)->mci_ctl )
>+ {
>+ xfree(dom_vmce(d));
>+ return -ENOMEM;
>+ }
>+ memset(d->arch.vmca_msrs->mci_ctl, ~0,
>+ sizeof(d->arch.vmca_msrs->mci_ctl));
>+
>+ dom_vmce(d)->mcg_status = 0x0;
>+ dom_vmce(d)->mcg_cap = g_mcg_cap;
>+ dom_vmce(d)->mcg_ctl = ~(uint64_t)0x0;
>+ dom_vmce(d)->nr_injection = 0;
>+
>+ INIT_LIST_HEAD(&d->arch.vmca_msrs->impact_header);
>+ spin_lock_init(&d->arch.vmca_msrs->lock);
>+
>+ return 0;
>+}
>+
>+/*
>+ * Caller should make sure msr is bank msr */
>+static int bank_mce_rdmsr(struct domain *d, uint32_t msr, uint64_t *val)
>+{
>+ int bank, ret = 1;
>+ struct domain_mca_msrs *vmce;
>+ struct bank_entry *entry = NULL;
>+
>+ if (!d)
>+ return -EINVAL;
>+ vmce = dom_vmce(d);
>+ ASSERT(vmce);
>+
>+ bank = (msr - MSR_IA32_MC0_CTL) / 4;
>+ if (bank >= nr_mce_banks)
>+ return -1;
>+
>+ switch (msr & (MSR_IA32_MC0_CTL | 3))
>+ {
>+ case MSR_IA32_MC0_CTL:
>+ *val = vmce->mci_ctl[bank] &
>+ (h_mci_ctrl ? h_mci_ctrl[bank] : ~0UL);
>+ mce_printk(MCE_VERBOSE, "MCE: rdmsr MC%u_CTL 0x%"PRIx64"\n",
>+ bank, *val);
>+ break;
>+ case MSR_IA32_MC0_STATUS:
>+ /* Only error bank is read. Non-error banks simply return. */
>+ if ( !list_empty(&vmce->impact_header) )
>+ {
>+ entry = list_entry(vmce->impact_header.next,
>+ struct bank_entry, list);
>+ if (entry->bank == bank) {
>+ *val = entry->mci_status;
>+ mce_printk(MCE_VERBOSE,
>+ "MCE: rd MC%u_STATUS in vMCE# context "
>+ "value 0x%"PRIx64"\n", bank, *val);
>+ }
>+ else
>+ entry = NULL;
>+ }
>+ break;
>+ case MSR_IA32_MC0_ADDR:
>+ if ( !list_empty(&vmce->impact_header) )
>+ {
>+ entry = list_entry(vmce->impact_header.next,
>+ struct bank_entry, list);
>+ if ( entry->bank == bank )
>+ {
>+ *val = entry->mci_addr;
>+ mce_printk(MCE_VERBOSE,
>+ "MCE: rdmsr MC%u_ADDR in vMCE# context "
>+ "0x%"PRIx64"\n", bank, *val);
>+ }
>+ }
>+ break;
>+ case MSR_IA32_MC0_MISC:
>+ if ( !list_empty(&vmce->impact_header) )
>+ {
>+ entry = list_entry(vmce->impact_header.next,
>+ struct bank_entry, list);
>+ if ( entry->bank == bank )
>+ {
>+ *val = entry->mci_misc;
>+ mce_printk(MCE_VERBOSE,
>+ "MCE: rd MC%u_MISC in vMCE# context "
>+ "0x%"PRIx64"\n", bank, *val);
>+ }
>+ }
>+ break;
>+ default:
>+ switch ( boot_cpu_data.x86_vendor )
>+ {
>+ case X86_VENDOR_INTEL:
>+ ret = intel_mce_rdmsr(msr, val);
>+ break;
>+ default:
>+ ret = 0;
>+ break;
>+ }
>+ break;
>+ }
>+
>+ return ret;
>+}
>+
>+/*
>+ * < 0: Unsupported and will #GP fault to guest
>+ * = 0: Not handled, should be handled by other components
>+ * > 0: Success
>+ */
>+int vmce_rdmsr(uint32_t msr, uint64_t *val)
>+{
>+ struct domain *d = current->domain;
>+ struct domain_mca_msrs *vmce;
>+ int ret = 1;
>+
>+ *val = 0;
>+
>+ vmce = dom_vmce(d);
>+ if ( !vmce )
>+ {
>+ /* XXX more handle here */
>+ return 0;
>+ }
>+
>+ spin_lock(&d->arch.vmca_msrs->lock);
>+
>+ switch ( msr )
>+ {
>+ case MSR_IA32_MCG_STATUS:
>+ *val = vmce->mcg_status;
>+ if (*val)
>+ mce_printk(MCE_VERBOSE,
>+ "MCE: rdmsr MCG_STATUS 0x%"PRIx64"\n", *val);
>+ break;
>+ case MSR_IA32_MCG_CAP:
>+ *val = vmce->mcg_cap;
>+ mce_printk(MCE_VERBOSE, "MCE: rdmsr MCG_CAP 0x%"PRIx64"\n",
>+ *val);
>+ break;
>+ case MSR_IA32_MCG_CTL:
>+ /* Always 0 if no CTL support */
>+ *val = vmce->mcg_ctl & h_mcg_ctl;
>+ mce_printk(MCE_VERBOSE, "MCE: rdmsr MCG_CTL 0x%"PRIx64"\n",
>+ *val);
>+ break;
>+ default:
>+ if ( mce_bank_msr(msr) )
>+ ret = bank_mce_rdmsr(d, msr, val);
>+ else
>+ ret = 0;
>+ break;
>+ }
>+
>+ spin_unlock(&d->arch.vmca_msrs->lock);
>+ return ret;
>+}
>+
>+int bank_mce_wrmsr(struct domain *d, u32 msr, u64 val)
>+{
>+ int bank, ret = 1;
>+ struct domain_mca_msrs *vmce;
>+ struct bank_entry *entry = NULL;
>+
>+ if (!d)
>+ return -EINVAL;
>+ vmce = dom_vmce(d);
>+ ASSERT(vmce && vmce->mci_ctl);
>+
>+ bank = (msr - MSR_IA32_MC0_CTL) / 4;
>+ if (bank >= nr_mce_banks)
>+ return -EINVAL;
>+
>+ switch ( msr & (MSR_IA32_MC0_CTL | 3) )
>+ {
>+ case MSR_IA32_MC0_CTL:
>+ vmce->mci_ctl[bank] = val;
>+ break;
>+ case MSR_IA32_MC0_STATUS:
>+ /* Give the first entry of the list, it corresponds to current
>+ * vMCE# injection. When vMCE# is finished processing by the
>+ * the guest, this node will be deleted.
>+ * Only error bank is written. Non-error banks simply return.
>+ */
>+ if ( !list_empty(&d->arch.vmca_msrs->impact_header) )
>+ {
>+ entry = list_entry(d->arch.vmca_msrs->impact_header.next,
>+ struct bank_entry, list);
>+ if ( entry->bank == bank )
>+ entry->mci_status = val;
>+ mce_printk(MCE_VERBOSE,
>+ "MCE: wr MC%u_STATUS %"PRIx64" in vMCE#\n",
>+ bank, val);
>+ }
>+ else
>+ mce_printk(MCE_VERBOSE,
>+ "MCE: wr MC%u_STATUS %"PRIx64"\n", bank, val);
>+ break;
>+ case MSR_IA32_MC0_ADDR:
>+ mce_printk(MCE_QUIET, "MCE: MC%u_ADDR is read-only\n",
>bank);
>+ ret = -1;
>+ break;
>+ case MSR_IA32_MC0_MISC:
>+ mce_printk(MCE_QUIET, "MCE: MC%u_MISC is read-only\n", bank);
>+ ret = -1;
>+ break;
>+ default:
>+ switch ( boot_cpu_data.x86_vendor )
>+ {
>+ case X86_VENDOR_INTEL:
>+ ret = intel_mce_wrmsr(msr, val);
>+ break;
>+ default:
>+ ret = 0;
>+ break;
>+ }
>+ break;
>+ }
>+
>+ return ret;
>+}
>+
>+/*
>+ * < 0: Unsupported and will #GP fault to guest
>+ * = 0: Not handled, should be handled by other components
>+ * > 0: Success
>+ */
>+int vmce_wrmsr(u32 msr, u64 val)
>+{
>+ struct domain *d = current->domain;
>+ struct bank_entry *entry = NULL;
>+ struct domain_mca_msrs *vmce;
>+ int ret = 1;
>+
>+ if ( !g_mcg_cap )
>+ return 0;
>+
>+ vmce = dom_vmce(d);
>+ spin_lock(&vmce->lock);
>+
>+ switch ( msr )
>+ {
>+ case MSR_IA32_MCG_CTL:
>+ vmce->mcg_ctl = val;
>+ break;
>+ case MSR_IA32_MCG_STATUS:
>+ vmce->mcg_status = val;
>+ mce_printk(MCE_VERBOSE, "MCE: wrmsr MCG_STATUS %"PRIx64"\n",
>val);
>+ /* For HVM guest, this is the point for deleting vMCE injection node
>*/
>+ if ( d->is_hvm && (vmce->nr_injection > 0) )
>+ {
>+ vmce->nr_injection--; /* Should be 0 */
>+ if ( !list_empty(&vmce->impact_header) )
>+ {
>+ entry = list_entry(vmce->impact_header.next,
>+ struct bank_entry, list);
>+ if ( entry->mci_status & MCi_STATUS_VAL )
>+ mce_printk(MCE_QUIET, "MCE: MCi_STATUS MSR should
>have "
>+ "been cleared before write MCG_STATUS
>MSR\n");
>+
>+ mce_printk(MCE_QUIET, "MCE: Delete HVM last injection "
>+ "Node, nr_injection %u\n",
>+ vmce->nr_injection);
>+ list_del(&entry->list);
>+ xfree(entry);
>+ }
>+ else
>+ mce_printk(MCE_QUIET, "MCE: Not found HVM guest"
>+ " last injection Node, something Wrong!\n");
>+ }
>+ break;
>+ case MSR_IA32_MCG_CAP:
>+ mce_printk(MCE_QUIET, "MCE: MCG_CAP is read-only\n");
>+ ret = -1;
>+ break;
>+ default:
>+ if ( mce_bank_msr(msr) )
>+ ret = bank_mce_wrmsr(d, msr, val);
>+ else
>+ ret = 0;
>+ break;
>+ }
>+
>+ spin_unlock(&vmce->lock);
>+ return ret;
>+}
>+
>+int inject_vmce(struct domain *d)
>+{
>+ int cpu = smp_processor_id();
>+ cpumask_t affinity;
>+
>+ /* PV guest and HVM guest have different vMCE# injection
>+ * methods*/
>+ if ( !test_and_set_bool(d->vcpu[0]->mce_pending) )
>+ {
>+ if (d->is_hvm)
>+ {
>+ mce_printk(MCE_VERBOSE, "MCE: inject vMCE to HVM
>DOM %d\n",
>+ d->domain_id);
>+ vcpu_kick(d->vcpu[0]);
>+ }
>+ /* PV guest including DOM0 */
>+ else
>+ {
>+ mce_printk(MCE_VERBOSE, "MCE: inject vMCE to PV DOM%d\n",
>+ d->domain_id);
>+ if (guest_has_trap_callback
>+ (d, 0, TRAP_machine_check))
>+ {
>+ d->vcpu[0]->cpu_affinity_tmp =
>+ d->vcpu[0]->cpu_affinity;
>+ cpus_clear(affinity);
>+ cpu_set(cpu, affinity);
>+ mce_printk(MCE_VERBOSE, "MCE: CPU%d set affinity,
>old %d\n", cpu,
>+ d->vcpu[0]->processor);
>+ vcpu_set_affinity(d->vcpu[0], &affinity);
>+ vcpu_kick(d->vcpu[0]);
>+ }
>+ else
>+ {
>+ mce_printk(MCE_VERBOSE, "MCE: Kill PV guest with No MCE
>handler\n");
>+ domain_crash(d);
>+ }
>+ }
>+ }
>+ else {
>+ /* new vMCE comes while first one has not been injected yet,
>+ * in this case, inject fail. [We can't lose this vMCE for
>+ * the mce node's consistency].
>+ */
>+ mce_printk(MCE_QUIET, "There's a pending vMCE waiting to be injected
>"
>+ " to this DOM%d!\n", d->domain_id);
>+ return -1;
>+ }
>+ return 0;
>+}
>+
>+/* This node list records errors impacting a domain. when one
>+ * MCE# happens, one error bank impacts a domain. This error node
>+ * will be inserted to the tail of the per_dom data for vMCE# MSR
>+ * virtualization. When one vMCE# injection is finished processing
>+ * processed by guest, the corresponding node will be deleted.
>+ * This node list is for GUEST vMCE# MSRS virtualization.
>+ */
>+static struct bank_entry* alloc_bank_entry(void) {
>+ struct bank_entry *entry;
>+
>+ entry = xmalloc(struct bank_entry);
>+ if (!entry) {
>+ printk(KERN_ERR "MCE: malloc bank_entry failed\n");
>+ return NULL;
>+ }
>+ memset(entry, 0x0, sizeof(entry));
>+ INIT_LIST_HEAD(&entry->list);
>+ return entry;
>+}
>+
>+/* Fill error bank info for #vMCE injection and GUEST vMCE#
>+ * MSR virtualization data
>+ * 1) Log down how many nr_injections of the impacted.
>+ * 2) Copy MCE# error bank to impacted DOM node list,
>+ for vMCE# MSRs virtualization
>+*/
>+
>+int fill_vmsr_data(struct mcinfo_bank *mc_bank, struct domain *d,
>+ uint64_t gstatus) {
>+ struct bank_entry *entry;
>+
>+ /* This error bank impacts one domain, we need to fill domain related
>+ * data for vMCE MSRs virtualization and vMCE# injection */
>+ if (mc_bank->mc_domid != (uint16_t)~0) {
>+ /* For HVM guest, Only when first vMCE is consumed by HVM guest
>successfully,
>+ * will we generete another node and inject another vMCE
>+ */
>+ if ( (d->is_hvm) && (d->arch.vmca_msrs->nr_injection > 0) )
>+ {
>+ mce_printk(MCE_QUIET, "MCE: HVM guest has not handled
>previous"
>+ " vMCE yet!\n");
>+ return -1;
>+ }
>+ entry = alloc_bank_entry();
>+ if (entry == NULL)
>+ return -1;
>+
>+ entry->mci_status = mc_bank->mc_status;
>+ entry->mci_addr = mc_bank->mc_addr;
>+ entry->mci_misc = mc_bank->mc_misc;
>+ entry->bank = mc_bank->mc_bank;
>+
>+ spin_lock(&d->arch.vmca_msrs->lock);
>+ /* New error Node, insert to the tail of the per_dom data */
>+ list_add_tail(&entry->list, &d->arch.vmca_msrs->impact_header);
>+ /* Fill MSR global status */
>+ d->arch.vmca_msrs->mcg_status = gstatus;
>+ /* New node impact the domain, need another vMCE# injection*/
>+ d->arch.vmca_msrs->nr_injection++;
>+ spin_unlock(&d->arch.vmca_msrs->lock);
>+
>+ mce_printk(MCE_VERBOSE,"MCE: Found error @[BANK%d "
>+ "status %"PRIx64" addr %"PRIx64" domid %d]\n ",
>+ mc_bank->mc_bank, mc_bank->mc_status,
>mc_bank->mc_addr,
>+ mc_bank->mc_domid);
>+ }
>+ return 0;
>+}
>+
>+int vmce_domain_inject(struct mcinfo_bank *bank, struct domain *d, struct
>mcinfo_global *global)
>+{
>+ int ret;
>+
>+ ret = fill_vmsr_data(bank, d, global->mc_gstatus);
>+ if (ret < 0)
>+ return ret;
>+
>+ return inject_vmce(d);
>+}
>+
>diff -r 7ee8bb40200a -r b4fd50c22d9c xen/arch/x86/domain.c
>--- a/xen/arch/x86/domain.c Thu Apr 15 19:11:16 2010 +0100
>+++ b/xen/arch/x86/domain.c Fri Apr 16 18:55:03 2010 +0800
>@@ -49,6 +49,7 @@
> #include <asm/msr.h>
> #include <asm/traps.h>
> #include <asm/nmi.h>
>+#include <asm/mce.h>
> #include <xen/numa.h>
> #include <xen/iommu.h>
> #ifdef CONFIG_COMPAT
>@@ -501,7 +502,7 @@ int arch_domain_create(struct domain *d,
> goto fail;
>
> /* For Guest vMCE MSRs virtualization */
>- mce_init_msr(d);
>+ vmce_init_msr(d);
> }
>
> if ( is_hvm_domain(d) )
>diff -r 7ee8bb40200a -r b4fd50c22d9c xen/arch/x86/hvm/hvm.c
>--- a/xen/arch/x86/hvm/hvm.c Thu Apr 15 19:11:16 2010 +0100
>+++ b/xen/arch/x86/hvm/hvm.c Fri Apr 16 18:55:03 2010 +0800
>@@ -47,6 +47,7 @@
> #include <asm/traps.h>
> #include <asm/mc146818rtc.h>
> #include <asm/spinlock.h>
>+#include <asm/mce.h>
> #include <asm/hvm/hvm.h>
> #include <asm/hvm/vpt.h>
> #include <asm/hvm/support.h>
>@@ -2061,7 +2062,7 @@ int hvm_msr_read_intercept(struct cpu_us
> break;
>
> default:
>- ret = mce_rdmsr(ecx, &msr_content);
>+ ret = vmce_rdmsr(ecx, &msr_content);
> if ( ret < 0 )
> goto gp_fault;
> else if ( ret )
>@@ -2160,7 +2161,7 @@ int hvm_msr_write_intercept(struct cpu_u
> break;
>
> default:
>- ret = mce_wrmsr(ecx, msr_content);
>+ ret = vmce_wrmsr(ecx, msr_content);
> if ( ret < 0 )
> goto gp_fault;
> else if ( ret )
>diff -r 7ee8bb40200a -r b4fd50c22d9c xen/arch/x86/traps.c
>--- a/xen/arch/x86/traps.c Thu Apr 15 19:11:16 2010 +0100
>+++ b/xen/arch/x86/traps.c Fri Apr 16 18:55:03 2010 +0800
>@@ -65,6 +65,7 @@
> #include <asm/traps.h>
> #include <asm/hvm/vpt.h>
> #include <asm/hypercall.h>
>+#include <asm/mce.h>
> #include <public/arch-x86/cpuid.h>
>
> /*
>@@ -2295,7 +2296,7 @@ static int emulate_privileged_op(struct
> if ( wrmsr_hypervisor_regs(regs->ecx, val) )
> break;
>
>- rc = mce_wrmsr(regs->ecx, val);
>+ rc = vmce_wrmsr(regs->ecx, val);
> if ( rc < 0 )
> goto fail;
> if ( rc )
>@@ -2388,7 +2389,7 @@ static int emulate_privileged_op(struct
> break;
> }
>
>- rc = mce_rdmsr(regs->ecx, &val);
>+ rc = vmce_rdmsr(regs->ecx, &val);
> if ( rc < 0 )
> goto fail;
> if ( rc )
>@@ -2947,19 +2948,19 @@ void async_exception_cleanup(struct vcpu
> {
> struct domain *d = curr->domain;
>
>- if ( !d->arch.vmca_msrs.nr_injection )
>+ if ( !d->arch.vmca_msrs->nr_injection )
> {
> printk(XENLOG_WARNING "MCE: ret from vMCE#, "
> "no injection node\n");
> goto end;
> }
>
>- d->arch.vmca_msrs.nr_injection--;
>- if ( !list_empty(&d->arch.vmca_msrs.impact_header) )
>+ d->arch.vmca_msrs->nr_injection--;
>+ if ( !list_empty(&d->arch.vmca_msrs->impact_header) )
> {
> struct bank_entry *entry;
>
>- entry = list_entry(d->arch.vmca_msrs.impact_header.next,
>+ entry = list_entry(d->arch.vmca_msrs->impact_header.next,
> struct bank_entry, list);
> gdprintk(XENLOG_DEBUG, "MCE: delete last injection
>node\n");
> list_del(&entry->list);
>@@ -2968,7 +2969,7 @@ void async_exception_cleanup(struct vcpu
> printk(XENLOG_ERR "MCE: didn't found last injection node\n");
>
> /* further injection */
>- if ( d->arch.vmca_msrs.nr_injection > 0 &&
>+ if ( d->arch.vmca_msrs->nr_injection > 0 &&
> guest_has_trap_callback(d, 0, TRAP_machine_check) &&
> !test_and_set_bool(curr->mce_pending) )
> {
>diff -r 7ee8bb40200a -r b4fd50c22d9c xen/common/domain.c
>--- a/xen/common/domain.c Thu Apr 15 19:11:16 2010 +0100
>+++ b/xen/common/domain.c Fri Apr 16 18:55:03 2010 +0800
>@@ -616,6 +616,8 @@ static void complete_domain_destroy(stru
>
> xfree(d->pirq_mask);
> xfree(d->pirq_to_evtchn);
>+ xfree(dom_vmce(d)->mci_ctl);
>+ xfree(dom_vmce(d));
>
> xsm_free_security_domain(d);
> free_domain_struct(d);
>diff -r 7ee8bb40200a -r b4fd50c22d9c xen/include/asm-x86/domain.h
>--- a/xen/include/asm-x86/domain.h Thu Apr 15 19:11:16 2010 +0100
>+++ b/xen/include/asm-x86/domain.h Fri Apr 16 18:55:03 2010 +0800
>@@ -6,6 +6,7 @@
> #include <asm/hvm/vcpu.h>
> #include <asm/hvm/domain.h>
> #include <asm/e820.h>
>+#include <asm/mce.h>
> #include <public/vcpu.h>
>
> #define has_32bit_shinfo(d) ((d)->arch.has_32bit_shinfo)
>@@ -214,32 +215,6 @@ typedef xen_domctl_cpuid_t cpuid_input_t
> typedef xen_domctl_cpuid_t cpuid_input_t;
>
> struct p2m_domain;
>-
>-/* Define for GUEST MCA handling */
>-#define MAX_NR_BANKS 30
>-
>-/* This entry is for recording bank nodes for the impacted domain,
>- * put into impact_header list. */
>-struct bank_entry {
>- struct list_head list;
>- uint16_t bank;
>- uint64_t mci_status;
>- uint64_t mci_addr;
>- uint64_t mci_misc;
>-};
>-
>-struct domain_mca_msrs
>-{
>- /* Guest should not change below values after DOM boot up */
>- uint64_t mcg_cap;
>- uint64_t mcg_ctl;
>- uint64_t mcg_status;
>- uint64_t mci_ctl[MAX_NR_BANKS];
>- uint16_t nr_injection;
>- struct list_head impact_header;
>- spinlock_t lock;
>-};
>-
> struct time_scale {
> int shift;
> u32 mul_frac;
>@@ -311,7 +286,7 @@ struct arch_domain
> cpuid_input_t cpuids[MAX_CPUID_INPUT];
>
> /* For Guest vMCA handling */
>- struct domain_mca_msrs vmca_msrs;
>+ struct domain_mca_msrs *vmca_msrs;
>
> /* TSC management (emulation, pv, scaling, stats) */
> int tsc_mode; /* see include/asm-x86/time.h */
>diff -r 7ee8bb40200a -r b4fd50c22d9c xen/include/asm-x86/mce.h
>--- /dev/null Thu Jan 01 00:00:00 1970 +0000
>+++ b/xen/include/asm-x86/mce.h Fri Apr 16 18:55:03 2010 +0800
>@@ -0,0 +1,36 @@
>+#include <xen/types.h>
>+#include <public/arch-x86/xen-mca.h>
>+#ifndef _XEN_X86_MCE_H
>+#define _XEN_X86_MCE_H
>+/* Define for GUEST MCA handling */
>+#define MAX_NR_BANKS 30
>+
>+/* This entry is for recording bank nodes for the impacted domain,
>+ * put into impact_header list. */
>+struct bank_entry {
>+ struct list_head list;
>+ uint16_t bank;
>+ uint64_t mci_status;
>+ uint64_t mci_addr;
>+ uint64_t mci_misc;
>+};
>+
>+struct domain_mca_msrs
>+{
>+ /* Guest should not change below values after DOM boot up */
>+ uint64_t mcg_cap;
>+ uint64_t mcg_ctl;
>+ uint64_t mcg_status;
>+ uint64_t *mci_ctl;
>+ uint16_t nr_injection;
>+ struct list_head impact_header;
>+ spinlock_t lock;
>+};
>+
>+#define dom_vmce(x) ((x)->arch.vmca_msrs)
>+
>+/* Guest vMCE MSRs virtualization */
>+extern int vmce_init_msr(struct domain *d);
>+extern int vmce_wrmsr(uint32_t msr, uint64_t val);
>+extern int vmce_rdmsr(uint32_t msr, uint64_t *val);
>+#endif
>diff -r 7ee8bb40200a -r b4fd50c22d9c xen/include/asm-x86/traps.h
>--- a/xen/include/asm-x86/traps.h Thu Apr 15 19:11:16 2010 +0100
>+++ b/xen/include/asm-x86/traps.h Fri Apr 16 18:55:03 2010 +0800
>@@ -49,9 +49,4 @@ extern int send_guest_trap(struct domain
> extern int send_guest_trap(struct domain *d, uint16_t vcpuid,
> unsigned int trap_nr);
>
>-/* Guest vMCE MSRs virtualization */
>-extern void mce_init_msr(struct domain *d);
>-extern int mce_wrmsr(uint32_t msr, uint64_t val);
>-extern int mce_rdmsr(uint32_t msr, uint64_t *val);
>-
> #endif /* ASM_TRAP_H */
>
>
>
>_______________________________________________
>Xen-devel mailing list
>Xen-devel@xxxxxxxxxxxxxxxxxxx
>http://lists.xensource.com/xen-devel
vmce_seperate_file.patch
Description: vmce_seperate_file.patch
_______________________________________________
Xen-devel mailing list
Xen-devel@xxxxxxxxxxxxxxxxxxx
http://lists.xensource.com/xen-devel
|