[Xen-changelog] [xen-unstable] hvm/x86: MTRR/PAT virtualisation.

# HG changeset patch
# User Keir Fraser <keir@xxxxxxxxxxxxx>
# Date 1193059162 -3600
# Node ID 3e7c86602c70d2d587aafbef957d644f48cd6da9
# Parent  42d8dadb5864eac0140262b9475a7b1ed150b607
hvm/x86: MTRR/PAT virtualisation.
Signed-off-by: Disheng Su <disheng.su@xxxxxxxxx>
---
 xen/arch/x86/cpu/common.c         |    9 
 xen/arch/x86/cpu/mtrr/generic.c   |   16 
 xen/arch/x86/cpu/mtrr/main.c      |    7 
 xen/arch/x86/cpu/mtrr/mtrr.h      |   11 
 xen/arch/x86/hvm/Makefile         |    1 
 xen/arch/x86/hvm/hvm.c            |   88 ++++
 xen/arch/x86/hvm/mtrr.c           |  687 ++++++++++++++++++++++++++++++++++++++
 xen/arch/x86/hvm/vmx/vmcs.c       |   17 
 xen/arch/x86/hvm/vmx/vmx.c        |   70 +++
 xen/arch/x86/mm.c                 |    9 
 xen/arch/x86/mm/shadow/common.c   |    8 
 xen/arch/x86/mm/shadow/multi.c    |   28 +
 xen/include/asm-x86/cpufeature.h  |    2 
 xen/include/asm-x86/hvm/domain.h  |    6 
 xen/include/asm-x86/hvm/support.h |    1 
 xen/include/asm-x86/hvm/vcpu.h    |    7 
 xen/include/asm-x86/msr-index.h   |   19 +
 xen/include/asm-x86/mtrr.h        |   49 ++
 18 files changed, 999 insertions(+), 36 deletions(-)

diff -r 42d8dadb5864 -r 3e7c86602c70 xen/arch/x86/cpu/common.c
--- a/xen/arch/x86/cpu/common.c Mon Oct 22 13:04:32 2007 +0100
+++ b/xen/arch/x86/cpu/common.c Mon Oct 22 14:19:22 2007 +0100
@@ -22,6 +22,12 @@ static int disable_x86_serial_nr __devin
 static int disable_x86_serial_nr __devinitdata = 0;
 
 struct cpu_dev * cpu_devs[X86_VENDOR_NUM] = {};
+
+/*
+ * Default host IA32_CR_PAT value to cover all memory types.
+ * BIOS usually sets it to 0x07040600070406.
+ */
+u64 host_pat = 0x050100070406;
 
 static void default_init(struct cpuinfo_x86 * c)
 {
@@ -557,6 +563,9 @@ void __devinit cpu_init(void)
        }
        printk(KERN_INFO "Initializing CPU#%d\n", cpu);
 
+       if (cpu_has_pat)
+               wrmsrl(MSR_IA32_CR_PAT, host_pat);
+
        *(unsigned short *)(&gdt_load[0]) = LAST_RESERVED_GDT_BYTE;
        *(unsigned long  *)(&gdt_load[2]) = GDT_VIRT_START(current);
        asm volatile ( "lgdt %0" : "=m" (gdt_load) );
diff -r 42d8dadb5864 -r 3e7c86602c70 xen/arch/x86/cpu/mtrr/generic.c
--- a/xen/arch/x86/cpu/mtrr/generic.c   Mon Oct 22 13:04:32 2007 +0100
+++ b/xen/arch/x86/cpu/mtrr/generic.c   Mon Oct 22 14:19:22 2007 +0100
@@ -11,14 +11,6 @@
 #include <asm/cpufeature.h>
 #include "mtrr.h"
 
-struct mtrr_state {
-       struct mtrr_var_range *var_ranges;
-       mtrr_type fixed_ranges[NUM_FIXED_RANGES];
-       unsigned char enabled;
-       unsigned char have_fixed;
-       mtrr_type def_type;
-};
-
 struct fixed_range_block {
        int base_msr; /* start address of an MTRR block */
        int ranges;   /* number of MTRRs in this block  */
@@ -32,7 +24,7 @@ static struct fixed_range_block fixed_ra
 };
 
 static unsigned long smp_changes_mask;
-static struct mtrr_state mtrr_state = {};
+struct mtrr_state mtrr_state = {};
 
 /*  Get the MSR pair relating to a var range  */
 static void
@@ -88,6 +80,9 @@ void __init get_mtrr_state(void)
        rdmsr(MTRRdefType_MSR, lo, dummy);
        mtrr_state.def_type = (lo & 0xff);
        mtrr_state.enabled = (lo & 0xc00) >> 10;
+
+       /* Store mtrr_cap for HVM MTRR virtualisation. */
+       rdmsrl(MTRRcap_MSR, mtrr_state.mtrr_cap);
 }
 
 /*  Some BIOS's are fucked and don't set all MTRRs the same!  */
@@ -107,6 +102,7 @@ void __init mtrr_state_warn(void)
        printk(KERN_INFO "mtrr: corrected configuration.\n");
 }
 
+extern bool_t is_var_mtrr_overlapped(struct mtrr_state *m);
 /* Doesn't attempt to pass an error out to MTRR users
    because it's quite complicated in some cases and probably not
    worth it because the best error handling is to ignore it. */
@@ -116,6 +112,8 @@ void mtrr_wrmsr(unsigned msr, unsigned a
                printk(KERN_ERR
                        "MTRR: CPU %u: Writing MSR %x to %x:%x failed\n",
                        smp_processor_id(), msr, a, b);
+       /* Cache overlap status for efficient HVM MTRR virtualisation. */
+       mtrr_state.overlapped = is_var_mtrr_overlapped(&mtrr_state);
 }
 
 /**
diff -r 42d8dadb5864 -r 3e7c86602c70 xen/arch/x86/cpu/mtrr/main.c
--- a/xen/arch/x86/cpu/mtrr/main.c      Mon Oct 22 13:04:32 2007 +0100
+++ b/xen/arch/x86/cpu/mtrr/main.c      Mon Oct 22 14:19:22 2007 +0100
@@ -588,6 +588,8 @@ struct mtrr_value {
        unsigned long   lsize;
 };
 
+extern void global_init_mtrr_pat(void);
+
 /**
  * mtrr_bp_init - initialize mtrrs on the boot CPU
  *
@@ -654,8 +656,11 @@ void __init mtrr_bp_init(void)
        if (mtrr_if) {
                set_num_var_ranges();
                init_table();
-               if (use_intel())
+               if (use_intel()) {
                        get_mtrr_state();
+                       /* initialize some global data for MTRR/PAT 
virutalization */
+                       global_init_mtrr_pat();
+               }
        }
 }
 
diff -r 42d8dadb5864 -r 3e7c86602c70 xen/arch/x86/cpu/mtrr/mtrr.h
--- a/xen/arch/x86/cpu/mtrr/mtrr.h      Mon Oct 22 13:04:32 2007 +0100
+++ b/xen/arch/x86/cpu/mtrr/mtrr.h      Mon Oct 22 14:19:22 2007 +0100
@@ -13,7 +13,6 @@
 #define MTRRphysBase_MSR(reg) (0x200 + 2 * (reg))
 #define MTRRphysMask_MSR(reg) (0x200 + 2 * (reg) + 1)
 
-#define NUM_FIXED_RANGES 88
 #define MTRRfix64K_00000_MSR 0x250
 #define MTRRfix16K_80000_MSR 0x258
 #define MTRRfix16K_A0000_MSR 0x259
@@ -30,9 +29,6 @@
 #define MTRR_CHANGE_MASK_VARIABLE  0x02
 #define MTRR_CHANGE_MASK_DEFTYPE   0x04
 
-/* In the Intel processor's MTRR interface, the MTRR type is always held in
-   an 8 bit field: */
-typedef u8 mtrr_type;
 
 struct mtrr_ops {
        u32     vendor;
@@ -69,13 +65,6 @@ struct set_mtrr_context {
        u32 ccr3;
 };
 
-struct mtrr_var_range {
-       u32 base_lo;
-       u32 base_hi;
-       u32 mask_lo;
-       u32 mask_hi;
-};
-
 void set_mtrr_done(struct set_mtrr_context *ctxt);
 void set_mtrr_cache_disable(struct set_mtrr_context *ctxt);
 void set_mtrr_prepare_save(struct set_mtrr_context *ctxt);
diff -r 42d8dadb5864 -r 3e7c86602c70 xen/arch/x86/hvm/Makefile
--- a/xen/arch/x86/hvm/Makefile Mon Oct 22 13:04:32 2007 +0100
+++ b/xen/arch/x86/hvm/Makefile Mon Oct 22 14:19:22 2007 +0100
@@ -7,6 +7,7 @@ obj-y += intercept.o
 obj-y += intercept.o
 obj-y += io.o
 obj-y += irq.o
+obj-y += mtrr.o
 obj-y += platform.o
 obj-y += pmtimer.o
 obj-y += rtc.o
diff -r 42d8dadb5864 -r 3e7c86602c70 xen/arch/x86/hvm/hvm.c
--- a/xen/arch/x86/hvm/hvm.c    Mon Oct 22 13:04:32 2007 +0100
+++ b/xen/arch/x86/hvm/hvm.c    Mon Oct 22 14:19:22 2007 +0100
@@ -226,6 +226,7 @@ int hvm_domain_initialise(struct domain 
 
     spin_lock_init(&d->arch.hvm_domain.pbuf_lock);
     spin_lock_init(&d->arch.hvm_domain.irq_lock);
+    spin_lock_init(&d->arch.hvm_domain.uc_lock);
 
     rc = paging_enable(d, PG_refcounts|PG_translate|PG_external);
     if ( rc != 0 )
@@ -417,27 +418,22 @@ HVM_REGISTER_SAVE_RESTORE(CPU, hvm_save_
 HVM_REGISTER_SAVE_RESTORE(CPU, hvm_save_cpu_ctxt, hvm_load_cpu_ctxt,
                           1, HVMSR_PER_VCPU);
 
+extern int reset_vmsr(struct mtrr_state *m, u64 *p);
+
 int hvm_vcpu_initialise(struct vcpu *v)
 {
     int rc;
 
     if ( (rc = vlapic_init(v)) != 0 )
-        return rc;
+        goto fail1;
 
     if ( (rc = hvm_funcs.vcpu_initialise(v)) != 0 )
-    {
-        vlapic_destroy(v);
-        return rc;
-    }
+        goto fail2;
 
     /* Create ioreq event channel. */
     rc = alloc_unbound_xen_event_channel(v, 0);
     if ( rc < 0 )
-    {
-        hvm_funcs.vcpu_destroy(v);
-        vlapic_destroy(v);
-        return rc;
-    }
+        goto fail3;
 
     /* Register ioreq event channel. */
     v->arch.hvm_vcpu.xen_port = rc;
@@ -448,6 +444,10 @@ int hvm_vcpu_initialise(struct vcpu *v)
 
     spin_lock_init(&v->arch.hvm_vcpu.tm_lock);
     INIT_LIST_HEAD(&v->arch.hvm_vcpu.tm_list);
+
+    rc = reset_vmsr(&v->arch.hvm_vcpu.mtrr, &v->arch.hvm_vcpu.pat_cr);
+    if ( rc != 0 )
+        goto fail3;
 
     v->arch.guest_context.user_regs.eflags = 2;
 
@@ -468,6 +468,13 @@ int hvm_vcpu_initialise(struct vcpu *v)
     }
 
     return 0;
+
+ fail3:
+    hvm_funcs.vcpu_destroy(v);
+ fail2:
+    vlapic_destroy(v);
+ fail1:
+    return rc;
 }
 
 void hvm_vcpu_destroy(struct vcpu *v)
@@ -604,6 +611,32 @@ int hvm_set_efer(uint64_t value)
     hvm_update_guest_efer(v);
 
     return 1;
+}
+
+extern void shadow_blow_tables_per_domain(struct domain *d);
+extern bool_t mtrr_pat_not_equal(struct vcpu *vd, struct vcpu *vs);
+
+/* Exit UC mode only if all VCPUs agree on MTRR/PAT and are not in no_fill. */
+static bool_t domain_exit_uc_mode(struct vcpu *v)
+{
+    struct domain *d = v->domain;
+    struct vcpu *vs;
+
+    for_each_vcpu ( d, vs )
+    {
+        if ( (vs == v) || !vs->is_initialised )
+            continue;
+        if ( (vs->arch.hvm_vcpu.cache_mode == NO_FILL_CACHE_MODE) ||
+             mtrr_pat_not_equal(vs, v) )
+            return 0;
+    }
+
+    return 1;
+}
+
+static void local_flush_cache(void *info)
+{
+    wbinvd();
 }
 
 int hvm_set_cr0(unsigned long value)
@@ -683,6 +716,41 @@ int hvm_set_cr0(unsigned long value)
         {
             put_page(pagetable_get_page(v->arch.guest_table));
             v->arch.guest_table = pagetable_null();
+        }
+    }
+
+    if ( !list_empty(&(domain_hvm_iommu(v->domain)->pdev_list)) )
+    {
+        if ( (value & X86_CR0_CD) && !(value & X86_CR0_NW) )
+        {
+            /* Entering no fill cache mode. */
+            spin_lock(&v->domain->arch.hvm_domain.uc_lock);
+            v->arch.hvm_vcpu.cache_mode = NO_FILL_CACHE_MODE;
+
+            if ( !v->domain->arch.hvm_domain.is_in_uc_mode )
+            {
+                /* Flush physical caches. */
+                on_each_cpu(local_flush_cache, NULL, 1, 1);
+                /* Shadow pagetables must recognise UC mode. */
+                v->domain->arch.hvm_domain.is_in_uc_mode = 1;
+                shadow_blow_tables_per_domain(v->domain);
+            }
+            spin_unlock(&v->domain->arch.hvm_domain.uc_lock);
+        }
+        else if ( !(value & (X86_CR0_CD | X86_CR0_NW)) &&
+                  (v->arch.hvm_vcpu.cache_mode == NO_FILL_CACHE_MODE) )
+        {
+            /* Exit from no fill cache mode. */
+            spin_lock(&v->domain->arch.hvm_domain.uc_lock);
+            v->arch.hvm_vcpu.cache_mode = NORMAL_CACHE_MODE;
+
+            if ( domain_exit_uc_mode(v) )
+            {
+                /* Shadow pagetables must recognise normal caching mode. */
+                v->domain->arch.hvm_domain.is_in_uc_mode = 0;
+                shadow_blow_tables_per_domain(v->domain);
+            }
+            spin_unlock(&v->domain->arch.hvm_domain.uc_lock);
         }
     }
 
diff -r 42d8dadb5864 -r 3e7c86602c70 xen/arch/x86/hvm/mtrr.c
--- /dev/null   Thu Jan 01 00:00:00 1970 +0000
+++ b/xen/arch/x86/hvm/mtrr.c   Mon Oct 22 14:19:22 2007 +0100
@@ -0,0 +1,687 @@
+/*
+ * mtrr.c: MTRR/PAT virtualization
+ *
+ * Copyright (c) 2007, Intel Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License along with
+ * this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+ * Place - Suite 330, Boston, MA 02111-1307 USA.
+ */
+
+#include <public/hvm/e820.h>
+#include <xen/types.h>
+#include <asm/e820.h>
+#include <asm/paging.h>
+#include <asm/p2m.h>
+#include <xen/domain_page.h>
+#include <stdbool.h>
+#include <asm/mtrr.h>
+#include <asm/hvm/support.h>
+
+/* Xen holds the native MTRR MSRs */
+extern struct mtrr_state mtrr_state;
+
+static u64 phys_base_msr_mask;
+static u64 phys_mask_msr_mask;
+static u32 size_or_mask;
+static u32 size_and_mask;
+
+static void init_pat_entry_tbl(u64 pat);
+static void init_mtrr_epat_tbl(void);
+static unsigned char get_mtrr_type(struct mtrr_state *m, paddr_t pa);
+/* get page attribute fields (PAn) from PAT MSR */
+#define pat_cr_2_paf(pat_cr,n)  ((((u64)pat_cr) >> ((n)<<3)) & 0xff)
+/* pat entry to PTE flags (PAT, PCD, PWT bits) */
+static unsigned char pat_entry_2_pte_flags[8] = {
+    0,           _PAGE_PWT,
+    _PAGE_PCD,   _PAGE_PCD | _PAGE_PWT,
+    _PAGE_PAT,   _PAGE_PAT | _PAGE_PWT,
+    _PAGE_PAT | _PAGE_PCD, _PAGE_PAT | _PAGE_PCD | _PAGE_PWT };
+
+/* effective mm type lookup table, according to MTRR and PAT */
+static u8 mm_type_tbl[MTRR_NUM_TYPES][PAT_TYPE_NUMS] = {
+/********PAT(UC,WC,RS,RS,WT,WP,WB,UC-)*/
+/* RS means reserved type(2,3), and type is hardcoded here */
+ /*MTRR(UC):(UC,WC,RS,RS,UC,UC,UC,UC)*/
+            {0, 1, 2, 2, 0, 0, 0, 0},
+ /*MTRR(WC):(UC,WC,RS,RS,UC,UC,WC,WC)*/
+            {0, 1, 2, 2, 0, 0, 1, 1},
+ /*MTRR(RS):(RS,RS,RS,RS,RS,RS,RS,RS)*/
+            {2, 2, 2, 2, 2, 2, 2, 2},
+ /*MTRR(RS):(RS,RS,RS,RS,RS,RS,RS,RS)*/
+            {2, 2, 2, 2, 2, 2, 2, 2},
+ /*MTRR(WT):(UC,WC,RS,RS,WT,WP,WT,UC)*/
+            {0, 1, 2, 2, 4, 5, 4, 0},
+ /*MTRR(WP):(UC,WC,RS,RS,WT,WP,WP,WC)*/
+            {0, 1, 2, 2, 4, 5, 5, 1},
+ /*MTRR(WB):(UC,WC,RS,RS,WT,WP,WB,UC)*/
+            {0, 1, 2, 2, 4, 5, 6, 0}
+};
+
+/* reverse lookup table, to find a pat type according to MTRR and effective
+ * memory type. This table is dynamically generated
+ */
+static u8 mtrr_epat_tbl[MTRR_NUM_TYPES][MEMORY_NUM_TYPES];
+
+/* lookup table for PAT entry of a given PAT value in host pat */
+static u8 pat_entry_tbl[PAT_TYPE_NUMS];
+
+static void get_mtrr_range(uint64_t base_msr, uint64_t mask_msr,
+                           uint64_t *base, uint64_t *end)
+{
+    uint32_t mask_lo = (uint32_t)mask_msr;
+    uint32_t mask_hi = (uint32_t)(mask_msr >> 32);
+    uint32_t base_lo = (uint32_t)base_msr;
+    uint32_t base_hi = (uint32_t)(base_msr >> 32);
+    uint32_t size;
+
+    if ( (mask_lo & 0x800) == 0 )
+    {
+        /* Invalid (i.e. free) range */
+        *base = 0;
+        *end = 0;
+        return;
+    }
+
+    /* Work out the shifted address mask. */
+    mask_lo = (size_or_mask | (mask_hi << (32 - PAGE_SHIFT)) |
+               (mask_lo >> PAGE_SHIFT));
+
+    /* This works correctly if size is a power of two (a contiguous range). */
+    size = -mask_lo;
+    *base = base_hi << (32 - PAGE_SHIFT) | base_lo >> PAGE_SHIFT;
+    *end = *base + size - 1;
+}
+
+bool_t is_var_mtrr_overlapped(struct mtrr_state *m)
+{
+    int seg, i;
+    uint64_t phys_base, phys_mask, phys_base_pre, phys_mask_pre;
+    uint64_t base_pre, end_pre, base, end;
+    uint8_t num_var_ranges = (u8)m->mtrr_cap;
+
+    for ( i = 0; i < num_var_ranges; i++ )
+    {
+        phys_base_pre = ((u64*)m->var_ranges)[i*2];
+        phys_mask_pre = ((u64*)m->var_ranges)[i*2 + 1];
+
+        get_mtrr_range(phys_base_pre, phys_mask_pre,
+                        &base_pre, &end_pre);
+
+        for ( seg = i + 1; seg < num_var_ranges; seg ++ )
+        {
+            phys_base = ((u64*)m->var_ranges)[seg*2];
+            phys_mask = ((u64*)m->var_ranges)[seg*2 + 1];
+
+            get_mtrr_range(phys_base, phys_mask,
+                            &base, &end);
+
+            if ( ((base_pre != end_pre) && (base != end))
+                 || ((base >= base_pre) && (base <= end_pre))
+                 || ((end >= base_pre) && (end <= end_pre))
+                 || ((base_pre >= base) && (base_pre <= end))
+                 || ((end_pre >= base) && (end_pre <= end)) )
+            {
+                /* MTRR is overlapped. */
+                return 1;
+            }
+        }
+    }
+    return 0;
+}
+
+/* reserved mtrr for guest OS */
+#define RESERVED_MTRR 2
+#define MTRRphysBase_MSR(reg) (0x200 + 2 * (reg))
+#define MTRRphysMask_MSR(reg) (0x200 + 2 * (reg) + 1)
+bool mtrr_var_range_msr_set(struct mtrr_state *m, u32 msr, u64 msr_content);
+bool mtrr_def_type_msr_set(struct mtrr_state *m, u64 msr_content);
+bool mtrr_fix_range_msr_set(struct mtrr_state *m, int row, u64 msr_content);
+static void set_var_mtrr(unsigned int reg, struct mtrr_state *m,
+                    unsigned int base, unsigned int size,
+                    unsigned int type)
+{
+    struct mtrr_var_range *vr;
+
+    vr = &m->var_ranges[reg];
+
+    if ( size == 0 )
+    {
+        /* The invalid bit is kept in the mask, so we simply clear the
+         * relevant mask register to disable a range.
+         */
+        mtrr_var_range_msr_set(m, MTRRphysMask_MSR(reg), 0);
+    }
+    else
+    {
+        vr->base_lo = base << PAGE_SHIFT | type;
+        vr->base_hi = (base & size_and_mask) >> (32 - PAGE_SHIFT);
+        vr->mask_lo = -size << PAGE_SHIFT | 0x800;
+        vr->mask_hi = (-size & size_and_mask) >> (32 - PAGE_SHIFT);
+
+        mtrr_var_range_msr_set(m, MTRRphysBase_MSR(reg), *(unsigned long *)vr);
+        mtrr_var_range_msr_set(m, MTRRphysMask_MSR(reg),
+                               *((unsigned long *)vr + 1));
+    }
+}
+/* From Intel Vol. III Section 10.11.4, the Range Size and Base Alignment has
+ * some kind of requirement:
+ * 1. The range size must be 2^N byte for N >= 12 (i.e 4KB minimum).
+ * 2. The base address must be 2^N aligned, where the N here is equal to
+ * the N in previous requirement. So a 8K range must be 8K aligned not 4K 
aligned.
+ */
+static unsigned int range_to_mtrr(unsigned int reg, struct mtrr_state *m,
+    unsigned int range_startk, unsigned int range_sizek, unsigned char type)
+{
+    if ( !range_sizek || (reg >= ((m->mtrr_cap & 0xff) - RESERVED_MTRR)) )
+        return reg;
+
+    while ( range_sizek )
+    {
+        unsigned int max_align, align, sizek;
+
+        max_align = (range_startk == 0) ? 32 : ffs(range_startk);
+        align = min_t(unsigned int, fls(range_sizek), max_align);
+        sizek = 1 << (align - 1);
+
+        set_var_mtrr(reg++, m, range_startk, sizek, type);
+
+        range_startk += sizek;
+        range_sizek  -= sizek;
+
+        if ( reg >= ((m->mtrr_cap & 0xff) - RESERVED_MTRR) )
+            break;
+    }
+
+    return reg;
+}
+
+static void setup_fixed_mtrrs(struct vcpu *v)
+{
+    uint64_t content;
+    int i;
+    struct mtrr_state *m = &v->arch.hvm_vcpu.mtrr;
+
+    /* 1. Map (0~A0000) as WB */
+    content = 0x0606060606060606ull;
+    mtrr_fix_range_msr_set(m, 0, content);
+    mtrr_fix_range_msr_set(m, 1, content);
+    /* 2. Map VRAM(A0000~C0000) as WC */
+    content = 0x0101010101010101;
+    mtrr_fix_range_msr_set(m, 2, content);
+    /* 3. Map (C0000~100000) as UC */
+    for ( i = 3; i < 11; i++)
+        mtrr_fix_range_msr_set(m, i, 0);
+}
+
+static void setup_var_mtrrs(struct vcpu *v)
+{
+    p2m_type_t p2m;
+    unsigned long e820_mfn;
+    char *p = NULL;
+    unsigned char nr = 0;
+    int i;
+    unsigned int reg = 0;
+    unsigned long size = 0;
+    unsigned long addr = 0;
+    struct e820entry *e820_table;
+
+    e820_mfn = mfn_x(gfn_to_mfn(v->domain,
+                    HVM_E820_PAGE >> PAGE_SHIFT, &p2m));
+
+    p = (char *)map_domain_page(e820_mfn);
+
+    nr = *(unsigned char*)(p + HVM_E820_NR_OFFSET);
+    e820_table = (struct e820entry*)(p + HVM_E820_OFFSET);
+    /* search E820 table, set MTRR for RAM */
+    for ( i = 0; i < nr; i++)
+    {
+        if ( (e820_table[i].addr >= 0x100000) &&
+             (e820_table[i].type == E820_RAM) )
+        {
+            if ( e820_table[i].addr == 0x100000 )
+            {
+                size = e820_table[i].size + 0x100000 + PAGE_SIZE * 3;
+                addr = 0;
+            }
+            else
+            {
+                /* Larger than 4G */
+                size = e820_table[i].size;
+                addr = e820_table[i].addr;
+            }
+
+            reg = range_to_mtrr(reg, &v->arch.hvm_vcpu.mtrr,
+                                addr >> PAGE_SHIFT, size >> PAGE_SHIFT,
+                                MTRR_TYPE_WRBACK);
+        }
+    }
+}
+
+void init_mtrr_in_hyper(struct vcpu *v)
+{
+    /* TODO:MTRR should be initialized in BIOS or other places.
+     * workaround to do it in here
+     */
+    if ( v->arch.hvm_vcpu.mtrr.is_initialized )
+        return;
+
+    setup_fixed_mtrrs(v);
+    setup_var_mtrrs(v);
+    /* enable mtrr */
+    mtrr_def_type_msr_set(&v->arch.hvm_vcpu.mtrr, 0xc00);
+
+    v->arch.hvm_vcpu.mtrr.is_initialized = 1;
+}
+
+static int reset_mtrr(struct mtrr_state *m)
+{
+    m->var_ranges = xmalloc_array(struct mtrr_var_range, MTRR_VCNT);
+    if ( m->var_ranges == NULL )
+        return -ENOMEM;
+    memset(m->var_ranges, 0, MTRR_VCNT * sizeof(struct mtrr_var_range));
+    memset(m->fixed_ranges, 0, sizeof(m->fixed_ranges));
+    m->enabled = 0;
+    m->def_type = 0;/*mtrr is disabled*/
+    m->mtrr_cap = (0x5<<8)|MTRR_VCNT;/*wc,fix enabled, and vcnt=8*/
+    m->overlapped = 0;
+    return 0;
+}
+
+/* init global variables for MTRR and PAT */
+void global_init_mtrr_pat(void)
+{
+    extern u64 host_pat;
+    u32 phys_addr;
+
+    init_mtrr_epat_tbl();
+    init_pat_entry_tbl(host_pat);
+    /* Get max physical address, set some global variable */
+    if ( cpuid_eax(0x80000000) < 0x80000008 )
+        phys_addr = 36;
+    else
+        phys_addr = cpuid_eax(0x80000008);
+
+    phys_base_msr_mask = ~((((u64)1) << phys_addr) - 1) | 0xf00UL;
+    phys_mask_msr_mask = ~((((u64)1) << phys_addr) - 1) | 0x7ffUL;
+
+    size_or_mask = ~((1 << (phys_addr - PAGE_SHIFT)) - 1);
+    size_and_mask = ~size_or_mask & 0xfff00000;
+}
+
+static void init_pat_entry_tbl(u64 pat)
+{
+    int i, j;
+
+    memset(&pat_entry_tbl, INVALID_MEM_TYPE,
+           PAT_TYPE_NUMS * sizeof(pat_entry_tbl[0]));
+
+    for ( i = 0; i < PAT_TYPE_NUMS; i++ )
+    {
+        for ( j = 0; j < PAT_TYPE_NUMS; j++ )
+        {
+            if ( pat_cr_2_paf(pat, j) == i )
+            {
+                pat_entry_tbl[i] = j;
+                break;
+            }
+        }
+    }
+}
+
+unsigned char pat_type_2_pte_flags(unsigned char pat_type)
+{
+    int pat_entry = pat_entry_tbl[pat_type];
+
+    /* INVALID_MEM_TYPE, means doesn't find the pat_entry in host pat for
+     * a given pat_type. If host pat covers all the pat types,
+     * it can't happen.
+     */
+    if ( likely(pat_entry != INVALID_MEM_TYPE) )
+        return pat_entry_2_pte_flags[pat_entry];
+
+    return pat_entry_2_pte_flags[pat_entry_tbl[PAT_TYPE_UNCACHABLE]];
+}
+
+int reset_vmsr(struct mtrr_state *m, u64 *pat_ptr)
+{
+    int rc;
+
+    rc = reset_mtrr(m);
+    if ( rc != 0 )
+        return rc;
+
+    *pat_ptr = ( (u64)PAT_TYPE_WRBACK) |                /* PAT0: WB */
+        ( (u64)PAT_TYPE_WRTHROUGH << 8 ) |              /* PAT1: WT */
+        ( (u64)PAT_TYPE_UC_MINUS << 16 ) |              /* PAT2: UC- */
+        ( (u64)PAT_TYPE_UNCACHABLE << 24 ) |            /* PAT3: UC */
+        ( (u64)PAT_TYPE_WRBACK << 32 ) |                /* PAT4: WB */
+        ( (u64)PAT_TYPE_WRTHROUGH << 40 ) |             /* PAT5: WT */
+        ( (u64)PAT_TYPE_UC_MINUS << 48 ) |              /* PAT6: UC- */
+        ( (u64)PAT_TYPE_UNCACHABLE << 56 );             /* PAT7: UC */
+
+    return 0;
+}
+
+/*
+ * Get MTRR memory type for physical address pa.
+ */
+static unsigned char get_mtrr_type(struct mtrr_state *m, paddr_t pa)
+{
+   int    addr, seg, index;
+   u8     overlap_mtrr = 0;
+   u8     overlap_mtrr_pos = 0;
+   u64    phys_base;
+   u64    phys_mask;
+   u8     num_var_ranges = m->mtrr_cap & 0xff;
+
+   if ( unlikely(!(m->enabled & 0x2)) )
+       return MTRR_TYPE_UNCACHABLE;
+
+   if ( (pa < 0x100000) && (m->enabled & 1) )
+   {
+       /* Fixed range MTRR takes effective */
+       addr = (unsigned int) pa;
+       if ( addr < 0x80000 )
+       {
+           seg = (addr >> 16);
+           return m->fixed_ranges[seg];
+       }
+       else if ( addr < 0xc0000 )
+       {
+           seg = (addr - 0x80000) >> 14;
+           index = (seg >> 3) + 1;
+           seg &= 7;            /* select 0-7 segments */
+           return m->fixed_ranges[index*8 + seg];
+       }
+       else
+       {
+           /* 0xC0000 --- 0x100000 */
+           seg = (addr - 0xc0000) >> 12;
+           index = (seg >> 3) + 3;
+           seg &= 7;            /* select 0-7 segments */
+           return m->fixed_ranges[index*8 + seg];
+       }
+   }
+
+   /* Match with variable MTRRs. */
+   for ( seg = 0; seg < num_var_ranges; seg++ )
+   {
+       phys_base = ((u64*)m->var_ranges)[seg*2];
+       phys_mask = ((u64*)m->var_ranges)[seg*2 + 1];
+       if ( phys_mask & (1 << MTRR_PHYSMASK_VALID_BIT) )
+       {
+           if ( ((u64) pa & phys_mask) >> MTRR_PHYSMASK_SHIFT ==
+                (phys_base & phys_mask) >> MTRR_PHYSMASK_SHIFT )
+           {
+               if ( unlikely(m->overlapped) )
+               {
+                    overlap_mtrr |= 1 << (phys_base & MTRR_PHYSBASE_TYPE_MASK);
+                    overlap_mtrr_pos = phys_base & MTRR_PHYSBASE_TYPE_MASK;
+               }
+               else
+               {
+                   /* If no overlap, return the found one */
+                   return (phys_base & MTRR_PHYSBASE_TYPE_MASK);
+               }
+           }
+       }
+   }
+
+   /* Overlapped or not found. */
+   if ( unlikely(overlap_mtrr == 0) )
+       return m->def_type;
+
+   if ( likely(!(overlap_mtrr & ~( ((u8)1) << overlap_mtrr_pos ))) )
+       /* Covers both one variable memory range matches and
+        * two or more identical match.
+        */
+       return overlap_mtrr_pos;
+
+   if ( overlap_mtrr & 0x1 )
+       /* Two or more match, one is UC. */
+       return MTRR_TYPE_UNCACHABLE;
+   
+   if ( !(overlap_mtrr & 0xaf) )
+       /* Two or more match, WT and WB. */
+       return MTRR_TYPE_WRTHROUGH;
+
+   /* Behaviour is undefined, but return the last overlapped type. */
+   return overlap_mtrr_pos;
+}
+
+/*
+ * return the memory type from PAT.
+ * NOTE: valid only when paging is enabled.
+ *       Only 4K page PTE is supported now.
+ */
+static unsigned char page_pat_type(u64 pat_cr, unsigned long pte_flags)
+{
+    int pat_entry;
+
+    /* PCD/PWT -> bit 1/0 of PAT entry */
+    pat_entry = ( pte_flags >> 3 ) & 0x3;
+    /* PAT bits as bit 2 of PAT entry */
+    if ( pte_flags & _PAGE_PAT )
+        pat_entry |= 4;
+
+    return (unsigned char)pat_cr_2_paf(pat_cr, pat_entry);
+}
+
+/*
+ * Effective memory type for leaf page.
+ */
+static u8 effective_mm_type(
+        struct mtrr_state *m,
+        u64 pat,
+        paddr_t gpa,
+        unsigned long pte_flags)
+{
+    unsigned char mtrr_mtype, pat_value, effective;
+
+    mtrr_mtype = get_mtrr_type(m, gpa);
+
+    pat_value = page_pat_type(pat, pte_flags);
+
+    effective = mm_type_tbl[mtrr_mtype][pat_value];
+
+    return effective;
+}
+
+static void init_mtrr_epat_tbl(void)
+{
+    int i, j;
+    /* set default value to an invalid type, just for checking conflict */
+    memset(&mtrr_epat_tbl, INVALID_MEM_TYPE, sizeof(mtrr_epat_tbl));
+
+    for ( i = 0; i < MTRR_NUM_TYPES; i++ )
+    {
+        for ( j = 0; j < PAT_TYPE_NUMS; j++ )
+        {
+            int tmp = mm_type_tbl[i][j];
+            if ( (tmp >= 0) && (tmp < MEMORY_NUM_TYPES) )
+                mtrr_epat_tbl[i][tmp] = j;
+        }
+    }
+}
+
+u32 get_pat_flags(struct vcpu *v,
+                  u32 gl1e_flags,
+                  paddr_t gpaddr,
+                  paddr_t spaddr)
+{
+    u8 guest_eff_mm_type;
+    u8 shadow_mtrr_type;
+    u8 pat_entry_value;
+    u64 pat = v->arch.hvm_vcpu.pat_cr;
+    struct mtrr_state *g = &v->arch.hvm_vcpu.mtrr;
+
+    /* 1. Get the effective memory type of guest physical address,
+     * with the pair of guest MTRR and PAT
+     */
+    guest_eff_mm_type = effective_mm_type(g, pat, gpaddr, gl1e_flags);
+    /* 2. Get the memory type of host physical address, with MTRR */
+    shadow_mtrr_type = get_mtrr_type(&mtrr_state, spaddr);
+
+    /* 3. Find the memory type in PAT, with host MTRR memory type
+     * and guest effective memory type.
+     */
+    pat_entry_value = mtrr_epat_tbl[shadow_mtrr_type][guest_eff_mm_type];
+    /* If conflit occurs(e.g host MTRR is UC, guest memory type is
+     * WB),set UC as effective memory. Here, returning PAT_TYPE_UNCACHABLE will
+     * always set effective memory as UC.
+     */
+    if ( pat_entry_value == INVALID_MEM_TYPE )
+    {
+        gdprintk(XENLOG_WARNING,
+                 "Conflict occurs for a given guest l1e flags:%x "
+                 "at %"PRIx64" (the effective mm type:%d), "
+                 "because the host mtrr type is:%d\n",
+                 gl1e_flags, (uint64_t)gpaddr, guest_eff_mm_type,
+                 shadow_mtrr_type);
+        pat_entry_value = PAT_TYPE_UNCACHABLE;
+    }
+    /* 4. Get the pte flags */
+    return pat_type_2_pte_flags(pat_entry_value);
+}
+
+/* Helper funtions for seting mtrr/pat */
+bool pat_msr_set(u64 *pat, u64 msr_content)
+{
+    u8 *value = (u8*)&msr_content;
+    int i;
+
+    if ( *pat != msr_content )
+    {
+        for ( i = 0; i < 8; i++ )
+            if ( unlikely(!(value[i] == 0 || value[i] == 1 ||
+                            value[i] == 4 || value[i] == 5 ||
+                            value[i] == 6 || value[i] == 7)) )
+                return 0;
+
+        *pat = msr_content;
+    }
+
+    return 1;
+}
+
+bool mtrr_def_type_msr_set(struct mtrr_state *m, u64 msr_content)
+{
+    u8 def_type = msr_content & 0xff;
+    u8 enabled = (msr_content >> 10) & 0x3;
+
+    if ( unlikely(!(def_type == 0 || def_type == 1 || def_type == 4 ||
+                    def_type == 5 || def_type == 6)) )
+    {
+         HVM_DBG_LOG(DBG_LEVEL_MSR, "invalid MTRR def type:%x\n", def_type);
+         return 0;
+    }
+
+    if ( unlikely(msr_content && (msr_content & ~0xcffUL)) )
+    {
+         HVM_DBG_LOG(DBG_LEVEL_MSR, "invalid msr content:%"PRIx64"\n",
+                     msr_content);
+         return 0;
+    }
+
+    m->enabled = enabled;
+    m->def_type = def_type;
+
+    return 1;
+}
+
+bool mtrr_fix_range_msr_set(struct mtrr_state *m, int row, u64 msr_content)
+{
+    u64 *fixed_range_base = (u64 *)m->fixed_ranges;
+
+    if ( fixed_range_base[row] != msr_content )
+    {
+        u8 *range = (u8*)&msr_content;
+        int i, type;
+
+        for ( i = 0; i < 8; i++ )
+        {
+            type = range[i];
+            if ( unlikely(!(type == 0 || type == 1 ||
+                            type == 4 || type == 5 || type == 6)) )
+                return 0;
+        }
+
+        fixed_range_base[row] = msr_content;
+    }
+
+    return 1;
+}
+
+bool mtrr_var_range_msr_set(struct mtrr_state *m, u32 msr, u64 msr_content)
+{
+    u32 index;
+    u64 msr_mask;
+    u64 *var_range_base = (u64*)m->var_ranges;
+
+    index = msr - MSR_IA32_MTRR_PHYSBASE0;
+
+    if ( var_range_base[index] != msr_content )
+    {
+        u32 type = msr_content & 0xff;
+
+        msr_mask = (index & 1) ? phys_mask_msr_mask : phys_base_msr_mask;
+
+        if ( unlikely(!(type == 0 || type == 1 ||
+                        type == 4 || type == 5 || type == 6)) )
+            return 0;
+
+        if ( unlikely(msr_content && (msr_content & msr_mask)) )
+        {
+            HVM_DBG_LOG(DBG_LEVEL_MSR, "invalid msr content:%"PRIx64"\n",
+                        msr_content);
+            return 0;
+        }
+
+        var_range_base[index] = msr_content;
+    }
+
+    m->overlapped = is_var_mtrr_overlapped(m);
+
+    return 1;
+}
+
+bool_t mtrr_pat_not_equal(struct vcpu *vd, struct vcpu *vs)
+{
+    struct mtrr_state *md = &vd->arch.hvm_vcpu.mtrr;
+    struct mtrr_state *ms = &vs->arch.hvm_vcpu.mtrr;
+    int res;
+    u8 num_var_ranges = (u8)md->mtrr_cap;
+
+    /* Test fixed ranges. */
+    res = memcmp(md->fixed_ranges, ms->fixed_ranges,
+            NUM_FIXED_RANGES*sizeof(mtrr_type));
+    if ( res )
+        return 1;
+
+    /* Test var ranges. */
+    res = memcmp(md->var_ranges, ms->var_ranges,
+            num_var_ranges*sizeof(struct mtrr_var_range));
+    if ( res )
+        return 1;
+
+    /* Test default type MSR. */
+    if ( (md->def_type != ms->def_type)
+            && (md->enabled != ms->enabled) )
+        return 1;
+
+    /* Test PAT. */
+    if ( vd->arch.hvm_vcpu.pat_cr != vs->arch.hvm_vcpu.pat_cr )
+        return 1;
+
+    return 0;
+}
diff -r 42d8dadb5864 -r 3e7c86602c70 xen/arch/x86/hvm/vmx/vmcs.c
--- a/xen/arch/x86/hvm/vmx/vmcs.c       Mon Oct 22 13:04:32 2007 +0100
+++ b/xen/arch/x86/hvm/vmx/vmcs.c       Mon Oct 22 14:19:22 2007 +0100
@@ -756,6 +756,11 @@ void vm_resume_fail(unsigned long eflags
     domain_crash_synchronous();
 }
 
+static void flush_cache(void *info)
+{
+    wbinvd();
+}
+
 void vmx_do_resume(struct vcpu *v)
 {
     bool_t debug_state;
@@ -767,6 +772,18 @@ void vmx_do_resume(struct vcpu *v)
     }
     else
     {
+        /* For pass-through domain, guest PCI-E device driver may leverage the
+         * "Non-Snoop" I/O, and explicitly "WBINVD" or "CFLUSH" to a RAM space.
+         * In that case, if migration occurs before "WBINVD" or "CFLUSH", need
+         * to maintain data consistency.
+         */
+        if ( !list_empty(&(domain_hvm_iommu(v->domain)->pdev_list)) )
+        {
+            int cpu = v->arch.hvm_vmx.active_cpu;
+            if ( cpu != -1 )
+                on_selected_cpus(cpumask_of_cpu(cpu), flush_cache, NULL, 1, 1);
+        }
+
         vmx_clear_vmcs(v);
         vmx_load_vmcs(v);
         hvm_migrate_timers(v);
diff -r 42d8dadb5864 -r 3e7c86602c70 xen/arch/x86/hvm/vmx/vmx.c
--- a/xen/arch/x86/hvm/vmx/vmx.c        Mon Oct 22 13:04:32 2007 +0100
+++ b/xen/arch/x86/hvm/vmx/vmx.c        Mon Oct 22 14:19:22 2007 +0100
@@ -50,6 +50,7 @@
 #include <asm/hvm/vpt.h>
 #include <public/hvm/save.h>
 #include <asm/hvm/trace.h>
+#include <stdbool.h>
 
 enum handler_return { HNDL_done, HNDL_unhandled, HNDL_exception_raised };
 
@@ -2285,6 +2286,9 @@ static int vmx_do_msr_read(struct cpu_us
     u64 msr_content = 0;
     u32 ecx = regs->ecx, eax, edx;
     struct vcpu *v = current;
+    int index;
+    u64 *var_range_base = (u64*)v->arch.hvm_vcpu.mtrr.var_ranges;
+    u64 *fixed_range_base =  (u64*)v->arch.hvm_vcpu.mtrr.fixed_ranges;
 
     HVM_DBG_LOG(DBG_LEVEL_1, "ecx=%x", ecx);
 
@@ -2304,6 +2308,32 @@ static int vmx_do_msr_read(struct cpu_us
         break;
     case MSR_IA32_APICBASE:
         msr_content = vcpu_vlapic(v)->hw.apic_base_msr;
+        break;
+    case MSR_IA32_CR_PAT:
+        msr_content = v->arch.hvm_vcpu.pat_cr;
+        break;
+    case MSR_MTRRcap:
+        msr_content = v->arch.hvm_vcpu.mtrr.mtrr_cap;
+        break;
+    case MSR_MTRRdefType:
+        msr_content = v->arch.hvm_vcpu.mtrr.def_type
+                        | (v->arch.hvm_vcpu.mtrr.enabled << 10);
+        break;
+    case MSR_MTRRfix64K_00000:
+        msr_content = fixed_range_base[0];
+        break;
+    case MSR_MTRRfix16K_80000:
+    case MSR_MTRRfix16K_A0000:
+        index = regs->ecx - MSR_MTRRfix16K_80000;
+        msr_content = fixed_range_base[index + 1];
+        break;
+    case MSR_MTRRfix4K_C0000...MSR_MTRRfix4K_F8000:
+        index = regs->ecx - MSR_MTRRfix4K_C0000;
+        msr_content = fixed_range_base[index + 3];
+        break;
+    case MSR_IA32_MTRR_PHYSBASE0...MSR_IA32_MTRR_PHYSMASK7:
+        index = regs->ecx - MSR_IA32_MTRR_PHYSBASE0;
+        msr_content = var_range_base[index];
         break;
     case MSR_IA32_DEBUGCTLMSR:
         if ( vmx_read_guest_msr(v, ecx, &msr_content) != 0 )
@@ -2428,11 +2458,19 @@ void vmx_vlapic_msr_changed(struct vcpu 
     vmx_vmcs_exit(v);
 }
 
+extern bool mtrr_var_range_msr_set(struct mtrr_state *v,
+        u32 msr, u64 msr_content);
+extern bool mtrr_fix_range_msr_set(struct mtrr_state *v,
+        int row, u64 msr_content);
+extern bool mtrr_def_type_msr_set(struct mtrr_state *v, u64 msr_content);
+extern bool pat_msr_set(u64 *pat, u64 msr);
+
 static int vmx_do_msr_write(struct cpu_user_regs *regs)
 {
     u32 ecx = regs->ecx;
     u64 msr_content;
     struct vcpu *v = current;
+    int index;
 
     HVM_DBG_LOG(DBG_LEVEL_1, "ecx=%x, eax=%x, edx=%x",
                 ecx, (u32)regs->eax, (u32)regs->edx);
@@ -2459,6 +2497,38 @@ static int vmx_do_msr_write(struct cpu_u
     case MSR_IA32_APICBASE:
         vlapic_msr_set(vcpu_vlapic(v), msr_content);
         break;
+    case MSR_IA32_CR_PAT:
+        if ( !pat_msr_set(&v->arch.hvm_vcpu.pat_cr, msr_content) )
+           goto gp_fault;
+        break;
+    case MSR_MTRRdefType:
+        if ( !mtrr_def_type_msr_set(&v->arch.hvm_vcpu.mtrr, msr_content) )
+           goto gp_fault;
+        break;
+    case MSR_MTRRfix64K_00000:
+        if ( !mtrr_fix_range_msr_set(&v->arch.hvm_vcpu.mtrr, 0, msr_content) )
+            goto gp_fault;
+        break;
+    case MSR_MTRRfix16K_80000:
+    case MSR_MTRRfix16K_A0000:
+        index = regs->ecx - MSR_MTRRfix16K_80000 + 1;
+        if ( !mtrr_fix_range_msr_set(&v->arch.hvm_vcpu.mtrr,
+                                     index, msr_content) )
+            goto gp_fault;
+        break;
+    case MSR_MTRRfix4K_C0000...MSR_MTRRfix4K_F8000:
+        index = regs->ecx - MSR_MTRRfix4K_C0000 + 3;
+        if ( !mtrr_fix_range_msr_set(&v->arch.hvm_vcpu.mtrr,
+                                     index, msr_content) )
+            goto gp_fault;
+        break;
+    case MSR_IA32_MTRR_PHYSBASE0...MSR_IA32_MTRR_PHYSMASK7:
+        if ( !mtrr_var_range_msr_set(&v->arch.hvm_vcpu.mtrr,
+                                     regs->ecx, msr_content) )
+            goto gp_fault;
+        break;
+    case MSR_MTRRcap:
+        goto gp_fault;
     case MSR_IA32_DEBUGCTLMSR: {
         int i, rc = 0;
 
diff -r 42d8dadb5864 -r 3e7c86602c70 xen/arch/x86/mm.c
--- a/xen/arch/x86/mm.c Mon Oct 22 13:04:32 2007 +0100
+++ b/xen/arch/x86/mm.c Mon Oct 22 14:19:22 2007 +0100
@@ -3115,6 +3115,15 @@ long arch_memory_op(int op, XEN_GUEST_HA
         case XENMAPSPACE_shared_info:
             if ( xatp.idx == 0 )
                 mfn = virt_to_mfn(d->shared_info);
+            /* XXX: assumption here, this is called after E820 table is build
+             * need the E820 to initialize MTRR.
+             */
+            if ( is_hvm_domain(d) ) {
+                extern void init_mtrr_in_hyper(struct vcpu *);
+                struct vcpu *vs;
+                for_each_vcpu(d, vs)
+                    init_mtrr_in_hyper(vs);
+            }
             break;
         case XENMAPSPACE_grant_table:
             spin_lock(&d->grant_table->lock);
diff -r 42d8dadb5864 -r 3e7c86602c70 xen/arch/x86/mm/shadow/common.c
--- a/xen/arch/x86/mm/shadow/common.c   Mon Oct 22 13:04:32 2007 +0100
+++ b/xen/arch/x86/mm/shadow/common.c   Mon Oct 22 14:19:22 2007 +0100
@@ -880,6 +880,14 @@ static void shadow_blow_tables(struct do
     flush_tlb_mask(d->domain_dirty_cpumask);
 }
 
+void shadow_blow_tables_per_domain(struct domain *d)
+{
+    if ( shadow_mode_enabled(d) && d->vcpu[0] != NULL ) {
+        shadow_lock(d);
+        shadow_blow_tables(d);
+        shadow_unlock(d);
+    }
+}
 
 #ifndef NDEBUG
 /* Blow all shadows of all shadowed domains: this can be used to cause the
diff -r 42d8dadb5864 -r 3e7c86602c70 xen/arch/x86/mm/shadow/multi.c
--- a/xen/arch/x86/mm/shadow/multi.c    Mon Oct 22 13:04:32 2007 +0100
+++ b/xen/arch/x86/mm/shadow/multi.c    Mon Oct 22 14:19:22 2007 +0100
@@ -33,6 +33,7 @@
 #include <asm/shadow.h>
 #include <asm/flushtlb.h>
 #include <asm/hvm/hvm.h>
+#include <asm/mtrr.h>
 #include "private.h"
 #include "types.h"
 
@@ -267,6 +268,11 @@ guest_walk_tables(struct vcpu *v, unsign
          * us reflect l2 changes later without touching the l1s. */
         int flags = (_PAGE_PRESENT|_PAGE_USER|_PAGE_RW|
                      _PAGE_ACCESSED|_PAGE_DIRTY);
+        /* propagate PWT PCD to level 1 for PSE */
+        if ( (guest_l2e_get_flags(*gw->l2e) & _PAGE_PWT) )
+            flags |= _PAGE_PWT;
+        if ( (guest_l2e_get_flags(*gw->l2e) & _PAGE_PCD) )
+            flags |= _PAGE_PCD;
         /* PSE level 2 entries use bit 12 for PAT; propagate it to bit 7
          * of the level 1 */
         if ( (guest_l2e_get_flags(*gw->l2e) & _PAGE_PSE_PAT) ) 
@@ -614,7 +620,12 @@ shadow_l4_index(mfn_t *smfn, u32 guest_i
 
 #endif // GUEST_PAGING_LEVELS >= 4
 
-
+extern u32 get_pat_flags(struct vcpu *v,
+                  u32 gl1e_flags,
+                  paddr_t gpaddr,
+                  paddr_t spaddr);
+
+unsigned char pat_type_2_pte_flags(unsigned char pat_type);
 /**************************************************************************/
 /* Function which computes shadow entries from their corresponding guest
  * entries.  This is the "heart" of the shadow code. It operates using
@@ -703,6 +714,17 @@ _sh_propagate(struct vcpu *v,
         pass_thru_flags |= _PAGE_NX_BIT;
     sflags = gflags & pass_thru_flags;
 
+    /* Only change memory caching type for pass-through domain */
+    if ( (level == 1) && !list_empty(&(domain_hvm_iommu(d)->pdev_list)) ) {
+        if ( v->domain->arch.hvm_domain.is_in_uc_mode )
+            sflags |= pat_type_2_pte_flags(PAT_TYPE_UNCACHABLE);
+        else
+            sflags |= get_pat_flags(v,
+                                    gflags,
+                                    guest_l1e_get_paddr(*gp),
+                                    mfn_x(target_mfn) << PAGE_SHIFT);
+    }
+
     // Set the A&D bits for higher level shadows.
     // Higher level entries do not, strictly speaking, have dirty bits, but
     // since we use shadow linear tables, each of these entries may, at some
@@ -773,10 +795,6 @@ _sh_propagate(struct vcpu *v,
     {
         sflags |= _PAGE_USER;
     }
-
-    /* MMIO addresses should never be cached */
-    if ( p2m_is_mmio(p2mt) )
-        sflags |= _PAGE_PCD;
 
     *sp = shadow_l1e_from_mfn(target_mfn, sflags);
 
diff -r 42d8dadb5864 -r 3e7c86602c70 xen/include/asm-x86/cpufeature.h
--- a/xen/include/asm-x86/cpufeature.h  Mon Oct 22 13:04:32 2007 +0100
+++ b/xen/include/asm-x86/cpufeature.h  Mon Oct 22 14:19:22 2007 +0100
@@ -128,6 +128,7 @@
 #define cpu_has_tsc            boot_cpu_has(X86_FEATURE_TSC)
 #define cpu_has_pae            boot_cpu_has(X86_FEATURE_PAE)
 #define cpu_has_pge            boot_cpu_has(X86_FEATURE_PGE)
+#define cpu_has_pat            boot_cpu_has(X86_FEATURE_PAT)
 #define cpu_has_apic           boot_cpu_has(X86_FEATURE_APIC)
 #define cpu_has_sep            boot_cpu_has(X86_FEATURE_SEP)
 #define cpu_has_mtrr           boot_cpu_has(X86_FEATURE_MTRR)
@@ -152,6 +153,7 @@
 #define cpu_has_tsc            1
 #define cpu_has_pae            1
 #define cpu_has_pge            1
+#define cpu_has_pat            1
 #define cpu_has_apic           boot_cpu_has(X86_FEATURE_APIC)
 #define cpu_has_sep            0
 #define cpu_has_mtrr           1
diff -r 42d8dadb5864 -r 3e7c86602c70 xen/include/asm-x86/hvm/domain.h
--- a/xen/include/asm-x86/hvm/domain.h  Mon Oct 22 13:04:32 2007 +0100
+++ b/xen/include/asm-x86/hvm/domain.h  Mon Oct 22 14:19:22 2007 +0100
@@ -61,6 +61,12 @@ struct hvm_domain {
 
     unsigned long          vmx_apic_access_mfn;
 
+    /* If one of vcpus of this domain is in no_fill_mode or
+     * mtrr/pat between vcpus is not the same, set is_in_uc_mode
+     */
+    spinlock_t       uc_lock;
+    bool_t           is_in_uc_mode;
+
     /* Pass-through */
     struct hvm_iommu       hvm_iommu;
 };
diff -r 42d8dadb5864 -r 3e7c86602c70 xen/include/asm-x86/hvm/support.h
--- a/xen/include/asm-x86/hvm/support.h Mon Oct 22 13:04:32 2007 +0100
+++ b/xen/include/asm-x86/hvm/support.h Mon Oct 22 14:19:22 2007 +0100
@@ -64,6 +64,7 @@ static inline vcpu_iodata_t *get_ioreq(s
 #define DBG_LEVEL_VLAPIC_INTERRUPT  (1 << 8)
 #define DBG_LEVEL_IOAPIC            (1 << 9)
 #define DBG_LEVEL_HCALL             (1 << 10)
+#define DBG_LEVEL_MSR               (1 << 11)
 
 extern unsigned int opt_hvm_debug_level;
 #define HVM_DBG_LOG(level, _f, _a...)                                         \
diff -r 42d8dadb5864 -r 3e7c86602c70 xen/include/asm-x86/hvm/vcpu.h
--- a/xen/include/asm-x86/hvm/vcpu.h    Mon Oct 22 13:04:32 2007 +0100
+++ b/xen/include/asm-x86/hvm/vcpu.h    Mon Oct 22 14:19:22 2007 +0100
@@ -24,6 +24,7 @@
 #include <asm/hvm/vlapic.h>
 #include <asm/hvm/vmx/vmcs.h>
 #include <asm/hvm/svm/vmcb.h>
+#include <asm/mtrr.h>
 
 #define HVM_VCPU_INIT_SIPI_SIPI_STATE_NORM          0
 #define HVM_VCPU_INIT_SIPI_SIPI_STATE_WAIT_SIPI     1
@@ -62,6 +63,12 @@ struct hvm_vcpu {
         struct arch_vmx_struct vmx;
         struct arch_svm_struct svm;
     } u;
+
+    struct mtrr_state   mtrr;
+    u64                 pat_cr;
+
+    /* Which cache mode is this VCPU in (CR0:CD/NW)? */
+    u8                  cache_mode;
 };
 
 #define ARCH_HVM_IO_WAIT         1   /* Waiting for I/O completion */
diff -r 42d8dadb5864 -r 3e7c86602c70 xen/include/asm-x86/msr-index.h
--- a/xen/include/asm-x86/msr-index.h   Mon Oct 22 13:04:32 2007 +0100
+++ b/xen/include/asm-x86/msr-index.h   Mon Oct 22 14:19:22 2007 +0100
@@ -68,6 +68,25 @@
 #define MSR_IA32_LASTBRANCHTOIP                0x000001dc
 #define MSR_IA32_LASTINTFROMIP         0x000001dd
 #define MSR_IA32_LASTINTTOIP           0x000001de
+ 
+#define MSR_IA32_MTRR_PHYSBASE0     0x00000200
+#define MSR_IA32_MTRR_PHYSMASK0     0x00000201
+#define MSR_IA32_MTRR_PHYSBASE1     0x00000202
+#define MSR_IA32_MTRR_PHYSMASK1     0x00000203
+#define MSR_IA32_MTRR_PHYSBASE2     0x00000204
+#define MSR_IA32_MTRR_PHYSMASK2     0x00000205
+#define MSR_IA32_MTRR_PHYSBASE3     0x00000206
+#define MSR_IA32_MTRR_PHYSMASK3     0x00000207
+#define MSR_IA32_MTRR_PHYSBASE4     0x00000208
+#define MSR_IA32_MTRR_PHYSMASK4     0x00000209
+#define MSR_IA32_MTRR_PHYSBASE5     0x0000020a
+#define MSR_IA32_MTRR_PHYSMASK5     0x0000020b
+#define MSR_IA32_MTRR_PHYSBASE6     0x0000020c
+#define MSR_IA32_MTRR_PHYSMASK6     0x0000020d
+#define MSR_IA32_MTRR_PHYSBASE7     0x0000020e
+#define MSR_IA32_MTRR_PHYSMASK7     0x0000020f
+
+#define MSR_IA32_CR_PAT             0x00000277
 
 #define MSR_IA32_MC0_CTL               0x00000400
 #define MSR_IA32_MC0_STATUS            0x00000401
diff -r 42d8dadb5864 -r 3e7c86602c70 xen/include/asm-x86/mtrr.h
--- a/xen/include/asm-x86/mtrr.h        Mon Oct 22 13:04:32 2007 +0100
+++ b/xen/include/asm-x86/mtrr.h        Mon Oct 22 14:19:22 2007 +0100
@@ -10,6 +10,55 @@
 #define MTRR_TYPE_WRPROT     5
 #define MTRR_TYPE_WRBACK     6
 #define MTRR_NUM_TYPES       7
+#define MEMORY_NUM_TYPES     MTRR_NUM_TYPES
+
+#define MTRR_PHYSMASK_VALID_BIT  11
+#define MTRR_PHYSMASK_SHIFT      12
+
+#define MTRR_PHYSBASE_TYPE_MASK  0xff   /* lowest 8 bits */
+#define MTRR_PHYSBASE_SHIFT      12
+#define MTRR_VCNT            8
+
+#define NORMAL_CACHE_MODE          0
+#define NO_FILL_CACHE_MODE         2
+
+enum {
+    PAT_TYPE_UNCACHABLE=0,
+    PAT_TYPE_WRCOMB=1,
+    PAT_TYPE_RESERVED=2,
+    PAT_TYPE_WRTHROUGH=4,
+    PAT_TYPE_WRPROT=5,
+    PAT_TYPE_WRBACK=6,
+    PAT_TYPE_UC_MINUS=7,
+    PAT_TYPE_NUMS
+};
+
+#define INVALID_MEM_TYPE PAT_TYPE_NUMS
+
+/* In the Intel processor's MTRR interface, the MTRR type is always held in
+   an 8 bit field: */
+typedef u8 mtrr_type;
+
+struct mtrr_var_range {
+       u32 base_lo;
+       u32 base_hi;
+       u32 mask_lo;
+       u32 mask_hi;
+};
+
+#define NUM_FIXED_RANGES 88
+struct mtrr_state {
+       struct mtrr_var_range *var_ranges;
+       mtrr_type fixed_ranges[NUM_FIXED_RANGES];
+       unsigned char enabled;
+       unsigned char have_fixed;
+       mtrr_type def_type;
+
+       u64       mtrr_cap;
+       /* ranges in var MSRs are overlapped or not:0(no overlapped) */
+       bool_t    overlapped;
+       bool_t    is_initialized;
+};
 
 extern void mtrr_save_fixed_ranges(void *);
 extern void mtrr_save_state(void);

_______________________________________________
Xen-changelog mailing list
Xen-changelog@xxxxxxxxxxxxxxxxxxx
http://lists.xensource.com/xen-changelog
WARNING - OLD ARCHIVES

xen-changelog

[Xen-changelog] [xen-unstable] hvm/x86: MTRR/PAT virtualisation.