[Xen-devel] [patch 16/21] Xen-paravirt: Add code into head.S to

This adds a second entry point to head.S, which is jumped to when
booted by Xen.  This allows startup under Xen to be easily detected.

Because Xen starts the kernel in a fairly sane state, very little
setup is needed here; it just needs to jump into xen_start_kernel to
init the paravirt_ops structure, and then jump into start_kernel
proper.

This also makes a few small adjustments to the gdt tables to make them
properly suited to Xen.

One warty thing in this patch is the requirement to hard-code the
location of the Xen entrypoint and hypervisor page, rather than
letting the assembler/linker choose an appropriate place.  This is
because these addresses must be converted into a string at compile
time, so the address must be known at compile rather than link time.

Subject: [patch 16/21] Xen-paravirt: Add outline of Xen paravirt interface 
code, plus boot-time init.

Create a new arch/i386/xen/ directory for all the
Xen-specific paravirt code; I'd expect there would be parallel
paravirt-vmi, etc directories.

This also contains an initial set of paravirt ops for Xen, mostly ones
which can just be implemented with the generic native_* version.  At
boot time, the global paravirt_ops structure is populated with the Xen
pointers in xen_start_kernel, which then jumps to the standard
start_kernel.

Hook Xen entrypoint into common paravirt entrypoint.

Register Xen-specific architecture and memory setup functions.

Signed-off-by: Jeremy Fitzhardinge <jeremy@xxxxxxxxxxxxx>

---
 arch/i386/Makefile             |    3 
 arch/i386/kernel/cpu/common.c  |    3 
 arch/i386/kernel/entry.S       |   77 +++
 arch/i386/kernel/head.S        |   12 
 arch/i386/kernel/paravirt.c    |   48 +-
 arch/i386/kernel/vmlinux.lds.S |    1 
 arch/i386/mm/pgtable.c         |    1 
 arch/i386/xen/Makefile         |    2 
 arch/i386/xen/enlighten.c      |  807 +++++++++++++++++++++++++++++++++++++++
 arch/i386/xen/events.c         |  473 ++++++++++++++++++++++
 arch/i386/xen/features.c       |   29 +
 arch/i386/xen/mmu.c            |  419 ++++++++++++++++++++
 arch/i386/xen/mmu.h            |   51 ++
 arch/i386/xen/multicalls.c     |   62 ++
 arch/i386/xen/multicalls.h     |   13 
 arch/i386/xen/setup.c          |   95 ++++
 arch/i386/xen/time.c           |  452 +++++++++++++++++++++
 arch/i386/xen/xen-head.S       |   29 +
 arch/i386/xen/xen-ops.h        |   20 
 include/asm-i386/hypercall.h   |   21 -
 include/asm-i386/irq.h         |    1 
 include/asm-i386/paravirt.h    |   42 ++
 include/asm-i386/pda.h         |   11 
 include/xen/events.h           |   28 +
 include/xen/features.h         |   26 +
 include/xen/page.h             |  175 ++++++++
 26 files changed, 2872 insertions(+), 29 deletions(-)

===================================================================
--- a/arch/i386/Makefile
+++ b/arch/i386/Makefile
@@ -93,6 +93,9 @@ mcore-$(CONFIG_X86_ES7000)    := mach-defau
 mcore-$(CONFIG_X86_ES7000)     := mach-default
 core-$(CONFIG_X86_ES7000)      := arch/i386/mach-es7000/
 
+# Xen paravirtualization support
+core-$(CONFIG_XEN)             += arch/i386/xen/
+
 # default subarch .h files
 mflags-y += -Iinclude/asm-i386/mach-default
 
===================================================================
--- a/arch/i386/kernel/cpu/common.c
+++ b/arch/i386/kernel/cpu/common.c
@@ -19,6 +19,7 @@
 #include <mach_apic.h>
 #endif
 #include <asm/pda.h>
+#include <asm/paravirt.h>
 
 #include "cpu.h"
 
@@ -707,6 +708,8 @@ __cpuinit int init_gdt(int cpu, struct t
        pda->cpu_number = cpu;
        pda->pcurrent = idle;
 
+       paravirt_init_pda(pda, cpu);
+
        return 1;
 }
 
===================================================================
--- a/arch/i386/kernel/entry.S
+++ b/arch/i386/kernel/entry.S
@@ -1001,6 +1001,83 @@ ENTRY(kernel_thread_helper)
        CFI_ENDPROC
 ENDPROC(kernel_thread_helper)
 
+#ifdef CONFIG_XEN
+/* Xen only supports sysenter/sysexit in ring0 guests,
+   and only if it the guest asks for it.  So for now,
+   this should never be used. */
+ENTRY(xen_sti_sysexit)
+       CFI_STARTPROC
+       ud2
+       CFI_ENDPROC
+       
+ENTRY(xen_hypervisor_callback)
+       CFI_STARTPROC
+       pushl $0
+       CFI_ADJUST_CFA_OFFSET 4
+       SAVE_ALL
+       mov %esp, %eax
+       call xen_evtchn_do_upcall
+       jmp  ret_from_intr
+       CFI_ENDPROC
+       
+# Hypervisor uses this for application faults while it executes.
+# We get here for two reasons:
+#  1. Fault while reloading DS, ES, FS or GS
+#  2. Fault while executing IRET
+# Category 1 we fix up by reattempting the load, and zeroing the segment
+# register if the load fails.
+# Category 2 we fix up by jumping to do_iret_error. We cannot use the
+# normal Linux return path in this case because if we use the IRET hypercall
+# to pop the stack frame we end up in an infinite loop of failsafe callbacks.
+# We distinguish between categories by maintaining a status value in EAX.
+ENTRY(xen_failsafe_callback)
+       CFI_STARTPROC
+       pushl %eax
+       CFI_ADJUST_CFA_OFFSET 4
+       movl $1,%eax
+1:     mov 4(%esp),%ds
+2:     mov 8(%esp),%es
+3:     mov 12(%esp),%fs
+4:     mov 16(%esp),%gs
+       testl %eax,%eax
+       popl %eax
+       CFI_ADJUST_CFA_OFFSET -4
+       jz 5f
+       addl $16,%esp           # EAX != 0 => Category 2 (Bad IRET)
+       CFI_ADJUST_CFA_OFFSET -16
+       jmp iret_exc
+5:     addl $16,%esp           # EAX == 0 => Category 1 (Bad segment)
+       CFI_ADJUST_CFA_OFFSET -16
+       pushl $0
+       CFI_ADJUST_CFA_OFFSET 4
+       SAVE_ALL
+       jmp ret_from_exception
+       CFI_ENDPROC
+       
+.section .fixup,"ax"
+6:     xorl %eax,%eax
+       movl %eax,4(%esp)
+       jmp 1b
+7:     xorl %eax,%eax
+       movl %eax,8(%esp)
+       jmp 2b
+8:     xorl %eax,%eax
+       movl %eax,12(%esp)
+       jmp 3b
+9:     xorl %eax,%eax
+       movl %eax,16(%esp)
+       jmp 4b
+.previous
+.section __ex_table,"a"
+       .align 4
+       .long 1b,6b
+       .long 2b,7b
+       .long 3b,8b
+       .long 4b,9b
+.previous
+               
+#endif /* CONFIG_XEN */
+       
 .section .rodata,"a"
 #include "syscall_table.S"
 
===================================================================
--- a/arch/i386/kernel/head.S
+++ b/arch/i386/kernel/head.S
@@ -519,6 +519,10 @@ 1:
        jmp     1b
 #endif
 
+#ifdef CONFIG_XEN
+#include "../xen/xen-head.S"
+#endif
+       
 /*
  * Real beginning of normal "text" segment
  */
@@ -528,7 +532,7 @@ ENTRY(_stext)
 /*
  * BSS section
  */
-.section ".bss.page_aligned","w"
+.section ".bss.page_aligned"
 ENTRY(swapper_pg_dir)
        .fill 1024,4,0
 ENTRY(empty_zero_page)
@@ -598,7 +602,8 @@ ENTRY(boot_gdt_table)
 /*
  * The Global Descriptor Table contains 28 quadwords, per-CPU.
  */
-       .align L1_CACHE_BYTES
+       .section ".data.page_aligned"
+       .align PAGE_SIZE_asm
 ENTRY(cpu_gdt_table)
        .quad 0x0000000000000000        /* NULL descriptor */
        .quad 0x0000000000000000        /* 0x0b reserved */
@@ -647,3 +652,6 @@ ENTRY(cpu_gdt_table)
        .quad 0x0000000000000000        /* 0xf0 - unused */
        .quad 0x0000000000000000        /* 0xf8 - GDT entry 31: double-fault 
TSS */
 
+       /* Be sure this is zeroed to avoid false validations in Xen */
+       .fill PAGE_SIZE_asm / 8 - GDT_ENTRIES,8,0
+       .previous
===================================================================
--- a/arch/i386/kernel/paravirt.c
+++ b/arch/i386/kernel/paravirt.c
@@ -146,55 +146,55 @@ void init_IRQ(void)
        paravirt_ops.init_IRQ();
 }
 
-static fastcall void native_clts(void)
+fastcall void native_clts(void)
 {
        asm volatile ("clts");
 }
 
-static fastcall unsigned long native_read_cr0(void)
+fastcall unsigned long native_read_cr0(void)
 {
        unsigned long val;
        asm volatile("movl %%cr0,%0\n\t" :"=r" (val));
        return val;
 }
 
-static fastcall void native_write_cr0(unsigned long val)
+fastcall void native_write_cr0(unsigned long val)
 {
        asm volatile("movl %0,%%cr0": :"r" (val));
 }
 
-static fastcall unsigned long native_read_cr2(void)
+fastcall unsigned long native_read_cr2(void)
 {
        unsigned long val;
        asm volatile("movl %%cr2,%0\n\t" :"=r" (val));
        return val;
 }
 
-static fastcall void native_write_cr2(unsigned long val)
+fastcall void native_write_cr2(unsigned long val)
 {
        asm volatile("movl %0,%%cr2": :"r" (val));
 }
 
-static fastcall unsigned long native_read_cr3(void)
+fastcall unsigned long native_read_cr3(void)
 {
        unsigned long val;
        asm volatile("movl %%cr3,%0\n\t" :"=r" (val));
        return val;
 }
 
-static fastcall void native_write_cr3(unsigned long val)
+fastcall void native_write_cr3(unsigned long val)
 {
        asm volatile("movl %0,%%cr3": :"r" (val));
 }
 
-static fastcall unsigned long native_read_cr4(void)
+fastcall unsigned long native_read_cr4(void)
 {
        unsigned long val;
        asm volatile("movl %%cr4,%0\n\t" :"=r" (val));
        return val;
 }
 
-static fastcall unsigned long native_read_cr4_safe(void)
+fastcall unsigned long native_read_cr4_safe(void)
 {
        unsigned long val;
        /* This could fault if %cr4 does not exist */
@@ -207,7 +207,7 @@ static fastcall unsigned long native_rea
        return val;
 }
 
-static fastcall void native_write_cr4(unsigned long val)
+fastcall void native_write_cr4(unsigned long val)
 {
        asm volatile("movl %0,%%cr4": :"r" (val));
 }
@@ -246,12 +246,12 @@ static fastcall void native_halt(void)
        asm volatile("hlt": : :"memory");
 }
 
-static fastcall void native_wbinvd(void)
+fastcall void native_wbinvd(void)
 {
        asm volatile("wbinvd": : :"memory");
 }
 
-static fastcall unsigned long long native_read_msr(unsigned int msr, int *err)
+fastcall unsigned long long native_read_msr(unsigned int msr, int *err)
 {
        unsigned long long val;
 
@@ -270,7 +270,7 @@ static fastcall unsigned long long nativ
        return val;
 }
 
-static fastcall int native_write_msr(unsigned int msr, unsigned long long val)
+fastcall int native_write_msr(unsigned int msr, unsigned long long val)
 {
        int err;
        asm volatile("2: wrmsr ; xorl %0,%0\n"
@@ -288,14 +288,14 @@ static fastcall int native_write_msr(uns
        return err;
 }
 
-static fastcall unsigned long long native_read_tsc(void)
+fastcall unsigned long long native_read_tsc(void)
 {
        unsigned long long val;
        asm volatile("rdtsc" : "=A" (val));
        return val;
 }
 
-static fastcall unsigned long long native_read_pmc(void)
+fastcall unsigned long long native_read_pmc(void)
 {
        unsigned long long val;
        asm volatile("rdpmc" : "=A" (val));
@@ -317,17 +317,17 @@ static fastcall void native_load_idt(con
        asm volatile("lidt %0"::"m" (*dtr));
 }
 
-static fastcall void native_store_gdt(struct Xgt_desc_struct *dtr)
+fastcall void native_store_gdt(struct Xgt_desc_struct *dtr)
 {
        asm ("sgdt %0":"=m" (*dtr));
 }
 
-static fastcall void native_store_idt(struct Xgt_desc_struct *dtr)
+fastcall void native_store_idt(struct Xgt_desc_struct *dtr)
 {
        asm ("sidt %0":"=m" (*dtr));
 }
 
-static fastcall unsigned long native_store_tr(void)
+fastcall unsigned long native_store_tr(void)
 {
        unsigned long tr;
        asm ("str %0":"=r" (tr));
@@ -336,9 +336,9 @@ static fastcall unsigned long native_sto
 
 static fastcall void native_load_tls(struct thread_struct *t, unsigned int cpu)
 {
-#define C(i) get_cpu_gdt_table(cpu)[GDT_ENTRY_TLS_MIN + i] = t->tls_array[i]
-       C(0); C(1); C(2);
-#undef C
+       get_cpu_gdt_table(cpu)[GDT_ENTRY_TLS_MIN + 0] = t->tls_array[0];
+       get_cpu_gdt_table(cpu)[GDT_ENTRY_TLS_MIN + 1] = t->tls_array[1];
+       get_cpu_gdt_table(cpu)[GDT_ENTRY_TLS_MIN + 2] = t->tls_array[2];
 }
 
 static inline void native_write_dt_entry(void *dt, int entry, u32 entry_low, 
u32 entry_high)
@@ -348,17 +348,17 @@ static inline void native_write_dt_entry
        lp[1] = entry_high;
 }
 
-static fastcall void native_write_ldt_entry(void *dt, int entrynum, u32 low, 
u32 high)
+fastcall void native_write_ldt_entry(void *dt, int entrynum, u32 low, u32 high)
 {
        native_write_dt_entry(dt, entrynum, low, high);
 }
 
-static fastcall void native_write_gdt_entry(void *dt, int entrynum, u32 low, 
u32 high)
+fastcall void native_write_gdt_entry(void *dt, int entrynum, u32 low, u32 high)
 {
        native_write_dt_entry(dt, entrynum, low, high);
 }
 
-static fastcall void native_write_idt_entry(void *dt, int entrynum, u32 low, 
u32 high)
+fastcall void native_write_idt_entry(void *dt, int entrynum, u32 low, u32 high)
 {
        native_write_dt_entry(dt, entrynum, low, high);
 }
===================================================================
--- a/arch/i386/kernel/vmlinux.lds.S
+++ b/arch/i386/kernel/vmlinux.lds.S
@@ -93,6 +93,7 @@ SECTIONS
 
   . = ALIGN(4096);
   .data.page_aligned : AT(ADDR(.data.page_aligned) - LOAD_OFFSET) {
+       *(.data.page_aligned)
        *(.data.idt)
   }
 
===================================================================
--- a/arch/i386/mm/pgtable.c
+++ b/arch/i386/mm/pgtable.c
@@ -267,6 +267,7 @@ static void pgd_ctor(pgd_t *pgd)
                                        swapper_pg_dir + USER_PTRS_PER_PGD,
                                        KERNEL_PGD_PTRS);
                } else {
+                       memset(pgd, 0, USER_PTRS_PER_PGD*sizeof(pgd_t));
                        spin_lock_irqsave(&pgd_lock, flags);
                        pgd_list_add(pgd);
                        spin_unlock_irqrestore(&pgd_lock, flags);
===================================================================
--- /dev/null
+++ b/arch/i386/xen/Makefile
@@ -0,0 +1,2 @@
+obj-y          := enlighten.o setup.o events.o time.o \
+                       features.o mmu.o multicalls.o
===================================================================
--- /dev/null
+++ b/arch/i386/xen/enlighten.c
@@ -0,0 +1,807 @@
+#include <linux/kernel.h>
+#include <linux/init.h>
+#include <linux/smp.h>
+#include <linux/preempt.h>
+#include <linux/percpu.h>
+#include <linux/delay.h>
+#include <linux/start_kernel.h>
+#include <linux/sched.h>
+#include <linux/bootmem.h>
+
+#include <xen/interface/xen.h>
+#include <xen/features.h>
+#include <xen/page.h>
+
+#include <asm/paravirt.h>
+#include <asm/page.h>
+#include <asm/hypercall.h>
+#include <asm/hypervisor.h>
+#include <asm/fixmap.h>
+#include <asm/processor.h>
+#include <asm/setup.h>
+#include <asm/desc.h>
+#include <asm/pgtable.h>
+
+#include "xen-ops.h"
+#include "mmu.h"
+#include "multicalls.h"
+
+extern struct Xgt_desc_struct cpu_gdt_descr;
+extern struct i386_pda boot_pda;
+extern unsigned long init_pg_tables_end;
+
+static DEFINE_PER_CPU(unsigned, lazy_mode);
+
+/* Code defined in entry.S (not a function) */
+extern const char xen_sti_sysexit[];
+
+struct start_info *xen_start_info;
+
+static unsigned xen_patch(u8 type, u16 clobber, void *firstinsn, unsigned len)
+{
+       /* Xen will require relocations to patch calls and jmps, and
+          perhaps chunks of inline code */
+       return len;
+}
+
+static void __init xen_banner(void)
+{
+       printk(KERN_INFO "Booting paravirtualized kernel on %s\n",
+              paravirt_ops.name);
+       printk(KERN_INFO "Hypervisor signature: %s\n", xen_start_info->magic);
+}
+
+static void xen_init_pda(struct i386_pda *pda, int cpu)
+{
+       /* Don't re-init boot CPU; we do it once very early in boot,
+          and then then cpu_init tries to do it again. If so, just
+          reuse the stuff we already set up. */
+       if (cpu == 0 && pda != &boot_pda) {
+               BUG_ON(boot_pda.xen.vcpu == NULL);
+               pda->xen = boot_pda.xen;
+               return;
+       }
+
+       pda->xen.vcpu = &HYPERVISOR_shared_info->vcpu_info[cpu];
+       pda->xen.cr3 = 0;
+}
+
+static fastcall void xen_cpuid(unsigned int *eax, unsigned int *ebx,
+                              unsigned int *ecx, unsigned int *edx)
+{
+       unsigned maskedx = ~0;
+       if (*eax == 1)
+               maskedx = ~(1 << X86_FEATURE_APIC);
+
+       asm(XEN_EMULATE_PREFIX "cpuid"
+               : "=a" (*eax),
+                 "=b" (*ebx),
+                 "=c" (*ecx),
+                 "=d" (*edx)
+               : "0" (*eax), "2" (*ecx));
+       *edx &= maskedx;
+}
+
+static fastcall void xen_set_debugreg(int reg, unsigned long val)
+{
+       HYPERVISOR_set_debugreg(reg, val);
+}
+
+static fastcall unsigned long xen_get_debugreg(int reg)
+{
+       return HYPERVISOR_get_debugreg(reg);
+}
+
+static fastcall unsigned long xen_save_fl(void)
+{
+       struct vcpu_info *vcpu;
+       unsigned long flags;
+
+       preempt_disable();
+       vcpu = read_pda(xen.vcpu);
+       /* flag has opposite sense of mask */
+       flags = !vcpu->evtchn_upcall_mask;
+       preempt_enable();
+
+       /* convert to IF type flag 
+          -0 -> 0x00000000
+          -1 -> 0xffffffff
+       */
+       return (-flags) & X86_EFLAGS_IF;
+}
+
+static fastcall void xen_restore_fl(unsigned long flags)
+{
+       struct vcpu_info *vcpu;
+
+       preempt_disable();
+
+       /* convert from IF type flag */
+       flags = !(flags & X86_EFLAGS_IF);
+       vcpu = read_pda(xen.vcpu);
+       vcpu->evtchn_upcall_mask = flags;
+       if (flags == 0) {
+               barrier(); /* unmask then check (avoid races) */
+               if (unlikely(vcpu->evtchn_upcall_pending))
+                       force_evtchn_callback();
+               preempt_enable();
+       } else
+               preempt_enable_no_resched();
+}
+
+static fastcall void xen_irq_disable(void)
+{
+       struct vcpu_info *vcpu;
+       preempt_disable();
+       vcpu = read_pda(xen.vcpu);
+       vcpu->evtchn_upcall_mask = 1;
+       preempt_enable_no_resched();
+}
+
+static fastcall void xen_irq_enable(void)
+{
+       struct vcpu_info *vcpu;
+
+       preempt_disable();
+       vcpu = read_pda(xen.vcpu);
+       vcpu->evtchn_upcall_mask = 0;
+       barrier(); /* unmask then check (avoid races) */
+       if (unlikely(vcpu->evtchn_upcall_pending))
+               force_evtchn_callback();
+       preempt_enable();
+}
+
+static fastcall void xen_safe_halt(void)
+{
+       stop_hz_timer();
+       /* Blocking includes an implicit local_irq_enable(). */
+       if (HYPERVISOR_sched_op(SCHEDOP_block, 0) != 0)
+               BUG();
+       start_hz_timer();
+}
+
+static fastcall void xen_halt(void)
+{
+#if 0
+       if (irqs_disabled())
+               HYPERVISOR_vcpu_op(VCPUOP_down, smp_processor_id(), NULL);
+#endif
+}
+
+static void xen_set_lazy_mode(int mode)
+{
+       unsigned *lazy = &get_cpu_var(lazy_mode);
+
+       if (xen_mc_flush())
+               BUG();
+
+       *lazy = mode;
+
+       put_cpu_var(lazy_mode);
+}
+
+static unsigned xen_get_lazy_mode(void)
+{
+       unsigned ret = get_cpu_var(lazy_mode);
+       put_cpu_var(lazy_mode);
+
+       return ret;
+}
+
+static fastcall void xen_load_tr_desc(void)
+{
+       /* do nothing */
+}
+
+static fastcall unsigned long xen_store_tr(void)
+{
+       return 0;
+}
+
+static fastcall void xen_set_ldt(const void *addr, unsigned entries)
+{
+       struct mmuext_op *op;
+       struct multicall_space mcs = xen_mc_entry(sizeof(*op));
+
+       op = mcs.args;
+       op->cmd = MMUEXT_SET_LDT;
+       op->arg1.linear_addr = (unsigned long)addr;
+       if (addr)
+               /* ldt my be vmalloced, use arbitrary_virt_to_machine */
+               op->arg1.linear_addr = arbitrary_virt_to_machine((unsigned 
long)addr).maddr;
+       op->arg2.nr_ents = entries;
+
+       MULTI_mmuext_op(mcs.mc, op, 1, NULL, DOMID_SELF);
+
+       if (xen_get_lazy_mode() != PARAVIRT_LAZY_CPU)
+               xen_mc_flush();
+}
+
+static fastcall void xen_load_gdt(const struct Xgt_desc_struct *dtr)
+{
+        unsigned long va;
+        int f;
+       unsigned size = dtr->size + 1;
+       unsigned long frames[16];
+
+       BUG_ON(size > 16*PAGE_SIZE);
+
+        for (va = dtr->address, f = 0;
+             va < dtr->address + size;
+             va += PAGE_SIZE, f++) {
+                frames[f] = virt_to_mfn(va);
+               make_lowmem_page_readonly((void *)va);
+        }
+
+       /* This is used very early, so we can't rely on per-cpu data
+          being set up, so no multicalls */
+       if (HYPERVISOR_set_gdt(frames, size/8))
+               BUG();
+}
+
+static void load_TLS_descriptor(struct thread_struct *t,
+                               unsigned int cpu, unsigned int i)
+{
+       xmaddr_t maddr = 
virt_to_machine(&get_cpu_gdt_table(cpu)[GDT_ENTRY_TLS_MIN+i]);
+       struct multicall_space mc = xen_mc_entry(0);
+
+       MULTI_update_descriptor(mc.mc, maddr.maddr, t->tls_array[i]);
+}
+
+static fastcall void xen_load_tls(struct thread_struct *t, unsigned int cpu)
+{
+       load_TLS_descriptor(t, cpu, 0);
+       load_TLS_descriptor(t, cpu, 1);
+       load_TLS_descriptor(t, cpu, 2);
+
+       if (xen_get_lazy_mode() != PARAVIRT_LAZY_CPU && xen_mc_flush())
+               BUG();
+}
+
+static fastcall void xen_write_ldt_entry(void *dt, int entrynum, u32 low, u32 
high)
+{
+        unsigned long lp = (unsigned long)dt + entrynum * 8;
+        xmaddr_t mach_lp = virt_to_machine(lp);
+       u64 entry = (u64)high << 32 | low;
+
+       xen_mc_flush();
+        if (HYPERVISOR_update_descriptor(mach_lp.maddr, entry))
+               BUG();
+}
+
+static int cvt_gate_to_trap(int vector, u32 low, u32 high, struct trap_info 
*info)
+{
+       u8 type, dpl;
+
+       type = (high >> 8) & 0x1f;
+       dpl = (high >> 13) & 3;
+
+       if (type != 0xf && type != 0xe)
+               return 0;
+
+       info->vector = vector;
+       info->address = (high & 0xffff0000) | (low & 0x0000ffff);
+       info->cs = low >> 16;
+       info->flags = dpl;
+       /* interrupt gates clear IF */
+       if (type == 0xe)
+               info->flags |= 4;
+
+       return 1;
+}
+
+#if 0
+static void unpack_desc(u32 low, u32 high,
+                       unsigned long *base, unsigned long *limit,
+                       unsigned char *type, unsigned char *flags)
+{
+       *base = (high & 0xff000000) | ((high << 16) & 0x00ff0000) | ((low >> 
16) & 0xffff);
+       *limit = (high & 0x000f0000) | (low & 0xffff);
+       *type = (high >> 8) & 0xff;
+       *flags = (high >> 20) & 0xf;
+}
+#endif
+
+/* Locations of each CPU's IDT */
+static DEFINE_PER_CPU(struct Xgt_desc_struct, idt_desc);
+
+/* Set an IDT entry.  If the entry is part of the current IDT, then
+   also update Xen. */
+static fastcall void xen_write_idt_entry(void *dt, int entrynum, u32 low, u32 
high)
+{
+
+       int cpu = smp_processor_id();
+       unsigned long p = (unsigned long)dt + entrynum * 8;
+       unsigned long start = per_cpu(idt_desc, cpu).address;
+       unsigned long end = start + per_cpu(idt_desc, cpu).size + 1;
+
+       xen_mc_flush();
+
+       native_write_idt_entry(dt, entrynum, low, high);
+
+       if (p >= start && (p + 8) <= end) {
+               struct trap_info info[2];
+
+               info[1].address = 0;
+
+               if (cvt_gate_to_trap(entrynum, low, high, &info[0]))
+                       if (HYPERVISOR_set_trap_table(info))
+                               BUG();
+       }
+}
+
+/* Load a new IDT into Xen.  In principle this can be per-CPU, so we
+   hold a spinlock to protect the static traps[] array (static because
+   it avoids allocation, and saves stack space). */
+static fastcall void xen_load_idt(const struct Xgt_desc_struct *desc)
+{
+       static DEFINE_SPINLOCK(lock);
+       static struct trap_info traps[257];
+
+       int cpu = smp_processor_id();
+       unsigned in, out, count;
+
+       per_cpu(idt_desc, cpu) = *desc;
+       
+       count = desc->size / 8;
+       BUG_ON(count > 256);
+
+       spin_lock(&lock);
+       for(in = out = 0; in < count; in++) {
+               const u32 *entry = (u32 *)(desc->address + in * 8);
+
+               if (cvt_gate_to_trap(in, entry[0], entry[1], &traps[out]))
+                       out++;
+       }
+       traps[out].address = 0;
+
+       xen_mc_flush();
+       if (HYPERVISOR_set_trap_table(traps))
+               BUG();
+
+       spin_unlock(&lock);
+}
+
+/* Write a GDT descriptor entry.  Ignore LDT descriptors, since
+   they're handled differently. */
+static fastcall void xen_write_gdt_entry(void *dt, int entry, u32 low, u32 
high)
+{
+       switch ((high >> 8) & 0xff) {
+       case DESCTYPE_LDT:
+       case DESCTYPE_TSS:
+               /* ignore */
+               break;
+
+       default:
+               xen_mc_flush();
+               if (HYPERVISOR_update_descriptor(virt_to_machine(dt + 
entry*8).maddr,
+                                                (u64)high << 32 | low))
+                       BUG();
+       }
+}
+
+static fastcall void xen_load_esp0(struct tss_struct *tss,
+                                  struct thread_struct *thread)
+{
+       if (xen_get_lazy_mode() != PARAVIRT_LAZY_CPU) {
+               if (HYPERVISOR_stack_switch(__KERNEL_DS, thread->esp0))
+                       BUG();
+       } else {
+               struct multicall_space mcs = xen_mc_entry(0);
+               MULTI_stack_switch(mcs.mc, __KERNEL_DS, thread->esp0);
+       }
+}
+
+static fastcall void xen_set_iopl_mask(unsigned mask)
+{
+#if 0
+       struct physdev_set_iopl set_iopl;
+
+       /* Force the change at ring 0. */
+       set_iopl.iopl = (mask == 0) ? 1 : (mask >> 12) & 3;
+       HYPERVISOR_physdev_op(PHYSDEVOP_set_iopl, &set_iopl);
+#endif
+}
+
+static fastcall void xen_io_delay(void)
+{
+}
+
+#ifdef CONFIG_X86_LOCAL_APIC
+static fastcall void xen_apic_write(unsigned long reg, unsigned long v)
+{
+}
+
+static fastcall void xen_apic_write_atomic(unsigned long reg, unsigned long v)
+{
+}
+
+static fastcall unsigned long xen_apic_read(unsigned long reg)
+{
+       return 0;
+}
+#endif
+
+static fastcall void xen_flush_tlb(void)
+{
+       struct mmuext_op *op;
+       struct multicall_space mcs = xen_mc_entry(sizeof(*op));
+
+       op = mcs.args;
+       op->cmd = MMUEXT_TLB_FLUSH_LOCAL;
+       MULTI_mmuext_op(mcs.mc, op, 1, NULL, DOMID_SELF);
+
+       if (xen_get_lazy_mode() != PARAVIRT_LAZY_CPU && xen_mc_flush())
+               BUG();
+}
+
+static fastcall void xen_flush_tlb_global(void)
+{
+       struct mmuext_op *op;
+       struct multicall_space mcs = xen_mc_entry(sizeof(*op));
+
+       op = mcs.args;
+       op->cmd = MMUEXT_TLB_FLUSH_ALL;
+       MULTI_mmuext_op(mcs.mc, op, 1, NULL, DOMID_SELF);
+
+       if (xen_get_lazy_mode() != PARAVIRT_LAZY_CPU && xen_mc_flush())
+               BUG();
+}
+
+static fastcall void xen_flush_tlb_single(u32 addr)
+{
+       struct mmuext_op *op;
+       struct multicall_space mcs = xen_mc_entry(sizeof(*op));
+
+       op = mcs.args;
+       op->cmd = MMUEXT_INVLPG_LOCAL;
+       op->arg1.linear_addr = addr & PAGE_MASK;
+       MULTI_mmuext_op(mcs.mc, op, 1, NULL, DOMID_SELF);
+
+       if (xen_get_lazy_mode() != PARAVIRT_LAZY_CPU && xen_mc_flush())
+               BUG();
+}
+
+static fastcall unsigned long xen_read_cr2(void)
+{
+       return read_pda(xen.vcpu)->arch.cr2;
+}
+
+static fastcall void xen_write_cr4(unsigned long cr4)
+{
+       /* never allow TSC to be disabled */
+       native_write_cr4(cr4 & ~X86_CR4_TSD);
+}
+
+/*
+ * Page-directory addresses above 4GB do not fit into architectural %cr3.
+ * When accessing %cr3, or equivalent field in vcpu_guest_context, guests
+ * must use the following accessor macros to pack/unpack valid MFNs.
+ */
+#define xen_pfn_to_cr3(pfn) (((unsigned)(pfn) << 12) | ((unsigned)(pfn) >> 20))
+#define xen_cr3_to_pfn(cr3) (((unsigned)(cr3) >> 12) | ((unsigned)(cr3) << 20))
+
+static fastcall unsigned long xen_read_cr3(void)
+{
+       return read_pda(xen.cr3);
+}
+
+static fastcall void xen_write_cr3(unsigned long cr3)
+{
+       if (cr3 == read_pda(xen.cr3)) {
+               /* just a simple tlb flush */
+               xen_flush_tlb();
+               return;
+       }
+
+       write_pda(xen.cr3, cr3);
+
+
+       {
+               struct mmuext_op *op;
+               struct multicall_space mcs = xen_mc_entry(sizeof(*op));
+               unsigned long mfn = pfn_to_mfn(PFN_DOWN(cr3));
+
+               op = mcs.args;
+               op->cmd = MMUEXT_NEW_BASEPTR;
+               op->arg1.mfn = mfn;
+
+               MULTI_mmuext_op(mcs.mc, op, 1, NULL, DOMID_SELF);
+
+               if (xen_get_lazy_mode() != PARAVIRT_LAZY_CPU && xen_mc_flush())
+                       BUG();
+       }
+}
+
+static fastcall void xen_alloc_pt(u32 pfn)
+{
+       /* XXX pfn isn't necessarily a lowmem page */
+       make_lowmem_page_readonly(__va(PFN_PHYS(pfn)));
+}
+
+static fastcall void xen_alloc_pd(u32 pfn)
+{
+       make_lowmem_page_readonly(__va(PFN_PHYS(pfn)));
+}
+
+static fastcall void xen_release_pd(u32 pfn)
+{
+       make_lowmem_page_readwrite(__va(PFN_PHYS(pfn)));
+       /* make sure next person to allocate this page gets a clean
+          pmd */
+       clear_page(__va(PFN_PHYS(pfn)));
+}
+
+static fastcall void xen_release_pt(u32 pfn)
+{
+       make_lowmem_page_readwrite(__va(PFN_PHYS(pfn)));
+}
+
+static fastcall void xen_alloc_pd_clone(u32 pfn, u32 clonepfn,
+                                       u32 start, u32 count)
+{
+       xen_alloc_pd(pfn);
+}
+
+static __init void xen_pagetable_setup_start(pgd_t *base)
+{
+       pgd_t *xen_pgd = (pgd_t *)xen_start_info->pt_base;
+
+       init_mm.pgd = base;
+
+       /* copy top-level of Xen-supplied pagetable into place.  For
+          !PAE we can use this as-is, but for PAE it is a stand-in
+          while we copy the pmd pages. */
+       memcpy(base, xen_pgd, PTRS_PER_PGD * sizeof(pgd_t));
+
+       if (PTRS_PER_PMD > 1) {
+               int i;
+
+               /* For PAE, need to allocate new pmds, rather than
+                  share Xen's, since Xen doesn't like pmd's being
+                  shared between address spaces, even though in this
+                  case they're effectively the same address space. */
+               for(i = 0; i < PTRS_PER_PGD; i++) {
+                       if (pgd_val_ma(xen_pgd[i]) & _PAGE_PRESENT) {
+                               pmd_t *pmd = (pmd_t 
*)alloc_bootmem_low_pages(PAGE_SIZE);
+
+                               memcpy(pmd, (void *)pgd_page_vaddr(xen_pgd[i]),
+                                      PAGE_SIZE);
+
+                               xen_alloc_pd(PFN_DOWN(__pa(pmd)));
+
+                               set_pgd(&base[i], __pgd(1 + __pa(pmd)));
+                       } else
+                               pgd_clear(&base[i]);
+               }
+       }
+
+       /* make sure the zero_page is mapped RO so we
+          can use it in pagetables */
+       make_lowmem_page_readonly(empty_zero_page);
+       make_lowmem_page_readonly(base);
+
+       /* Switch to new pagetable.  This is done before
+          pagetable_init has done anything so that the new pages
+          added to the table can be prepared properly for Xen.  */
+       printk("about to switch to new pagetable %p...\n", base);
+       xen_write_cr3(__pa(base));
+       printk("done\n");
+}
+
+static __init void xen_pagetable_setup_done(pgd_t *base)
+{
+       /* init_mm has a new pagetable set up - make sure the GDT page
+          is still read-only in the new pagetable */
+       xen_load_gdt(&cpu_gdt_descr);
+
+       if (!xen_feature(XENFEAT_writable_page_tables)) {
+               /* Create a mapping for the shared info page.
+                  Should be set_fixmap(), but shared_info is a machine
+                  address with no corresponding pseudo-phys address. */
+               set_pte_mfn(fix_to_virt(FIX_PARAVIRT),
+                           PFN_DOWN(xen_start_info->shared_info),
+                           PAGE_KERNEL);
+               
+               HYPERVISOR_shared_info =
+                       (struct shared_info *)fix_to_virt(FIX_PARAVIRT);
+       } else
+               HYPERVISOR_shared_info =
+                       (struct shared_info *)__va(xen_start_info->shared_info);
+
+       xen_pgd_pin(base);
+
+       write_pda(xen.vcpu, 
&HYPERVISOR_shared_info->vcpu_info[smp_processor_id()]);
+}
+
+static const struct paravirt_ops xen_paravirt_ops __initdata = {
+       .paravirt_enabled = 1,
+       .shared_kernel_pmd = 0,
+       .pgd_alignment = PAGE_SIZE,
+
+       .name = "Xen",
+       .banner = xen_banner,
+
+       .patch = xen_patch,
+
+       .memory_setup = xen_memory_setup,
+       .arch_setup = xen_arch_setup,
+       .init_IRQ = xen_init_IRQ,
+       .time_init = xen_time_init,
+       .init_pda = xen_init_pda,
+
+       .cpuid = xen_cpuid,
+
+       .set_debugreg = xen_set_debugreg,
+       .get_debugreg = xen_get_debugreg,
+
+       .clts = native_clts,
+
+       .read_cr0 = native_read_cr0,
+       .write_cr0 = native_write_cr0,
+
+       .read_cr2 = xen_read_cr2,
+       .write_cr2 = native_write_cr2,
+
+       .read_cr3 = xen_read_cr3,
+       .write_cr3 = xen_write_cr3,
+
+       .read_cr4 = native_read_cr4,
+       .read_cr4_safe = native_read_cr4_safe,
+       .write_cr4 = xen_write_cr4,
+
+       .save_fl = xen_save_fl,
+       .restore_fl = xen_restore_fl,
+       .irq_disable = xen_irq_disable,
+       .irq_enable = xen_irq_enable,
+       .safe_halt = xen_safe_halt,
+       .halt = xen_halt,
+       .wbinvd = native_wbinvd,
+
+       .read_msr = native_read_msr,
+       .write_msr = native_write_msr,
+       .read_tsc = native_read_tsc,
+       .read_pmc = native_read_pmc,
+
+       .iret = (void (fastcall *)(void))&hypercall_page[__HYPERVISOR_iret],
+       .irq_enable_sysexit = (void (fastcall *)(void))xen_sti_sysexit,
+
+       .load_tr_desc = xen_load_tr_desc,
+       .set_ldt = xen_set_ldt,
+       .load_gdt = xen_load_gdt,
+       .load_idt = xen_load_idt,
+       .load_tls = xen_load_tls,
+
+       .store_gdt = native_store_gdt,
+       .store_idt = native_store_idt,
+       .store_tr = xen_store_tr,
+
+       .write_ldt_entry = xen_write_ldt_entry,
+       .write_gdt_entry = xen_write_gdt_entry,
+       .write_idt_entry = xen_write_idt_entry,
+       .load_esp0 = xen_load_esp0,
+
+       .set_iopl_mask = xen_set_iopl_mask,
+       .io_delay = xen_io_delay,
+       .const_udelay = __const_udelay,
+       .set_wallclock = xen_set_wallclock,
+       .get_wallclock = xen_get_wallclock,
+
+#ifdef CONFIG_X86_LOCAL_APIC
+       .apic_write = xen_apic_write,
+       .apic_write_atomic = xen_apic_write_atomic,
+       .apic_read = xen_apic_read,
+       .setup_boot_clock = (void *)native_nop,
+       .setup_secondary_clock = (void *)native_nop,
+#endif
+
+       .flush_tlb_user = xen_flush_tlb,
+       .flush_tlb_kernel = xen_flush_tlb_global,
+       .flush_tlb_single = xen_flush_tlb_single,
+
+       .pte_update = (void *)native_nop,
+       .pte_update_defer = (void *)native_nop,
+
+       .pagetable_setup_start = xen_pagetable_setup_start,
+       .pagetable_setup_done = xen_pagetable_setup_done,
+       .activate_mm = xen_activate_mm,
+       .dup_mmap = xen_dup_mmap,
+       .exit_mmap = xen_exit_mmap,
+
+       .set_pte = xen_set_pte,
+       .set_pte_at = xen_set_pte_at,
+       .set_pmd = xen_set_pmd,
+
+       .alloc_pt = xen_alloc_pt,
+       .alloc_pd = xen_alloc_pd,
+       .alloc_pd_clone = xen_alloc_pd_clone,
+       .release_pd = xen_release_pd,
+       .release_pt = xen_release_pt,
+
+       .pte_val = xen_pte_val,
+       .pmd_val = xen_pmd_val,
+       .pgd_val = xen_pgd_val,
+
+       .make_pte = xen_make_pte,
+       .make_pmd = xen_make_pmd,
+       .make_pgd = xen_make_pgd,
+
+       .ptep_get_and_clear = xen_ptep_get_and_clear,
+
+#ifdef CONFIG_X86_PAE
+       .set_pte_atomic = xen_set_pte,
+       .set_pte_present = xen_set_pte_at,
+       .set_pud = xen_set_pud,
+       .pte_clear = xen_pte_clear,
+       .pmd_clear = xen_pmd_clear,
+#endif /* PAE */
+
+       .set_lazy_mode = xen_set_lazy_mode,
+       .startup_ipi_hook = (void *)native_nop,
+};
+
+/* First C function to be called on Xen boot */
+static asmlinkage void __init xen_start_kernel(void)
+{
+       u32 low, high;
+       pgd_t *pgd;
+
+       if (!xen_start_info)
+               return;
+
+       BUG_ON(memcmp(xen_start_info->magic, "xen-3.0", 7) != 0);
+
+       /* Install Xen paravirt ops */
+       paravirt_ops = xen_paravirt_ops;
+
+       xen_setup_features();
+
+       /* Get mfn list */
+       if (!xen_feature(XENFEAT_auto_translated_physmap))
+               phys_to_machine_mapping = (unsigned long 
*)xen_start_info->mfn_list;
+
+       pgd = (pgd_t *)xen_start_info->pt_base;
+
+       init_pg_tables_end = __pa(pgd) + xen_start_info->nr_pt_frames*PAGE_SIZE;
+
+       /* set up the boot-time gdt and segments */
+       init_mm.pgd = pgd; /* use the Xen pagetables to start */
+
+       xen_load_gdt(&cpu_gdt_descr);
+
+       /* set up PDA descriptor */
+       pack_descriptor(&low, &high, (unsigned)&boot_pda, sizeof(boot_pda)-1,
+                       0x80 | DESCTYPE_S | 0x02, 0);
+
+       /* Use hypercall directly, because xen_write_gdt_entry can't
+        * be used until batched multicalls work. */
+       if (HYPERVISOR_update_descriptor(virt_to_machine(cpu_gdt_table +
+                                                        GDT_ENTRY_PDA).maddr,
+                                        (u64)high << 32 | low))
+               BUG();
+
+       /* set up %fs and init Xen parts of the PDA */
+       asm volatile("mov %0, %%gs" : : "r" (__KERNEL_PDA) : "memory");
+       xen_init_pda(&boot_pda, 0);
+       boot_pda.xen.cr3 = __pa(pgd);
+
+       paravirt_ops.kernel_rpl = xen_feature(XENFEAT_supervisor_mode_kernel) ? 
0 : 1;
+
+       /* set the limit of our address space */
+       reserve_top_address(-HYPERVISOR_VIRT_START + 2 * PAGE_SIZE);
+
+       /* set up basic CPUID stuff */
+       cpu_detect(&new_cpu_data);
+       new_cpu_data.hard_math = 1;
+       identify_cpu(&new_cpu_data);
+
+       /* Poke various useful things into boot_params */
+       LOADER_TYPE = (9 << 4) | 0;
+       INITRD_START = xen_start_info->mod_start ? 
__pa(xen_start_info->mod_start) : 0;
+       INITRD_SIZE = xen_start_info->mod_len;
+
+       /* Start the world */
+       start_kernel();
+}
+
+paravirt_probe(xen_start_kernel);
===================================================================
--- /dev/null
+++ b/arch/i386/xen/events.c
@@ -0,0 +1,473 @@
+#include <linux/linkage.h>
+#include <linux/interrupt.h>
+#include <linux/irq.h>
+#include <linux/module.h>
+#include <linux/string.h>
+
+#include <asm/ptrace.h>
+#include <asm/irq.h>
+#include <asm/sync_bitops.h>
+#include <asm/hypercall.h>
+
+#include <xen/events.h>
+#include <xen/interface/xen.h>
+#include <xen/interface/event_channel.h>
+
+#include "xen-ops.h"
+
+/*
+ * This lock protects updates to the following mapping and reference-count
+ * arrays. The lock does not need to be acquired to read the mapping tables.
+ */
+static DEFINE_SPINLOCK(irq_mapping_update_lock);
+
+/* IRQ <-> VIRQ mapping. */
+DEFINE_PER_CPU(int, virq_to_irq[NR_VIRQS]) = {[0 ... NR_VIRQS-1] = -1};
+
+/* Packed IRQ information: binding type, sub-type index, and event channel. */
+static u32 irq_info[NR_IRQS];
+
+/* Binding types. */
+enum { IRQT_UNBOUND, IRQT_PIRQ, IRQT_VIRQ, IRQT_IPI, IRQT_EVTCHN };
+
+/* Convenient shorthand for packed representation of an unbound IRQ. */
+#define IRQ_UNBOUND    mk_irq_info(IRQT_UNBOUND, 0, 0)
+
+static int evtchn_to_irq[NR_EVENT_CHANNELS] = {
+       [0 ... NR_EVENT_CHANNELS-1] = -1
+};
+static unsigned long cpu_evtchn_mask[NR_CPUS][NR_EVENT_CHANNELS/BITS_PER_LONG];
+static u8 cpu_evtchn[NR_EVENT_CHANNELS];
+
+/* Reference counts for bindings to IRQs. */
+static int irq_bindcount[NR_IRQS];
+
+/* Xen will never allocate port zero for any purpose. */
+#define VALID_EVTCHN(chn)      ((chn) != 0)
+
+/*
+ * Force a proper event-channel callback from Xen after clearing the
+ * callback mask. We do this in a very simple manner, by making a call
+ * down into Xen. The pending flag will be checked by Xen on return.
+ */
+void force_evtchn_callback(void)
+{
+       (void)HYPERVISOR_xen_version(0, NULL);
+}
+EXPORT_SYMBOL_GPL(force_evtchn_callback);
+
+static struct irq_chip xen_dynamic_chip;
+
+/* Constructor for packed IRQ information. */
+static inline u32 mk_irq_info(u32 type, u32 index, u32 evtchn)
+{
+       return ((type << 24) | (index << 16) | evtchn);
+}
+
+/*
+ * Accessors for packed IRQ information.
+ */
+static inline unsigned int evtchn_from_irq(int irq)
+{
+       return (u16)(irq_info[irq]);
+}
+
+static inline unsigned int index_from_irq(int irq)
+{
+       return (u8)(irq_info[irq] >> 16);
+}
+
+static inline unsigned int type_from_irq(int irq)
+{
+       return (u8)(irq_info[irq] >> 24);
+}
+
+static inline unsigned long active_evtchns(unsigned int cpu,
+                                          struct shared_info *sh,
+                                          unsigned int idx)
+{
+       return (sh->evtchn_pending[idx] &
+               cpu_evtchn_mask[cpu][idx] &
+               ~sh->evtchn_mask[idx]);
+}
+
+static void bind_evtchn_to_cpu(unsigned int chn, unsigned int cpu)
+{
+       int irq = evtchn_to_irq[chn];
+
+       BUG_ON(irq == -1);
+       set_native_irq_info(irq, cpumask_of_cpu(cpu));
+
+       __clear_bit(chn, (unsigned long *)cpu_evtchn_mask[cpu_evtchn[chn]]);
+       __set_bit(chn, (unsigned long *)cpu_evtchn_mask[cpu]);
+
+       cpu_evtchn[chn] = cpu;
+}
+
+static void init_evtchn_cpu_bindings(void)
+{
+       int i;
+
+       /* By default all event channels notify CPU#0. */
+       for (i = 0; i < NR_IRQS; i++)
+               set_native_irq_info(i, cpumask_of_cpu(0));
+
+       memset(cpu_evtchn, 0, sizeof(cpu_evtchn));
+       memset(cpu_evtchn_mask[0], ~0, sizeof(cpu_evtchn_mask[0]));
+}
+
+static inline unsigned int cpu_from_evtchn(unsigned int evtchn)
+{
+       return cpu_evtchn[evtchn];
+}
+
+static inline void clear_evtchn(int port)
+{
+       struct shared_info *s = HYPERVISOR_shared_info;
+       sync_clear_bit(port, &s->evtchn_pending[0]);
+}
+
+static inline void set_evtchn(int port)
+{
+       struct shared_info *s = HYPERVISOR_shared_info;
+       sync_set_bit(port, &s->evtchn_pending[0]);
+}
+
+
+/**
+ * notify_remote_via_irq - send event to remote end of event channel via irq
+ * @irq: irq of event channel to send event to
+ *
+ * Unlike notify_remote_via_evtchn(), this is safe to use across
+ * save/restore. Notifications on a broken connection are silently
+ * dropped.
+ */
+void notify_remote_via_irq(int irq)
+{
+       int evtchn = evtchn_from_irq(irq);
+
+       if (VALID_EVTCHN(evtchn))
+               notify_remote_via_evtchn(evtchn);
+}
+EXPORT_SYMBOL_GPL(notify_remote_via_irq);
+
+void mask_evtchn(int port)
+{
+       struct shared_info *s = HYPERVISOR_shared_info;
+       sync_set_bit(port, &s->evtchn_mask[0]);
+}
+EXPORT_SYMBOL_GPL(mask_evtchn);
+
+void unmask_evtchn(int port)
+{
+       struct shared_info *s = HYPERVISOR_shared_info;
+       unsigned int cpu = smp_processor_id();
+       struct vcpu_info *vcpu_info = read_pda(xen.vcpu);
+
+       BUG_ON(!irqs_disabled());
+
+       /* Slow path (hypercall) if this is a non-local port. */
+       if (unlikely(cpu != cpu_from_evtchn(port))) {
+               struct evtchn_unmask unmask = { .port = port };
+               (void)HYPERVISOR_event_channel_op(EVTCHNOP_unmask, &unmask);
+               return;
+       }
+
+       sync_clear_bit(port, &s->evtchn_mask[0]);
+
+       /*
+        * The following is basically the equivalent of 'hw_resend_irq'. Just
+        * like a real IO-APIC we 'lose the interrupt edge' if the channel is
+        * masked.
+        */
+       if (sync_test_bit(port, &s->evtchn_pending[0]) &&
+           !sync_test_and_set_bit(port / BITS_PER_LONG,
+                                  &vcpu_info->evtchn_pending_sel))
+               vcpu_info->evtchn_upcall_pending = 1;
+}
+EXPORT_SYMBOL_GPL(unmask_evtchn);
+
+static int find_unbound_irq(void)
+{
+       int irq;
+
+       /* Only allocate from dynirq range */
+       for (irq = 0; irq < NR_IRQS; irq++)
+               if (irq_bindcount[irq] == 0)
+                       break;
+
+       if (irq == NR_IRQS)
+               panic("No available IRQ to bind to: increase NR_IRQS!\n");
+
+       return irq;
+}
+
+static int bind_evtchn_to_irq(unsigned int evtchn)
+{
+       int irq;
+
+       spin_lock(&irq_mapping_update_lock);
+
+       irq = evtchn_to_irq[evtchn];
+
+       if (irq == -1) {
+               irq = find_unbound_irq();
+
+               dynamic_irq_init(irq);
+               set_irq_chip_and_handler(irq, &xen_dynamic_chip, 
handle_level_irq);
+
+               evtchn_to_irq[evtchn] = irq;
+               irq_info[irq] = mk_irq_info(IRQT_EVTCHN, 0, evtchn);
+       }
+
+       irq_bindcount[irq]++;
+
+       spin_unlock(&irq_mapping_update_lock);
+
+       return irq;
+}
+
+static int bind_virq_to_irq(unsigned int virq, unsigned int cpu)
+{
+       struct evtchn_bind_virq bind_virq;
+       int evtchn, irq;
+
+       spin_lock(&irq_mapping_update_lock);
+
+       irq = per_cpu(virq_to_irq, cpu)[virq];
+
+       if (irq == -1) {
+               bind_virq.virq = virq;
+               bind_virq.vcpu = cpu;
+               if (HYPERVISOR_event_channel_op(EVTCHNOP_bind_virq,
+                                               &bind_virq) != 0)
+                       BUG();
+               evtchn = bind_virq.port;
+
+               irq = find_unbound_irq();
+
+               dynamic_irq_init(irq);
+               set_irq_chip_and_handler(irq, &xen_dynamic_chip, 
handle_level_irq);
+
+               evtchn_to_irq[evtchn] = irq;
+               irq_info[irq] = mk_irq_info(IRQT_VIRQ, virq, evtchn);
+
+               per_cpu(virq_to_irq, cpu)[virq] = irq;
+
+               bind_evtchn_to_cpu(evtchn, cpu);
+       }
+
+       irq_bindcount[irq]++;
+
+       spin_unlock(&irq_mapping_update_lock);
+
+       return irq;
+}
+
+static void unbind_from_irq(unsigned int irq)
+{
+       struct evtchn_close close;
+       int evtchn = evtchn_from_irq(irq);
+
+       spin_lock(&irq_mapping_update_lock);
+
+       if (VALID_EVTCHN(evtchn) && (--irq_bindcount[irq] == 0)) {
+               close.port = evtchn;
+               if (HYPERVISOR_event_channel_op(EVTCHNOP_close, &close) != 0)
+                       BUG();
+
+               switch (type_from_irq(irq)) {
+               case IRQT_VIRQ:
+                       per_cpu(virq_to_irq, cpu_from_evtchn(evtchn))
+                               [index_from_irq(irq)] = -1;
+                       break;
+               default:
+                       break;
+               }
+
+               /* Closed ports are implicitly re-bound to VCPU0. */
+               bind_evtchn_to_cpu(evtchn, 0);
+
+               evtchn_to_irq[evtchn] = -1;
+               irq_info[irq] = IRQ_UNBOUND;
+
+               dynamic_irq_init(irq);
+       }
+
+       spin_unlock(&irq_mapping_update_lock);
+}
+
+int bind_evtchn_to_irqhandler(unsigned int evtchn,
+                             irqreturn_t (*handler)(int, void *),
+                             unsigned long irqflags, const char *devname, void 
*dev_id)
+{
+       unsigned int irq;
+       int retval;
+
+       irq = bind_evtchn_to_irq(evtchn);
+       retval = request_irq(irq, handler, irqflags, devname, dev_id);
+       if (retval != 0) {
+               unbind_from_irq(irq);
+               return retval;
+       }
+
+       return irq;
+}
+EXPORT_SYMBOL_GPL(bind_evtchn_to_irqhandler);
+
+int bind_virq_to_irqhandler(unsigned int virq, unsigned int cpu,
+                           irqreturn_t (*handler)(int, void *),
+                           unsigned long irqflags, const char *devname, void 
*dev_id)
+{
+       unsigned int irq;
+       int retval;
+
+       irq = bind_virq_to_irq(virq, cpu);
+       retval = request_irq(irq, handler, irqflags, devname, dev_id);
+       if (retval != 0) {
+               unbind_from_irq(irq);
+               return retval;
+       }
+
+       return irq;
+}
+EXPORT_SYMBOL_GPL(bind_virq_to_irqhandler);
+
+void unbind_from_irqhandler(unsigned int irq, void *dev_id)
+{
+       free_irq(irq, dev_id);
+       unbind_from_irq(irq);
+}
+EXPORT_SYMBOL_GPL(unbind_from_irqhandler);
+
+/*
+  Search the CPUs pending events bitmasks.  For each one found, map
+  the event number to an irq, and feed it into do_IRQ() for
+  handling.
+
+  Xen uses a two-level bitmap to speed searching.  The first level is
+  a bitset of words which contain pending event bits.  The second
+  level is a bitset of pending events themselves.
+*/
+asmlinkage fastcall void xen_evtchn_do_upcall(struct pt_regs *regs)
+{
+       int cpu = smp_processor_id();
+       struct shared_info *s = HYPERVISOR_shared_info;
+       struct vcpu_info *vcpu_info = read_pda(xen.vcpu);
+       unsigned long pending_words;
+
+       vcpu_info->evtchn_upcall_pending = 0;
+
+       /* NB. No need for a barrier here -- XCHG is a barrier on x86. */
+       pending_words = xchg(&vcpu_info->evtchn_pending_sel, 0);
+       while (pending_words != 0) {
+               unsigned long pending_bits;
+               int word_idx = __ffs(pending_words);
+               pending_words &= ~(1UL << word_idx);
+
+               while ((pending_bits = active_evtchns(cpu, s, word_idx)) != 0) {
+                       int bit_idx = __ffs(pending_bits);
+                       int port = (word_idx * BITS_PER_LONG) + bit_idx;
+                       int irq = evtchn_to_irq[port];
+
+                       if (irq != -1) {
+                               regs->orig_eax = ~irq;
+                               do_IRQ(regs);
+                       }
+               }
+       }
+}
+
+/* Rebind an evtchn so that it gets delivered to a specific cpu */
+static void rebind_irq_to_cpu(unsigned irq, unsigned tcpu)
+{
+       struct evtchn_bind_vcpu bind_vcpu;
+       int evtchn = evtchn_from_irq(irq);
+
+       if (!VALID_EVTCHN(evtchn))
+               return;
+
+       /* Send future instances of this interrupt to other vcpu. */
+       bind_vcpu.port = evtchn;
+       bind_vcpu.vcpu = tcpu;
+
+       /*
+        * If this fails, it usually just indicates that we're dealing with a 
+        * virq or IPI channel, which don't actually need to be rebound. Ignore
+        * it, but don't do the xenlinux-level rebind in that case.
+        */
+       if (HYPERVISOR_event_channel_op(EVTCHNOP_bind_vcpu, &bind_vcpu) >= 0)
+               bind_evtchn_to_cpu(evtchn, tcpu);
+}
+
+
+static void set_affinity_irq(unsigned irq, cpumask_t dest)
+{
+       unsigned tcpu = first_cpu(dest);
+       rebind_irq_to_cpu(irq, tcpu);
+}
+
+static void enable_dynirq(unsigned int irq)
+{
+       int evtchn = evtchn_from_irq(irq);
+
+       if (VALID_EVTCHN(evtchn))
+               unmask_evtchn(evtchn);
+}
+
+static void disable_dynirq(unsigned int irq)
+{
+       int evtchn = evtchn_from_irq(irq);
+
+       if (VALID_EVTCHN(evtchn))
+               mask_evtchn(evtchn);
+}
+
+static void ack_dynirq(unsigned int irq)
+{
+       int evtchn = evtchn_from_irq(irq);
+
+       move_native_irq(irq);
+
+       if (VALID_EVTCHN(evtchn))
+               clear_evtchn(evtchn);
+}
+
+static int retrigger_dynirq(unsigned int irq)
+{
+       int evtchn = evtchn_from_irq(irq);
+       int ret = 0;
+
+       if (VALID_EVTCHN(evtchn)) {
+               set_evtchn(evtchn);
+               ret = 1;
+       }
+
+       return ret;
+}
+
+static struct irq_chip xen_dynamic_chip __read_mostly = {
+       .name           = "xen-virq",
+       .mask           = disable_dynirq,
+       .unmask         = enable_dynirq,
+       .ack            = ack_dynirq,
+       .set_affinity   = set_affinity_irq,
+       .retrigger      = retrigger_dynirq,
+};
+
+void __init xen_init_IRQ(void)
+{
+       int i;
+
+       init_evtchn_cpu_bindings();
+
+       /* No event channels are 'live' right now. */
+       for (i = 0; i < NR_EVENT_CHANNELS; i++)
+               mask_evtchn(i);
+
+       /* Dynamic IRQ space is currently unbound. Zero the refcnts. */
+       for (i = 0; i < NR_IRQS; i++)
+               irq_bindcount[i] = 0;
+
+       irq_ctx_init(smp_processor_id());
+}
===================================================================
--- /dev/null
+++ b/arch/i386/xen/features.c
@@ -0,0 +1,29 @@
+/******************************************************************************
+ * features.c
+ *
+ * Xen feature flags.
+ *
+ * Copyright (c) 2006, Ian Campbell, XenSource Inc.
+ */
+#include <linux/types.h>
+#include <linux/cache.h>
+#include <linux/module.h>
+#include <asm/hypervisor.h>
+#include <xen/features.h>
+
+u8 xen_features[XENFEAT_NR_SUBMAPS * 32] __read_mostly;
+EXPORT_SYMBOL_GPL(xen_features);
+
+void xen_setup_features(void)
+{
+       struct xen_feature_info fi;
+       int i, j;
+
+       for (i = 0; i < XENFEAT_NR_SUBMAPS; i++) {
+               fi.submap_idx = i;
+               if (HYPERVISOR_xen_version(XENVER_get_features, &fi) < 0)
+                       break;
+               for (j=0; j<32; j++)
+                       xen_features[i*32+j] = !!(fi.submap & 1<<j);
+       }
+}
===================================================================
--- /dev/null
+++ b/arch/i386/xen/mmu.c
@@ -0,0 +1,419 @@
+//#include <linux/bug.h>
+#include <asm/bug.h>
+
+#include <asm/pgtable.h>
+#include <asm/tlbflush.h>
+#include <asm/mmu_context.h>
+
+#include <asm/hypercall.h>
+#include <asm/paravirt.h>
+
+#include <xen/page.h>
+#include <xen/interface/xen.h>
+
+xmaddr_t arbitrary_virt_to_machine(unsigned long address)
+{
+       pte_t *pte = lookup_address(address);
+       unsigned offset = address & PAGE_MASK;
+
+       BUG_ON(pte == NULL);
+
+       return XMADDR((pte_mfn(*pte) << PAGE_SHIFT) + offset);
+}
+
+void make_lowmem_page_readonly(void *vaddr)
+{
+       pte_t *pte, ptev;
+       unsigned long address = (unsigned long)vaddr;
+
+       pte = lookup_address(address);
+       BUG_ON(pte == NULL);
+
+       ptev = pte_wrprotect(*pte);
+
+       if (xen_feature(XENFEAT_writable_page_tables))
+               *pte = ptev;
+       else
+               if(HYPERVISOR_update_va_mapping(address, ptev, 0))
+                       BUG();
+}
+
+void make_lowmem_page_readwrite(void *vaddr)
+{
+       pte_t *pte, ptev;
+       unsigned long address = (unsigned long)vaddr;
+
+       pte = lookup_address(address);
+       BUG_ON(pte == NULL);
+
+       ptev = pte_mkwrite(*pte);
+
+       if (xen_feature(XENFEAT_writable_page_tables))
+               *pte = ptev;
+       else
+               if(HYPERVISOR_update_va_mapping(address, ptev, 0))
+                       BUG();
+}
+
+
+fastcall void xen_set_pte(pte_t *ptep, pte_t pte)
+{
+#if 1
+       struct mmu_update u;
+
+       u.ptr = virt_to_machine(ptep).maddr;
+       u.val = pte_val_ma(pte);
+       if (HYPERVISOR_mmu_update(&u, 1, NULL, DOMID_SELF) < 0)
+               BUG();
+#else
+       ptep->pte_high = pte.pte_high;
+       smp_wmb();
+       ptep->pte_low = pte.pte_low;
+#endif
+}
+
+fastcall void xen_set_pmd(pmd_t *ptr, pmd_t val)
+{
+       struct mmu_update u;
+
+       u.ptr = virt_to_machine(ptr).maddr;
+       u.val = pmd_val_ma(val);
+       if (HYPERVISOR_mmu_update(&u, 1, NULL, DOMID_SELF) < 0)
+               BUG();
+}
+
+#ifdef CONFIG_X86_PAE
+fastcall void xen_set_pud(pmd_t *ptr, pud_t val)
+{
+       struct mmu_update u;
+
+       u.ptr = virt_to_machine(ptr).maddr;
+       u.val = pud_val_ma(val);
+       if (HYPERVISOR_mmu_update(&u, 1, NULL, DOMID_SELF) < 0)
+               BUG();
+}
+#endif
+
+/*
+ * Associate a virtual page frame with a given physical page frame 
+ * and protection flags for that frame.
+ */ 
+void set_pte_mfn(unsigned long vaddr, unsigned long mfn, pgprot_t flags)
+{
+       pgd_t *pgd;
+       pud_t *pud;
+       pmd_t *pmd;
+       pte_t *pte;
+
+       pgd = swapper_pg_dir + pgd_index(vaddr);
+       if (pgd_none(*pgd)) {
+               BUG();
+               return;
+       }
+       pud = pud_offset(pgd, vaddr);
+       if (pud_none(*pud)) {
+               BUG();
+               return;
+       }
+       pmd = pmd_offset(pud, vaddr);
+       if (pmd_none(*pmd)) {
+               BUG();
+               return;
+       }
+       pte = pte_offset_kernel(pmd, vaddr);
+       /* <mfn,flags> stored as-is, to permit clearing entries */
+       xen_set_pte(pte, mfn_pte(mfn, flags));
+
+       /*
+        * It's enough to flush this one mapping.
+        * (PGE mappings get flushed as well)
+        */
+       __flush_tlb_one(vaddr);
+}
+
+void fastcall xen_set_pte_at(struct mm_struct *mm, u32 addr,
+                            pte_t *ptep, pte_t pteval)
+{
+       if ((mm != current->mm && mm != &init_mm) ||
+           HYPERVISOR_update_va_mapping(addr, pteval, 0) != 0)
+               xen_set_pte(ptep, pteval);
+}
+
+void fastcall xen_pte_update(struct mm_struct *mm, u32 addr, pte_t *ptep)
+{
+}
+
+void fastcall xen_pte_update_defer(struct mm_struct *mm, u32 addr, pte_t *ptep)
+{
+}
+
+#ifdef CONFIG_X86_PAE
+void fastcall xen_set_pte_atomic(pte_t *ptep, pte_t pte)
+{
+       set_64bit((u64 *)ptep, pte_val_ma(pte));
+}
+
+void fastcall xen_pte_clear(struct mm_struct *mm, u32 addr,pte_t *ptep)
+{
+#if 1
+       ptep->pte_low = 0;
+       smp_wmb();
+       ptep->pte_high = 0;     
+#else
+       set_64bit((u64 *)ptep, 0);
+#endif
+}
+
+void fastcall xen_pmd_clear(pmd_t *pmdp)
+{
+       xen_set_pmd(pmdp, __pmd(0));
+}
+
+fastcall unsigned long long xen_pte_val(pte_t pte)
+{
+       unsigned long long ret = 0;
+
+       if (pte.pte_low) {
+               ret = ((unsigned long long)pte.pte_high << 32) | pte.pte_low;
+               ret = machine_to_phys(XMADDR(ret)).paddr | 1;
+       }
+
+       return ret;
+}
+
+fastcall unsigned long long xen_pmd_val(pmd_t pmd)
+{
+       unsigned long long ret = pmd.pmd;
+       if (ret)
+               ret = machine_to_phys(XMADDR(ret)).paddr | 1;
+       return ret;
+}
+
+fastcall unsigned long long xen_pgd_val(pgd_t pgd)
+{
+       unsigned long long ret = pgd.pgd;
+       if (ret)
+               ret = machine_to_phys(XMADDR(ret)).paddr | 1;
+       return ret;
+}
+
+fastcall pte_t xen_make_pte(unsigned long long pte)
+{
+       if (pte & 1)
+               pte = phys_to_machine(XPADDR(pte)).maddr;
+
+       return (pte_t){ pte, pte >> 32 };
+}
+
+fastcall pmd_t xen_make_pmd(unsigned long long pmd)
+{
+       if (pmd & 1)
+               pmd = phys_to_machine(XPADDR(pmd)).maddr;
+
+       return (pmd_t){ pmd };
+}
+
+fastcall pgd_t xen_make_pgd(unsigned long long pgd)
+{
+       if (pgd & _PAGE_PRESENT)
+               pgd = phys_to_machine(XPADDR(pgd)).maddr;
+
+       return (pgd_t){ pgd };
+}
+
+fastcall pte_t xen_ptep_get_and_clear(pte_t *ptep)
+{
+       pte_t res;
+
+       /* xchg acts as a barrier before the setting of the high bits */
+       res.pte_low = xchg(&ptep->pte_low, 0);
+       res.pte_high = ptep->pte_high;
+       ptep->pte_high = 0;
+
+       return res;
+}
+#else  /* !PAE */
+fastcall unsigned long xen_pte_val(pte_t pte)
+{
+       unsigned long ret = pte.pte_low;
+
+       if (ret & _PAGE_PRESENT)
+               ret = machine_to_phys(XMADDR(ret)).paddr;
+
+       return ret;
+}
+
+fastcall unsigned long xen_pmd_val(pmd_t pmd)
+{
+       BUG();
+       return 0;
+}
+
+fastcall unsigned long xen_pgd_val(pgd_t pgd)
+{
+       unsigned long ret = pgd.pgd;
+       if (ret)
+               ret = machine_to_phys(XMADDR(ret)).paddr | 1;
+       return ret;
+}
+
+fastcall pte_t xen_make_pte(unsigned long pte)
+{
+       if (pte & _PAGE_PRESENT)
+               pte = phys_to_machine(XPADDR(pte)).maddr;
+
+       return (pte_t){ pte };
+}
+
+fastcall pmd_t xen_make_pmd(unsigned long pmd)
+{
+       BUG();
+       return __pmd(0);
+}
+
+fastcall pgd_t xen_make_pgd(unsigned long pgd)
+{
+       if (pgd & _PAGE_PRESENT)
+               pgd = phys_to_machine(XPADDR(pgd)).maddr;
+
+       return (pgd_t){ pgd };
+}
+
+fastcall pte_t xen_ptep_get_and_clear(pte_t *ptep)
+{
+       return __pte_ma(xchg(&(ptep)->pte_low, 0));
+}
+#endif /* CONFIG_X86_PAE */
+
+
+
+static void pgd_walk_set_prot(void *pt, pgprot_t flags)
+{
+       unsigned long pfn = PFN_DOWN(__pa(pt));
+
+       if (HYPERVISOR_update_va_mapping((unsigned long)pt,
+                                        pfn_pte(pfn, flags), 0) < 0)
+               BUG();
+}
+
+static void pgd_walk(pgd_t *pgd_base, pgprot_t flags)
+{
+       pgd_t *pgd = pgd_base;
+       pud_t *pud;
+       pmd_t *pmd;
+       pte_t *pte;
+       int    g, u, m;
+
+       if (xen_feature(XENFEAT_auto_translated_physmap))
+               return;
+
+       for (g = 0; g < USER_PTRS_PER_PGD; g++, pgd++) {
+               if (pgd_none(*pgd))
+                       continue;
+               pud = pud_offset(pgd, 0);
+
+               if (PTRS_PER_PUD > 1) /* not folded */
+                       pgd_walk_set_prot(pud,flags);
+
+               for (u = 0; u < PTRS_PER_PUD; u++, pud++) {
+                       if (pud_none(*pud))
+                               continue;
+                       pmd = pmd_offset(pud, 0);
+
+                       if (PTRS_PER_PMD > 1) /* not folded */
+                               pgd_walk_set_prot(pmd,flags);
+
+                       for (m = 0; m < PTRS_PER_PMD; m++, pmd++) {
+                               if (pmd_none(*pmd))
+                                       continue;
+
+                               /* This can get called before mem_map
+                                  is set up, so we assume nothing is
+                                  highmem at that point. */
+                               if (mem_map == NULL ||
+                                   !PageHighMem(pmd_page(*pmd))) {
+                                       pte = pte_offset_kernel(pmd,0);
+                                       pgd_walk_set_prot(pte,flags);
+                               }
+                       }
+               }
+       }
+
+       if (HYPERVISOR_update_va_mapping((unsigned long)pgd_base,
+                                        pfn_pte(PFN_DOWN(__pa(pgd_base)),
+                                                flags),
+                                        UVMF_TLB_FLUSH) < 0)
+               BUG();
+}
+
+
+/* This is called just after a mm has been duplicated from its parent,
+   but it has not been used yet.  We need to make sure that its
+   pagetable is all read-only, and can be pinned. The pagetable itself
+   needs to map itself as RO; it doesn't matter what the state its in
+   with respect to any other pagetable. */
+void xen_pgd_pin(pgd_t *pgd)
+{
+       struct mmuext_op op;
+
+       pgd_walk(pgd, PAGE_KERNEL_RO);
+
+#if defined(CONFIG_X86_PAE)
+       op.cmd = MMUEXT_PIN_L3_TABLE;
+#else
+       op.cmd = MMUEXT_PIN_L2_TABLE;
+#endif
+       op.arg1.mfn = pfn_to_mfn(PFN_DOWN(__pa(pgd)));
+       if (HYPERVISOR_mmuext_op(&op, 1, NULL, DOMID_SELF) < 0)
+               BUG();
+}
+
+/* Release a pagetables pages back as normal RW */
+void xen_pgd_unpin(pgd_t *pgd)
+{
+       struct mmuext_op op;
+
+       op.cmd = MMUEXT_UNPIN_TABLE;
+       op.arg1.mfn = pfn_to_mfn(PFN_DOWN(__pa(pgd)));
+
+       if (HYPERVISOR_mmuext_op(&op, 1, NULL, DOMID_SELF) < 0)
+               BUG();
+
+       pgd_walk(pgd, PAGE_KERNEL);
+}
+
+
+fastcall void xen_activate_mm(struct mm_struct *prev, struct mm_struct *next)
+{
+       xen_pgd_pin(next->pgd);
+}
+
+fastcall void xen_dup_mmap(struct mm_struct *oldmm, struct mm_struct *mm)
+{
+       xen_pgd_pin(mm->pgd);   
+}
+
+fastcall void xen_exit_mmap(struct mm_struct *mm)
+{
+       struct task_struct *tsk = current;
+
+       task_lock(tsk);
+
+       /*
+        * We aggressively remove defunct pgd from cr3. We execute unmap_vmas()
+        * *much* faster this way, as no tlb flushes means bigger wrpt batches.
+        */
+       if (tsk->active_mm == mm) {
+               tsk->active_mm = &init_mm;
+               atomic_inc(&init_mm.mm_count);
+
+               switch_mm(mm, &init_mm, tsk);
+
+               atomic_dec(&mm->mm_count);
+               BUG_ON(atomic_read(&mm->mm_count) == 0);
+       }
+
+       task_unlock(tsk);
+
+       xen_pgd_unpin(mm->pgd);
+}
===================================================================
--- /dev/null
+++ b/arch/i386/xen/mmu.h
@@ -0,0 +1,51 @@
+#ifndef _XEN_MMU_H
+
+#include <linux/linkage.h>
+#include <asm/page.h>
+
+void set_pte_mfn(unsigned long vaddr, unsigned long pfn, pgprot_t flags);
+
+void fastcall xen_set_pte(pte_t *ptep, pte_t pteval);
+void fastcall xen_set_pte_at(struct mm_struct *mm, u32 addr,
+                            pte_t *ptep, pte_t pteval);
+void fastcall xen_set_pmd(pmd_t *pmdp, pmd_t pmdval);
+void fastcall xen_pte_update(struct mm_struct *mm, u32 addr, pte_t *ptep);
+void fastcall xen_pte_update_defer(struct mm_struct *mm, u32 addr, pte_t 
*ptep);
+
+fastcall void xen_activate_mm(struct mm_struct *prev, struct mm_struct *next);
+fastcall void xen_dup_mmap(struct mm_struct *oldmm, struct mm_struct *mm);
+fastcall void xen_exit_mmap(struct mm_struct *mm);
+
+fastcall pte_t xen_ptep_get_and_clear(pte_t *ptep);
+
+void xen_pgd_pin(pgd_t *pgd);
+void xen_pgd_unpin(pgd_t *pgd);
+
+#ifdef CONFIG_X86_PAE
+fastcall unsigned long long xen_pte_val(pte_t);
+fastcall unsigned long long xen_pmd_val(pmd_t);
+fastcall unsigned long long xen_pgd_val(pgd_t);
+
+fastcall pte_t xen_make_pte(unsigned long long);
+fastcall pmd_t xen_make_pmd(unsigned long long);
+fastcall pgd_t xen_make_pgd(unsigned long long);
+
+fastcall void xen_set_pte_at(struct mm_struct *mm, u32 addr,
+                            pte_t *ptep, pte_t pteval);
+fastcall void xen_set_pte_atomic(pte_t *ptep, pte_t pte);
+fastcall void xen_set_pud(pud_t *ptr, pud_t val);
+fastcall void xen_pte_clear(struct mm_struct *mm, u32 addr,pte_t *ptep);
+fastcall void xen_pmd_clear(pmd_t *pmdp);
+
+
+#else
+fastcall unsigned long xen_pte_val(pte_t);
+fastcall unsigned long xen_pmd_val(pmd_t);
+fastcall unsigned long xen_pgd_val(pgd_t);
+
+fastcall pte_t xen_make_pte(unsigned long);
+fastcall pmd_t xen_make_pmd(unsigned long);
+fastcall pgd_t xen_make_pgd(unsigned long);
+#endif
+
+#endif /* _XEN_MMU_H */
===================================================================
--- /dev/null
+++ b/arch/i386/xen/multicalls.c
@@ -0,0 +1,62 @@
+#include <linux/percpu.h>
+
+#include <asm/hypercall.h>
+
+#include "multicalls.h"
+
+#define MC_BATCH       8
+#define MC_ARGS                (MC_BATCH * 32 / sizeof(u64))
+
+struct mc_buffer {
+       struct multicall_entry entries[MC_BATCH];
+       u64 args[MC_ARGS];
+       unsigned mcidx, argidx;
+};
+
+static DEFINE_PER_CPU(struct mc_buffer, mc_buffer);
+
+int xen_mc_flush(void)
+{
+       struct mc_buffer *b = &get_cpu_var(mc_buffer);
+       int ret = 0;
+
+       if (b->mcidx) {
+               int i;
+
+               if (HYPERVISOR_multicall(b->entries, b->mcidx) != 0)
+                       BUG();
+               for(i = 0; i < b->mcidx; i++)
+                       if (b->entries[i].result < 0)
+                               ret++;
+               b->mcidx = 0;
+               b->argidx = 0;
+       } else
+               BUG_ON(b->argidx != 0);
+
+       put_cpu_var(mc_buffer);
+
+       return ret;
+}
+
+struct multicall_space xen_mc_entry(size_t args)
+{
+       struct mc_buffer *b = &get_cpu_var(mc_buffer);
+       struct multicall_space ret;
+       unsigned argspace = (args + sizeof(u64) - 1) / sizeof(u64);
+
+       BUG_ON(argspace > MC_ARGS);
+
+       if (b->mcidx == MC_BATCH ||
+           (b->argidx + argspace) > MC_ARGS)
+               if (xen_mc_flush())
+                       BUG();
+
+       ret.mc = &b->entries[b->mcidx];
+       b->mcidx++;
+       ret.args = &b->args[b->argidx];
+       b->argidx += argspace;
+
+       put_cpu_var(mc_buffer);
+
+       return ret;
+}
===================================================================
--- /dev/null
+++ b/arch/i386/xen/multicalls.h
@@ -0,0 +1,13 @@
+#ifndef _XEN_MULTICALLS_H
+#define _XEN_MULTICALLS_H
+
+struct multicall_space
+{
+       struct multicall_entry *mc;
+       void *args;
+};
+
+struct multicall_space xen_mc_entry(size_t args);
+int xen_mc_flush(void);
+
+#endif /* _XEN_MULTICALLS_H */
===================================================================
--- /dev/null
+++ b/arch/i386/xen/setup.c
@@ -0,0 +1,95 @@
+/*
+ *     Machine specific setup for xen
+ */
+
+#include <linux/module.h>
+#include <linux/mm.h>
+
+#include <asm/e820.h>
+#include <asm/setup.h>
+#include <asm/hypervisor.h>
+#include <asm/hypercall.h>
+#include <asm/pda.h>
+
+#include <xen/interface/physdev.h>
+#include <xen/features.h>
+
+/* These are code, but not functions.  Defined in entry.S */
+extern const char xen_hypervisor_callback[];
+extern const char xen_failsafe_callback[];
+
+static __initdata struct shared_info init_shared;
+
+/*
+ * Point at some empty memory to start with. We map the real shared_info
+ * page as soon as fixmap is up and running.
+ */
+struct shared_info *HYPERVISOR_shared_info = &init_shared;
+EXPORT_SYMBOL(HYPERVISOR_shared_info);
+
+unsigned long *phys_to_machine_mapping;
+unsigned long *pfn_to_mfn_frame_list_list, *pfn_to_mfn_frame_list[16];
+EXPORT_SYMBOL(phys_to_machine_mapping);
+
+/**
+ * machine_specific_memory_setup - Hook for machine specific memory setup.
+ **/
+
+char * __init xen_memory_setup(void)
+{
+       unsigned long max_pfn = xen_start_info->nr_pages;
+
+       e820.nr_map = 0;
+       add_memory_region(0, PFN_PHYS(max_pfn), E820_RAM);
+
+       return "Xen";
+}
+
+void xen_idle(void)
+{
+       local_irq_disable();
+
+       if (need_resched())
+               local_irq_enable();
+       else {
+               current_thread_info()->status &= ~TS_POLLING;
+               smp_mb__after_clear_bit();
+               safe_halt();
+               current_thread_info()->status |= TS_POLLING;
+       }
+}
+
+void __init xen_arch_setup(void)
+{
+       struct physdevop_set_iopl set_iopl;
+       int rc;
+
+       HYPERVISOR_vm_assist(VMASST_CMD_enable, VMASST_TYPE_4gb_segments);
+       HYPERVISOR_vm_assist(VMASST_CMD_enable, 
VMASST_TYPE_writable_pagetables);
+
+       if (!xen_feature(XENFEAT_auto_translated_physmap))
+               HYPERVISOR_vm_assist(VMASST_CMD_enable, 
VMASST_TYPE_pae_extended_cr3);
+
+       HYPERVISOR_set_callbacks(__KERNEL_CS, (unsigned 
long)xen_hypervisor_callback,
+                                __KERNEL_CS, (unsigned 
long)xen_failsafe_callback);
+
+       set_iopl.iopl = 1;
+       rc = HYPERVISOR_physdev_op(PHYSDEVOP_SET_IOPL, &set_iopl);
+       if (rc != 0)
+               printk(KERN_INFO "physdev_op failed %d\n", rc);
+
+#ifdef CONFIG_ACPI
+       if (!(xen_start_info->flags & SIF_INITDOMAIN)) {
+               printk(KERN_INFO "ACPI in unprivileged domain disabled\n");
+               disable_acpi();
+       }
+#endif
+
+       memcpy(saved_command_line, xen_start_info->cmd_line,
+              MAX_GUEST_CMDLINE > COMMAND_LINE_SIZE ?
+              COMMAND_LINE_SIZE : MAX_GUEST_CMDLINE);
+
+       pm_idle = xen_idle;
+
+       vdso_enabled = 1;       /* enable by default */
+}
===================================================================
--- /dev/null
+++ b/arch/i386/xen/time.c
@@ -0,0 +1,452 @@
+#include <linux/kernel.h>
+#include <linux/sched.h>
+#include <linux/kernel_stat.h>
+#include <linux/clocksource.h>
+
+#include <asm/hypercall.h>
+#include <asm/arch_hooks.h>
+
+#include <xen/events.h>
+#include <xen/interface/xen.h>
+#include <xen/interface/vcpu.h>
+
+#include "xen-ops.h"
+
+#define XEN_SHIFT 22
+
+/* Permitted clock jitter, in nsecs, beyond which a warning will be printed. */
+static unsigned long permitted_clock_jitter = 10000000UL; /* 10ms */
+static int __init __permitted_clock_jitter(char *str)
+{
+       permitted_clock_jitter = simple_strtoul(str, NULL, 0);
+       return 1;
+}
+__setup("permitted_clock_jitter=", __permitted_clock_jitter);
+
+
+/* These are perodically updated in shared_info, and then copied here. */
+struct shadow_time_info {
+       u64 tsc_timestamp;     /* TSC at last update of time vals.  */
+       u64 system_timestamp;  /* Time, in nanosecs, since boot.    */
+       u32 tsc_to_nsec_mul;
+       int tsc_shift;
+       u32 version;
+};
+
+static DEFINE_PER_CPU(struct shadow_time_info, shadow_time);
+
+/* Keep track of last time we did processing/updating of jiffies and xtime. */
+static u64 processed_system_time;   /* System time (ns) at last processing. */
+static DEFINE_PER_CPU(u64, processed_system_time);
+
+/* How much CPU time was spent blocked and how much was 'stolen'? */
+static DEFINE_PER_CPU(u64, processed_stolen_time);
+static DEFINE_PER_CPU(u64, processed_blocked_time);
+
+/* Current runstate of each CPU (updated automatically by the hypervisor). */
+static DEFINE_PER_CPU(struct vcpu_runstate_info, runstate);
+
+/* Must be signed, as it's compared with s64 quantities which can be -ve. */
+#define NS_PER_TICK (1000000000LL/HZ)
+
+/*
+ * Reads a consistent set of time-base values from Xen, into a shadow data
+ * area.
+ */
+static void get_time_values_from_xen(void)
+{
+       struct vcpu_time_info   *src;
+       struct shadow_time_info *dst;
+
+       src = &read_pda(xen.vcpu)->time;
+       dst = &get_cpu_var(shadow_time);
+
+       do {
+               dst->version = src->version;
+               rmb();
+               dst->tsc_timestamp     = src->tsc_timestamp;
+               dst->system_timestamp  = src->system_time;
+               dst->tsc_to_nsec_mul   = src->tsc_to_system_mul;
+               dst->tsc_shift         = src->tsc_shift;
+               rmb();
+       } while ((src->version & 1) | (dst->version ^ src->version));
+
+       put_cpu_var(shadow_time);
+}
+
+static inline int time_values_up_to_date(void)
+{
+       struct vcpu_time_info   *src;
+       unsigned dstversion;
+
+       src = &read_pda(xen.vcpu)->time;
+       dstversion = get_cpu_var(shadow_time).version;
+       put_cpu_var(shadow_time);
+
+       rmb();
+       return (dstversion == src->version);
+}
+
+/*
+ * Scale a 64-bit delta by scaling and multiplying by a 32-bit fraction,
+ * yielding a 64-bit result.
+ */
+static inline u64 scale_delta(u64 delta, u32 mul_frac, int shift)
+{
+       u64 product;
+#ifdef __i386__
+       u32 tmp1, tmp2;
+#endif
+
+       if (shift < 0)
+               delta >>= -shift;
+       else
+               delta <<= shift;
+
+#ifdef __i386__
+       __asm__ (
+               "mul  %5       ; "
+               "mov  %4,%%eax ; "
+               "mov  %%edx,%4 ; "
+               "mul  %5       ; "
+               "xor  %5,%5    ; "
+               "add  %4,%%eax ; "
+               "adc  %5,%%edx ; "
+               : "=A" (product), "=r" (tmp1), "=r" (tmp2)
+               : "a" ((u32)delta), "1" ((u32)(delta >> 32)), "2" (mul_frac) );
+#elif __x86_64__
+       __asm__ (
+               "mul %%rdx ; shrd $32,%%rdx,%%rax"
+               : "=a" (product) : "0" (delta), "d" ((u64)mul_frac) );
+#else
+#error implement me!
+#endif
+
+       return product;
+}
+
+static u64 get_nsec_offset(struct shadow_time_info *shadow)
+{
+       u64 now, delta;
+       rdtscll(now);
+       delta = now - shadow->tsc_timestamp;
+       return scale_delta(delta, shadow->tsc_to_nsec_mul, shadow->tsc_shift);
+}
+
+
+static void xen_timer_interrupt_hook(void)
+{
+       s64 delta, delta_cpu, stolen, blocked;
+       u64 sched_time;
+       int i, cpu = smp_processor_id();
+       unsigned long ticks;
+       struct shadow_time_info *shadow = &__get_cpu_var(shadow_time);
+       struct vcpu_runstate_info *runstate = &__get_cpu_var(runstate);
+
+       do {
+               get_time_values_from_xen();
+
+               /* Obtain a consistent snapshot of elapsed wallclock cycles. */
+               delta = delta_cpu =
+                       shadow->system_timestamp + get_nsec_offset(shadow);
+               if (0)
+                       printk("tsc_timestamp=%llu system_timestamp=%llu 
tsc_to_nsec=%u tsc_shift=%d, version=%u, delta=%lld 
processed_system_time=%lld\n",
+                              shadow->tsc_timestamp, shadow->system_timestamp,
+                              shadow->tsc_to_nsec_mul, shadow->tsc_shift,
+                              shadow->version, delta, processed_system_time);
+
+               delta     -= processed_system_time;
+               delta_cpu -= __get_cpu_var(processed_system_time);
+
+               /*
+                * Obtain a consistent snapshot of stolen/blocked cycles. We
+                * can use state_entry_time to detect if we get preempted here.
+                */
+               do {
+                       sched_time = runstate->state_entry_time;
+                       barrier();
+                       stolen = runstate->time[RUNSTATE_runnable] +
+                               runstate->time[RUNSTATE_offline] -
+                               __get_cpu_var(processed_stolen_time);
+                       blocked = runstate->time[RUNSTATE_blocked] -
+                               __get_cpu_var(processed_blocked_time);
+                       barrier();
+               } while (sched_time != runstate->state_entry_time);
+       } while (!time_values_up_to_date());
+
+       if ((unlikely(delta < -(s64)permitted_clock_jitter) ||
+            unlikely(delta_cpu < -(s64)permitted_clock_jitter))
+           && printk_ratelimit()) {
+               printk("Timer ISR/%d: Time went backwards: "
+                      "delta=%lld delta_cpu=%lld shadow=%lld "
+                      "off=%lld processed=%lld cpu_processed=%lld\n",
+                      cpu, delta, delta_cpu, shadow->system_timestamp,
+                      (s64)get_nsec_offset(shadow),
+                      processed_system_time,
+                      __get_cpu_var(processed_system_time));
+               for (i = 0; i < num_online_cpus(); i++)
+                       printk(" %d: %lld\n", i,
+                              per_cpu(processed_system_time, i));
+       }
+
+       /* System-wide jiffy work. */
+       ticks = 0;
+       while(delta > NS_PER_TICK) {
+               delta -= NS_PER_TICK;
+               processed_system_time += NS_PER_TICK;
+               ticks++;
+       }
+       do_timer(ticks);
+
+       /*
+        * Account stolen ticks.
+        * HACK: Passing NULL to account_steal_time()
+        * ensures that the ticks are accounted as stolen.
+        */
+       if ((stolen > 0) && (delta_cpu > 0)) {
+               delta_cpu -= stolen;
+               if (unlikely(delta_cpu < 0))
+                       stolen += delta_cpu; /* clamp local-time progress */
+               do_div(stolen, NS_PER_TICK);
+               __get_cpu_var(processed_stolen_time) += stolen * NS_PER_TICK;
+               __get_cpu_var(processed_system_time) += stolen * NS_PER_TICK;
+               account_steal_time(NULL, (cputime_t)stolen);
+       }
+
+       /*
+        * Account blocked ticks.
+        * HACK: Passing idle_task to account_steal_time()
+        * ensures that the ticks are accounted as idle/wait.
+        */
+       if ((blocked > 0) && (delta_cpu > 0)) {
+               delta_cpu -= blocked;
+               if (unlikely(delta_cpu < 0))
+                       blocked += delta_cpu; /* clamp local-time progress */
+               do_div(blocked, NS_PER_TICK);
+               __get_cpu_var(processed_blocked_time) += blocked * NS_PER_TICK;
+               __get_cpu_var(processed_system_time)  += blocked * NS_PER_TICK;
+               account_steal_time(idle_task(cpu), (cputime_t)blocked);
+       }
+
+       update_process_times(user_mode_vm(get_irq_regs()));
+}
+
+static cycle_t xen_clocksource_read(void)
+{
+       struct shadow_time_info *shadow = &get_cpu_var(shadow_time);
+       cycle_t ret;
+
+       get_time_values_from_xen();
+
+       ret = shadow->system_timestamp + get_nsec_offset(shadow);
+
+       put_cpu_var(shadow_time);
+
+       return ret;
+}
+
+static void xen_read_wallclock(struct timespec *ts)
+{
+       const struct shared_info *s = HYPERVISOR_shared_info;
+       u32 version;
+       u64 delta;
+       struct timespec now;
+
+       /* get wallclock at system boot */
+       do {
+               version = s->wc_version;
+               rmb();
+               now.tv_sec  = s->wc_sec;
+               now.tv_nsec = s->wc_nsec;
+               rmb();
+       } while ((s->wc_version & 1) | (version ^ s->wc_version));
+
+       delta = xen_clocksource_read(); /* time since system boot */
+       delta += now.tv_sec * (u64)NSEC_PER_SEC + now.tv_nsec;
+
+       now.tv_nsec = do_div(delta, NSEC_PER_SEC);
+       now.tv_sec = delta;
+
+       set_normalized_timespec(ts, now.tv_sec, now.tv_nsec);
+}
+
+unsigned long xen_get_wallclock(void)
+{
+       struct timespec ts;
+
+       xen_read_wallclock(&ts);
+
+       return ts.tv_sec;
+}
+
+int xen_set_wallclock(unsigned long now)
+{
+       /* do nothing for domU */
+       return -1;
+}
+
+static void init_cpu_khz(void)
+{
+       u64 __cpu_khz = 1000000ULL << 32;
+       struct vcpu_time_info *info;
+       info = &HYPERVISOR_shared_info->vcpu_info[0].time;
+       do_div(__cpu_khz, info->tsc_to_system_mul);
+       if (info->tsc_shift < 0)
+               cpu_khz = __cpu_khz << -info->tsc_shift;
+       else
+               cpu_khz = __cpu_khz >> info->tsc_shift;
+}
+
+static struct clocksource xen_clocksource = {
+       .name = "xen",
+       .rating = 400,
+       .read = xen_clocksource_read,
+       .mask = ~0,
+       .mult = 1<<XEN_SHIFT,           /* time directly in nanoseconds */
+       .shift = XEN_SHIFT,
+       .is_continuous = 1
+};
+
+static void init_missing_ticks_accounting(int cpu)
+{
+       struct vcpu_register_runstate_memory_area area;
+       struct vcpu_runstate_info *runstate = &per_cpu(runstate, cpu);
+
+       memset(runstate, 0, sizeof(*runstate));
+
+       area.addr.v = runstate;
+       HYPERVISOR_vcpu_op(VCPUOP_register_runstate_memory_area, cpu, &area);
+
+       per_cpu(processed_blocked_time, cpu) =
+               runstate->time[RUNSTATE_blocked];
+       per_cpu(processed_stolen_time, cpu) =
+               runstate->time[RUNSTATE_runnable] +
+               runstate->time[RUNSTATE_offline];
+}
+
+static irqreturn_t xen_timer_interrupt(int irq, void *dev_id)
+{
+       /*
+        * Here we are in the timer irq handler. We just have irqs locally
+        * disabled but we don't know if the timer_bh is running on the other
+        * CPU. We need to avoid to SMP race with it. NOTE: we don' t need
+        * the irq version of write_lock because as just said we have irq
+        * locally disabled. -arca
+        */
+       write_seqlock(&xtime_lock);
+
+       xen_timer_interrupt_hook();
+
+       write_sequnlock(&xtime_lock);
+
+       return IRQ_HANDLED;
+}
+
+static void setup_cpu0_timer_irq(void)
+{
+       printk(KERN_DEBUG "installing Xen timer for CPU 0\n");
+
+       bind_virq_to_irqhandler(
+               VIRQ_TIMER,
+               0,
+               xen_timer_interrupt,
+               SA_INTERRUPT,
+               "timer0",
+               NULL);
+}
+
+static __init void xen_late_time_init(void)
+{
+       setup_cpu0_timer_irq();
+}
+
+extern void (*late_time_init)(void);
+__init void xen_time_init(void)
+{
+       late_time_init = xen_late_time_init;
+
+       get_time_values_from_xen();
+
+       processed_system_time = per_cpu(shadow_time, 0).system_timestamp;
+       per_cpu(processed_system_time, 0) = processed_system_time;
+
+       init_cpu_khz();
+       printk(KERN_INFO "Xen reported: %u.%03u MHz processor.\n",
+              cpu_khz / 1000, cpu_khz % 1000);
+
+       init_missing_ticks_accounting(0);
+
+       clocksource_register(&xen_clocksource);
+
+       /* Set initial system time with full resolution */
+       xen_read_wallclock(&xtime);
+       set_normalized_timespec(&wall_to_monotonic,
+                               -xtime.tv_sec, -xtime.tv_nsec);
+
+       tsc_disable = 0;
+}
+
+/* Convert jiffies to system time. */
+static u64 jiffies_to_st(unsigned long j)
+{
+       unsigned long seq;
+       long delta;
+       u64 st;
+
+       do {
+               seq = read_seqbegin(&xtime_lock);
+               delta = j - jiffies;
+               if (delta < 1) {
+                       /* Triggers in some wrap-around cases, but that's okay:
+                        * we just end up with a shorter timeout. */
+                       st = processed_system_time + NS_PER_TICK;
+               } else if (((unsigned long)delta >> (BITS_PER_LONG-3)) != 0) {
+                       /* Very long timeout means there is no pending timer.
+                        * We indicate this to Xen by passing zero timeout. */
+                       st = 0;
+               } else {
+                       st = processed_system_time + delta * (u64)NS_PER_TICK;
+               }
+       } while (read_seqretry(&xtime_lock, seq));
+
+       return st;
+}
+
+/*
+ * stop_hz_timer / start_hz_timer - enter/exit 'tickless mode' on an idle cpu
+ * These functions are based on implementations from arch/s390/kernel/time.c
+ */
+void stop_hz_timer(void)
+{
+       unsigned int cpu = smp_processor_id();
+       unsigned long j;
+
+       cpu_set(cpu, nohz_cpu_mask);
+
+       /* 
+        * See matching smp_mb in rcu_start_batch in rcupdate.c.  These mbs 
+        * ensure that if __rcu_pending (nested in rcu_needs_cpu) fetches a
+        * value of rcp->cur that matches rdp->quiescbatch and allows us to
+        * stop the hz timer then the cpumasks created for subsequent values
+        * of cur in rcu_start_batch are guaranteed to pick up the updated
+        * nohz_cpu_mask and so will not depend on this cpu.
+        */
+
+       smp_mb();
+
+       /* Leave ourselves in tick mode if rcu or softirq or timer pending. */
+       if (rcu_needs_cpu(cpu) || local_softirq_pending() ||
+           (j = next_timer_interrupt(), time_before_eq(j, jiffies))) {
+               cpu_clear(cpu, nohz_cpu_mask);
+               j = jiffies + 1;
+       }
+
+       if (HYPERVISOR_set_timer_op(jiffies_to_st(j)) != 0)
+               BUG();
+}
+
+void start_hz_timer(void)
+{
+       cpu_clear(smp_processor_id(), nohz_cpu_mask);
+}
+
===================================================================
--- /dev/null
+++ b/arch/i386/xen/xen-head.S
@@ -0,0 +1,29 @@
+/* Xen-specific pieces of head.S, intended to be included in the right
+       place in head.S */
+
+#include <linux/elfnote.h>
+#include <asm/boot.h>
+#include <xen/interface/elfnote.h>
+
+ENTRY(startup_xen)
+       movl %esi,xen_start_info
+       jmp startup_paravirt
+       
+.pushsection ".bss.page_aligned"
+ENTRY(hypercall_page)
+       .skip 0x1000
+.popsection
+
+       ELFNOTE(Xen, XEN_ELFNOTE_GUEST_OS,       .asciz, "linux")
+       ELFNOTE(Xen, XEN_ELFNOTE_GUEST_VERSION,  .asciz, "2.6")
+       ELFNOTE(Xen, XEN_ELFNOTE_XEN_VERSION,    .asciz, "xen-3.0")
+       ELFNOTE(Xen, XEN_ELFNOTE_VIRT_BASE,      .long,  __PAGE_OFFSET)
+       ELFNOTE(Xen, XEN_ELFNOTE_ENTRY,          .long,  startup_xen)
+       ELFNOTE(Xen, XEN_ELFNOTE_HYPERCALL_PAGE, .long,  hypercall_page)
+       ELFNOTE(Xen, XEN_ELFNOTE_FEATURES,       .asciz, 
"!writable_page_tables|pae_pgdir_above_4gb")
+#ifdef CONFIG_X86_PAE
+       ELFNOTE(Xen, XEN_ELFNOTE_PAE_MODE,       .asciz, "yes")
+#else
+       ELFNOTE(Xen, XEN_ELFNOTE_PAE_MODE,       .asciz, "no")
+#endif
+       ELFNOTE(Xen, XEN_ELFNOTE_LOADER,         .asciz, "generic")
===================================================================
--- /dev/null
+++ b/arch/i386/xen/xen-ops.h
@@ -0,0 +1,20 @@
+#ifndef XEN_OPS_H
+#define XEN_OPS_H
+
+#include <linux/init.h>
+
+extern struct start_info *xen_start_info;
+extern struct shared_info *HYPERVISOR_shared_info;
+
+char * __init xen_memory_setup(void);
+void __init xen_arch_setup(void);
+void __init xen_init_IRQ(void);
+
+void __init xen_time_init(void);
+unsigned long xen_get_wallclock(void);
+int xen_set_wallclock(unsigned long time);
+
+void stop_hz_timer(void);
+void start_hz_timer(void);
+
+#endif /* XEN_OPS_H */
===================================================================
--- a/include/asm-i386/hypercall.h
+++ b/include/asm-i386/hypercall.h
@@ -39,9 +39,6 @@
 #include <xen/interface/xen.h>
 #include <xen/interface/sched.h>
 #include <xen/interface/physdev.h>
-
-#define __STR(x) #x
-#define STR(x) __STR(x)
 
 extern struct { char _entry[32]; } hypercall_page[];
 
@@ -413,4 +410,22 @@ MULTI_mmuext_op(struct multicall_entry *
        mcl->args[2] = (unsigned long)success_count;
        mcl->args[3] = domid;
 }
+
+static inline void
+MULTI_set_gdt(struct multicall_entry *mcl, unsigned long *frames, int entries)
+{
+       mcl->op = __HYPERVISOR_set_gdt;
+       mcl->args[0] = (unsigned long)frames;
+       mcl->args[1] = entries;
+}
+
+static inline void
+MULTI_stack_switch(struct multicall_entry *mcl, 
+                  unsigned long ss, unsigned long esp)
+{
+       mcl->op = __HYPERVISOR_stack_switch;
+       mcl->args[0] = ss;
+       mcl->args[1] = esp;
+}
+
 #endif /* __HYPERCALL_H__ */
===================================================================
--- a/include/asm-i386/irq.h
+++ b/include/asm-i386/irq.h
@@ -43,6 +43,7 @@ extern void fixup_irqs(cpumask_t map);
 extern void fixup_irqs(cpumask_t map);
 #endif
 
+fastcall unsigned int do_IRQ(struct pt_regs *regs);
 void init_IRQ(void);
 void __init native_init_IRQ(void);
 
===================================================================
--- a/include/asm-i386/paravirt.h
+++ b/include/asm-i386/paravirt.h
@@ -31,6 +31,7 @@ struct Xgt_desc_struct;
 struct Xgt_desc_struct;
 struct tss_struct;
 struct mm_struct;
+struct i386_pda;
 struct paravirt_ops
 {
        int paravirt_enabled;
@@ -53,6 +54,7 @@ struct paravirt_ops
        void (*arch_setup)(void);
        char *(*memory_setup)(void);
        void (*init_IRQ)(void);
+       void (*init_pda)(struct i386_pda *, int cpu);
 
        void (*pagetable_setup_start)(pgd_t *pgd_base);
        void (*pagetable_setup_done)(pgd_t *pgd_base);
@@ -200,6 +202,30 @@ extern struct paravirt_ops paravirt_ops;
 
 void native_pagetable_setup_start(pgd_t *pgd);
 
+/* Non-paravirtualized implementations of various operations for
+   back-ends which don't need their own version. */
+fastcall void native_clts(void);
+
+fastcall unsigned long native_read_cr0(void);
+fastcall void native_write_cr0(unsigned long val);
+
+fastcall unsigned long native_read_cr2(void);
+fastcall void native_write_cr2(unsigned long val);
+
+fastcall unsigned long native_read_cr3(void);
+fastcall void native_write_cr3(unsigned long val);
+
+fastcall unsigned long native_read_cr4(void);
+fastcall unsigned long native_read_cr4_safe(void);
+fastcall void native_write_cr4(unsigned long val);
+
+fastcall void native_wbinvd(void);
+
+fastcall unsigned long long native_read_msr(unsigned int msr, int *err);
+fastcall int native_write_msr(unsigned int msr, unsigned long long val);
+fastcall unsigned long long native_read_tsc(void);
+fastcall unsigned long long native_read_pmc(void);
+
 #ifdef CONFIG_X86_PAE
 fastcall unsigned long long native_pte_val(pte_t);
 fastcall unsigned long long native_pmd_val(pmd_t);
@@ -405,6 +431,19 @@ static inline void paravirt_exit_mmap(st
 {
        paravirt_ops.exit_mmap(mm);
 }
+
+static inline void paravirt_init_pda(struct i386_pda *pda, int cpu)
+{
+       if (paravirt_ops.init_pda)
+               (*paravirt_ops.init_pda)(pda, cpu);
+}
+
+fastcall void native_write_ldt_entry(void *dt, int entrynum, u32 low, u32 
high);
+fastcall void native_write_gdt_entry(void *dt, int entrynum, u32 low, u32 
high);
+fastcall void native_write_idt_entry(void *dt, int entrynum, u32 low, u32 
high);
+fastcall void native_store_gdt(struct Xgt_desc_struct *dtr);
+fastcall void native_store_idt(struct Xgt_desc_struct *dtr);
+fastcall unsigned long native_store_tr(void);
 
 #define __flush_tlb() paravirt_ops.flush_tlb_user()
 #define __flush_tlb_global() paravirt_ops.flush_tlb_kernel()
@@ -699,5 +738,8 @@ static inline void paravirt_exit_mmap(st
 {
 }
 
+static inline void paravirt_init_pda(struct i386_pda *pda, int cpu)
+{
+}
 #endif /* CONFIG_PARAVIRT */
 #endif /* __ASM_PARAVIRT_H */
===================================================================
--- a/include/asm-i386/pda.h
+++ b/include/asm-i386/pda.h
@@ -16,6 +16,17 @@ struct i386_pda
        int cpu_number;
        struct task_struct *pcurrent;   /* current process */
        struct pt_regs *irq_regs;
+
+#ifdef CONFIG_PARAVIRT
+       union {
+#ifdef CONFIG_XEN
+               struct {
+                       struct vcpu_info *vcpu;
+                       unsigned long cr3;
+               } xen;
+#endif /* CONFIG_XEN */
+       };
+#endif /* CONFIG_PARAVIRT */
 };
 
 extern struct i386_pda *_cpu_pda[];
===================================================================
--- /dev/null
+++ b/include/xen/events.h
@@ -0,0 +1,28 @@
+#ifndef _XEN_EVENTS_H
+#define _XEN_EVENTS_H
+
+#include <linux/irq.h>
+
+int bind_evtchn_to_irqhandler(unsigned int evtchn,
+                             irqreturn_t (*handler)(int, void *),
+                             unsigned long irqflags, const char *devname,
+                             void *dev_id);
+int bind_virq_to_irqhandler(unsigned int virq, unsigned int cpu,
+                           irqreturn_t (*handler)(int, void *),
+                           unsigned long irqflags, const char *devname, void 
*dev_id);
+
+/*
+ * Common unbind function for all event sources. Takes IRQ to unbind from.
+ * Automatically closes the underlying event channel (even for bindings
+ * made with bind_evtchn_to_irqhandler()).
+ */
+void unbind_from_irqhandler(unsigned int irq, void *dev_id);
+
+static inline void notify_remote_via_evtchn(int port)
+{
+       struct evtchn_send send = { .port = port };
+       (void)HYPERVISOR_event_channel_op(EVTCHNOP_send, &send);
+}
+
+extern void notify_remote_via_irq(int irq);
+#endif /* _XEN_EVENTS_H */
===================================================================
--- /dev/null
+++ b/include/xen/features.h
@@ -0,0 +1,26 @@
+/******************************************************************************
+ * features.h
+ *
+ * Query the features reported by Xen.
+ *
+ * Copyright (c) 2006, Ian Campbell
+ */
+
+#ifndef __XEN_FEATURES_H__
+#define __XEN_FEATURES_H__
+
+#include <xen/interface/features.h>
+
+void xen_setup_features(void);
+
+extern u8 xen_features[XENFEAT_NR_SUBMAPS * 32];
+
+static inline int xen_feature(int flag)
+{
+       switch(flag) {
+       }
+
+       return xen_features[flag];
+}
+
+#endif /* __ASM_XEN_FEATURES_H__ */
===================================================================
--- /dev/null
+++ b/include/xen/page.h
@@ -0,0 +1,175 @@
+#ifndef __XEN_PAGE_H
+#define __XEN_PAGE_H
+
+#include <linux/pfn.h>
+
+#include <asm/uaccess.h>
+
+#include <xen/features.h>
+
+#ifdef CONFIG_X86_PAE
+/* Xen machine address */
+typedef struct xmaddr {
+       unsigned long long maddr;
+} xmaddr_t;
+
+/* Xen pseudo-physical address */
+typedef struct xpaddr {
+       unsigned long long paddr;
+} xpaddr_t;
+#else
+/* Xen machine address */
+typedef struct xmaddr {
+       unsigned long maddr;
+} xmaddr_t;
+
+/* Xen pseudo-physical address */
+typedef struct xpaddr {
+       unsigned long paddr;
+} xpaddr_t;
+#endif
+
+#define XMADDR(x)      ((xmaddr_t) { .maddr = (x) })
+#define XPADDR(x)      ((xpaddr_t) { .paddr = (x) })
+
+/**** MACHINE <-> PHYSICAL CONVERSION MACROS ****/
+#define INVALID_P2M_ENTRY      (~0UL)
+#define FOREIGN_FRAME_BIT      (1UL<<31)
+#define FOREIGN_FRAME(m)       ((m) | FOREIGN_FRAME_BIT)
+
+extern unsigned long *phys_to_machine_mapping;
+
+static inline unsigned long pfn_to_mfn(unsigned long pfn)
+{
+       if (xen_feature(XENFEAT_auto_translated_physmap))
+               return pfn;
+
+       return phys_to_machine_mapping[(unsigned int)(pfn)] &
+               ~FOREIGN_FRAME_BIT;
+}
+
+static inline int phys_to_machine_mapping_valid(unsigned long pfn)
+{
+       if (xen_feature(XENFEAT_auto_translated_physmap))
+               return 1;
+
+       return (phys_to_machine_mapping[pfn] != INVALID_P2M_ENTRY);
+}
+
+static inline unsigned long mfn_to_pfn(unsigned long mfn)
+{
+       unsigned long pfn;
+
+       if (xen_feature(XENFEAT_auto_translated_physmap))
+               return mfn;
+
+#if 0
+       if (unlikely((mfn >> machine_to_phys_order) != 0))
+               return max_mapnr;
+#endif
+
+       pfn = 0;
+       /*
+        * The array access can fail (e.g., device space beyond end of RAM).
+        * In such cases it doesn't matter what we return (we return garbage),
+        * but we must handle the fault without crashing!
+        */
+       __get_user(pfn, &machine_to_phys_mapping[mfn]);
+
+       return pfn;
+}
+
+static inline xmaddr_t phys_to_machine(xpaddr_t phys)
+{
+       unsigned offset = phys.paddr & ~PAGE_MASK;
+       return XMADDR(PFN_PHYS(pfn_to_mfn(PFN_DOWN(phys.paddr))) | offset);
+}
+
+static inline xpaddr_t machine_to_phys(xmaddr_t machine)
+{
+       unsigned offset = machine.maddr & ~PAGE_MASK;
+       return XPADDR(PFN_PHYS(mfn_to_pfn(PFN_DOWN(machine.maddr))) | offset);
+}
+
+/*
+ * We detect special mappings in one of two ways:
+ *  1. If the MFN is an I/O page then Xen will set the m2p entry
+ *     to be outside our maximum possible pseudophys range.
+ *  2. If the MFN belongs to a different domain then we will certainly
+ *     not have MFN in our p2m table. Conversely, if the page is ours,
+ *     then we'll have p2m(m2p(MFN))==MFN.
+ * If we detect a special mapping then it doesn't have a 'struct page'.
+ * We force !pfn_valid() by returning an out-of-range pointer.
+ *
+ * NB. These checks require that, for any MFN that is not in our reservation,
+ * there is no PFN such that p2m(PFN) == MFN. Otherwise we can get confused if
+ * we are foreign-mapping the MFN, and the other domain as m2p(MFN) == PFN.
+ * Yikes! Various places must poke in INVALID_P2M_ENTRY for safety.
+ *
+ * NB2. When deliberately mapping foreign pages into the p2m table, you *must*
+ *      use FOREIGN_FRAME(). This will cause pte_pfn() to choke on it, as we
+ *      require. In all the cases we care about, the FOREIGN_FRAME bit is
+ *      masked (e.g., pfn_to_mfn()) so behaviour there is correct.
+ */
+static inline unsigned long mfn_to_local_pfn(unsigned long mfn)
+{
+       extern unsigned long max_mapnr;
+       unsigned long pfn = mfn_to_pfn(mfn);
+       if ((pfn < max_mapnr)
+           && !xen_feature(XENFEAT_auto_translated_physmap)
+           && (phys_to_machine_mapping[pfn] != mfn))
+               return max_mapnr; /* force !pfn_valid() */
+       return pfn;
+}
+
+static inline void set_phys_to_machine(unsigned long pfn, unsigned long mfn)
+{
+       if (xen_feature(XENFEAT_auto_translated_physmap)) {
+               BUG_ON(pfn != mfn && mfn != INVALID_P2M_ENTRY);
+               return;
+       }
+       phys_to_machine_mapping[pfn] = mfn;
+}
+
+/* VIRT <-> MACHINE conversion */
+#define virt_to_machine(v)     (phys_to_machine(XPADDR(__pa(v))))
+#define virt_to_mfn(v)         (pfn_to_mfn(PFN_DOWN(__pa(v))))
+#define mfn_to_virt(m)         (__va(mfn_to_pfn(m) << PAGE_SHIFT))
+
+#ifdef CONFIG_X86_PAE
+#define pte_mfn(_pte) (((_pte).pte_low >> PAGE_SHIFT) |\
+                       (((_pte).pte_high & 0xfff) << (32-PAGE_SHIFT)))
+
+static inline pte_t mfn_pte(unsigned long page_nr, pgprot_t pgprot)
+{
+       pte_t pte;
+
+       pte.pte_high = (page_nr >> (32 - PAGE_SHIFT)) | (pgprot_val(pgprot) >> 
32);
+       pte.pte_high &= (__supported_pte_mask >> 32);
+       pte.pte_low = ((page_nr << PAGE_SHIFT) | pgprot_val(pgprot));
+       pte.pte_low &= __supported_pte_mask;
+
+       return pte;
+}
+
+static inline unsigned long long pte_val_ma(pte_t x)
+{
+       return ((unsigned long long)x.pte_high << 32) | x.pte_low;
+}
+#define pmd_val_ma(v) ((v).pmd)
+#define pud_val_ma(v) ((v).pgd.pgd)
+#else  /* !X86_PAE */
+#define pte_mfn(_pte) ((_pte).pte_low >> PAGE_SHIFT)
+#define mfn_pte(pfn, prot)     __pte_ma(((pfn) << PAGE_SHIFT) | 
pgprot_val(prot))
+#define pte_val_ma(x)  ((x).pte_low)
+#define pmd_val_ma(v)  ((v).pud.pgd.pgd)
+#endif /* CONFIG_X86_PAE */
+#define pgd_val_ma(x)  ((x).pgd)
+
+#define __pte_ma(x)    ((pte_t) { (x) } )
+
+xmaddr_t arbitrary_virt_to_machine(unsigned long address);
+void make_lowmem_page_readonly(void *vaddr);
+void make_lowmem_page_readwrite(void *vaddr);
+
+#endif /* __XEN_PAGE_H */

-- 


_______________________________________________
Xen-devel mailing list
Xen-devel@xxxxxxxxxxxxxxxxxxx
http://lists.xensource.com/xen-devel
WARNING - OLD ARCHIVES

xen-devel

[Xen-devel] [patch 16/21] Xen-paravirt: Add code into head.S to handle b