WARNING - OLD ARCHIVES

This is an archived copy of the Xen.org mailing list, which we have preserved to ensure that existing links to archives are not broken. The live archive, which contains the latest emails, can be found at http://lists.xen.org/
   
 
 
Xen 
 
Home Products Support Community News
 
   
 

xen-merge

[Xen-merge] [PATCH 23/23] Xen subarch (largely copied files still)

To: xen-merge@xxxxxxxxxxxxxxxxxxx
Subject: [Xen-merge] [PATCH 23/23] Xen subarch (largely copied files still)
From: Chris Wright <chrisw@xxxxxxxx>
Date: Mon, 08 Aug 2005 00:02:59 -0700
Delivery-date: Mon, 08 Aug 2005 07:06:52 +0000
Envelope-to: www-data@xxxxxxxxxxxxxxxxxxx
List-help: <mailto:xen-merge-request@lists.xensource.com?subject=help>
List-id: xen-merge <xen-merge.lists.xensource.com>
List-post: <mailto:xen-merge@lists.xensource.com>
List-subscribe: <http://lists.xensource.com/cgi-bin/mailman/listinfo/xen-merge>, <mailto:xen-merge-request@lists.xensource.com?subject=subscribe>
List-unsubscribe: <http://lists.xensource.com/cgi-bin/mailman/listinfo/xen-merge>, <mailto:xen-merge-request@lists.xensource.com?subject=unsubscribe>
References: <20050808070236.231405000@xxxxxxxxxxxxxxxxxxxxx>
Sender: xen-merge-bounces@xxxxxxxxxxxxxxxxxxx
===================================================================
--- linux-2.6.12-xen0-arch.orig/arch/i386/Kconfig
+++ linux-2.6.12-xen0-arch/arch/i386/Kconfig
@@ -106,6 +106,13 @@ config X86_VISWS
          A kernel compiled for the Visual Workstation will not run on PCs
          and vice versa. See <file:Documentation/sgi-visws.txt> for details.
 
+config X86_XEN
+       bool "Xen Paravirtualized Linux"
+       select XEN
+       help
+         This option will build a kernel that will run on a Xen
+         paravirtualized machine.
+
 config X86_GENERICARCH
        bool "Generic architecture (Summit, bigsmp, ES7000, default)"
        depends on SMP
@@ -124,6 +131,10 @@ config X86_ES7000
 
 endchoice
 
+if X86_XEN
+source arch/i386/mach-xen/Kconfig
+endif
+
 config ACPI_SRAT
        bool
        default y
@@ -589,12 +600,12 @@ config X86_VISWS_APIC
 
 config X86_TSC
        bool
-       depends on (MWINCHIP3D || MWINCHIP2 || MCRUSOE || MEFFICEON || 
MCYRIXIII || MK7 || MK6 || MPENTIUM4 || MPENTIUMM || MPENTIUMIII || MPENTIUMII 
|| M686 || M586MMX || M586TSC || MK8 || MVIAC3_2 || MGEODEGX1) && !X86_NUMAQ
+       depends on (MWINCHIP3D || MWINCHIP2 || MCRUSOE || MEFFICEON || 
MCYRIXIII || MK7 || MK6 || MPENTIUM4 || MPENTIUMM || MPENTIUMIII || MPENTIUMII 
|| M686 || M586MMX || M586TSC || MK8 || MVIAC3_2 || MGEODEGX1) && !X86_NUMAQ && 
!X86_XEN
        default y
 
 config X86_MCE
        bool "Machine Check Exception"
-       depends on !X86_VOYAGER
+       depends on !X86_VOYAGER && !X86_XEN
        ---help---
          Machine Check Exception support allows the processor to notify the
          kernel if it detects a problem (e.g. overheating, component failure).
@@ -701,6 +712,7 @@ config MICROCODE
 
 config X86_MSR
        tristate "/dev/cpu/*/msr - Model-specific register support"
+       depends on !X86_XEN
        help
          This device gives privileged processes access to the x86
          Model-Specific Registers (MSRs).  It is a character device with
Index: linux-2.6.12-xen0-arch/arch/i386/kernel/early_printk.c
===================================================================
--- linux-2.6.12-xen0-arch.orig/arch/i386/kernel/early_printk.c
+++ linux-2.6.12-xen0-arch/arch/i386/kernel/early_printk.c
@@ -1,2 +1,2 @@
 
-#include "../../x86_64/kernel/early_printk.c"
+#include "../../../x86_64/kernel/early_printk.c"
Index: linux-2.6.12-xen0-arch/arch/i386/mach-xen/boot/Makefile
===================================================================
--- /dev/null
+++ linux-2.6.12-xen0-arch/arch/i386/mach-xen/boot/Makefile
@@ -0,0 +1,8 @@
+
+OBJCOPYFLAGS := -g --strip-unneeded
+
+vmlinuz: vmlinux-stripped FORCE
+       $(call if_changed,gzip)
+
+vmlinux-stripped: vmlinux FORCE
+       $(call if_changed,objcopy)
Index: linux-2.6.12-xen0-arch/arch/i386/mach-xen/Kconfig
===================================================================
--- /dev/null
+++ linux-2.6.12-xen0-arch/arch/i386/mach-xen/Kconfig
@@ -0,0 +1,159 @@
+#
+# For a description of the syntax of this configuration file,
+# see Documentation/kbuild/kconfig-language.txt.
+#
+
+menu "XEN"
+
+config XEN_PRIVILEGED_GUEST
+       bool "Privileged Guest (domain 0)"
+       default n
+       select XEN_PHYSDEV_ACCESS
+       help
+         Support for privileged operation (domain 0)
+
+config XEN_PHYSDEV_ACCESS
+       bool "Physical device access"
+       default XEN_PRIVILEGED_GUEST
+       help
+         Assume access is available to physical hardware devices
+         (e.g., hard drives, network cards). This allows you to configure
+         such devices and also includes some low-level support that is
+         otherwise not compiled into the kernel.
+
+config XEN_BLKDEV_BACKEND
+       bool "Block-device backend driver"
+       depends on XEN_PHYSDEV_ACCESS
+       default y
+       help
+         The block-device backend driver allows the kernel to export its
+         block devices to other guests via a high-performance shared-memory
+         interface.
+
+config XEN_BLKDEV_TAP_BE
+        bool "Block Tap support for backend driver (DANGEROUS)"
+        depends on XEN_BLKDEV_BACKEND
+        default n
+        help
+          If you intend to use the block tap driver, the backend domain will
+          not know the domain id of the real frontend, and so will not be able
+          to map its data pages.  This modifies the backend to attempt to map
+          from both the tap domain and the real frontend.  This presents a
+          security risk, and so should ONLY be used for development
+          with the blktap.  This option will be removed as the block drivers 
are
+          modified to use grant tables.
+
+config XEN_BLKDEV_GRANT
+        bool "Grant table substrate for block drivers"
+        depends on !XEN_BLKDEV_TAP_BE
+        default y
+        help
+          This introduces the use of grant tables as a data exhange mechanism
+          between the frontend and backend block drivers. This currently
+          conflicts with the block tap.
+
+config XEN_NETDEV_BACKEND
+       bool "Network-device backend driver"
+       depends on XEN_PHYSDEV_ACCESS
+       default y
+       help
+         The network-device backend driver allows the kernel to export its
+         network devices to other guests via a high-performance shared-memory
+         interface.
+
+config XEN_BLKDEV_FRONTEND
+       bool "Block-device frontend driver"
+       default y
+       help
+         The block-device frontend driver allows the kernel to access block
+         devices mounted within another guest OS. Unless you are building a
+         dedicated device-driver domain, or your master control domain
+         (domain 0), then you almost certainly want to say Y here.
+
+config XEN_NETDEV_FRONTEND
+       bool "Network-device frontend driver"
+       default y
+       help
+         The network-device frontend driver allows the kernel to access
+         network interfaces within another guest OS. Unless you are building a
+         dedicated device-driver domain, or your master control domain
+         (domain 0), then you almost certainly want to say Y here.
+
+config XEN_NETDEV_GRANT_TX
+        bool "Grant table substrate for net drivers tx path (DANGEROUS)"
+        default n
+        help
+          This introduces the use of grant tables as a data exhange mechanism
+          between the frontend and backend network drivers.
+
+config XEN_NETDEV_GRANT_RX
+        bool "Grant table substrate for net drivers rx path (DANGEROUS)"
+        default n
+        help
+          This introduces the use of grant tables as a data exhange mechanism
+          between the frontend and backend network drivers.
+
+config XEN_NETDEV_FRONTEND_PIPELINED_TRANSMITTER
+       bool "Pipelined transmitter (DANGEROUS)"
+       depends on XEN_NETDEV_FRONTEND
+       default n
+       help
+         The driver will assume that the backend is pipelining packets for
+         transmission: whenever packets are pending in the remote backend,
+         the driver will not send asynchronous notifications when it queues
+         additional packets for transmission.
+         If the backend is a dumb domain, such as a transparent Ethernet
+         bridge with no local IP interface, it is safe to say Y here to get
+         slightly lower network overhead.
+         If the backend has a local IP interface; or may be doing smart things
+         like reassembling packets to perform firewall filtering; or if you
+         are unsure; or if you experience network hangs when this option is
+         enabled; then you must say N here.
+
+config XEN_BLKDEV_TAP
+       bool "Block device tap driver"
+       default n
+       help
+         This driver allows a VM to interact on block device channels
+         to other VMs.  Block messages may be passed through or redirected
+         to a character device, allowing device prototyping in application
+         space.  Odds are that you want to say N here.
+
+config XEN_SHADOW_MODE
+       bool "Fake shadow mode"
+       default n
+    help
+      fakes out a shadow mode kernel
+
+
+config XEN_SCRUB_PAGES
+       bool "Scrub memory before freeing it to Xen"
+       default y
+       help
+         Erase memory contents before freeing it back to Xen's global
+         pool. This ensures that any secrets contained within that
+         memory (e.g., private keys) cannot be found by other guests that
+         may be running on the machine. Most people will want to say Y here.
+         If security is not a concern then you may increase performance by
+         saying N.
+endmenu
+
+config XEN
+       boolean
+
+config HAVE_ARCH_DEV_ALLOC_SKB
+       bool
+       default y
+
+config NO_IDLE_HZ
+       bool
+       default y
+
+#Placeholder
+#source "drivers/xen/Kconfig.drivers"
+
+#if XEN_PRIVILEGED_GUEST
+#menu "Power management options"
+#source "drivers/acpi/Kconfig"
+#endmenu
+#endif
Index: linux-2.6.12-xen0-arch/arch/i386/mach-xen/Kconfig.debug
===================================================================
--- /dev/null
+++ linux-2.6.12-xen0-arch/arch/i386/mach-xen/Kconfig.debug
@@ -0,0 +1,129 @@
+menu "Kernel hacking"
+
+source "lib/Kconfig.debug"
+
+# X86
+config EARLY_PRINTK
+       bool "Early printk" if EMBEDDED && DEBUG_KERNEL
+       default y
+       depends on X86
+       help
+         Write kernel log output directly into the VGA buffer or to a serial
+         port.
+
+         This is useful for kernel debugging when your machine crashes very
+         early before the console code is initialized. For normal operation
+         it is not recommended because it looks ugly and doesn't cooperate
+         with klogd/syslogd or the X server. You should normally N here,
+         unless you want to debug such a crash.
+
+config DEBUG_STACKOVERFLOW
+       bool "Check for stack overflows"
+       depends on DEBUG_KERNEL && X86
+
+config KPROBES
+       bool "Kprobes"
+       depends on DEBUG_KERNEL && X86
+       help
+         Kprobes allows you to trap at almost any kernel address and
+         execute a callback function.  register_kprobe() establishes
+         a probepoint and specifies the callback.  Kprobes is useful
+         for kernel debugging, non-intrusive instrumentation and testing.
+         If in doubt, say "N".
+
+config DEBUG_STACK_USAGE
+       bool "Stack utilization instrumentation"
+       depends on DEBUG_KERNEL && X86
+       help
+         Enables the display of the minimum amount of free stack which each
+         task has ever had available in the sysrq-T and sysrq-P debug output.
+
+         This option will slow down process creation somewhat.
+
+comment "Page alloc debug is incompatible with Software Suspend on i386"
+       depends on DEBUG_KERNEL && SOFTWARE_SUSPEND && X86
+
+config DEBUG_PAGEALLOC
+       bool "Page alloc debugging"
+       depends on DEBUG_KERNEL && !SOFTWARE_SUSPEND && X86
+       help
+         Unmap pages from the kernel linear mapping after free_pages().
+         This results in a large slowdown, but helps to find certain types
+         of memory corruptions.
+
+config 4KSTACKS
+       bool "Use 4Kb for kernel stacks instead of 8Kb"
+       depends on DEBUG_KERNEL && X86
+       help
+         If you say Y here the kernel will use a 4Kb stacksize for the
+         kernel stack attached to each process/thread. This facilitates
+         running more threads on a system and also reduces the pressure
+         on the VM subsystem for higher order allocations. This option
+         will also use IRQ stacks to compensate for the reduced stackspace.
+
+config X86_FIND_SMP_CONFIG
+       bool
+       depends on X86_LOCAL_APIC || X86_VOYAGER && X86
+       default y
+
+config X86_MPPARSE
+       bool
+       depends on X86_LOCAL_APIC && !X86_VISWS && X86
+       default y
+
+# X86_64
+
+# !SMP for now because the context switch early causes GPF in segment reloading
+# and the GS base checking does the wrong thing then, causing a hang.
+config CHECKING
+       bool "Additional run-time checks"
+       depends on DEBUG_KERNEL && !SMP && X86_64
+       help
+         Enables some internal consistency checks for kernel debugging.
+         You should normally say N.
+
+config INIT_DEBUG
+       bool "Debug __init statements"
+       depends on DEBUG_KERNEL && X86_64
+       help
+         Fill __init and __initdata at the end of boot. This helps debugging
+         illegal uses of __init and __initdata after initialization.
+
+config IOMMU_DEBUG
+       depends on GART_IOMMU && DEBUG_KERNEL && X86_64
+       bool "Enable IOMMU debugging"
+       help
+         Force the IOMMU to on even when you have less than 4GB of
+        memory and add debugging code. On overflow always panic. And
+        allow to enable IOMMU leak tracing. Can be disabled at boot
+        time with iommu=noforce. This will also enable scatter gather
+        list merging.  Currently not recommended for production
+        code. When you use it make sure you have a big enough
+        IOMMU/AGP aperture.  Most of the options enabled by this can
+        be set more finegrained using the iommu= command line
+        options. See Documentation/x86_64/boot-options.txt for more
+        details.
+
+config IOMMU_LEAK
+       bool "IOMMU leak tracing"
+       depends on DEBUG_KERNEL && X86_64
+       depends on IOMMU_DEBUG
+       help
+         Add a simple leak tracer to the IOMMU code. This is useful when you
+        are debugging a buggy device driver that leaks IOMMU mappings.
+
+#config X86_REMOTE_DEBUG
+#       bool "kgdb debugging stub"
+
+# X86 & X86_64
+config KPROBES
+       bool "Kprobes"
+       depends on DEBUG_KERNEL
+       help
+         Kprobes allows you to trap at almost any kernel address and
+         execute a callback function.  register_kprobe() establishes
+         a probepoint and specifies the callback.  Kprobes is useful
+         for kernel debugging, non-intrusive instrumentation and testing.
+         If in doubt, say "N".
+
+endmenu
Index: linux-2.6.12-xen0-arch/arch/i386/mach-xen/Kconfig.drivers
===================================================================
--- /dev/null
+++ linux-2.6.12-xen0-arch/arch/i386/mach-xen/Kconfig.drivers
@@ -0,0 +1,94 @@
+# arch/xen/Kconfig.drivers
+
+menu "Device Drivers"
+
+source "drivers/base/Kconfig"
+
+if XEN_PHYSDEV_ACCESS
+source "drivers/mtd/Kconfig"
+source "drivers/parport/Kconfig"
+source "drivers/pnp/Kconfig"
+endif
+
+source "drivers/block/Kconfig"
+
+if XEN_PHYSDEV_ACCESS
+source "drivers/ide/Kconfig"
+endif
+
+source "drivers/scsi/Kconfig"
+
+if XEN_PHYSDEV_ACCESS
+source "drivers/cdrom/Kconfig"
+endif
+
+source "drivers/md/Kconfig"
+
+if XEN_PHYSDEV_ACCESS
+source "drivers/message/fusion/Kconfig"
+source "drivers/ieee1394/Kconfig"
+source "drivers/message/i2o/Kconfig"
+endif
+
+source "net/Kconfig"
+
+if XEN_PHYSDEV_ACCESS
+source "drivers/isdn/Kconfig"
+source "drivers/telephony/Kconfig"
+source "drivers/input/Kconfig"
+source "drivers/char/Kconfig"
+source "drivers/i2c/Kconfig"
+source "drivers/w1/Kconfig"
+source "drivers/misc/Kconfig"
+source "drivers/media/Kconfig"
+source "drivers/video/Kconfig"
+source "sound/Kconfig"
+source "drivers/usb/Kconfig"
+source "drivers/mmc/Kconfig"
+source "drivers/infiniband/Kconfig"
+endif
+
+if !XEN_PHYSDEV_ACCESS
+
+menu "Character devices"
+
+config UNIX98_PTYS
+       bool
+       default y
+
+config LEGACY_PTYS
+       bool "Legacy (BSD) PTY support"
+       default y
+       ---help---
+         A pseudo terminal (PTY) is a software device consisting of two
+         halves: a master and a slave. The slave device behaves identical to
+         a physical terminal; the master device is used by a process to
+         read data from and write data to the slave, thereby emulating a
+         terminal. Typical programs for the master side are telnet servers
+         and xterms.
+
+         Linux has traditionally used the BSD-like names /dev/ptyxx
+         for masters and /dev/ttyxx for slaves of pseudo
+         terminals. This scheme has a number of problems, including
+         security.  This option enables these legacy devices; on most
+         systems, it is safe to say N.
+
+
+config LEGACY_PTY_COUNT
+       int "Maximum number of legacy PTY in use"
+       depends on LEGACY_PTYS
+       range 1 256
+       default "256"
+       ---help---
+         The maximum number of legacy PTYs that can be used at any one time.
+         The default is 256, and should be more than enough.  Embedded
+         systems may want to reduce this to save memory.
+
+         When not in use, each legacy PTY occupies 12 bytes on 32-bit
+         architectures and 24 bytes on 64-bit architectures.
+
+endmenu
+
+endif
+
+endmenu
Index: linux-2.6.12-xen0-arch/arch/i386/mach-xen/kernel/acpi/boot.c
===================================================================
--- /dev/null
+++ linux-2.6.12-xen0-arch/arch/i386/mach-xen/kernel/acpi/boot.c
@@ -0,0 +1,912 @@
+/*
+ *  boot.c - Architecture-Specific Low-Level ACPI Boot Support
+ *
+ *  Copyright (C) 2001, 2002 Paul Diefenbaugh <paul.s.diefenbaugh@xxxxxxxxx>
+ *  Copyright (C) 2001 Jun Nakajima <jun.nakajima@xxxxxxxxx>
+ *
+ * ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+ *
+ *  This program is free software; you can redistribute it and/or modify
+ *  it under the terms of the GNU General Public License as published by
+ *  the Free Software Foundation; either version 2 of the License, or
+ *  (at your option) any later version.
+ *
+ *  This program is distributed in the hope that it will be useful,
+ *  but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *  GNU General Public License for more details.
+ *
+ *  You should have received a copy of the GNU General Public License
+ *  along with this program; if not, write to the Free Software
+ *  Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+ *
+ * ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+ */
+
+#include <linux/init.h>
+#include <linux/config.h>
+#include <linux/acpi.h>
+#include <linux/efi.h>
+#include <linux/irq.h>
+#include <linux/module.h>
+
+#include <asm/pgtable.h>
+#include <asm/io_apic.h>
+#include <asm/apic.h>
+#include <asm/io.h>
+#include <asm/irq.h>
+#include <asm/mpspec.h>
+#ifdef CONFIG_XEN
+#include <asm/fixmap.h>
+#endif
+
+void (*pm_power_off)(void) = NULL;
+
+#ifdef CONFIG_X86_64
+
+static inline void  acpi_madt_oem_check(char *oem_id, char *oem_table_id) { }
+extern void __init clustered_apic_check(void);
+static inline int ioapic_setup_disabled(void) { return 0; }
+#include <asm/proto.h>
+
+#else  /* X86 */
+
+#ifdef CONFIG_X86_LOCAL_APIC
+#include <mach_apic.h>
+#include <mach_mpparse.h>
+#endif /* CONFIG_X86_LOCAL_APIC */
+
+#endif /* X86 */
+
+#define BAD_MADT_ENTRY(entry, end) (                                       \
+               (!entry) || (unsigned long)entry + sizeof(*entry) > end ||  \
+               ((acpi_table_entry_header *)entry)->length != sizeof(*entry))
+
+#define PREFIX                 "ACPI: "
+
+#ifdef CONFIG_ACPI_PCI
+int acpi_noirq __initdata;     /* skip ACPI IRQ initialization */
+int acpi_pci_disabled __initdata; /* skip ACPI PCI scan and IRQ initialization 
*/
+#else
+int acpi_noirq __initdata = 1;
+int acpi_pci_disabled __initdata = 1;
+#endif
+int acpi_ht __initdata = 1;    /* enable HT */
+
+int acpi_lapic;
+int acpi_ioapic;
+int acpi_strict;
+EXPORT_SYMBOL(acpi_strict);
+
+acpi_interrupt_flags acpi_sci_flags __initdata;
+int acpi_sci_override_gsi __initdata;
+int acpi_skip_timer_override __initdata;
+
+#ifdef CONFIG_X86_LOCAL_APIC
+static u64 acpi_lapic_addr __initdata = APIC_DEFAULT_PHYS_BASE;
+#endif
+
+#ifndef __HAVE_ARCH_CMPXCHG
+#warning ACPI uses CMPXCHG, i486 and later hardware
+#endif
+
+#define MAX_MADT_ENTRIES       256
+u8 x86_acpiid_to_apicid[MAX_MADT_ENTRIES] =
+                       { [0 ... MAX_MADT_ENTRIES-1] = 0xff };
+EXPORT_SYMBOL(x86_acpiid_to_apicid);
+
+/* --------------------------------------------------------------------------
+                              Boot-time Configuration
+   -------------------------------------------------------------------------- 
*/
+
+/*
+ * The default interrupt routing model is PIC (8259).  This gets
+ * overriden if IOAPICs are enumerated (below).
+ */
+enum acpi_irq_model_id         acpi_irq_model = ACPI_IRQ_MODEL_PIC;
+
+#if defined(CONFIG_X86_64) && !defined(CONFIG_XEN)
+
+/* rely on all ACPI tables being in the direct mapping */
+char *__acpi_map_table(unsigned long phys_addr, unsigned long size)
+{
+       if (!phys_addr || !size)
+       return NULL;
+
+       if (phys_addr < (end_pfn_map << PAGE_SHIFT))
+               return __va(phys_addr);
+
+       return NULL;
+}
+
+#else
+
+/*
+ * Temporarily use the virtual area starting from FIX_IO_APIC_BASE_END,
+ * to map the target physical address. The problem is that set_fixmap()
+ * provides a single page, and it is possible that the page is not
+ * sufficient.
+ * By using this area, we can map up to MAX_IO_APICS pages temporarily,
+ * i.e. until the next __va_range() call.
+ *
+ * Important Safety Note:  The fixed I/O APIC page numbers are *subtracted*
+ * from the fixed base.  That's why we start at FIX_IO_APIC_BASE_END and
+ * count idx down while incrementing the phys address.
+ */
+char *__acpi_map_table(unsigned long phys, unsigned long size)
+{
+       unsigned long base, offset, mapped_size;
+       int idx;
+
+#ifndef CONFIG_XEN
+       if (phys + size < 8*1024*1024) 
+               return __va(phys); 
+#endif
+
+       offset = phys & (PAGE_SIZE - 1);
+       mapped_size = PAGE_SIZE - offset;
+       set_fixmap(FIX_ACPI_END, phys);
+       base = fix_to_virt(FIX_ACPI_END);
+
+       /*
+        * Most cases can be covered by the below.
+        */
+       idx = FIX_ACPI_END;
+       while (mapped_size < size) {
+               if (--idx < FIX_ACPI_BEGIN)
+                       return NULL;    /* cannot handle this */
+               phys += PAGE_SIZE;
+               set_fixmap(idx, phys);
+               mapped_size += PAGE_SIZE;
+       }
+
+       return ((unsigned char *) base + offset);
+}
+#endif
+
+#ifdef CONFIG_PCI_MMCONFIG
+static int __init acpi_parse_mcfg(unsigned long phys_addr, unsigned long size)
+{
+       struct acpi_table_mcfg *mcfg;
+
+       if (!phys_addr || !size)
+               return -EINVAL;
+
+       mcfg = (struct acpi_table_mcfg *) __acpi_map_table(phys_addr, size);
+       if (!mcfg) {
+               printk(KERN_WARNING PREFIX "Unable to map MCFG\n");
+               return -ENODEV;
+       }
+
+       if (mcfg->base_reserved) {
+               printk(KERN_ERR PREFIX "MMCONFIG not in low 4GB of memory\n");
+               return -ENODEV;
+       }
+
+       pci_mmcfg_base_addr = mcfg->base_address;
+
+       return 0;
+}
+#else
+#define        acpi_parse_mcfg NULL
+#endif /* !CONFIG_PCI_MMCONFIG */
+
+#ifdef CONFIG_X86_LOCAL_APIC
+static int __init
+acpi_parse_madt (
+       unsigned long           phys_addr,
+       unsigned long           size)
+{
+       struct acpi_table_madt  *madt = NULL;
+
+       if (!phys_addr || !size)
+               return -EINVAL;
+
+       madt = (struct acpi_table_madt *) __acpi_map_table(phys_addr, size);
+       if (!madt) {
+               printk(KERN_WARNING PREFIX "Unable to map MADT\n");
+               return -ENODEV;
+       }
+
+       if (madt->lapic_address) {
+               acpi_lapic_addr = (u64) madt->lapic_address;
+
+               printk(KERN_DEBUG PREFIX "Local APIC address 0x%08x\n",
+                       madt->lapic_address);
+       }
+
+       acpi_madt_oem_check(madt->header.oem_id, madt->header.oem_table_id);
+       
+       return 0;
+}
+
+
+static int __init
+acpi_parse_lapic (
+       acpi_table_entry_header *header, const unsigned long end)
+{
+       struct acpi_table_lapic *processor = NULL;
+
+       processor = (struct acpi_table_lapic*) header;
+
+       if (BAD_MADT_ENTRY(processor, end))
+               return -EINVAL;
+
+       acpi_table_print_madt_entry(header);
+
+       /* no utility in registering a disabled processor */
+       if (processor->flags.enabled == 0)
+               return 0;
+
+       x86_acpiid_to_apicid[processor->acpi_id] = processor->id;
+
+       mp_register_lapic (
+               processor->id,                                     /* APIC ID */
+               processor->flags.enabled);                        /* Enabled? */
+
+       return 0;
+}
+
+static int __init
+acpi_parse_lapic_addr_ovr (
+       acpi_table_entry_header *header, const unsigned long end)
+{
+       struct acpi_table_lapic_addr_ovr *lapic_addr_ovr = NULL;
+
+       lapic_addr_ovr = (struct acpi_table_lapic_addr_ovr*) header;
+
+       if (BAD_MADT_ENTRY(lapic_addr_ovr, end))
+               return -EINVAL;
+
+       acpi_lapic_addr = lapic_addr_ovr->address;
+
+       return 0;
+}
+
+static int __init
+acpi_parse_lapic_nmi (
+       acpi_table_entry_header *header, const unsigned long end)
+{
+       struct acpi_table_lapic_nmi *lapic_nmi = NULL;
+
+       lapic_nmi = (struct acpi_table_lapic_nmi*) header;
+
+       if (BAD_MADT_ENTRY(lapic_nmi, end))
+               return -EINVAL;
+
+       acpi_table_print_madt_entry(header);
+
+       if (lapic_nmi->lint != 1)
+               printk(KERN_WARNING PREFIX "NMI not connected to LINT 1!\n");
+
+       return 0;
+}
+
+
+#endif /*CONFIG_X86_LOCAL_APIC*/
+
+#if defined(CONFIG_X86_IO_APIC) && defined(CONFIG_ACPI_INTERPRETER)
+
+static int __init
+acpi_parse_ioapic (
+       acpi_table_entry_header *header, const unsigned long end)
+{
+       struct acpi_table_ioapic *ioapic = NULL;
+
+       ioapic = (struct acpi_table_ioapic*) header;
+
+       if (BAD_MADT_ENTRY(ioapic, end))
+               return -EINVAL;
+ 
+       acpi_table_print_madt_entry(header);
+
+       mp_register_ioapic (
+               ioapic->id,
+               ioapic->address,
+               ioapic->global_irq_base);
+ 
+       return 0;
+}
+
+/*
+ * Parse Interrupt Source Override for the ACPI SCI
+ */
+static void
+acpi_sci_ioapic_setup(u32 gsi, u16 polarity, u16 trigger)
+{
+       if (trigger == 0)       /* compatible SCI trigger is level */
+               trigger = 3;
+
+       if (polarity == 0)      /* compatible SCI polarity is low */
+               polarity = 3;
+
+       /* Command-line over-ride via acpi_sci= */
+       if (acpi_sci_flags.trigger)
+               trigger = acpi_sci_flags.trigger;
+
+       if (acpi_sci_flags.polarity)
+               polarity = acpi_sci_flags.polarity;
+
+       /*
+        * mp_config_acpi_legacy_irqs() already setup IRQs < 16
+        * If GSI is < 16, this will update its flags,
+        * else it will create a new mp_irqs[] entry.
+        */
+       mp_override_legacy_irq(gsi, polarity, trigger, gsi);
+
+       /*
+        * stash over-ride to indicate we've been here
+        * and for later update of acpi_fadt
+        */
+       acpi_sci_override_gsi = gsi;
+       return;
+}
+
+static int __init
+acpi_parse_int_src_ovr (
+       acpi_table_entry_header *header, const unsigned long end)
+{
+       struct acpi_table_int_src_ovr *intsrc = NULL;
+
+       intsrc = (struct acpi_table_int_src_ovr*) header;
+
+       if (BAD_MADT_ENTRY(intsrc, end))
+               return -EINVAL;
+
+       acpi_table_print_madt_entry(header);
+
+       if (intsrc->bus_irq == acpi_fadt.sci_int) {
+               acpi_sci_ioapic_setup(intsrc->global_irq,
+                       intsrc->flags.polarity, intsrc->flags.trigger);
+               return 0;
+       }
+
+       if (acpi_skip_timer_override &&
+               intsrc->bus_irq == 0 && intsrc->global_irq == 2) {
+                       printk(PREFIX "BIOS IRQ0 pin2 override ignored.\n");
+                       return 0;
+       }
+
+       mp_override_legacy_irq (
+               intsrc->bus_irq,
+               intsrc->flags.polarity,
+               intsrc->flags.trigger,
+               intsrc->global_irq);
+
+       return 0;
+}
+
+
+static int __init
+acpi_parse_nmi_src (
+       acpi_table_entry_header *header, const unsigned long end)
+{
+       struct acpi_table_nmi_src *nmi_src = NULL;
+
+       nmi_src = (struct acpi_table_nmi_src*) header;
+
+       if (BAD_MADT_ENTRY(nmi_src, end))
+               return -EINVAL;
+
+       acpi_table_print_madt_entry(header);
+
+       /* TBD: Support nimsrc entries? */
+
+       return 0;
+}
+
+#endif /* CONFIG_X86_IO_APIC */
+
+#ifdef CONFIG_ACPI_BUS
+
+/*
+ * acpi_pic_sci_set_trigger()
+ * 
+ * use ELCR to set PIC-mode trigger type for SCI
+ *
+ * If a PIC-mode SCI is not recognized or gives spurious IRQ7's
+ * it may require Edge Trigger -- use "acpi_sci=edge"
+ *
+ * Port 0x4d0-4d1 are ECLR1 and ECLR2, the Edge/Level Control Registers
+ * for the 8259 PIC.  bit[n] = 1 means irq[n] is Level, otherwise Edge.
+ * ECLR1 is IRQ's 0-7 (IRQ 0, 1, 2 must be 0)
+ * ECLR2 is IRQ's 8-15 (IRQ 8, 13 must be 0)
+ */
+
+void __init
+acpi_pic_sci_set_trigger(unsigned int irq, u16 trigger)
+{
+       unsigned int mask = 1 << irq;
+       unsigned int old, new;
+
+       /* Real old ELCR mask */
+       old = inb(0x4d0) | (inb(0x4d1) << 8);
+
+       /*
+        * If we use ACPI to set PCI irq's, then we should clear ELCR
+        * since we will set it correctly as we enable the PCI irq
+        * routing.
+        */
+       new = acpi_noirq ? old : 0;
+
+       /*
+        * Update SCI information in the ELCR, it isn't in the PCI
+        * routing tables..
+        */
+       switch (trigger) {
+       case 1: /* Edge - clear */
+               new &= ~mask;
+               break;
+       case 3: /* Level - set */
+               new |= mask;
+               break;
+       }
+
+       if (old == new)
+               return;
+
+       printk(PREFIX "setting ELCR to %04x (from %04x)\n", new, old);
+       outb(new, 0x4d0);
+       outb(new >> 8, 0x4d1);
+}
+
+
+#endif /* CONFIG_ACPI_BUS */
+
+int acpi_gsi_to_irq(u32 gsi, unsigned int *irq)
+{
+#ifdef CONFIG_X86_IO_APIC
+       if (use_pci_vector() && !platform_legacy_irq(gsi))
+               *irq = IO_APIC_VECTOR(gsi);
+       else
+#endif
+               *irq = gsi;
+       return 0;
+}
+
+unsigned int acpi_register_gsi(u32 gsi, int edge_level, int active_high_low)
+{
+       unsigned int irq;
+       unsigned int plat_gsi = gsi;
+
+#ifdef CONFIG_X86_IO_APIC
+       if (acpi_irq_model == ACPI_IRQ_MODEL_IOAPIC) {
+               plat_gsi = mp_register_gsi(gsi, edge_level, active_high_low);
+       }
+#endif
+       acpi_gsi_to_irq(plat_gsi, &irq);
+       return irq;
+}
+EXPORT_SYMBOL(acpi_register_gsi);
+
+/*
+ *  ACPI based hotplug support for CPU
+ */
+#ifdef CONFIG_ACPI_HOTPLUG_CPU
+int
+acpi_map_lsapic(acpi_handle handle, int *pcpu)
+{
+       /* TBD */
+       return -EINVAL;
+}
+EXPORT_SYMBOL(acpi_map_lsapic);
+
+
+int
+acpi_unmap_lsapic(int cpu)
+{
+       /* TBD */
+       return -EINVAL;
+}
+EXPORT_SYMBOL(acpi_unmap_lsapic);
+#endif /* CONFIG_ACPI_HOTPLUG_CPU */
+
+static unsigned long __init
+acpi_scan_rsdp (
+       unsigned long           start,
+       unsigned long           length)
+{
+       unsigned long           offset = 0;
+       unsigned long           sig_len = sizeof("RSD PTR ") - 1;
+       unsigned long           vstart = (unsigned long)isa_bus_to_virt(start);
+
+       /*
+        * Scan all 16-byte boundaries of the physical memory region for the
+        * RSDP signature.
+        */
+       for (offset = 0; offset < length; offset += 16) {
+               if (strncmp((char *) (vstart + offset), "RSD PTR ", sig_len))
+                       continue;
+               return (start + offset);
+       }
+
+       return 0;
+}
+
+static int __init acpi_parse_sbf(unsigned long phys_addr, unsigned long size)
+{
+       struct acpi_table_sbf *sb;
+
+       if (!phys_addr || !size)
+       return -EINVAL;
+
+       sb = (struct acpi_table_sbf *) __acpi_map_table(phys_addr, size);
+       if (!sb) {
+               printk(KERN_WARNING PREFIX "Unable to map SBF\n");
+               return -ENODEV;
+       }
+
+       sbf_port = sb->sbf_cmos; /* Save CMOS port */
+
+       return 0;
+}
+
+
+#ifdef CONFIG_HPET_TIMER
+
+static int __init acpi_parse_hpet(unsigned long phys, unsigned long size)
+{
+       struct acpi_table_hpet *hpet_tbl;
+
+       if (!phys || !size)
+               return -EINVAL;
+
+       hpet_tbl = (struct acpi_table_hpet *) __acpi_map_table(phys, size);
+       if (!hpet_tbl) {
+               printk(KERN_WARNING PREFIX "Unable to map HPET\n");
+               return -ENODEV;
+       }
+
+       if (hpet_tbl->addr.space_id != ACPI_SPACE_MEM) {
+               printk(KERN_WARNING PREFIX "HPET timers must be located in "
+                      "memory.\n");
+               return -1;
+       }
+
+#ifdef CONFIG_X86_64
+        vxtime.hpet_address = hpet_tbl->addr.addrl |
+                ((long) hpet_tbl->addr.addrh << 32);
+
+        printk(KERN_INFO PREFIX "HPET id: %#x base: %#lx\n",
+               hpet_tbl->id, vxtime.hpet_address);
+#else  /* X86 */
+       {
+               extern unsigned long hpet_address;
+
+               hpet_address = hpet_tbl->addr.addrl;
+               printk(KERN_INFO PREFIX "HPET id: %#x base: %#lx\n",
+                       hpet_tbl->id, hpet_address);
+       }
+#endif /* X86 */
+
+       return 0;
+}
+#else
+#define        acpi_parse_hpet NULL
+#endif
+
+#ifdef CONFIG_X86_PM_TIMER
+extern u32 pmtmr_ioport;
+#endif
+
+static int __init acpi_parse_fadt(unsigned long phys, unsigned long size)
+{
+       struct fadt_descriptor_rev2 *fadt = NULL;
+
+       fadt = (struct fadt_descriptor_rev2*) __acpi_map_table(phys,size);
+       if(!fadt) {
+               printk(KERN_WARNING PREFIX "Unable to map FADT\n");
+               return 0;
+       }
+
+#ifdef CONFIG_ACPI_INTERPRETER
+       /* initialize sci_int early for INT_SRC_OVR MADT parsing */
+       acpi_fadt.sci_int = fadt->sci_int;
+#endif
+
+#ifdef CONFIG_ACPI_BUS
+       /* initialize rev and apic_phys_dest_mode for x86_64 genapic */
+       acpi_fadt.revision = fadt->revision;
+       acpi_fadt.force_apic_physical_destination_mode = 
fadt->force_apic_physical_destination_mode;
+#endif
+
+#ifdef CONFIG_X86_PM_TIMER
+       /* detect the location of the ACPI PM Timer */
+       if (fadt->revision >= FADT2_REVISION_ID) {
+               /* FADT rev. 2 */
+               if (fadt->xpm_tmr_blk.address_space_id != 
ACPI_ADR_SPACE_SYSTEM_IO)
+                       return 0;
+
+               pmtmr_ioport = fadt->xpm_tmr_blk.address;
+       } else {
+               /* FADT rev. 1 */
+               pmtmr_ioport = fadt->V1_pm_tmr_blk;
+       }
+       if (pmtmr_ioport)
+               printk(KERN_INFO PREFIX "PM-Timer IO Port: %#x\n", 
pmtmr_ioport);
+#endif
+       return 0;
+}
+
+
+unsigned long __init
+acpi_find_rsdp (void)
+{
+       unsigned long           rsdp_phys = 0;
+
+       if (efi_enabled) {
+               if (efi.acpi20)
+                       return __pa(efi.acpi20);
+               else if (efi.acpi)
+                       return __pa(efi.acpi);
+       }
+       /*
+        * Scan memory looking for the RSDP signature. First search EBDA (low
+        * memory) paragraphs and then search upper memory (E0000-FFFFF).
+        */
+       rsdp_phys = acpi_scan_rsdp (0, 0x400);
+       if (!rsdp_phys)
+               rsdp_phys = acpi_scan_rsdp (0xE0000, 0x20000);
+
+       set_fixmap(FIX_ACPI_RSDP_PAGE, rsdp_phys);
+
+       return rsdp_phys;
+}
+
+#ifdef CONFIG_X86_LOCAL_APIC
+/*
+ * Parse LAPIC entries in MADT
+ * returns 0 on success, < 0 on error
+ */
+static int __init
+acpi_parse_madt_lapic_entries(void)
+{
+       int count;
+
+       /* 
+        * Note that the LAPIC address is obtained from the MADT (32-bit value)
+        * and (optionally) overriden by a LAPIC_ADDR_OVR entry (64-bit value).
+        */
+
+       count = acpi_table_parse_madt(ACPI_MADT_LAPIC_ADDR_OVR, 
acpi_parse_lapic_addr_ovr, 0);
+       if (count < 0) {
+               printk(KERN_ERR PREFIX "Error parsing LAPIC address override 
entry\n");
+               return count;
+       }
+
+       mp_register_lapic_address(acpi_lapic_addr);
+
+       count = acpi_table_parse_madt(ACPI_MADT_LAPIC, acpi_parse_lapic,
+                                      MAX_APICS);
+       if (!count) { 
+               printk(KERN_ERR PREFIX "No LAPIC entries present\n");
+               /* TBD: Cleanup to allow fallback to MPS */
+               return -ENODEV;
+       }
+       else if (count < 0) {
+               printk(KERN_ERR PREFIX "Error parsing LAPIC entry\n");
+               /* TBD: Cleanup to allow fallback to MPS */
+               return count;
+       }
+
+       count = acpi_table_parse_madt(ACPI_MADT_LAPIC_NMI, 
acpi_parse_lapic_nmi, 0);
+       if (count < 0) {
+               printk(KERN_ERR PREFIX "Error parsing LAPIC NMI entry\n");
+               /* TBD: Cleanup to allow fallback to MPS */
+               return count;
+       }
+       return 0;
+}
+#endif /* CONFIG_X86_LOCAL_APIC */
+
+#if defined(CONFIG_X86_IO_APIC) && defined(CONFIG_ACPI_INTERPRETER)
+/*
+ * Parse IOAPIC related entries in MADT
+ * returns 0 on success, < 0 on error
+ */
+static int __init
+acpi_parse_madt_ioapic_entries(void)
+{
+       int count;
+
+       /*
+        * ACPI interpreter is required to complete interrupt setup,
+        * so if it is off, don't enumerate the io-apics with ACPI.
+        * If MPS is present, it will handle them,
+        * otherwise the system will stay in PIC mode
+        */
+       if (acpi_disabled || acpi_noirq) {
+               return -ENODEV;
+        }
+
+       /*
+        * if "noapic" boot option, don't look for IO-APICs
+        */
+       if (skip_ioapic_setup) {
+               printk(KERN_INFO PREFIX "Skipping IOAPIC probe "
+                       "due to 'noapic' option.\n");
+               return -ENODEV;
+       }
+
+       count = acpi_table_parse_madt(ACPI_MADT_IOAPIC, acpi_parse_ioapic, 
MAX_IO_APICS);
+       if (!count) {
+               printk(KERN_ERR PREFIX "No IOAPIC entries present\n");
+               return -ENODEV;
+       }
+       else if (count < 0) {
+               printk(KERN_ERR PREFIX "Error parsing IOAPIC entry\n");
+               return count;
+       }
+
+       count = acpi_table_parse_madt(ACPI_MADT_INT_SRC_OVR, 
acpi_parse_int_src_ovr, NR_IRQ_VECTORS);
+       if (count < 0) {
+               printk(KERN_ERR PREFIX "Error parsing interrupt source 
overrides entry\n");
+               /* TBD: Cleanup to allow fallback to MPS */
+               return count;
+       }
+
+       /*
+        * If BIOS did not supply an INT_SRC_OVR for the SCI
+        * pretend we got one so we can set the SCI flags.
+        */
+       if (!acpi_sci_override_gsi)
+               acpi_sci_ioapic_setup(acpi_fadt.sci_int, 0, 0);
+
+       /* Fill in identity legacy mapings where no override */
+       mp_config_acpi_legacy_irqs();
+
+       count = acpi_table_parse_madt(ACPI_MADT_NMI_SRC, acpi_parse_nmi_src, 
NR_IRQ_VECTORS);
+       if (count < 0) {
+               printk(KERN_ERR PREFIX "Error parsing NMI SRC entry\n");
+               /* TBD: Cleanup to allow fallback to MPS */
+               return count;
+       }
+
+       return 0;
+}
+#else
+static inline int acpi_parse_madt_ioapic_entries(void)
+{
+       return -1;
+}
+#endif /* !(CONFIG_X86_IO_APIC && CONFIG_ACPI_INTERPRETER) */
+
+
+static void __init
+acpi_process_madt(void)
+{
+#ifdef CONFIG_X86_LOCAL_APIC
+       int count, error;
+
+       count = acpi_table_parse(ACPI_APIC, acpi_parse_madt);
+       if (count >= 1) {
+
+               /*
+                * Parse MADT LAPIC entries
+                */
+               error = acpi_parse_madt_lapic_entries();
+               if (!error) {
+                       acpi_lapic = 1;
+
+                       /*
+                        * Parse MADT IO-APIC entries
+                        */
+                       error = acpi_parse_madt_ioapic_entries();
+                       if (!error) {
+                               acpi_irq_model = ACPI_IRQ_MODEL_IOAPIC;
+                               acpi_irq_balance_set(NULL);
+                               acpi_ioapic = 1;
+
+                               smp_found_config = 1;
+                               clustered_apic_check();
+                       }
+               }
+               if (error == -EINVAL) {
+                       /*
+                        * Dell Precision Workstation 410, 610 come here.
+                        */
+                       printk(KERN_ERR PREFIX "Invalid BIOS MADT, disabling 
ACPI\n");
+                       disable_acpi();
+               }
+       }
+#endif
+       return;
+}
+
+/*
+ * acpi_boot_table_init() and acpi_boot_init()
+ *  called from setup_arch(), always.
+ *     1. checksums all tables
+ *     2. enumerates lapics
+ *     3. enumerates io-apics
+ *
+ * acpi_table_init() is separate to allow reading SRAT without
+ * other side effects.
+ *
+ * side effects of acpi_boot_init:
+ *     acpi_lapic = 1 if LAPIC found
+ *     acpi_ioapic = 1 if IOAPIC found
+ *     if (acpi_lapic && acpi_ioapic) smp_found_config = 1;
+ *     if acpi_blacklisted() acpi_disabled = 1;
+ *     acpi_irq_model=...
+ *     ...
+ *
+ * return value: (currently ignored)
+ *     0: success
+ *     !0: failure
+ */
+
+int __init
+acpi_boot_table_init(void)
+{
+       int error;
+
+       /*
+        * If acpi_disabled, bail out
+        * One exception: acpi=ht continues far enough to enumerate LAPICs
+        */
+       if (acpi_disabled && !acpi_ht)
+                return 1;
+
+       /* 
+        * Initialize the ACPI boot-time table parser.
+        */
+       error = acpi_table_init();
+       if (error) {
+               disable_acpi();
+               return error;
+       }
+
+#ifdef __i386__
+       check_acpi_pci();
+#endif
+
+       acpi_table_parse(ACPI_BOOT, acpi_parse_sbf);
+
+       /*
+        * blacklist may disable ACPI entirely
+        */
+       error = acpi_blacklisted();
+       if (error) {
+               extern int acpi_force;
+
+               if (acpi_force) {
+                       printk(KERN_WARNING PREFIX "acpi=force override\n");
+               } else {
+                       printk(KERN_WARNING PREFIX "Disabling ACPI support\n");
+                       disable_acpi();
+                       return error;
+               }
+       }
+
+       return 0;
+}
+
+
+int __init acpi_boot_init(void)
+{
+       /*
+        * If acpi_disabled, bail out
+        * One exception: acpi=ht continues far enough to enumerate LAPICs
+        */
+       if (acpi_disabled && !acpi_ht)
+                return 1;
+
+       acpi_table_parse(ACPI_BOOT, acpi_parse_sbf);
+
+       /*
+        * set sci_int and PM timer address
+        */
+       acpi_table_parse(ACPI_FADT, acpi_parse_fadt);
+
+       /*
+        * Process the Multiple APIC Description Table (MADT), if present
+        */
+       acpi_process_madt();
+
+       acpi_table_parse(ACPI_HPET, acpi_parse_hpet);
+       acpi_table_parse(ACPI_MCFG, acpi_parse_mcfg);
+
+       return 0;
+}
+
Index: linux-2.6.12-xen0-arch/arch/i386/mach-xen/kernel/acpi/Makefile
===================================================================
--- /dev/null
+++ linux-2.6.12-xen0-arch/arch/i386/mach-xen/kernel/acpi/Makefile
@@ -0,0 +1,13 @@
+obj-$(CONFIG_ACPI_BOOT)                        := boot.o
+c-obj-$(CONFIG_X86_IO_APIC)            += earlyquirk.o
+c-obj-$(CONFIG_ACPI_SLEEP)             += sleep.o wakeup.o
+
+c-link                                  :=
+
+$(patsubst %.o,$(obj)/%.c,$(c-obj-y) $(c-link)):
+       @ln -fsn $(srctree)/arch/i386/kernel/acpi/$(notdir $@) $@
+
+obj-y  += $(c-obj-y) $(s-obj-y)
+
+clean-files += $(patsubst %.o,%.c,$(c-obj-y) $(c-obj-) $(c-link))
+clean-files += $(patsubst %.o,%.S,$(s-obj-y) $(s-obj-) $(s-link))
Index: linux-2.6.12-xen0-arch/arch/i386/mach-xen/kernel/apic.c
===================================================================
--- /dev/null
+++ linux-2.6.12-xen0-arch/arch/i386/mach-xen/kernel/apic.c
@@ -0,0 +1,83 @@
+/*
+ *     Local APIC handling, local APIC timers
+ *
+ *     (c) 1999, 2000 Ingo Molnar <mingo@xxxxxxxxxx>
+ *
+ *     Fixes
+ *     Maciej W. Rozycki       :       Bits for genuine 82489DX APICs;
+ *                                     thanks to Eric Gilmore
+ *                                     and Rolf G. Tews
+ *                                     for testing these extensively.
+ *     Maciej W. Rozycki       :       Various updates and fixes.
+ *     Mikael Pettersson       :       Power Management for UP-APIC.
+ *     Pavel Machek and
+ *     Mikael Pettersson       :       PM converted to driver model.
+ */
+
+#include <linux/config.h>
+#include <linux/init.h>
+
+#include <linux/mm.h>
+#include <linux/irq.h>
+#include <linux/delay.h>
+#include <linux/bootmem.h>
+#include <linux/smp_lock.h>
+#include <linux/interrupt.h>
+#include <linux/mc146818rtc.h>
+#include <linux/kernel_stat.h>
+#include <linux/sysdev.h>
+
+#include <asm/atomic.h>
+#include <asm/smp.h>
+#include <asm/mtrr.h>
+#include <asm/mpspec.h>
+#include <asm/desc.h>
+#include <asm/arch_hooks.h>
+#include <asm/hpet.h>
+
+#include <mach_apic.h>
+
+#include "io_ports.h"
+
+/*
+ * Debug level
+ */
+int apic_verbosity;
+
+int get_physical_broadcast(void)
+{
+        return 0xff;
+}
+
+/*
+ * 'what should we do if we get a hw irq event on an illegal vector'.
+ * each architecture has to answer this themselves.
+ */
+void ack_bad_irq(unsigned int irq)
+{
+       printk("unexpected IRQ trap at vector %02x\n", irq);
+       /*
+        * Currently unexpected vectors happen only on SMP and APIC.
+        * We _must_ ack these because every local APIC has only N
+        * irq slots per priority level, and a 'hanging, unacked' IRQ
+        * holds up an irq slot - in excessive cases (when multiple
+        * unexpected vectors occur) that might lock up the APIC
+        * completely.
+        */
+       ack_APIC_irq();
+}
+
+/*
+ * This initializes the IO-APIC and APIC hardware if this is
+ * a UP kernel.
+ */
+int __init APIC_init_uniprocessor (void)
+{
+#ifdef CONFIG_X86_IO_APIC
+       if (smp_found_config)
+               if (!skip_ioapic_setup && nr_ioapics)
+                       setup_IO_APIC();
+#endif
+
+       return 0;
+}
Index: linux-2.6.12-xen0-arch/arch/i386/mach-xen/kernel/cpu/common.c
===================================================================
--- /dev/null
+++ linux-2.6.12-xen0-arch/arch/i386/mach-xen/kernel/cpu/common.c
@@ -0,0 +1,650 @@
+#include <linux/init.h>
+#include <linux/string.h>
+#include <linux/delay.h>
+#include <linux/smp.h>
+#include <linux/module.h>
+#include <linux/percpu.h>
+#include <asm/semaphore.h>
+#include <asm/processor.h>
+#include <asm/i387.h>
+#include <asm/msr.h>
+#include <asm/io.h>
+#include <asm/mmu_context.h>
+#ifdef CONFIG_X86_LOCAL_APIC
+#include <asm/mpspec.h>
+#include <asm/apic.h>
+#include <mach_apic.h>
+#endif
+#include <asm-xen/hypervisor.h>
+
+#include "cpu.h"
+
+DEFINE_PER_CPU(struct desc_struct, cpu_gdt_table[GDT_ENTRIES]);
+EXPORT_PER_CPU_SYMBOL(cpu_gdt_table);
+
+DEFINE_PER_CPU(unsigned char, cpu_16bit_stack[CPU_16BIT_STACK_SIZE]);
+EXPORT_PER_CPU_SYMBOL(cpu_16bit_stack);
+
+static int cachesize_override __initdata = -1;
+static int disable_x86_fxsr __initdata = 0;
+static int disable_x86_serial_nr __initdata = 1;
+
+struct cpu_dev * cpu_devs[X86_VENDOR_NUM] = {};
+
+extern void mcheck_init(struct cpuinfo_x86 *c);
+
+extern void machine_specific_modify_cpu_capabilities(struct cpuinfo_x86 *c);
+
+extern int disable_pse;
+
+static void default_init(struct cpuinfo_x86 * c)
+{
+       /* Not much we can do here... */
+       /* Check if at least it has cpuid */
+       if (c->cpuid_level == -1) {
+               /* No cpuid. It must be an ancient CPU */
+               if (c->x86 == 4)
+                       strcpy(c->x86_model_id, "486");
+               else if (c->x86 == 3)
+                       strcpy(c->x86_model_id, "386");
+       }
+}
+
+static struct cpu_dev default_cpu = {
+       .c_init = default_init,
+};
+static struct cpu_dev * this_cpu = &default_cpu;
+
+static int __init cachesize_setup(char *str)
+{
+       get_option (&str, &cachesize_override);
+       return 1;
+}
+__setup("cachesize=", cachesize_setup);
+
+int __init get_model_name(struct cpuinfo_x86 *c)
+{
+       unsigned int *v;
+       char *p, *q;
+
+       if (cpuid_eax(0x80000000) < 0x80000004)
+               return 0;
+
+       v = (unsigned int *) c->x86_model_id;
+       cpuid(0x80000002, &v[0], &v[1], &v[2], &v[3]);
+       cpuid(0x80000003, &v[4], &v[5], &v[6], &v[7]);
+       cpuid(0x80000004, &v[8], &v[9], &v[10], &v[11]);
+       c->x86_model_id[48] = 0;
+
+       /* Intel chips right-justify this string for some dumb reason;
+          undo that brain damage */
+       p = q = &c->x86_model_id[0];
+       while ( *p == ' ' )
+            p++;
+       if ( p != q ) {
+            while ( *p )
+                 *q++ = *p++;
+            while ( q <= &c->x86_model_id[48] )
+                 *q++ = '\0';  /* Zero-pad the rest */
+       }
+
+       return 1;
+}
+
+
+void __init display_cacheinfo(struct cpuinfo_x86 *c)
+{
+       unsigned int n, dummy, ecx, edx, l2size;
+
+       n = cpuid_eax(0x80000000);
+
+       if (n >= 0x80000005) {
+               cpuid(0x80000005, &dummy, &dummy, &ecx, &edx);
+               printk(KERN_INFO "CPU: L1 I Cache: %dK (%d bytes/line), D cache 
%dK (%d bytes/line)\n",
+                       edx>>24, edx&0xFF, ecx>>24, ecx&0xFF);
+               c->x86_cache_size=(ecx>>24)+(edx>>24);  
+       }
+
+       if (n < 0x80000006)     /* Some chips just has a large L1. */
+               return;
+
+       ecx = cpuid_ecx(0x80000006);
+       l2size = ecx >> 16;
+       
+       /* do processor-specific cache resizing */
+       if (this_cpu->c_size_cache)
+               l2size = this_cpu->c_size_cache(c,l2size);
+
+       /* Allow user to override all this if necessary. */
+       if (cachesize_override != -1)
+               l2size = cachesize_override;
+
+       if ( l2size == 0 )
+               return;         /* Again, no L2 cache is possible */
+
+       c->x86_cache_size = l2size;
+
+       printk(KERN_INFO "CPU: L2 Cache: %dK (%d bytes/line)\n",
+              l2size, ecx & 0xFF);
+}
+
+/* Naming convention should be: <Name> [(<Codename>)] */
+/* This table only is used unless init_<vendor>() below doesn't set it; */
+/* in particular, if CPUID levels 0x80000002..4 are supported, this isn't used 
*/
+
+/* Look up CPU names by table lookup. */
+static char __init *table_lookup_model(struct cpuinfo_x86 *c)
+{
+       struct cpu_model_info *info;
+
+       if ( c->x86_model >= 16 )
+               return NULL;    /* Range check */
+
+       if (!this_cpu)
+               return NULL;
+
+       info = this_cpu->c_models;
+
+       while (info && info->family) {
+               if (info->family == c->x86)
+                       return info->model_names[c->x86_model];
+               info++;
+       }
+       return NULL;            /* Not found */
+}
+
+
+void __init get_cpu_vendor(struct cpuinfo_x86 *c, int early)
+{
+       char *v = c->x86_vendor_id;
+       int i;
+
+       for (i = 0; i < X86_VENDOR_NUM; i++) {
+               if (cpu_devs[i]) {
+                       if (!strcmp(v,cpu_devs[i]->c_ident[0]) ||
+                           (cpu_devs[i]->c_ident[1] && 
+                            !strcmp(v,cpu_devs[i]->c_ident[1]))) {
+                               c->x86_vendor = i;
+                               if (!early)
+                                       this_cpu = cpu_devs[i];
+                               break;
+                       }
+               }
+       }
+}
+
+
+static int __init x86_fxsr_setup(char * s)
+{
+       disable_x86_fxsr = 1;
+       return 1;
+}
+__setup("nofxsr", x86_fxsr_setup);
+
+
+/* Standard macro to see if a specific flag is changeable */
+static inline int flag_is_changeable_p(u32 flag)
+{
+       u32 f1, f2;
+
+       asm("pushfl\n\t"
+           "pushfl\n\t"
+           "popl %0\n\t"
+           "movl %0,%1\n\t"
+           "xorl %2,%0\n\t"
+           "pushl %0\n\t"
+           "popfl\n\t"
+           "pushfl\n\t"
+           "popl %0\n\t"
+           "popfl\n\t"
+           : "=&r" (f1), "=&r" (f2)
+           : "ir" (flag));
+
+       return ((f1^f2) & flag) != 0;
+}
+
+
+/* Probe for the CPUID instruction */
+static int __init have_cpuid_p(void)
+{
+       return flag_is_changeable_p(X86_EFLAGS_ID);
+}
+
+/* Do minimum CPU detection early.
+   Fields really needed: vendor, cpuid_level, family, model, mask, cache 
alignment.
+   The others are not touched to avoid unwanted side effects. */
+static void __init early_cpu_detect(void)
+{
+       struct cpuinfo_x86 *c = &boot_cpu_data;
+
+       c->x86_cache_alignment = 32;
+
+       if (!have_cpuid_p())
+               return;
+
+       /* Get vendor name */
+       cpuid(0x00000000, &c->cpuid_level,
+             (int *)&c->x86_vendor_id[0],
+             (int *)&c->x86_vendor_id[8],
+             (int *)&c->x86_vendor_id[4]);
+
+       get_cpu_vendor(c, 1);
+
+       c->x86 = 4;
+       if (c->cpuid_level >= 0x00000001) {
+               u32 junk, tfms, cap0, misc;
+               cpuid(0x00000001, &tfms, &misc, &junk, &cap0);
+               c->x86 = (tfms >> 8) & 15;
+               c->x86_model = (tfms >> 4) & 15;
+               if (c->x86 == 0xf) {
+                       c->x86 += (tfms >> 20) & 0xff;
+                       c->x86_model += ((tfms >> 16) & 0xF) << 4;
+               }
+               c->x86_mask = tfms & 15;
+               if (cap0 & (1<<19))
+                       c->x86_cache_alignment = ((misc >> 8) & 0xff) * 8;
+       }
+
+       early_intel_workaround(c);
+
+#ifdef CONFIG_X86_HT
+       phys_proc_id[smp_processor_id()] = (cpuid_ebx(1) >> 24) & 0xff;
+#endif
+}
+
+void __init generic_identify(struct cpuinfo_x86 * c)
+{
+       u32 tfms, xlvl;
+       int junk;
+
+       if (have_cpuid_p()) {
+               /* Get vendor name */
+               cpuid(0x00000000, &c->cpuid_level,
+                     (int *)&c->x86_vendor_id[0],
+                     (int *)&c->x86_vendor_id[8],
+                     (int *)&c->x86_vendor_id[4]);
+               
+               get_cpu_vendor(c, 0);
+               /* Initialize the standard set of capabilities */
+               /* Note that the vendor-specific code below might override */
+       
+               /* Intel-defined flags: level 0x00000001 */
+               if ( c->cpuid_level >= 0x00000001 ) {
+                       u32 capability, excap;
+                       cpuid(0x00000001, &tfms, &junk, &excap, &capability);
+                       c->x86_capability[0] = capability;
+                       c->x86_capability[4] = excap;
+                       c->x86 = (tfms >> 8) & 15;
+                       c->x86_model = (tfms >> 4) & 15;
+                       if (c->x86 == 0xf) {
+                               c->x86 += (tfms >> 20) & 0xff;
+                               c->x86_model += ((tfms >> 16) & 0xF) << 4;
+                       } 
+                       c->x86_mask = tfms & 15;
+               } else {
+                       /* Have CPUID level 0 only - unheard of */
+                       c->x86 = 4;
+               }
+
+               /* AMD-defined flags: level 0x80000001 */
+               xlvl = cpuid_eax(0x80000000);
+               if ( (xlvl & 0xffff0000) == 0x80000000 ) {
+                       if ( xlvl >= 0x80000001 ) {
+                               c->x86_capability[1] = cpuid_edx(0x80000001);
+                               c->x86_capability[6] = cpuid_ecx(0x80000001);
+                       }
+                       if ( xlvl >= 0x80000004 )
+                               get_model_name(c); /* Default name */
+               }
+       }
+}
+
+static void __init squash_the_stupid_serial_number(struct cpuinfo_x86 *c)
+{
+       if (cpu_has(c, X86_FEATURE_PN) && disable_x86_serial_nr ) {
+               /* Disable processor serial number */
+               unsigned long lo,hi;
+               rdmsr(MSR_IA32_BBL_CR_CTL,lo,hi);
+               lo |= 0x200000;
+               wrmsr(MSR_IA32_BBL_CR_CTL,lo,hi);
+               printk(KERN_NOTICE "CPU serial number disabled.\n");
+               clear_bit(X86_FEATURE_PN, c->x86_capability);
+
+               /* Disabling the serial number may affect the cpuid level */
+               c->cpuid_level = cpuid_eax(0);
+       }
+}
+
+static int __init x86_serial_nr_setup(char *s)
+{
+       disable_x86_serial_nr = 0;
+       return 1;
+}
+__setup("serialnumber", x86_serial_nr_setup);
+
+
+
+/*
+ * This does the hard work of actually picking apart the CPU stuff...
+ */
+void __init identify_cpu(struct cpuinfo_x86 *c)
+{
+       int i;
+
+       c->loops_per_jiffy = loops_per_jiffy;
+       c->x86_cache_size = -1;
+       c->x86_vendor = X86_VENDOR_UNKNOWN;
+       c->cpuid_level = -1;    /* CPUID not detected */
+       c->x86_model = c->x86_mask = 0; /* So far unknown... */
+       c->x86_vendor_id[0] = '\0'; /* Unset */
+       c->x86_model_id[0] = '\0';  /* Unset */
+       c->x86_num_cores = 1;
+       memset(&c->x86_capability, 0, sizeof c->x86_capability);
+
+       if (!have_cpuid_p()) {
+               /* First of all, decide if this is a 486 or higher */
+               /* It's a 486 if we can modify the AC flag */
+               if ( flag_is_changeable_p(X86_EFLAGS_AC) )
+                       c->x86 = 4;
+               else
+                       c->x86 = 3;
+       }
+
+       generic_identify(c);
+
+       printk(KERN_DEBUG "CPU: After generic identify, caps:");
+       for (i = 0; i < NCAPINTS; i++)
+               printk(" %08lx", c->x86_capability[i]);
+       printk("\n");
+
+       if (this_cpu->c_identify) {
+               this_cpu->c_identify(c);
+
+               printk(KERN_DEBUG "CPU: After vendor identify, caps:");
+               for (i = 0; i < NCAPINTS; i++)
+                       printk(" %08lx", c->x86_capability[i]);
+               printk("\n");
+       }
+
+       /*
+        * Vendor-specific initialization.  In this section we
+        * canonicalize the feature flags, meaning if there are
+        * features a certain CPU supports which CPUID doesn't
+        * tell us, CPUID claiming incorrect flags, or other bugs,
+        * we handle them here.
+        *
+        * At the end of this section, c->x86_capability better
+        * indicate the features this CPU genuinely supports!
+        */
+       if (this_cpu->c_init)
+               this_cpu->c_init(c);
+
+       /* Disable the PN if appropriate */
+       squash_the_stupid_serial_number(c);
+
+       /*
+        * The vendor-specific functions might have changed features.  Now
+        * we do "generic changes."
+        */
+
+       /* TSC disabled? */
+       if ( tsc_disable )
+               clear_bit(X86_FEATURE_TSC, c->x86_capability);
+
+       /* FXSR disabled? */
+       if (disable_x86_fxsr) {
+               clear_bit(X86_FEATURE_FXSR, c->x86_capability);
+               clear_bit(X86_FEATURE_XMM, c->x86_capability);
+       }
+
+       if (disable_pse)
+               clear_bit(X86_FEATURE_PSE, c->x86_capability);
+
+       /* If the model name is still unset, do table lookup. */
+       if ( !c->x86_model_id[0] ) {
+               char *p;
+               p = table_lookup_model(c);
+               if ( p )
+                       strcpy(c->x86_model_id, p);
+               else
+                       /* Last resort... */
+                       sprintf(c->x86_model_id, "%02x/%02x",
+                               c->x86_vendor, c->x86_model);
+       }
+
+       machine_specific_modify_cpu_capabilities(c);
+
+       /* Now the feature flags better reflect actual CPU features! */
+
+       printk(KERN_DEBUG "CPU: After all inits, caps:");
+       for (i = 0; i < NCAPINTS; i++)
+               printk(" %08lx", c->x86_capability[i]);
+       printk("\n");
+
+       /*
+        * On SMP, boot_cpu_data holds the common feature set between
+        * all CPUs; so make sure that we indicate which features are
+        * common between the CPUs.  The first time this routine gets
+        * executed, c == &boot_cpu_data.
+        */
+       if ( c != &boot_cpu_data ) {
+               /* AND the already accumulated flags with these */
+               for ( i = 0 ; i < NCAPINTS ; i++ )
+                       boot_cpu_data.x86_capability[i] &= c->x86_capability[i];
+       }
+
+       /* Init Machine Check Exception if available. */
+#ifdef CONFIG_X86_MCE
+       mcheck_init(c);
+#endif
+}
+
+#ifdef CONFIG_X86_HT
+void __init detect_ht(struct cpuinfo_x86 *c)
+{
+       u32     eax, ebx, ecx, edx;
+       int     index_msb, tmp;
+       int     cpu = smp_processor_id();
+
+       if (!cpu_has(c, X86_FEATURE_HT) || cpu_has(c, X86_FEATURE_CMP_LEGACY))
+               return;
+
+       cpuid(1, &eax, &ebx, &ecx, &edx);
+       smp_num_siblings = (ebx & 0xff0000) >> 16;
+
+       if (smp_num_siblings == 1) {
+               printk(KERN_INFO  "CPU: Hyper-Threading is disabled\n");
+       } else if (smp_num_siblings > 1 ) {
+               index_msb = 31;
+
+               if (smp_num_siblings > NR_CPUS) {
+                       printk(KERN_WARNING "CPU: Unsupported number of the 
siblings %d", smp_num_siblings);
+                       smp_num_siblings = 1;
+                       return;
+               }
+               tmp = smp_num_siblings;
+               while ((tmp & 0x80000000 ) == 0) {
+                       tmp <<=1 ;
+                       index_msb--;
+               }
+               if (smp_num_siblings & (smp_num_siblings - 1))
+                       index_msb++;
+               phys_proc_id[cpu] = phys_pkg_id((ebx >> 24) & 0xFF, index_msb);
+
+               printk(KERN_INFO  "CPU: Physical Processor ID: %d\n",
+                      phys_proc_id[cpu]);
+
+               smp_num_siblings = smp_num_siblings / c->x86_num_cores;
+
+               tmp = smp_num_siblings;
+               index_msb = 31;
+               while ((tmp & 0x80000000) == 0) {
+                       tmp <<=1 ;
+                       index_msb--;
+               }
+
+               if (smp_num_siblings & (smp_num_siblings - 1))
+                       index_msb++;
+
+               cpu_core_id[cpu] = phys_pkg_id((ebx >> 24) & 0xFF, index_msb);
+
+               if (c->x86_num_cores > 1)
+                       printk(KERN_INFO  "CPU: Processor Core ID: %d\n",
+                              cpu_core_id[cpu]);
+       }
+}
+#endif
+
+void __init print_cpu_info(struct cpuinfo_x86 *c)
+{
+       char *vendor = NULL;
+
+       if (c->x86_vendor < X86_VENDOR_NUM)
+               vendor = this_cpu->c_vendor;
+       else if (c->cpuid_level >= 0)
+               vendor = c->x86_vendor_id;
+
+       if (vendor && strncmp(c->x86_model_id, vendor, strlen(vendor)))
+               printk("%s ", vendor);
+
+       if (!c->x86_model_id[0])
+               printk("%d86", c->x86);
+       else
+               printk("%s", c->x86_model_id);
+
+       if (c->x86_mask || c->cpuid_level >= 0) 
+               printk(" stepping %02x\n", c->x86_mask);
+       else
+               printk("\n");
+}
+
+cpumask_t cpu_initialized __initdata = CPU_MASK_NONE;
+
+/* This is hacky. :)
+ * We're emulating future behavior.
+ * In the future, the cpu-specific init functions will be called implicitly
+ * via the magic of initcalls.
+ * They will insert themselves into the cpu_devs structure.
+ * Then, when cpu_init() is called, we can just iterate over that array.
+ */
+
+extern int intel_cpu_init(void);
+extern int cyrix_init_cpu(void);
+extern int nsc_init_cpu(void);
+extern int amd_init_cpu(void);
+extern int centaur_init_cpu(void);
+extern int transmeta_init_cpu(void);
+extern int rise_init_cpu(void);
+extern int nexgen_init_cpu(void);
+extern int umc_init_cpu(void);
+
+void __init early_cpu_init(void)
+{
+       intel_cpu_init();
+       cyrix_init_cpu();
+       nsc_init_cpu();
+       amd_init_cpu();
+       centaur_init_cpu();
+       transmeta_init_cpu();
+       rise_init_cpu();
+       nexgen_init_cpu();
+       umc_init_cpu();
+       early_cpu_detect();
+
+#ifdef CONFIG_DEBUG_PAGEALLOC
+       /* pse is not compatible with on-the-fly unmapping,
+        * disable it even if the cpus claim to support it.
+        */
+       clear_bit(X86_FEATURE_PSE, boot_cpu_data.x86_capability);
+       disable_pse = 1;
+#endif
+}
+
+void __init cpu_gdt_init(struct Xgt_desc_struct *gdt_descr)
+{
+       unsigned long frames[16];
+       unsigned long va;
+       int f;
+
+       for (va = gdt_descr->address, f = 0;
+            va < gdt_descr->address + gdt_descr->size;
+            va += PAGE_SIZE, f++) {
+               frames[f] = virt_to_machine(va) >> PAGE_SHIFT;
+               make_page_readonly((void *)va);
+       }
+       if (HYPERVISOR_set_gdt(frames, gdt_descr->size / 8))
+               BUG();
+       lgdt_finish();
+}
+
+/*
+ * cpu_init() initializes state that is per-CPU. Some data is already
+ * initialized (naturally) in the bootstrap process, such as the GDT
+ * and IDT. We reload them nevertheless, this function acts as a
+ * 'CPU state barrier', nothing should get across.
+ */
+void __init cpu_init (void)
+{
+       int cpu = smp_processor_id();
+       struct tss_struct * t = &per_cpu(init_tss, cpu);
+       struct thread_struct *thread = &current->thread;
+
+       if (cpu_test_and_set(cpu, cpu_initialized)) {
+               printk(KERN_WARNING "CPU#%d already initialized!\n", cpu);
+               for (;;) local_irq_enable();
+       }
+       printk(KERN_INFO "Initializing CPU#%d\n", cpu);
+
+       if (cpu_has_vme || cpu_has_de)
+               clear_in_cr4(X86_CR4_VME|X86_CR4_PVI|X86_CR4_TSD|X86_CR4_DE);
+       if (tsc_disable && cpu_has_tsc) {
+               printk(KERN_NOTICE "Disabling TSC...\n");
+               /**** FIX-HPA: DOES THIS REALLY BELONG HERE? ****/
+               clear_bit(X86_FEATURE_TSC, boot_cpu_data.x86_capability);
+               set_in_cr4(X86_CR4_TSD);
+       }
+
+       /*
+        * Set up the per-thread TLS descriptor cache:
+        */
+       memcpy(thread->tls_array, &get_cpu_gdt_table(cpu)[GDT_ENTRY_TLS_MIN],
+              GDT_ENTRY_TLS_ENTRIES * 8);
+
+       cpu_gdt_init(&cpu_gdt_descr[cpu]);
+
+       /*
+        * Delete NT
+        */
+       __asm__("pushfl ; andl $0xffffbfff,(%esp) ; popfl");
+
+       /*
+        * Set up and load the per-CPU TSS and LDT
+        */
+       atomic_inc(&init_mm.mm_count);
+       current->active_mm = &init_mm;
+       if (current->mm)
+               BUG();
+       enter_lazy_tlb(&init_mm, current);
+
+       load_esp0(t, thread);
+
+       load_LDT(&init_mm.context);
+
+       /* Clear %fs and %gs. */
+       asm volatile ("xorl %eax, %eax; movl %eax, %fs; movl %eax, %gs");
+
+       /* Clear all 6 debug registers: */
+
+#define CD(register) HYPERVISOR_set_debugreg(register, 0)
+
+       CD(0); CD(1); CD(2); CD(3); /* no db4 and db5 */; CD(6); CD(7);
+
+#undef CD
+
+       /*
+        * Force FPU initialization:
+        */
+       current_thread_info()->status = 0;
+       clear_used_math();
+       mxcsr_feature_mask_init();
+}
Index: linux-2.6.12-xen0-arch/arch/i386/mach-xen/kernel/cpu/Makefile
===================================================================
--- /dev/null
+++ linux-2.6.12-xen0-arch/arch/i386/mach-xen/kernel/cpu/Makefile
@@ -0,0 +1,31 @@
+#
+# Makefile for x86-compatible CPU details and quirks
+#
+
+CFLAGS += -Iarch/i386/kernel/cpu
+
+obj-y  :=      common.o
+c-obj-y        +=      proc.o
+
+c-obj-y        +=      amd.o
+c-obj-y        +=      cyrix.o
+c-obj-y        +=      centaur.o
+c-obj-y        +=      transmeta.o
+c-obj-y        +=      intel.o intel_cacheinfo.o
+c-obj-y        +=      rise.o
+c-obj-y        +=      nexgen.o
+c-obj-y        +=      umc.o
+
+#obj-$(CONFIG_X86_MCE) +=      ../../../../i386/kernel/cpu/mcheck/
+
+obj-$(CONFIG_MTRR)     +=      mtrr/
+#obj-$(CONFIG_CPU_FREQ)        +=      ../../../../i386/kernel/cpu/cpufreq/
+
+c-link :=
+
+$(patsubst %.o,$(obj)/%.c,$(c-obj-y) $(c-link)):
+       @ln -fsn $(srctree)/arch/i386/kernel/cpu/$(notdir $@) $@
+
+obj-y  += $(c-obj-y)
+
+clean-files += $(patsubst %.o,%.c,$(c-obj-y) $(c-obj-) $(c-link))
Index: linux-2.6.12-xen0-arch/arch/i386/mach-xen/kernel/cpu/mtrr/main.c
===================================================================
--- /dev/null
+++ linux-2.6.12-xen0-arch/arch/i386/mach-xen/kernel/cpu/mtrr/main.c
@@ -0,0 +1,165 @@
+#include <linux/init.h>
+#include <linux/proc_fs.h>
+#include <linux/ctype.h>
+#include <linux/module.h>
+#include <linux/seq_file.h>
+#include <asm/uaccess.h>
+
+#include <asm/mtrr.h>
+#include "mtrr.h"
+
+void generic_get_mtrr(unsigned int reg, unsigned long *base,
+                     unsigned int *size, mtrr_type * type)
+{
+       dom0_op_t op;
+
+       op.cmd = DOM0_READ_MEMTYPE;
+       op.u.read_memtype.reg = reg;
+       (void)HYPERVISOR_dom0_op(&op);
+
+       *size = op.u.read_memtype.nr_pfns;
+       *base = op.u.read_memtype.pfn;
+       *type = op.u.read_memtype.type;
+}
+
+struct mtrr_ops generic_mtrr_ops = {
+       .use_intel_if      = 1,
+       .get               = generic_get_mtrr,
+};
+
+struct mtrr_ops *mtrr_if = &generic_mtrr_ops;
+unsigned int num_var_ranges;
+unsigned int *usage_table;
+
+static void __init set_num_var_ranges(void)
+{
+       dom0_op_t op;
+
+       for (num_var_ranges = 0; ; num_var_ranges++) {
+               op.cmd = DOM0_READ_MEMTYPE;
+               op.u.read_memtype.reg = num_var_ranges;
+               if (HYPERVISOR_dom0_op(&op) != 0)
+                       break;
+       }
+}
+
+static void __init init_table(void)
+{
+       int i, max;
+
+       max = num_var_ranges;
+       if ((usage_table = kmalloc(max * sizeof *usage_table, GFP_KERNEL))
+           == NULL) {
+               printk(KERN_ERR "mtrr: could not allocate\n");
+               return;
+       }
+       for (i = 0; i < max; i++)
+               usage_table[i] = 0;
+}
+
+int mtrr_add_page(unsigned long base, unsigned long size, 
+                 unsigned int type, char increment)
+{
+       int error;
+       dom0_op_t op;
+
+       op.cmd = DOM0_ADD_MEMTYPE;
+       op.u.add_memtype.pfn     = base;
+       op.u.add_memtype.nr_pfns = size;
+       op.u.add_memtype.type    = type;
+       if ((error = HYPERVISOR_dom0_op(&op)))
+               return error;
+
+       if (increment)
+               ++usage_table[op.u.add_memtype.reg];
+
+       return op.u.add_memtype.reg;
+}
+
+int
+mtrr_add(unsigned long base, unsigned long size, unsigned int type,
+        char increment)
+{
+       if ((base & (PAGE_SIZE - 1)) || (size & (PAGE_SIZE - 1))) {
+               printk(KERN_WARNING "mtrr: size and base must be multiples of 4 
kiB\n");
+               printk(KERN_DEBUG "mtrr: size: 0x%lx  base: 0x%lx\n", size, 
base);
+               return -EINVAL;
+       }
+       return mtrr_add_page(base >> PAGE_SHIFT, size >> PAGE_SHIFT, type,
+                            increment);
+}
+
+int mtrr_del_page(int reg, unsigned long base, unsigned long size)
+{
+       int i, max;
+       mtrr_type ltype;
+       unsigned long lbase;
+       unsigned int lsize;
+       int error = -EINVAL;
+       dom0_op_t op;
+
+       max = num_var_ranges;
+       if (reg < 0) {
+               /*  Search for existing MTRR  */
+               for (i = 0; i < max; ++i) {
+                       mtrr_if->get(i, &lbase, &lsize, &ltype);
+                       if (lbase == base && lsize == size) {
+                               reg = i;
+                               break;
+                       }
+               }
+               if (reg < 0) {
+                       printk(KERN_DEBUG "mtrr: no MTRR for %lx000,%lx000 
found\n", base,
+                              size);
+                       goto out;
+               }
+       }
+       if (usage_table[reg] < 1) {
+               printk(KERN_WARNING "mtrr: reg: %d has count=0\n", reg);
+               goto out;
+       }
+       if (--usage_table[reg] < 1) {
+               op.cmd = DOM0_DEL_MEMTYPE;
+               op.u.del_memtype.handle = 0;
+               op.u.add_memtype.reg    = reg;
+               (void)HYPERVISOR_dom0_op(&op);
+       }
+       error = reg;
+ out:
+       return error;
+}
+
+int
+mtrr_del(int reg, unsigned long base, unsigned long size)
+{
+       if ((base & (PAGE_SIZE - 1)) || (size & (PAGE_SIZE - 1))) {
+               printk(KERN_INFO "mtrr: size and base must be multiples of 4 
kiB\n");
+               printk(KERN_DEBUG "mtrr: size: 0x%lx  base: 0x%lx\n", size, 
base);
+               return -EINVAL;
+       }
+       return mtrr_del_page(reg, base >> PAGE_SHIFT, size >> PAGE_SHIFT);
+}
+
+EXPORT_SYMBOL(mtrr_add);
+EXPORT_SYMBOL(mtrr_del);
+
+static int __init mtrr_init(void)
+{
+       struct cpuinfo_x86 *c = &boot_cpu_data;
+
+       if (!(xen_start_info.flags & SIF_PRIVILEGED))
+               return -ENODEV;
+
+       if ((!cpu_has(c, X86_FEATURE_MTRR)) &&
+           (!cpu_has(c, X86_FEATURE_K6_MTRR)) &&
+           (!cpu_has(c, X86_FEATURE_CYRIX_ARR)) &&
+           (!cpu_has(c, X86_FEATURE_CENTAUR_MCR)))
+               return -ENODEV;
+
+       set_num_var_ranges();
+       init_table();
+
+       return 0;
+}
+
+subsys_initcall(mtrr_init);
Index: linux-2.6.12-xen0-arch/arch/i386/mach-xen/kernel/cpu/mtrr/Makefile
===================================================================
--- /dev/null
+++ linux-2.6.12-xen0-arch/arch/i386/mach-xen/kernel/cpu/mtrr/Makefile
@@ -0,0 +1,16 @@
+obj-y  := main.o
+c-obj-y        := if.o
+
+c-link :=
+
+$(patsubst %.o,$(obj)/%.c,$(c-obj-y) $(c-link)): $(obj)/mtrr.h
+       @ln -fsn $(srctree)/arch/i386/kernel/cpu/mtrr/$(notdir $@) $@
+
+$(patsubst %.o,$(obj)/%.c,$(obj-y)): $(obj)/mtrr.h
+
+$(obj)/mtrr.h:
+       @ln -fsn $(srctree)/arch/i386/kernel/cpu/mtrr/mtrr.h $@
+
+obj-y  += $(c-obj-y)
+
+clean-files += $(patsubst %.o,%.c,$(c-obj-y) $(c-obj-) $(c-link))
Index: linux-2.6.12-xen0-arch/arch/i386/mach-xen/kernel/entry.S
===================================================================
--- /dev/null
+++ linux-2.6.12-xen0-arch/arch/i386/mach-xen/kernel/entry.S
@@ -0,0 +1,753 @@
+/*
+ *  linux/arch/i386/entry.S
+ *
+ *  Copyright (C) 1991, 1992  Linus Torvalds
+ */
+
+/*
+ * entry.S contains the system-call and fault low-level handling routines.
+ * This also contains the timer-interrupt handler, as well as all interrupts
+ * and faults that can result in a task-switch.
+ *
+ * NOTE: This code handles signal-recognition, which happens every time
+ * after a timer-interrupt and after each system call.
+ *
+ * I changed all the .align's to 4 (16 byte alignment), as that's faster
+ * on a 486.
+ *
+ * Stack layout in 'ret_from_system_call':
+ *     ptrace needs to have all regs on the stack.
+ *     if the order here is changed, it needs to be
+ *     updated in fork.c:copy_process, signal.c:do_signal,
+ *     ptrace.c and ptrace.h
+ *
+ *      0(%esp) - %ebx
+ *      4(%esp) - %ecx
+ *      8(%esp) - %edx
+ *       C(%esp) - %esi
+ *     10(%esp) - %edi
+ *     14(%esp) - %ebp
+ *     18(%esp) - %eax
+ *     1C(%esp) - %ds
+ *     20(%esp) - %es
+ *     24(%esp) - orig_eax
+ *     28(%esp) - %eip
+ *     2C(%esp) - %cs
+ *     30(%esp) - %eflags
+ *     34(%esp) - %oldesp
+ *     38(%esp) - %oldss
+ *
+ * "current" is in register %ebx during any slow entries.
+ */
+
+#include <linux/config.h>
+#include <linux/linkage.h>
+#include <asm/thread_info.h>
+#include <asm/errno.h>
+#include <asm/segment.h>
+#include <asm/smp.h>
+#include <asm/page.h>
+#include "irq_vectors.h"
+#include <xen-public/xen.h>
+
+#define nr_syscalls ((syscall_table_size)/4)
+
+EBX            = 0x00
+ECX            = 0x04
+EDX            = 0x08
+ESI            = 0x0C
+EDI            = 0x10
+EBP            = 0x14
+EAX            = 0x18
+DS             = 0x1C
+ES             = 0x20
+ORIG_EAX       = 0x24
+EIP            = 0x28
+CS             = 0x2C
+EVENT_MASK     = 0x2E
+EFLAGS         = 0x30
+OLDESP         = 0x34
+OLDSS          = 0x38
+
+CF_MASK                = 0x00000001
+TF_MASK                = 0x00000100
+IF_MASK                = 0x00000200
+DF_MASK                = 0x00000400 
+NT_MASK                = 0x00004000
+VM_MASK                = 0x00020000
+
+/* Offsets into shared_info_t. */
+#define evtchn_upcall_pending          /* 0 */
+#define evtchn_upcall_mask             1
+
+#define sizeof_vcpu_shift              3
+
+#ifdef CONFIG_SMP
+#define preempt_disable(reg)   incl TI_preempt_count(reg)
+#define preempt_enable(reg)    decl TI_preempt_count(reg)
+#define XEN_GET_VCPU_INFO(reg) preempt_disable(%ebp)                   ; \
+                               movl TI_cpu(%ebp),reg                   ; \
+                               shl  $sizeof_vcpu_shift,reg             ; \
+                               addl HYPERVISOR_shared_info,reg
+#define XEN_PUT_VCPU_INFO(reg) preempt_enable(%ebp)
+#define XEN_PUT_VCPU_INFO_fixup .byte 0xff,0xff,0xff
+#else
+#define XEN_GET_VCPU_INFO(reg) movl HYPERVISOR_shared_info,reg
+#define XEN_PUT_VCPU_INFO(reg)
+#define XEN_PUT_VCPU_INFO_fixup
+#endif
+
+#define XEN_LOCKED_BLOCK_EVENTS(reg)   movb $1,evtchn_upcall_mask(reg)
+#define XEN_LOCKED_UNBLOCK_EVENTS(reg) movb $0,evtchn_upcall_mask(reg)
+#define XEN_BLOCK_EVENTS(reg)  XEN_GET_VCPU_INFO(reg)                  ; \
+                               XEN_LOCKED_BLOCK_EVENTS(reg)            ; \
+                               XEN_PUT_VCPU_INFO(reg)
+#define XEN_UNBLOCK_EVENTS(reg)        XEN_GET_VCPU_INFO(reg)                  
; \
+                               XEN_LOCKED_UNBLOCK_EVENTS(reg)          ; \
+                               XEN_PUT_VCPU_INFO(reg)
+#define XEN_TEST_PENDING(reg)  testb $0xFF,evtchn_upcall_pending(reg)
+
+#ifdef CONFIG_PREEMPT
+#define preempt_stop           GET_THREAD_INFO(%ebp)                   ; \
+                               XEN_BLOCK_EVENTS(%esi)
+#else
+#define preempt_stop
+#define resume_kernel          restore_all
+#endif
+
+#define SAVE_ALL \
+       cld; \
+       pushl %es; \
+       pushl %ds; \
+       pushl %eax; \
+       pushl %ebp; \
+       pushl %edi; \
+       pushl %esi; \
+       pushl %edx; \
+       pushl %ecx; \
+       pushl %ebx; \
+       movl $(__USER_DS), %edx; \
+       movl %edx, %ds; \
+       movl %edx, %es;
+
+#define RESTORE_INT_REGS \
+       popl %ebx;      \
+       popl %ecx;      \
+       popl %edx;      \
+       popl %esi;      \
+       popl %edi;      \
+       popl %ebp;      \
+       popl %eax
+
+#define RESTORE_REGS   \
+       RESTORE_INT_REGS; \
+1:     popl %ds;       \
+2:     popl %es;       \
+.section .fixup,"ax";  \
+3:     movl $0,(%esp); \
+       jmp 1b;         \
+4:     movl $0,(%esp); \
+       jmp 2b;         \
+.previous;             \
+.section __ex_table,"a";\
+       .align 4;       \
+       .long 1b,3b;    \
+       .long 2b,4b;    \
+.previous
+
+
+#define RESTORE_ALL    \
+       RESTORE_REGS    \
+       addl $4, %esp;  \
+1:     iret;           \
+.section .fixup,"ax";   \
+2:     movl $(__USER_DS), %edx; \
+       movl %edx, %ds; \
+       movl %edx, %es; \
+       movl $11,%eax;  \
+       call do_exit;   \
+.previous;             \
+.section __ex_table,"a";\
+       .align 4;       \
+       .long 1b,2b;    \
+.previous
+
+
+ENTRY(ret_from_fork)
+       pushl %eax
+       call schedule_tail
+       GET_THREAD_INFO(%ebp)
+       popl %eax
+       jmp syscall_exit
+
+/*
+ * Return to user mode is not as complex as all this looks,
+ * but we want the default path for a system call return to
+ * go as quickly as possible which is why some of this is
+ * less clear than it otherwise should be.
+ */
+
+       # userspace resumption stub bypassing syscall exit tracing
+       ALIGN
+ret_from_exception:
+       preempt_stop
+ret_from_intr:
+       GET_THREAD_INFO(%ebp)
+       movl EFLAGS(%esp), %eax         # mix EFLAGS and CS
+       movb CS(%esp), %al
+       testl $(VM_MASK | 2), %eax
+       jz resume_kernel                # returning to kernel or vm86-space
+ENTRY(resume_userspace)
+       XEN_BLOCK_EVENTS(%esi)          # make sure we don't miss an interrupt
+                                       # setting need_resched or sigpending
+                                       # between sampling and the iret
+       movl TI_flags(%ebp), %ecx
+       andl $_TIF_WORK_MASK, %ecx      # is there any work to be done on
+                                       # int/exception return?
+       jne work_pending
+       jmp restore_all
+
+#ifdef CONFIG_PREEMPT
+ENTRY(resume_kernel)
+       XEN_BLOCK_EVENTS(%esi)
+       cmpl $0,TI_preempt_count(%ebp)  # non-zero preempt_count ?
+       jnz restore_all
+need_resched:
+       movl TI_flags(%ebp), %ecx       # need_resched set ?
+       testb $_TIF_NEED_RESCHED, %cl
+       jz restore_all
+       testb $0xFF,EVENT_MASK(%esp)    # interrupts off (exception path) ?
+       jnz restore_all
+       call preempt_schedule_irq
+       jmp need_resched
+#endif
+
+/* SYSENTER_RETURN points to after the "sysenter" instruction in
+   the vsyscall page.  See vsyscall-sysentry.S, which defines the symbol.  */
+
+       # sysenter call handler stub
+ENTRY(sysenter_entry)
+       movl TSS_sysenter_esp0(%esp),%esp
+sysenter_past_esp:
+       sti
+       pushl $(__USER_DS)
+       pushl %ebp
+       pushfl
+       pushl $(__USER_CS)
+       pushl $SYSENTER_RETURN
+
+/*
+ * Load the potential sixth argument from user stack.
+ * Careful about security.
+ */
+       cmpl $__PAGE_OFFSET-3,%ebp
+       jae syscall_fault
+1:     movl (%ebp),%ebp
+.section __ex_table,"a"
+       .align 4
+       .long 1b,syscall_fault
+.previous
+
+       pushl %eax
+       SAVE_ALL
+       GET_THREAD_INFO(%ebp)
+
+       testb $(_TIF_SYSCALL_TRACE|_TIF_SYSCALL_AUDIT),TI_flags(%ebp)
+       jnz syscall_trace_entry
+       cmpl $(nr_syscalls), %eax
+       jae syscall_badsys
+       call *sys_call_table(,%eax,4)
+       movl %eax,EAX(%esp)
+       cli
+       movl TI_flags(%ebp), %ecx
+       testw $_TIF_ALLWORK_MASK, %cx
+       jne syscall_exit_work
+/* if something modifies registers it must also disable sysexit */
+       movl EIP(%esp), %edx
+       movl OLDESP(%esp), %ecx
+       xorl %ebp,%ebp
+       sti
+       sysexit
+
+
+       # system call handler stub
+ENTRY(system_call)
+       pushl %eax                      # save orig_eax
+       SAVE_ALL
+       GET_THREAD_INFO(%ebp)
+                                       # system call tracing in operation
+       testb $(_TIF_SYSCALL_TRACE|_TIF_SYSCALL_AUDIT),TI_flags(%ebp)
+       jnz syscall_trace_entry
+       cmpl $(nr_syscalls), %eax
+       jae syscall_badsys
+syscall_call:
+       call *sys_call_table(,%eax,4)
+       movl %eax,EAX(%esp)             # store the return value
+syscall_exit:
+       XEN_BLOCK_EVENTS(%esi)          # make sure we don't miss an interrupt
+                                       # setting need_resched or sigpending
+                                       # between sampling and the iret
+       movl TI_flags(%ebp), %ecx
+       testw $_TIF_ALLWORK_MASK, %cx   # current->work
+       jne syscall_exit_work
+restore_all:
+       testl $VM_MASK, EFLAGS(%esp)
+       jnz resume_vm86
+       movb EVENT_MASK(%esp), %al
+       notb %al                        # %al == ~saved_mask
+       XEN_GET_VCPU_INFO(%esi)
+       andb evtchn_upcall_mask(%esi),%al
+       andb $1,%al                     # %al == mask & ~saved_mask
+       jnz restore_all_enable_events   #     != 0 => reenable event delivery
+       XEN_PUT_VCPU_INFO(%esi)
+       RESTORE_ALL
+
+resume_vm86:
+       XEN_UNBLOCK_EVENTS(%esi)
+       RESTORE_REGS
+       movl %eax,(%esp)
+       movl $__HYPERVISOR_switch_vm86,%eax
+       int $0x82
+       ud2
+
+       # perform work that needs to be done immediately before resumption
+       ALIGN
+work_pending:
+       testb $_TIF_NEED_RESCHED, %cl
+       jz work_notifysig
+work_resched:
+       call schedule
+       XEN_BLOCK_EVENTS(%esi)          # make sure we don't miss an interrupt
+                                       # setting need_resched or sigpending
+                                       # between sampling and the iret
+       movl TI_flags(%ebp), %ecx
+       andl $_TIF_WORK_MASK, %ecx      # is there any work to be done other
+                                       # than syscall tracing?
+       jz restore_all
+       testb $_TIF_NEED_RESCHED, %cl
+       jnz work_resched
+
+work_notifysig:                                # deal with pending signals and
+                                       # notify-resume requests
+       testl $VM_MASK, EFLAGS(%esp)
+       movl %esp, %eax
+       jne work_notifysig_v86          # returning to kernel-space or
+                                       # vm86-space
+       xorl %edx, %edx
+       call do_notify_resume
+       jmp restore_all
+
+       ALIGN
+work_notifysig_v86:
+       pushl %ecx                      # save ti_flags for do_notify_resume
+       call save_v86_state             # %eax contains pt_regs pointer
+       popl %ecx
+       movl %eax, %esp
+       xorl %edx, %edx
+       call do_notify_resume
+       jmp restore_all
+
+       # perform syscall exit tracing
+       ALIGN
+syscall_trace_entry:
+       movl $-ENOSYS,EAX(%esp)
+       movl %esp, %eax
+       xorl %edx,%edx
+       call do_syscall_trace
+       movl ORIG_EAX(%esp), %eax
+       cmpl $(nr_syscalls), %eax
+       jnae syscall_call
+       jmp syscall_exit
+
+       # perform syscall exit tracing
+       ALIGN
+syscall_exit_work:
+       testb $(_TIF_SYSCALL_TRACE|_TIF_SYSCALL_AUDIT|_TIF_SINGLESTEP), %cl
+       jz work_pending
+       XEN_UNBLOCK_EVENTS(%esi)        # could let do_syscall_trace() call
+                                       # schedule() instead
+       movl %esp, %eax
+       movl $1, %edx
+       call do_syscall_trace
+       jmp resume_userspace
+
+       ALIGN
+syscall_fault:
+       pushl %eax                      # save orig_eax
+       SAVE_ALL
+       GET_THREAD_INFO(%ebp)
+       movl $-EFAULT,EAX(%esp)
+       jmp resume_userspace
+
+       ALIGN
+syscall_badsys:
+       movl $-ENOSYS,EAX(%esp)
+       jmp resume_userspace
+
+#if 0 /* XEN */
+/*
+ * Build the entry stubs and pointer table with
+ * some assembler magic.
+ */
+.data
+ENTRY(interrupt)
+.text
+
+vector=0
+ENTRY(irq_entries_start)
+.rept NR_IRQS
+       ALIGN
+1:     pushl $vector-256
+       jmp common_interrupt
+.data
+       .long 1b
+.text
+vector=vector+1
+.endr
+
+       ALIGN
+common_interrupt:
+       SAVE_ALL
+       movl %esp,%eax
+       call do_IRQ
+       jmp ret_from_intr
+
+#define BUILD_INTERRUPT(name, nr)      \
+ENTRY(name)                            \
+       pushl $nr-256;                  \
+       SAVE_ALL                        \
+       movl %esp,%eax;                 \
+       call smp_/**/name;              \
+       jmp ret_from_intr;
+
+/* The include is where all of the SMP etc. interrupts come from */
+#include "entry_arch.h"
+#endif /* XEN */
+
+ENTRY(divide_error)
+       pushl $0                        # no error code
+       pushl $do_divide_error
+       ALIGN
+error_code:
+       pushl %ds
+       pushl %eax
+       xorl %eax, %eax
+       pushl %ebp
+       pushl %edi
+       pushl %esi
+       pushl %edx
+       decl %eax                       # eax = -1
+       pushl %ecx
+       pushl %ebx
+       cld
+       movl %es, %ecx
+       movl ES(%esp), %edi             # get the function address
+       movl ORIG_EAX(%esp), %edx       # get the error code
+       movl %eax, ORIG_EAX(%esp)
+       movl %ecx, ES(%esp)
+       movl $(__USER_DS), %ecx
+       movl %ecx, %ds
+       movl %ecx, %es
+       movl %esp,%eax                  # pt_regs pointer
+       call *%edi
+       jmp ret_from_exception
+
+# A note on the "critical region" in our callback handler.
+# We want to avoid stacking callback handlers due to events occurring
+# during handling of the last event. To do this, we keep events disabled
+# until we've done all processing. HOWEVER, we must enable events before
+# popping the stack frame (can't be done atomically) and so it would still
+# be possible to get enough handler activations to overflow the stack.
+# Although unlikely, bugs of that kind are hard to track down, so we'd
+# like to avoid the possibility.
+# So, on entry to the handler we detect whether we interrupted an
+# existing activation in its critical region -- if so, we pop the current
+# activation and restart the handler using the previous one.
+ENTRY(hypervisor_callback)
+       pushl %eax
+       SAVE_ALL
+       movl EIP(%esp),%eax
+       cmpl $scrit,%eax
+       jb   11f
+       cmpl $ecrit,%eax
+       jb   critical_region_fixup
+11:    push %esp
+       call evtchn_do_upcall
+       add  $4,%esp
+       jmp  ret_from_intr
+
+        ALIGN
+restore_all_enable_events:  
+       XEN_LOCKED_UNBLOCK_EVENTS(%esi)
+scrit: /**** START OF CRITICAL REGION ****/
+       XEN_TEST_PENDING(%esi)
+       jnz  14f                        # process more events if necessary...
+       XEN_PUT_VCPU_INFO(%esi)
+       RESTORE_ALL
+14:    XEN_LOCKED_BLOCK_EVENTS(%esi)
+       XEN_PUT_VCPU_INFO(%esi)
+       jmp  11b
+ecrit:  /**** END OF CRITICAL REGION ****/
+# [How we do the fixup]. We want to merge the current stack frame with the
+# just-interrupted frame. How we do this depends on where in the critical
+# region the interrupted handler was executing, and so how many saved
+# registers are in each frame. We do this quickly using the lookup table
+# 'critical_fixup_table'. For each byte offset in the critical region, it
+# provides the number of bytes which have already been popped from the
+# interrupted stack frame. 
+critical_region_fixup:
+       addl $critical_fixup_table-scrit,%eax
+       movzbl (%eax),%eax              # %eax contains num bytes popped
+       cmpb $0xff,%al                  # 0xff => vcpu_info critical region
+       jne  15f
+       GET_THREAD_INFO(%ebp)
+       XEN_PUT_VCPU_INFO(%esi)         # abort vcpu_info critical region
+        xorl %eax,%eax
+15:    mov  %esp,%esi
+       add  %eax,%esi                  # %esi points at end of src region
+       mov  %esp,%edi
+       add  $0x34,%edi                 # %edi points at end of dst region
+       mov  %eax,%ecx
+       shr  $2,%ecx                    # convert words to bytes
+       je   17f                        # skip loop if nothing to copy
+16:    subl $4,%esi                    # pre-decrementing copy loop
+       subl $4,%edi
+       movl (%esi),%eax
+       movl %eax,(%edi)
+       loop 16b
+17:    movl %edi,%esp                  # final %edi is top of merged stack
+       jmp  11b
+
+critical_fixup_table:
+       .byte 0xff,0xff,0xff            # testb $0xff,(%esi) = XEN_TEST_PENDING
+       .byte 0xff,0xff                 # jnz  14f
+       XEN_PUT_VCPU_INFO_fixup
+       .byte 0x00                      # pop  %ebx
+       .byte 0x04                      # pop  %ecx
+       .byte 0x08                      # pop  %edx
+       .byte 0x0c                      # pop  %esi
+       .byte 0x10                      # pop  %edi
+       .byte 0x14                      # pop  %ebp
+       .byte 0x18                      # pop  %eax
+       .byte 0x1c                      # pop  %ds
+       .byte 0x20                      # pop  %es
+       .byte 0x24,0x24,0x24            # add  $4,%esp
+       .byte 0x28                      # iret
+       .byte 0xff,0xff,0xff,0xff       # movb $1,1(%esi)
+       XEN_PUT_VCPU_INFO_fixup
+       .byte 0x00,0x00                 # jmp  11b
+
+# Hypervisor uses this for application faults while it executes.
+ENTRY(failsafe_callback)
+1:     popl %ds
+2:     popl %es
+3:     popl %fs
+4:     popl %gs
+       subl $4,%esp
+       SAVE_ALL
+       jmp  ret_from_exception
+.section .fixup,"ax";  \
+6:     movl $0,(%esp); \
+       jmp 1b;         \
+7:     movl $0,(%esp); \
+       jmp 2b;         \
+8:     movl $0,(%esp); \
+       jmp 3b;         \
+9:     movl $0,(%esp); \
+       jmp 4b;         \
+.previous;             \
+.section __ex_table,"a";\
+       .align 4;       \
+       .long 1b,6b;    \
+       .long 2b,7b;    \
+       .long 3b,8b;    \
+       .long 4b,9b;    \
+.previous
+
+ENTRY(coprocessor_error)
+       pushl $0
+       pushl $do_coprocessor_error
+       jmp error_code
+
+ENTRY(simd_coprocessor_error)
+       pushl $0
+       pushl $do_simd_coprocessor_error
+       jmp error_code
+
+ENTRY(device_not_available)
+       pushl $-1                       # mark this as an int
+       SAVE_ALL
+       preempt_stop
+       call math_state_restore
+       jmp ret_from_exception
+
+/*
+ * Debug traps and NMI can happen at the one SYSENTER instruction
+ * that sets up the real kernel stack. Check here, since we can't
+ * allow the wrong stack to be used.
+ *
+ * "TSS_sysenter_esp0+12" is because the NMI/debug handler will have
+ * already pushed 3 words if it hits on the sysenter instruction:
+ * eflags, cs and eip.
+ *
+ * We just load the right stack, and push the three (known) values
+ * by hand onto the new stack - while updating the return eip past
+ * the instruction that would have done it for sysenter.
+ */
+#define FIX_STACK(offset, ok, label)           \
+       cmpw $__KERNEL_CS,4(%esp);              \
+       jne ok;                                 \
+label:                                         \
+       movl TSS_sysenter_esp0+offset(%esp),%esp;       \
+       pushfl;                                 \
+       pushl $__KERNEL_CS;                     \
+       pushl $sysenter_past_esp
+
+ENTRY(debug)
+       cmpl $sysenter_entry,(%esp)
+       jne debug_stack_correct
+       FIX_STACK(12, debug_stack_correct, debug_esp_fix_insn)
+debug_stack_correct:
+       pushl $-1                       # mark this as an int
+       SAVE_ALL
+       xorl %edx,%edx                  # error code 0
+       movl %esp,%eax                  # pt_regs pointer
+       call do_debug
+       jmp ret_from_exception
+
+#if 0 /* XEN */
+/*
+ * NMI is doubly nasty. It can happen _while_ we're handling
+ * a debug fault, and the debug fault hasn't yet been able to
+ * clear up the stack. So we first check whether we got  an
+ * NMI on the sysenter entry path, but after that we need to
+ * check whether we got an NMI on the debug path where the debug
+ * fault happened on the sysenter path.
+ */
+ENTRY(nmi)
+       cmpl $sysenter_entry,(%esp)
+       je nmi_stack_fixup
+       pushl %eax
+       movl %esp,%eax
+       /* Do not access memory above the end of our stack page,
+        * it might not exist.
+        */
+       andl $(THREAD_SIZE-1),%eax
+       cmpl $(THREAD_SIZE-20),%eax
+       popl %eax
+       jae nmi_stack_correct
+       cmpl $sysenter_entry,12(%esp)
+       je nmi_debug_stack_check
+nmi_stack_correct:
+       pushl %eax
+       SAVE_ALL
+       xorl %edx,%edx          # zero error code
+       movl %esp,%eax          # pt_regs pointer
+       call do_nmi
+       RESTORE_ALL
+
+nmi_stack_fixup:
+       FIX_STACK(12,nmi_stack_correct, 1)
+       jmp nmi_stack_correct
+nmi_debug_stack_check:
+       cmpw $__KERNEL_CS,16(%esp)
+       jne nmi_stack_correct
+       cmpl $debug - 1,(%esp)
+       jle nmi_stack_correct
+       cmpl $debug_esp_fix_insn,(%esp)
+       jle nmi_debug_stack_fixup
+nmi_debug_stack_fixup:
+       FIX_STACK(24,nmi_stack_correct, 1)
+       jmp nmi_stack_correct
+#endif /* XEN */
+
+ENTRY(int3)
+       pushl $-1                       # mark this as an int
+       SAVE_ALL
+       xorl %edx,%edx          # zero error code
+       movl %esp,%eax          # pt_regs pointer
+       call do_int3
+       jmp ret_from_exception
+
+ENTRY(overflow)
+       pushl $0
+       pushl $do_overflow
+       jmp error_code
+
+ENTRY(bounds)
+       pushl $0
+       pushl $do_bounds
+       jmp error_code
+
+ENTRY(invalid_op)
+       pushl $0
+       pushl $do_invalid_op
+       jmp error_code
+
+ENTRY(coprocessor_segment_overrun)
+       pushl $0
+       pushl $do_coprocessor_segment_overrun
+       jmp error_code
+
+ENTRY(invalid_TSS)
+       pushl $do_invalid_TSS
+       jmp error_code
+
+ENTRY(segment_not_present)
+       pushl $do_segment_not_present
+       jmp error_code
+
+ENTRY(stack_segment)
+       pushl $do_stack_segment
+       jmp error_code
+
+ENTRY(general_protection)
+       pushl $do_general_protection
+       jmp error_code
+
+ENTRY(alignment_check)
+       pushl $do_alignment_check
+       jmp error_code
+
+# This handler is special, because it gets an extra value on its stack,
+# which is the linear faulting address.
+# fastcall register usage:  %eax = pt_regs, %edx = error code,
+#                          %ecx = fault address
+ENTRY(page_fault)
+       pushl %ds
+       pushl %eax
+       xorl %eax, %eax
+       pushl %ebp
+       pushl %edi
+       pushl %esi
+       pushl %edx
+       decl %eax                       /* eax = -1 */
+       pushl %ecx
+       pushl %ebx
+       cld
+       movl %es,%edi
+       movl ES(%esp), %ecx             /* get the faulting address */
+       movl ORIG_EAX(%esp), %edx       /* get the error code */
+       movl %eax, ORIG_EAX(%esp)
+       movl %edi, ES(%esp)
+       movl $(__KERNEL_DS),%eax
+       movl %eax, %ds
+       movl %eax, %es
+       movl %esp,%eax                  /* pt_regs pointer */
+       call do_page_fault
+       jmp ret_from_exception
+
+#ifdef CONFIG_X86_MCE
+ENTRY(machine_check)
+       pushl $0
+       pushl machine_check_vector
+       jmp error_code
+#endif
+
+ENTRY(fixup_4gb_segment)
+       pushl $do_fixup_4gb_segment
+       jmp error_code
+
+#include "syscall_table.S"
+
+syscall_table_size=(.-sys_call_table)
Index: linux-2.6.12-xen0-arch/arch/i386/mach-xen/kernel/head.S
===================================================================
--- /dev/null
+++ linux-2.6.12-xen0-arch/arch/i386/mach-xen/kernel/head.S
@@ -0,0 +1,198 @@
+
+#include <linux/config.h>
+
+.section __xen_guest
+       .ascii  "GUEST_OS=linux,GUEST_VER=2.6"
+       .ascii  ",XEN_VER=3.0"
+       .ascii  ",VIRT_BASE=0xC0000000"
+#ifdef CONFIG_X86_PAE
+       .ascii  ",PAE=yes"
+#else
+       .ascii  ",PAE=no"
+#endif
+       .ascii  ",LOADER=generic"
+       .byte   0
+
+.text
+#include <linux/threads.h>
+#include <linux/linkage.h>
+#include <asm/segment.h>
+#include <asm/thread_info.h>
+#include <asm/asm_offsets.h>
+#include <xen-public/arch-x86_32.h>
+
+/*
+ * References to members of the new_cpu_data structure.
+ */
+
+#define X86            new_cpu_data+CPUINFO_x86
+#define X86_VENDOR     new_cpu_data+CPUINFO_x86_vendor
+#define X86_MODEL      new_cpu_data+CPUINFO_x86_model
+#define X86_MASK       new_cpu_data+CPUINFO_x86_mask
+#define X86_HARD_MATH  new_cpu_data+CPUINFO_hard_math
+#define X86_CPUID      new_cpu_data+CPUINFO_cpuid_level
+#define X86_CAPABILITY new_cpu_data+CPUINFO_x86_capability
+#define X86_VENDOR_ID  new_cpu_data+CPUINFO_x86_vendor_id
+
+ENTRY(startup_32)
+       cld
+
+       /* Copy the necessary stuff from xen_start_info structure. */
+       mov  $xen_start_info_union,%edi
+       mov  $512,%ecx
+       rep movsl
+
+#ifdef CONFIG_SMP
+ENTRY(startup_32_smp)
+       cld
+#endif /* CONFIG_SMP */
+
+       /* Set up the stack pointer */
+       lss stack_start,%esp
+
+checkCPUtype:
+
+       /* get vendor info */
+       xorl %eax,%eax                  # call CPUID with 0 -> return vendor ID
+       cpuid
+       movl %eax,X86_CPUID             # save CPUID level
+       movl %ebx,X86_VENDOR_ID         # lo 4 chars
+       movl %edx,X86_VENDOR_ID+4       # next 4 chars
+       movl %ecx,X86_VENDOR_ID+8       # last 4 chars
+
+       movl $1,%eax            # Use the CPUID instruction to get CPU type
+       cpuid
+       movb %al,%cl            # save reg for future use
+       andb $0x0f,%ah          # mask processor family
+       movb %ah,X86
+       andb $0xf0,%al          # mask model
+       shrb $4,%al
+       movb %al,X86_MODEL
+       andb $0x0f,%cl          # mask mask revision
+       movb %cl,X86_MASK
+       movl %edx,X86_CAPABILITY
+
+       incb ready
+
+       xorl %eax,%eax                  # Clear FS/GS and LDT
+       movl %eax,%fs
+       movl %eax,%gs
+       cld                     # gcc2 wants the direction flag cleared at all 
times
+
+#ifdef CONFIG_SMP
+       movb ready, %cl 
+       cmpb $1,%cl
+       je 1f                   # the first CPU calls start_kernel
+                               # all other CPUs call initialize_secondary
+       call initialize_secondary
+       jmp L6
+1:
+#endif /* CONFIG_SMP */
+       call start_kernel
+L6:
+       jmp L6                  # main should never return here, but
+                               # just in case, we know what happens.
+
+ENTRY(lgdt_finish)
+       movl $(__KERNEL_DS),%eax        # reload all the segment registers
+       movw %ax,%ss                    # after changing gdt.
+
+       movl $(__USER_DS),%eax          # DS/ES contains default USER segment
+       movw %ax,%ds
+       movw %ax,%es
+
+       popl %eax                       # reload CS by intersegment return
+       pushl $(__KERNEL_CS)
+       pushl %eax
+       lret
+
+ENTRY(stack_start)
+       .long init_thread_union+THREAD_SIZE
+       .long __BOOT_DS
+
+ready: .byte 0
+
+.globl idt_descr
+.globl cpu_gdt_descr
+
+       ALIGN
+       .word 0                         # 32-bit align idt_desc.address
+idt_descr:
+       .word IDT_ENTRIES*8-1           # idt contains 256 entries
+       .long idt_table
+
+# boot GDT descriptor (later on used by CPU#0):
+       .word 0                         # 32 bit align gdt_desc.address
+cpu_gdt_descr:
+       .word GDT_SIZE
+       .long cpu_gdt_table
+
+       .fill NR_CPUS-1,8,0             # space for the other GDT descriptors
+
+.org 0x1000
+ENTRY(empty_zero_page)
+
+.org 0x2000
+ENTRY(swapper_pg_dir)
+
+.org 0x3000
+ENTRY(cpu_gdt_table)
+       .quad 0x0000000000000000        /* NULL descriptor */
+       .quad 0x0000000000000000        /* 0x0b reserved */
+       .quad 0x0000000000000000        /* 0x13 reserved */
+       .quad 0x0000000000000000        /* 0x1b reserved */
+       .quad 0x0000000000000000        /* 0x20 unused */
+       .quad 0x0000000000000000        /* 0x28 unused */
+       .quad 0x0000000000000000        /* 0x33 TLS entry 1 */
+       .quad 0x0000000000000000        /* 0x3b TLS entry 2 */
+       .quad 0x0000000000000000        /* 0x43 TLS entry 3 */
+       .quad 0x0000000000000000        /* 0x4b reserved */
+       .quad 0x0000000000000000        /* 0x53 reserved */
+       .quad 0x0000000000000000        /* 0x5b reserved */
+
+#ifdef CONFIG_X86_PAE
+       .quad 0x00cfbb00000067ff        /* 0x60 kernel 4GB code at 0x00000000 */
+       .quad 0x00cfb300000067ff        /* 0x68 kernel 4GB data at 0x00000000 */
+       .quad 0x00cffb00000067ff        /* 0x73 user 4GB code at 0x00000000 */
+       .quad 0x00cff300000067ff        /* 0x7b user 4GB data at 0x00000000 */
+#else
+       .quad 0x00cfbb000000c3ff        /* 0x60 kernel 4GB code at 0x00000000 */
+       .quad 0x00cfb3000000c3ff        /* 0x68 kernel 4GB data at 0x00000000 */
+       .quad 0x00cffb000000c3ff        /* 0x73 user 4GB code at 0x00000000 */
+       .quad 0x00cff3000000c3ff        /* 0x7b user 4GB data at 0x00000000 */
+#endif
+
+       .quad 0x0000000000000000        /* 0x80 TSS descriptor */
+       .quad 0x0000000000000000        /* 0x88 LDT descriptor */
+
+       /* Segments used for calling PnP BIOS */
+       .quad 0x0000000000000000        /* 0x90 32-bit code */
+       .quad 0x0000000000000000        /* 0x98 16-bit code */
+       .quad 0x0000000000000000        /* 0xa0 16-bit data */
+       .quad 0x0000000000000000        /* 0xa8 16-bit data */
+       .quad 0x0000000000000000        /* 0xb0 16-bit data */
+       /*
+        * The APM segments have byte granularity and their bases
+        * and limits are set at run time.
+        */
+       .quad 0x0000000000000000        /* 0xb8 APM CS    code */
+       .quad 0x0000000000000000        /* 0xc0 APM CS 16 code (16 bit) */
+       .quad 0x0000000000000000        /* 0xc8 APM DS    data */
+
+       .quad 0x0000000000000000        /* 0xd0 - unused */
+       .quad 0x0000000000000000        /* 0xd8 - unused */
+       .quad 0x0000000000000000        /* 0xe0 - unused */
+       .quad 0x0000000000000000        /* 0xe8 - unused */
+       .quad 0x0000000000000000        /* 0xf0 - unused */
+       .quad 0x0000000000000000        /* 0xf8 - GDT entry 31: double-fault 
TSS */
+       .fill GDT_ENTRIES-32,8,0
+
+.org 0x4000
+ENTRY(default_ldt)
+
+.org 0x5000
+/*
+ * Real beginning of normal "text" segment
+ */
+ENTRY(stext)
+ENTRY(_stext)
Index: linux-2.6.12-xen0-arch/arch/i386/mach-xen/kernel/i386_ksyms.c
===================================================================
--- /dev/null
+++ linux-2.6.12-xen0-arch/arch/i386/mach-xen/kernel/i386_ksyms.c
@@ -0,0 +1,193 @@
+#include <linux/config.h>
+#include <linux/module.h>
+#include <linux/smp.h>
+#include <linux/user.h>
+#include <linux/elfcore.h>
+#include <linux/mca.h>
+#include <linux/sched.h>
+#include <linux/in6.h>
+#include <linux/interrupt.h>
+#include <linux/smp_lock.h>
+#include <linux/pm.h>
+#include <linux/pci.h>
+#include <linux/apm_bios.h>
+#include <linux/kernel.h>
+#include <linux/string.h>
+#include <linux/tty.h>
+#include <linux/highmem.h>
+#include <linux/time.h>
+
+#include <asm/semaphore.h>
+#include <asm/processor.h>
+#include <asm/i387.h>
+#include <asm/uaccess.h>
+#include <asm/checksum.h>
+#include <asm/io.h>
+#include <asm/delay.h>
+#include <asm/irq.h>
+#include <asm/mmx.h>
+#include <asm/desc.h>
+#include <asm/pgtable.h>
+#include <asm/tlbflush.h>
+#include <asm/nmi.h>
+#include <asm/ist.h>
+#include <asm/kdebug.h>
+
+extern void dump_thread(struct pt_regs *, struct user *);
+extern spinlock_t rtc_lock;
+
+/* This is definitely a GPL-only symbol */
+EXPORT_SYMBOL_GPL(cpu_gdt_table);
+
+#if defined(CONFIG_APM_MODULE)
+extern void machine_real_restart(unsigned char *, int);
+EXPORT_SYMBOL(machine_real_restart);
+extern void default_idle(void);
+EXPORT_SYMBOL(default_idle);
+#endif
+
+#ifdef CONFIG_SMP
+extern void FASTCALL( __write_lock_failed(rwlock_t *rw));
+extern void FASTCALL( __read_lock_failed(rwlock_t *rw));
+#endif
+
+#if defined(CONFIG_BLK_DEV_IDE) || defined(CONFIG_BLK_DEV_HD) || 
defined(CONFIG_BLK_DEV_IDE_MODULE) || defined(CONFIG_BLK_DEV_HD_MODULE)
+extern struct drive_info_struct drive_info;
+EXPORT_SYMBOL(drive_info);
+#endif
+
+extern unsigned long cpu_khz;
+extern unsigned long get_cmos_time(void);
+
+/* platform dependent support */
+EXPORT_SYMBOL(boot_cpu_data);
+#ifdef CONFIG_DISCONTIGMEM
+EXPORT_SYMBOL(node_data);
+EXPORT_SYMBOL(physnode_map);
+#endif
+#ifdef CONFIG_X86_NUMAQ
+EXPORT_SYMBOL(xquad_portio);
+#endif
+EXPORT_SYMBOL(dump_thread);
+EXPORT_SYMBOL(dump_fpu);
+EXPORT_SYMBOL_GPL(kernel_fpu_begin);
+EXPORT_SYMBOL(__ioremap);
+EXPORT_SYMBOL(ioremap_nocache);
+EXPORT_SYMBOL(iounmap);
+EXPORT_SYMBOL(kernel_thread);
+EXPORT_SYMBOL(pm_idle);
+#ifdef CONFIG_ACPI_BOOT
+EXPORT_SYMBOL(pm_power_off);
+#endif
+EXPORT_SYMBOL(get_cmos_time);
+EXPORT_SYMBOL(cpu_khz);
+EXPORT_SYMBOL(apm_info);
+
+EXPORT_SYMBOL(__down_failed);
+EXPORT_SYMBOL(__down_failed_interruptible);
+EXPORT_SYMBOL(__down_failed_trylock);
+EXPORT_SYMBOL(__up_wakeup);
+/* Networking helper routines. */
+EXPORT_SYMBOL(csum_partial_copy_generic);
+/* Delay loops */
+EXPORT_SYMBOL(__ndelay);
+EXPORT_SYMBOL(__udelay);
+EXPORT_SYMBOL(__delay);
+EXPORT_SYMBOL(__const_udelay);
+
+EXPORT_SYMBOL(__get_user_1);
+EXPORT_SYMBOL(__get_user_2);
+EXPORT_SYMBOL(__get_user_4);
+
+EXPORT_SYMBOL(__put_user_1);
+EXPORT_SYMBOL(__put_user_2);
+EXPORT_SYMBOL(__put_user_4);
+EXPORT_SYMBOL(__put_user_8);
+
+EXPORT_SYMBOL(strpbrk);
+EXPORT_SYMBOL(strstr);
+
+EXPORT_SYMBOL(strncpy_from_user);
+EXPORT_SYMBOL(__strncpy_from_user);
+EXPORT_SYMBOL(clear_user);
+EXPORT_SYMBOL(__clear_user);
+EXPORT_SYMBOL(__copy_from_user_ll);
+EXPORT_SYMBOL(__copy_to_user_ll);
+EXPORT_SYMBOL(strnlen_user);
+
+EXPORT_SYMBOL(dma_alloc_coherent);
+EXPORT_SYMBOL(dma_free_coherent);
+
+#ifdef CONFIG_PCI
+EXPORT_SYMBOL(pci_mem_start);
+#endif
+
+#ifdef CONFIG_PCI_BIOS
+EXPORT_SYMBOL(pcibios_set_irq_routing);
+EXPORT_SYMBOL(pcibios_get_irq_routing_table);
+#endif
+
+#ifdef CONFIG_X86_USE_3DNOW
+EXPORT_SYMBOL(_mmx_memcpy);
+EXPORT_SYMBOL(mmx_clear_page);
+EXPORT_SYMBOL(mmx_copy_page);
+#endif
+
+#ifdef CONFIG_X86_HT
+EXPORT_SYMBOL(smp_num_siblings);
+EXPORT_SYMBOL(cpu_sibling_map);
+#endif
+
+#ifdef CONFIG_SMP
+EXPORT_SYMBOL(cpu_data);
+EXPORT_SYMBOL(cpu_online_map);
+EXPORT_SYMBOL(cpu_callout_map);
+EXPORT_SYMBOL(__write_lock_failed);
+EXPORT_SYMBOL(__read_lock_failed);
+
+/* Global SMP stuff */
+EXPORT_SYMBOL(smp_call_function);
+
+/* TLB flushing */
+EXPORT_SYMBOL(flush_tlb_page);
+#endif
+
+#ifdef CONFIG_X86_IO_APIC
+EXPORT_SYMBOL(IO_APIC_get_PCI_irq_vector);
+#endif
+
+#ifdef CONFIG_MCA
+EXPORT_SYMBOL(machine_id);
+#endif
+
+#ifdef CONFIG_VT
+EXPORT_SYMBOL(screen_info);
+#endif
+
+EXPORT_SYMBOL(get_wchan);
+
+EXPORT_SYMBOL(rtc_lock);
+
+EXPORT_SYMBOL_GPL(set_nmi_callback);
+EXPORT_SYMBOL_GPL(unset_nmi_callback);
+
+EXPORT_SYMBOL(register_die_notifier);
+#ifdef CONFIG_HAVE_DEC_LOCK
+EXPORT_SYMBOL(_atomic_dec_and_lock);
+#endif
+
+EXPORT_SYMBOL(__PAGE_KERNEL);
+
+#ifdef CONFIG_HIGHMEM
+EXPORT_SYMBOL(kmap);
+EXPORT_SYMBOL(kunmap);
+EXPORT_SYMBOL(kmap_atomic);
+EXPORT_SYMBOL(kunmap_atomic);
+EXPORT_SYMBOL(kmap_atomic_to_page);
+#endif
+
+#if defined(CONFIG_X86_SPEEDSTEP_SMI) || 
defined(CONFIG_X86_SPEEDSTEP_SMI_MODULE)
+EXPORT_SYMBOL(ist_info);
+#endif
+
+EXPORT_SYMBOL(csum_partial);
Index: linux-2.6.12-xen0-arch/arch/i386/mach-xen/kernel/io_apic.c
===================================================================
--- /dev/null
+++ linux-2.6.12-xen0-arch/arch/i386/mach-xen/kernel/io_apic.c
@@ -0,0 +1,2609 @@
+/*
+ *     Intel IO-APIC support for multi-Pentium hosts.
+ *
+ *     Copyright (C) 1997, 1998, 1999, 2000 Ingo Molnar, Hajnalka Szabo
+ *
+ *     Many thanks to Stig Venaas for trying out countless experimental
+ *     patches and reporting/debugging problems patiently!
+ *
+ *     (c) 1999, Multiple IO-APIC support, developed by
+ *     Ken-ichi Yaku <yaku@xxxxxxxxxxxxxxxxxxxx> and
+ *      Hidemi Kishimoto <kisimoto@xxxxxxxxxxxxxxxxxxxx>,
+ *     further tested and cleaned up by Zach Brown <zab@xxxxxxxxxx>
+ *     and Ingo Molnar <mingo@xxxxxxxxxx>
+ *
+ *     Fixes
+ *     Maciej W. Rozycki       :       Bits for genuine 82489DX APICs;
+ *                                     thanks to Eric Gilmore
+ *                                     and Rolf G. Tews
+ *                                     for testing these extensively
+ *     Paul Diefenbaugh        :       Added full ACPI support
+ */
+
+#include <linux/mm.h>
+#include <linux/irq.h>
+#include <linux/interrupt.h>
+#include <linux/init.h>
+#include <linux/delay.h>
+#include <linux/sched.h>
+#include <linux/config.h>
+#include <linux/smp_lock.h>
+#include <linux/mc146818rtc.h>
+#include <linux/compiler.h>
+#include <linux/acpi.h>
+
+#include <linux/sysdev.h>
+#include <asm/io.h>
+#include <asm/smp.h>
+#include <asm/desc.h>
+#include <asm/timer.h>
+
+#include <mach_apic.h>
+
+#include "io_ports.h"
+
+#ifdef CONFIG_X86_XEN
+
+#include <xen-public/xen.h>
+#include <xen-public/physdev.h>
+
+/* Fake i8259 */
+#define make_8259A_irq(_irq)     (io_apic_irqs &= ~(1UL<<(_irq)))
+#define disable_8259A_irq(_irq)  ((void)0)
+#define i8259A_irq_pending(_irq) (0)
+
+unsigned long io_apic_irqs;
+
+static inline unsigned int xen_io_apic_read(unsigned int apic, unsigned int 
reg)
+{
+       physdev_op_t op;
+       int ret;
+
+       op.cmd = PHYSDEVOP_APIC_READ;
+       op.u.apic_op.apic = mp_ioapics[apic].mpc_apicid;
+       op.u.apic_op.offset = reg;
+       ret = HYPERVISOR_physdev_op(&op);
+       if (ret)
+               return ret;
+       return op.u.apic_op.value;
+}
+
+static inline void xen_io_apic_write(unsigned int apic, unsigned int reg, 
unsigned int value)
+{
+       physdev_op_t op;
+
+       op.cmd = PHYSDEVOP_APIC_WRITE;
+       op.u.apic_op.apic = mp_ioapics[apic].mpc_apicid;
+       op.u.apic_op.offset = reg;
+       op.u.apic_op.value = value;
+       HYPERVISOR_physdev_op(&op);
+}
+
+#define io_apic_read(a,r)    xen_io_apic_read(a,r)
+#define io_apic_write(a,r,v) xen_io_apic_write(a,r,v)
+
+#endif /* CONFIG_XEN */
+
+int (*ioapic_renumber_irq)(int ioapic, int irq);
+atomic_t irq_mis_count;
+
+static DEFINE_SPINLOCK(ioapic_lock);
+
+/*
+ *     Is the SiS APIC rmw bug present ?
+ *     -1 = don't know, 0 = no, 1 = yes
+ */
+int sis_apic_bug = -1;
+
+/*
+ * # of IRQ routing registers
+ */
+int nr_ioapic_registers[MAX_IO_APICS];
+
+/*
+ * Rough estimation of how many shared IRQs there are, can
+ * be changed anytime.
+ */
+#define MAX_PLUS_SHARED_IRQS NR_IRQS
+#define PIN_MAP_SIZE (MAX_PLUS_SHARED_IRQS + NR_IRQS)
+
+/*
+ * This is performance-critical, we want to do it O(1)
+ *
+ * the indexing order of this array favors 1:1 mappings
+ * between pins and IRQs.
+ */
+
+static struct irq_pin_list {
+       int apic, pin, next;
+} irq_2_pin[PIN_MAP_SIZE];
+
+int vector_irq[NR_VECTORS] = { [0 ... NR_VECTORS - 1] = -1};
+#ifdef CONFIG_PCI_MSI
+#define vector_to_irq(vector)  \
+       (platform_legacy_irq(vector) ? vector : vector_irq[vector])
+#else
+#define vector_to_irq(vector)  (vector)
+#endif
+
+/*
+ * The common case is 1:1 IRQ<->pin mappings. Sometimes there are
+ * shared ISA-space IRQs, so we have to support them. We are super
+ * fast in the common case, and fast for shared ISA-space IRQs.
+ */
+static void add_pin_to_irq(unsigned int irq, int apic, int pin)
+{
+       static int first_free_entry = NR_IRQS;
+       struct irq_pin_list *entry = irq_2_pin + irq;
+
+       while (entry->next)
+               entry = irq_2_pin + entry->next;
+
+       if (entry->pin != -1) {
+               entry->next = first_free_entry;
+               entry = irq_2_pin + entry->next;
+               if (++first_free_entry >= PIN_MAP_SIZE)
+                       panic("io_apic.c: whoops");
+       }
+       entry->apic = apic;
+       entry->pin = pin;
+}
+
+#ifndef CONFIG_XEN
+/*
+ * Reroute an IRQ to a different pin.
+ */
+static void __init replace_pin_at_irq(unsigned int irq,
+                                     int oldapic, int oldpin,
+                                     int newapic, int newpin)
+{
+       struct irq_pin_list *entry = irq_2_pin + irq;
+
+       while (1) {
+               if (entry->apic == oldapic && entry->pin == oldpin) {
+                       entry->apic = newapic;
+                       entry->pin = newpin;
+               }
+               if (!entry->next)
+                       break;
+               entry = irq_2_pin + entry->next;
+       }
+}
+
+static void __modify_IO_APIC_irq (unsigned int irq, unsigned long enable, 
unsigned long disable)
+{
+       struct irq_pin_list *entry = irq_2_pin + irq;
+       unsigned int pin, reg;
+
+       for (;;) {
+               pin = entry->pin;
+               if (pin == -1)
+                       break;
+               reg = io_apic_read(entry->apic, 0x10 + pin*2);
+               reg &= ~disable;
+               reg |= enable;
+               io_apic_modify(entry->apic, 0x10 + pin*2, reg);
+               if (!entry->next)
+                       break;
+               entry = irq_2_pin + entry->next;
+       }
+}
+
+/* mask = 1 */
+static void __mask_IO_APIC_irq (unsigned int irq)
+{
+       __modify_IO_APIC_irq(irq, 0x00010000, 0);
+}
+
+/* mask = 0 */
+static void __unmask_IO_APIC_irq (unsigned int irq)
+{
+       __modify_IO_APIC_irq(irq, 0, 0x00010000);
+}
+
+/* mask = 1, trigger = 0 */
+static void __mask_and_edge_IO_APIC_irq (unsigned int irq)
+{
+       __modify_IO_APIC_irq(irq, 0x00010000, 0x00008000);
+}
+
+/* mask = 0, trigger = 1 */
+static void __unmask_and_level_IO_APIC_irq (unsigned int irq)
+{
+       __modify_IO_APIC_irq(irq, 0x00008000, 0x00010000);
+}
+
+static void mask_IO_APIC_irq (unsigned int irq)
+{
+       unsigned long flags;
+
+       spin_lock_irqsave(&ioapic_lock, flags);
+       __mask_IO_APIC_irq(irq);
+       spin_unlock_irqrestore(&ioapic_lock, flags);
+}
+
+static void unmask_IO_APIC_irq (unsigned int irq)
+{
+       unsigned long flags;
+
+       spin_lock_irqsave(&ioapic_lock, flags);
+       __unmask_IO_APIC_irq(irq);
+       spin_unlock_irqrestore(&ioapic_lock, flags);
+}
+
+static void clear_IO_APIC_pin(unsigned int apic, unsigned int pin)
+{
+       struct IO_APIC_route_entry entry;
+       unsigned long flags;
+       
+       /* Check delivery_mode to be sure we're not clearing an SMI pin */
+       spin_lock_irqsave(&ioapic_lock, flags);
+       *(((int*)&entry) + 0) = io_apic_read(apic, 0x10 + 2 * pin);
+       *(((int*)&entry) + 1) = io_apic_read(apic, 0x11 + 2 * pin);
+       spin_unlock_irqrestore(&ioapic_lock, flags);
+       if (entry.delivery_mode == dest_SMI)
+               return;
+
+       /*
+        * Disable it in the IO-APIC irq-routing table:
+        */
+       memset(&entry, 0, sizeof(entry));
+       entry.mask = 1;
+       spin_lock_irqsave(&ioapic_lock, flags);
+       io_apic_write(apic, 0x10 + 2 * pin, *(((int *)&entry) + 0));
+       io_apic_write(apic, 0x11 + 2 * pin, *(((int *)&entry) + 1));
+       spin_unlock_irqrestore(&ioapic_lock, flags);
+}
+
+static void clear_IO_APIC (void)
+{
+       int apic, pin;
+
+       for (apic = 0; apic < nr_ioapics; apic++)
+               for (pin = 0; pin < nr_ioapic_registers[apic]; pin++)
+                       clear_IO_APIC_pin(apic, pin);
+}
+
+static void set_ioapic_affinity_irq(unsigned int irq, cpumask_t cpumask)
+{
+       unsigned long flags;
+       int pin;
+       struct irq_pin_list *entry = irq_2_pin + irq;
+       unsigned int apicid_value;
+       
+       apicid_value = cpu_mask_to_apicid(cpumask);
+       /* Prepare to do the io_apic_write */
+       apicid_value = apicid_value << 24;
+       spin_lock_irqsave(&ioapic_lock, flags);
+       for (;;) {
+               pin = entry->pin;
+               if (pin == -1)
+                       break;
+               io_apic_write(entry->apic, 0x10 + 1 + pin*2, apicid_value);
+               if (!entry->next)
+                       break;
+               entry = irq_2_pin + entry->next;
+       }
+       spin_unlock_irqrestore(&ioapic_lock, flags);
+}
+#else
+#define clear_IO_APIC() ((void)0)
+#endif
+
+#if defined(CONFIG_IRQBALANCE)
+# include <asm/processor.h>    /* kernel_thread() */
+# include <linux/kernel_stat.h>        /* kstat */
+# include <linux/slab.h>               /* kmalloc() */
+# include <linux/timer.h>      /* time_after() */
+ 
+# ifdef CONFIG_BALANCED_IRQ_DEBUG
+#  define TDprintk(x...) do { printk("<%ld:%s:%d>: ", jiffies, __FILE__, 
__LINE__); printk(x); } while (0)
+#  define Dprintk(x...) do { TDprintk(x); } while (0)
+# else
+#  define TDprintk(x...) 
+#  define Dprintk(x...) 
+# endif
+
+cpumask_t __cacheline_aligned pending_irq_balance_cpumask[NR_IRQS];
+
+#define IRQBALANCE_CHECK_ARCH -999
+static int irqbalance_disabled = IRQBALANCE_CHECK_ARCH;
+static int physical_balance = 0;
+
+static struct irq_cpu_info {
+       unsigned long * last_irq;
+       unsigned long * irq_delta;
+       unsigned long irq;
+} irq_cpu_data[NR_CPUS];
+
+#define CPU_IRQ(cpu)           (irq_cpu_data[cpu].irq)
+#define LAST_CPU_IRQ(cpu,irq)   (irq_cpu_data[cpu].last_irq[irq])
+#define IRQ_DELTA(cpu,irq)     (irq_cpu_data[cpu].irq_delta[irq])
+
+#define IDLE_ENOUGH(cpu,now) \
+       (idle_cpu(cpu) && ((now) - per_cpu(irq_stat, (cpu)).idle_timestamp > 1))
+
+#define IRQ_ALLOWED(cpu, allowed_mask) cpu_isset(cpu, allowed_mask)
+
+#define CPU_TO_PACKAGEINDEX(i) (first_cpu(cpu_sibling_map[i]))
+
+#define MAX_BALANCED_IRQ_INTERVAL      (5*HZ)
+#define MIN_BALANCED_IRQ_INTERVAL      (HZ/2)
+#define BALANCED_IRQ_MORE_DELTA                (HZ/10)
+#define BALANCED_IRQ_LESS_DELTA                (HZ)
+
+static long balanced_irq_interval = MAX_BALANCED_IRQ_INTERVAL;
+
+static unsigned long move(int curr_cpu, cpumask_t allowed_mask,
+                       unsigned long now, int direction)
+{
+       int search_idle = 1;
+       int cpu = curr_cpu;
+
+       goto inside;
+
+       do {
+               if (unlikely(cpu == curr_cpu))
+                       search_idle = 0;
+inside:
+               if (direction == 1) {
+                       cpu++;
+                       if (cpu >= NR_CPUS)
+                               cpu = 0;
+               } else {
+                       cpu--;
+                       if (cpu == -1)
+                               cpu = NR_CPUS-1;
+               }
+       } while (!cpu_online(cpu) || !IRQ_ALLOWED(cpu,allowed_mask) ||
+                       (search_idle && !IDLE_ENOUGH(cpu,now)));
+
+       return cpu;
+}
+
+static inline void balance_irq(int cpu, int irq)
+{
+       unsigned long now = jiffies;
+       cpumask_t allowed_mask;
+       unsigned int new_cpu;
+               
+       if (irqbalance_disabled)
+               return; 
+
+       cpus_and(allowed_mask, cpu_online_map, irq_affinity[irq]);
+       new_cpu = move(cpu, allowed_mask, now, 1);
+       if (cpu != new_cpu) {
+               irq_desc_t *desc = irq_desc + irq;
+               unsigned long flags;
+
+               spin_lock_irqsave(&desc->lock, flags);
+               pending_irq_balance_cpumask[irq] = cpumask_of_cpu(new_cpu);
+               spin_unlock_irqrestore(&desc->lock, flags);
+       }
+}
+
+static inline void rotate_irqs_among_cpus(unsigned long useful_load_threshold)
+{
+       int i, j;
+       Dprintk("Rotating IRQs among CPUs.\n");
+       for (i = 0; i < NR_CPUS; i++) {
+               for (j = 0; cpu_online(i) && (j < NR_IRQS); j++) {
+                       if (!irq_desc[j].action)
+                               continue;
+                       /* Is it a significant load ?  */
+                       if (IRQ_DELTA(CPU_TO_PACKAGEINDEX(i),j) <
+                                               useful_load_threshold)
+                               continue;
+                       balance_irq(i, j);
+               }
+       }
+       balanced_irq_interval = max((long)MIN_BALANCED_IRQ_INTERVAL,
+               balanced_irq_interval - BALANCED_IRQ_LESS_DELTA);       
+       return;
+}
+
+static void do_irq_balance(void)
+{
+       int i, j;
+       unsigned long max_cpu_irq = 0, min_cpu_irq = (~0);
+       unsigned long move_this_load = 0;
+       int max_loaded = 0, min_loaded = 0;
+       int load;
+       unsigned long useful_load_threshold = balanced_irq_interval + 10;
+       int selected_irq;
+       int tmp_loaded, first_attempt = 1;
+       unsigned long tmp_cpu_irq;
+       unsigned long imbalance = 0;
+       cpumask_t allowed_mask, target_cpu_mask, tmp;
+
+       for (i = 0; i < NR_CPUS; i++) {
+               int package_index;
+               CPU_IRQ(i) = 0;
+               if (!cpu_online(i))
+                       continue;
+               package_index = CPU_TO_PACKAGEINDEX(i);
+               for (j = 0; j < NR_IRQS; j++) {
+                       unsigned long value_now, delta;
+                       /* Is this an active IRQ? */
+                       if (!irq_desc[j].action)
+                               continue;
+                       if ( package_index == i )
+                               IRQ_DELTA(package_index,j) = 0;
+                       /* Determine the total count per processor per IRQ */
+                       value_now = (unsigned long) kstat_cpu(i).irqs[j];
+
+                       /* Determine the activity per processor per IRQ */
+                       delta = value_now - LAST_CPU_IRQ(i,j);
+
+                       /* Update last_cpu_irq[][] for the next time */
+                       LAST_CPU_IRQ(i,j) = value_now;
+
+                       /* Ignore IRQs whose rate is less than the clock */
+                       if (delta < useful_load_threshold)
+                               continue;
+                       /* update the load for the processor or package total */
+                       IRQ_DELTA(package_index,j) += delta;
+
+                       /* Keep track of the higher numbered sibling as well */
+                       if (i != package_index)
+                               CPU_IRQ(i) += delta;
+                       /*
+                        * We have sibling A and sibling B in the package
+                        *
+                        * cpu_irq[A] = load for cpu A + load for cpu B
+                        * cpu_irq[B] = load for cpu B
+                        */
+                       CPU_IRQ(package_index) += delta;
+               }
+       }
+       /* Find the least loaded processor package */
+       for (i = 0; i < NR_CPUS; i++) {
+               if (!cpu_online(i))
+                       continue;
+               if (i != CPU_TO_PACKAGEINDEX(i))
+                       continue;
+               if (min_cpu_irq > CPU_IRQ(i)) {
+                       min_cpu_irq = CPU_IRQ(i);
+                       min_loaded = i;
+               }
+       }
+       max_cpu_irq = ULONG_MAX;
+
+tryanothercpu:
+       /* Look for heaviest loaded processor.
+        * We may come back to get the next heaviest loaded processor.
+        * Skip processors with trivial loads.
+        */
+       tmp_cpu_irq = 0;
+       tmp_loaded = -1;
+       for (i = 0; i < NR_CPUS; i++) {
+               if (!cpu_online(i))
+                       continue;
+               if (i != CPU_TO_PACKAGEINDEX(i))
+                       continue;
+               if (max_cpu_irq <= CPU_IRQ(i)) 
+                       continue;
+               if (tmp_cpu_irq < CPU_IRQ(i)) {
+                       tmp_cpu_irq = CPU_IRQ(i);
+                       tmp_loaded = i;
+               }
+       }
+
+       if (tmp_loaded == -1) {
+        /* In the case of small number of heavy interrupt sources, 
+         * loading some of the cpus too much. We use Ingo's original 
+         * approach to rotate them around.
+         */
+               if (!first_attempt && imbalance >= useful_load_threshold) {
+                       rotate_irqs_among_cpus(useful_load_threshold);
+                       return;
+               }
+               goto not_worth_the_effort;
+       }
+       
+       first_attempt = 0;              /* heaviest search */
+       max_cpu_irq = tmp_cpu_irq;      /* load */
+       max_loaded = tmp_loaded;        /* processor */
+       imbalance = (max_cpu_irq - min_cpu_irq) / 2;
+       
+       Dprintk("max_loaded cpu = %d\n", max_loaded);
+       Dprintk("min_loaded cpu = %d\n", min_loaded);
+       Dprintk("max_cpu_irq load = %ld\n", max_cpu_irq);
+       Dprintk("min_cpu_irq load = %ld\n", min_cpu_irq);
+       Dprintk("load imbalance = %lu\n", imbalance);
+
+       /* if imbalance is less than approx 10% of max load, then
+        * observe diminishing returns action. - quit
+        */
+       if (imbalance < (max_cpu_irq >> 3)) {
+               Dprintk("Imbalance too trivial\n");
+               goto not_worth_the_effort;
+       }
+
+tryanotherirq:
+       /* if we select an IRQ to move that can't go where we want, then
+        * see if there is another one to try.
+        */
+       move_this_load = 0;
+       selected_irq = -1;
+       for (j = 0; j < NR_IRQS; j++) {
+               /* Is this an active IRQ? */
+               if (!irq_desc[j].action)
+                       continue;
+               if (imbalance <= IRQ_DELTA(max_loaded,j))
+                       continue;
+               /* Try to find the IRQ that is closest to the imbalance
+                * without going over.
+                */
+               if (move_this_load < IRQ_DELTA(max_loaded,j)) {
+                       move_this_load = IRQ_DELTA(max_loaded,j);
+                       selected_irq = j;
+               }
+       }
+       if (selected_irq == -1) {
+               goto tryanothercpu;
+       }
+
+       imbalance = move_this_load;
+       
+       /* For physical_balance case, we accumlated both load
+        * values in the one of the siblings cpu_irq[],
+        * to use the same code for physical and logical processors
+        * as much as possible. 
+        *
+        * NOTE: the cpu_irq[] array holds the sum of the load for
+        * sibling A and sibling B in the slot for the lowest numbered
+        * sibling (A), _AND_ the load for sibling B in the slot for
+        * the higher numbered sibling.
+        *
+        * We seek the least loaded sibling by making the comparison
+        * (A+B)/2 vs B
+        */
+       load = CPU_IRQ(min_loaded) >> 1;
+       for_each_cpu_mask(j, cpu_sibling_map[min_loaded]) {
+               if (load > CPU_IRQ(j)) {
+                       /* This won't change cpu_sibling_map[min_loaded] */
+                       load = CPU_IRQ(j);
+                       min_loaded = j;
+               }
+       }
+
+       cpus_and(allowed_mask, cpu_online_map, irq_affinity[selected_irq]);
+       target_cpu_mask = cpumask_of_cpu(min_loaded);
+       cpus_and(tmp, target_cpu_mask, allowed_mask);
+
+       if (!cpus_empty(tmp)) {
+               irq_desc_t *desc = irq_desc + selected_irq;
+               unsigned long flags;
+
+               Dprintk("irq = %d moved to cpu = %d\n",
+                               selected_irq, min_loaded);
+               /* mark for change destination */
+               spin_lock_irqsave(&desc->lock, flags);
+               pending_irq_balance_cpumask[selected_irq] =
+                                       cpumask_of_cpu(min_loaded);
+               spin_unlock_irqrestore(&desc->lock, flags);
+               /* Since we made a change, come back sooner to 
+                * check for more variation.
+                */
+               balanced_irq_interval = max((long)MIN_BALANCED_IRQ_INTERVAL,
+                       balanced_irq_interval - BALANCED_IRQ_LESS_DELTA);       
+               return;
+       }
+       goto tryanotherirq;
+
+not_worth_the_effort:
+       /*
+        * if we did not find an IRQ to move, then adjust the time interval
+        * upward
+        */
+       balanced_irq_interval = min((long)MAX_BALANCED_IRQ_INTERVAL,
+               balanced_irq_interval + BALANCED_IRQ_MORE_DELTA);       
+       Dprintk("IRQ worth rotating not found\n");
+       return;
+}
+
+static int balanced_irq(void *unused)
+{
+       int i;
+       unsigned long prev_balance_time = jiffies;
+       long time_remaining = balanced_irq_interval;
+
+       daemonize("kirqd");
+       
+       /* push everything to CPU 0 to give us a starting point.  */
+       for (i = 0 ; i < NR_IRQS ; i++) {
+               pending_irq_balance_cpumask[i] = cpumask_of_cpu(0);
+       }
+
+       for ( ; ; ) {
+               set_current_state(TASK_INTERRUPTIBLE);
+               time_remaining = schedule_timeout(time_remaining);
+               try_to_freeze(PF_FREEZE);
+               if (time_after(jiffies,
+                               prev_balance_time+balanced_irq_interval)) {
+                       do_irq_balance();
+                       prev_balance_time = jiffies;
+                       time_remaining = balanced_irq_interval;
+               }
+       }
+       return 0;
+}
+
+static int __init balanced_irq_init(void)
+{
+       int i;
+       struct cpuinfo_x86 *c;
+       cpumask_t tmp;
+
+       cpus_shift_right(tmp, cpu_online_map, 2);
+        c = &boot_cpu_data;
+       /* When not overwritten by the command line ask subarchitecture. */
+       if (irqbalance_disabled == IRQBALANCE_CHECK_ARCH)
+               irqbalance_disabled = NO_BALANCE_IRQ;
+       if (irqbalance_disabled)
+               return 0;
+       
+        /* disable irqbalance completely if there is only one processor online 
*/
+       if (num_online_cpus() < 2) {
+               irqbalance_disabled = 1;
+               return 0;
+       }
+       /*
+        * Enable physical balance only if more than 1 physical processor
+        * is present
+        */
+       if (smp_num_siblings > 1 && !cpus_empty(tmp))
+               physical_balance = 1;
+
+       for (i = 0; i < NR_CPUS; i++) {
+               if (!cpu_online(i))
+                       continue;
+               irq_cpu_data[i].irq_delta = kmalloc(sizeof(unsigned long) * 
NR_IRQS, GFP_KERNEL);
+               irq_cpu_data[i].last_irq = kmalloc(sizeof(unsigned long) * 
NR_IRQS, GFP_KERNEL);
+               if (irq_cpu_data[i].irq_delta == NULL || 
irq_cpu_data[i].last_irq == NULL) {
+                       printk(KERN_ERR "balanced_irq_init: out of memory");
+                       goto failed;
+               }
+               memset(irq_cpu_data[i].irq_delta,0,sizeof(unsigned long) * 
NR_IRQS);
+               memset(irq_cpu_data[i].last_irq,0,sizeof(unsigned long) * 
NR_IRQS);
+       }
+       
+       printk(KERN_INFO "Starting balanced_irq\n");
+       if (kernel_thread(balanced_irq, NULL, CLONE_KERNEL) >= 0) 
+               return 0;
+       else 
+               printk(KERN_ERR "balanced_irq_init: failed to spawn 
balanced_irq");
+failed:
+       for (i = 0; i < NR_CPUS; i++) {
+               if(irq_cpu_data[i].irq_delta)
+                       kfree(irq_cpu_data[i].irq_delta);
+               if(irq_cpu_data[i].last_irq)
+                       kfree(irq_cpu_data[i].last_irq);
+       }
+       return 0;
+}
+
+int __init irqbalance_disable(char *str)
+{
+       irqbalance_disabled = 1;
+       return 0;
+}
+
+__setup("noirqbalance", irqbalance_disable);
+
+static inline void move_irq(int irq)
+{
+       /* note - we hold the desc->lock */
+       if (unlikely(!cpus_empty(pending_irq_balance_cpumask[irq]))) {
+               set_ioapic_affinity_irq(irq, pending_irq_balance_cpumask[irq]);
+               cpus_clear(pending_irq_balance_cpumask[irq]);
+       }
+}
+
+late_initcall(balanced_irq_init);
+
+#else /* !CONFIG_IRQBALANCE */
+static inline void move_irq(int irq) { }
+#endif /* CONFIG_IRQBALANCE */
+
+#ifndef CONFIG_SMP
+void fastcall send_IPI_self(int vector)
+{
+#ifndef CONFIG_XEN
+       unsigned int cfg;
+
+       /*
+        * Wait for idle.
+        */
+       apic_wait_icr_idle();
+       cfg = APIC_DM_FIXED | APIC_DEST_SELF | vector | APIC_DEST_LOGICAL;
+       /*
+        * Send the IPI. The write to APIC_ICR fires this off.
+        */
+       apic_write_around(APIC_ICR, cfg);
+#endif
+}
+#endif /* !CONFIG_SMP */
+
+
+/*
+ * support for broken MP BIOSs, enables hand-redirection of PIRQ0-7 to
+ * specific CPU-side IRQs.
+ */
+
+#define MAX_PIRQS 8
+static int pirq_entries [MAX_PIRQS];
+static int pirqs_enabled;
+int skip_ioapic_setup;
+
+static int __init ioapic_setup(char *str)
+{
+       skip_ioapic_setup = 1;
+       return 1;
+}
+
+__setup("noapic", ioapic_setup);
+
+static int __init ioapic_pirq_setup(char *str)
+{
+       int i, max;
+       int ints[MAX_PIRQS+1];
+
+       get_options(str, ARRAY_SIZE(ints), ints);
+
+       for (i = 0; i < MAX_PIRQS; i++)
+               pirq_entries[i] = -1;
+
+       pirqs_enabled = 1;
+       apic_printk(APIC_VERBOSE, KERN_INFO
+                       "PIRQ redirection, working around broken MP-BIOS.\n");
+       max = MAX_PIRQS;
+       if (ints[0] < MAX_PIRQS)
+               max = ints[0];
+
+       for (i = 0; i < max; i++) {
+               apic_printk(APIC_VERBOSE, KERN_DEBUG
+                               "... PIRQ%d -> IRQ %d\n", i, ints[i+1]);
+               /*
+                * PIRQs are mapped upside down, usually.
+                */
+               pirq_entries[MAX_PIRQS-i-1] = ints[i+1];
+       }
+       return 1;
+}
+
+__setup("pirq=", ioapic_pirq_setup);
+
+/*
+ * Find the IRQ entry number of a certain pin.
+ */
+static int find_irq_entry(int apic, int pin, int type)
+{
+       int i;
+
+       for (i = 0; i < mp_irq_entries; i++)
+               if (mp_irqs[i].mpc_irqtype == type &&
+                   (mp_irqs[i].mpc_dstapic == mp_ioapics[apic].mpc_apicid ||
+                    mp_irqs[i].mpc_dstapic == MP_APIC_ALL) &&
+                   mp_irqs[i].mpc_dstirq == pin)
+                       return i;
+
+       return -1;
+}
+
+#ifndef CONFIG_XEN
+/*
+ * Find the pin to which IRQ[irq] (ISA) is connected
+ */
+static int find_isa_irq_pin(int irq, int type)
+{
+       int i;
+
+       for (i = 0; i < mp_irq_entries; i++) {
+               int lbus = mp_irqs[i].mpc_srcbus;
+
+               if ((mp_bus_id_to_type[lbus] == MP_BUS_ISA ||
+                    mp_bus_id_to_type[lbus] == MP_BUS_EISA ||
+                    mp_bus_id_to_type[lbus] == MP_BUS_MCA ||
+                    mp_bus_id_to_type[lbus] == MP_BUS_NEC98
+                   ) &&
+                   (mp_irqs[i].mpc_irqtype == type) &&
+                   (mp_irqs[i].mpc_srcbusirq == irq))
+
+                       return mp_irqs[i].mpc_dstirq;
+       }
+       return -1;
+}
+#endif
+
+/*
+ * Find a specific PCI IRQ entry.
+ * Not an __init, possibly needed by modules
+ */
+static int pin_2_irq(int idx, int apic, int pin);
+
+int IO_APIC_get_PCI_irq_vector(int bus, int slot, int pin)
+{
+       int apic, i, best_guess = -1;
+
+       apic_printk(APIC_DEBUG, "querying PCI -> IRQ mapping bus:%d, "
+               "slot:%d, pin:%d.\n", bus, slot, pin);
+       if (mp_bus_id_to_pci_bus[bus] == -1) {
+               printk(KERN_WARNING "PCI BIOS passed nonexistent PCI bus 
%d!\n", bus);
+               return -1;
+       }
+       for (i = 0; i < mp_irq_entries; i++) {
+               int lbus = mp_irqs[i].mpc_srcbus;
+
+               for (apic = 0; apic < nr_ioapics; apic++)
+                       if (mp_ioapics[apic].mpc_apicid == 
mp_irqs[i].mpc_dstapic ||
+                           mp_irqs[i].mpc_dstapic == MP_APIC_ALL)
+                               break;
+
+               if ((mp_bus_id_to_type[lbus] == MP_BUS_PCI) &&
+                   !mp_irqs[i].mpc_irqtype &&
+                   (bus == lbus) &&
+                   (slot == ((mp_irqs[i].mpc_srcbusirq >> 2) & 0x1f))) {
+                       int irq = pin_2_irq(i,apic,mp_irqs[i].mpc_dstirq);
+
+                       if (!(apic || IO_APIC_IRQ(irq)))
+                               continue;
+
+                       if (pin == (mp_irqs[i].mpc_srcbusirq & 3))
+                               return irq;
+                       /*
+                        * Use the first all-but-pin matching entry as a
+                        * best-guess fuzzy result for broken mptables.
+                        */
+                       if (best_guess < 0)
+                               best_guess = irq;
+               }
+       }
+       return best_guess;
+}
+
+#ifndef CONFIG_XEN
+/*
+ * This function currently is only a helper for the i386 smp boot process 
where 
+ * we need to reprogram the ioredtbls to cater for the cpus which have come 
online
+ * so mask in all cases should simply be TARGET_CPUS
+ */
+void __init setup_ioapic_dest(void)
+{
+       int pin, ioapic, irq, irq_entry;
+
+       if (skip_ioapic_setup == 1)
+               return;
+
+       for (ioapic = 0; ioapic < nr_ioapics; ioapic++) {
+               for (pin = 0; pin < nr_ioapic_registers[ioapic]; pin++) {
+                       irq_entry = find_irq_entry(ioapic, pin, mp_INT);
+                       if (irq_entry == -1)
+                               continue;
+                       irq = pin_2_irq(irq_entry, ioapic, pin);
+                       set_ioapic_affinity_irq(irq, TARGET_CPUS);
+               }
+
+       }
+}
+#endif /* !CONFIG_XEN */
+
+/*
+ * EISA Edge/Level control register, ELCR
+ */
+static int EISA_ELCR(unsigned int irq)
+{
+       if (irq < 16) {
+               unsigned int port = 0x4d0 + (irq >> 3);
+               return (inb(port) >> (irq & 7)) & 1;
+       }
+       apic_printk(APIC_VERBOSE, KERN_INFO
+                       "Broken MPtable reports ISA irq %d\n", irq);
+       return 0;
+}
+
+/* EISA interrupts are always polarity zero and can be edge or level
+ * trigger depending on the ELCR value.  If an interrupt is listed as
+ * EISA conforming in the MP table, that means its trigger type must
+ * be read in from the ELCR */
+
+#define default_EISA_trigger(idx)      (EISA_ELCR(mp_irqs[idx].mpc_srcbusirq))
+#define default_EISA_polarity(idx)     (0)
+
+/* ISA interrupts are always polarity zero edge triggered,
+ * when listed as conforming in the MP table. */
+
+#define default_ISA_trigger(idx)       (0)
+#define default_ISA_polarity(idx)      (0)
+
+/* PCI interrupts are always polarity one level triggered,
+ * when listed as conforming in the MP table. */
+
+#define default_PCI_trigger(idx)       (1)
+#define default_PCI_polarity(idx)      (1)
+
+/* MCA interrupts are always polarity zero level triggered,
+ * when listed as conforming in the MP table. */
+
+#define default_MCA_trigger(idx)       (1)
+#define default_MCA_polarity(idx)      (0)
+
+/* NEC98 interrupts are always polarity zero edge triggered,
+ * when listed as conforming in the MP table. */
+
+#define default_NEC98_trigger(idx)     (0)
+#define default_NEC98_polarity(idx)    (0)
+
+static int __init MPBIOS_polarity(int idx)
+{
+       int bus = mp_irqs[idx].mpc_srcbus;
+       int polarity;
+
+       /*
+        * Determine IRQ line polarity (high active or low active):
+        */
+       switch (mp_irqs[idx].mpc_irqflag & 3)
+       {
+               case 0: /* conforms, ie. bus-type dependent polarity */
+               {
+                       switch (mp_bus_id_to_type[bus])
+                       {
+                               case MP_BUS_ISA: /* ISA pin */
+                               {
+                                       polarity = default_ISA_polarity(idx);
+                                       break;
+                               }
+                               case MP_BUS_EISA: /* EISA pin */
+                               {
+                                       polarity = default_EISA_polarity(idx);
+                                       break;
+                               }
+                               case MP_BUS_PCI: /* PCI pin */
+                               {
+                                       polarity = default_PCI_polarity(idx);
+                                       break;
+                               }
+                               case MP_BUS_MCA: /* MCA pin */
+                               {
+                                       polarity = default_MCA_polarity(idx);
+                                       break;
+                               }
+                               case MP_BUS_NEC98: /* NEC 98 pin */
+                               {
+                                       polarity = default_NEC98_polarity(idx);
+                                       break;
+                               }
+                               default:
+                               {
+                                       printk(KERN_WARNING "broken BIOS!!\n");
+                                       polarity = 1;
+                                       break;
+                               }
+                       }
+                       break;
+               }
+               case 1: /* high active */
+               {
+                       polarity = 0;
+                       break;
+               }
+               case 2: /* reserved */
+               {
+                       printk(KERN_WARNING "broken BIOS!!\n");
+                       polarity = 1;
+                       break;
+               }
+               case 3: /* low active */
+               {
+                       polarity = 1;
+                       break;
+               }
+               default: /* invalid */
+               {
+                       printk(KERN_WARNING "broken BIOS!!\n");
+                       polarity = 1;
+                       break;
+               }
+       }
+       return polarity;
+}
+
+static int MPBIOS_trigger(int idx)
+{
+       int bus = mp_irqs[idx].mpc_srcbus;
+       int trigger;
+
+       /*
+        * Determine IRQ trigger mode (edge or level sensitive):
+        */
+       switch ((mp_irqs[idx].mpc_irqflag>>2) & 3)
+       {
+               case 0: /* conforms, ie. bus-type dependent */
+               {
+                       switch (mp_bus_id_to_type[bus])
+                       {
+                               case MP_BUS_ISA: /* ISA pin */
+                               {
+                                       trigger = default_ISA_trigger(idx);
+                                       break;
+                               }
+                               case MP_BUS_EISA: /* EISA pin */
+                               {
+                                       trigger = default_EISA_trigger(idx);
+                                       break;
+                               }
+                               case MP_BUS_PCI: /* PCI pin */
+                               {
+                                       trigger = default_PCI_trigger(idx);
+                                       break;
+                               }
+                               case MP_BUS_MCA: /* MCA pin */
+                               {
+                                       trigger = default_MCA_trigger(idx);
+                                       break;
+                               }
+                               case MP_BUS_NEC98: /* NEC 98 pin */
+                               {
+                                       trigger = default_NEC98_trigger(idx);
+                                       break;
+                               }
+                               default:
+                               {
+                                       printk(KERN_WARNING "broken BIOS!!\n");
+                                       trigger = 1;
+                                       break;
+                               }
+                       }
+                       break;
+               }
+               case 1: /* edge */
+               {
+                       trigger = 0;
+                       break;
+               }
+               case 2: /* reserved */
+               {
+                       printk(KERN_WARNING "broken BIOS!!\n");
+                       trigger = 1;
+                       break;
+               }
+               case 3: /* level */
+               {
+                       trigger = 1;
+                       break;
+               }
+               default: /* invalid */
+               {
+                       printk(KERN_WARNING "broken BIOS!!\n");
+                       trigger = 0;
+                       break;
+               }
+       }
+       return trigger;
+}
+
+static inline int irq_polarity(int idx)
+{
+       return MPBIOS_polarity(idx);
+}
+
+static inline int irq_trigger(int idx)
+{
+       return MPBIOS_trigger(idx);
+}
+
+static int pin_2_irq(int idx, int apic, int pin)
+{
+       int irq, i;
+       int bus = mp_irqs[idx].mpc_srcbus;
+
+       /*
+        * Debugging check, we are in big trouble if this message pops up!
+        */
+       if (mp_irqs[idx].mpc_dstirq != pin)
+               printk(KERN_ERR "broken BIOS or MPTABLE parser, ayiee!!\n");
+
+       switch (mp_bus_id_to_type[bus])
+       {
+               case MP_BUS_ISA: /* ISA pin */
+               case MP_BUS_EISA:
+               case MP_BUS_MCA:
+               case MP_BUS_NEC98:
+               {
+                       irq = mp_irqs[idx].mpc_srcbusirq;
+                       break;
+               }
+               case MP_BUS_PCI: /* PCI pin */
+               {
+                       /*
+                        * PCI IRQs are mapped in order
+                        */
+                       i = irq = 0;
+                       while (i < apic)
+                               irq += nr_ioapic_registers[i++];
+                       irq += pin;
+
+                       /*
+                        * For MPS mode, so far only needed by ES7000 platform
+                        */
+                       if (ioapic_renumber_irq)
+                               irq = ioapic_renumber_irq(apic, irq);
+
+                       break;
+               }
+               default:
+               {
+                       printk(KERN_ERR "unknown bus type %d.\n",bus); 
+                       irq = 0;
+                       break;
+               }
+       }
+
+       /*
+        * PCI IRQ command line redirection. Yes, limits are hardcoded.
+        */
+       if ((pin >= 16) && (pin <= 23)) {
+               if (pirq_entries[pin-16] != -1) {
+                       if (!pirq_entries[pin-16]) {
+                               apic_printk(APIC_VERBOSE, KERN_DEBUG
+                                               "disabling PIRQ%d\n", pin-16);
+                       } else {
+                               irq = pirq_entries[pin-16];
+                               apic_printk(APIC_VERBOSE, KERN_DEBUG
+                                               "using PIRQ%d -> IRQ %d\n",
+                                               pin-16, irq);
+                       }
+               }
+       }
+       return irq;
+}
+
+static inline int IO_APIC_irq_trigger(int irq)
+{
+       int apic, idx, pin;
+
+       for (apic = 0; apic < nr_ioapics; apic++) {
+               for (pin = 0; pin < nr_ioapic_registers[apic]; pin++) {
+                       idx = find_irq_entry(apic,pin,mp_INT);
+                       if ((idx != -1) && (irq == pin_2_irq(idx,apic,pin)))
+                               return irq_trigger(idx);
+               }
+       }
+       /*
+        * nonexistent IRQs are edge default
+        */
+       return 0;
+}
+
+/* irq_vectors is indexed by the sum of all RTEs in all I/O APICs. */
+u8 irq_vector[NR_IRQ_VECTORS]; /* = { FIRST_DEVICE_VECTOR , 0 }; */
+
+int assign_irq_vector(int irq)
+{
+       static int current_vector = FIRST_DEVICE_VECTOR;
+       physdev_op_t op;
+
+       BUG_ON(irq >= NR_IRQ_VECTORS);
+       if (irq != AUTO_ASSIGN && IO_APIC_VECTOR(irq) > 0)
+               return IO_APIC_VECTOR(irq);
+
+       op.cmd = PHYSDEVOP_ASSIGN_VECTOR;
+       op.u.irq_op.irq = irq;
+       if (HYPERVISOR_physdev_op(&op))
+               return -ENOSPC;
+       current_vector = op.u.irq_op.vector;
+
+       vector_irq[current_vector] = irq;
+       if (irq != AUTO_ASSIGN)
+               IO_APIC_VECTOR(irq) = current_vector;
+
+       return current_vector;
+}
+
+#ifndef CONFIG_XEN
+static struct hw_interrupt_type ioapic_level_type;
+static struct hw_interrupt_type ioapic_edge_type;
+
+#define IOAPIC_AUTO    -1
+#define IOAPIC_EDGE    0
+#define IOAPIC_LEVEL   1
+
+static inline void ioapic_register_intr(int irq, int vector, unsigned long 
trigger)
+{
+       if (use_pci_vector() && !platform_legacy_irq(irq)) {
+               if ((trigger == IOAPIC_AUTO && IO_APIC_irq_trigger(irq)) ||
+                               trigger == IOAPIC_LEVEL)
+                       irq_desc[vector].handler = &ioapic_level_type;
+               else
+                       irq_desc[vector].handler = &ioapic_edge_type;
+               set_intr_gate(vector, interrupt[vector]);
+       } else  {
+               if ((trigger == IOAPIC_AUTO && IO_APIC_irq_trigger(irq)) ||
+                               trigger == IOAPIC_LEVEL)
+                       irq_desc[irq].handler = &ioapic_level_type;
+               else
+                       irq_desc[irq].handler = &ioapic_edge_type;
+               set_intr_gate(vector, interrupt[irq]);
+       }
+}
+#else
+#define ioapic_register_intr(_irq,_vector,_trigger) ((void)0)
+#endif
+
+static void __init setup_IO_APIC_irqs(void)
+{
+       struct IO_APIC_route_entry entry;
+       int apic, pin, idx, irq, first_notcon = 1, vector;
+       unsigned long flags;
+
+       apic_printk(APIC_VERBOSE, KERN_DEBUG "init IO_APIC IRQs\n");
+
+       for (apic = 0; apic < nr_ioapics; apic++) {
+       for (pin = 0; pin < nr_ioapic_registers[apic]; pin++) {
+
+               /*
+                * add it to the IO-APIC irq-routing table:
+                */
+               memset(&entry,0,sizeof(entry));
+
+               entry.delivery_mode = INT_DELIVERY_MODE;
+               entry.dest_mode = INT_DEST_MODE;
+               entry.mask = 0;                         /* enable IRQ */
+               entry.dest.logical.logical_dest = 
+                                       cpu_mask_to_apicid(TARGET_CPUS);
+
+               idx = find_irq_entry(apic,pin,mp_INT);
+               if (idx == -1) {
+                       if (first_notcon) {
+                               apic_printk(APIC_VERBOSE, KERN_DEBUG
+                                               " IO-APIC (apicid-pin) %d-%d",
+                                               mp_ioapics[apic].mpc_apicid,
+                                               pin);
+                               first_notcon = 0;
+                       } else
+                               apic_printk(APIC_VERBOSE, ", %d-%d",
+                                       mp_ioapics[apic].mpc_apicid, pin);
+                       continue;
+               }
+
+               entry.trigger = irq_trigger(idx);
+               entry.polarity = irq_polarity(idx);
+
+               if (irq_trigger(idx)) {
+                       entry.trigger = 1;
+                       entry.mask = 1;
+               }
+
+               irq = pin_2_irq(idx, apic, pin);
+               /*
+                * skip adding the timer int on secondary nodes, which causes
+                * a small but painful rift in the time-space continuum
+                */
+               if (multi_timer_check(apic, irq))
+                       continue;
+               else
+                       add_pin_to_irq(irq, apic, pin);
+
+               if (/*!apic &&*/ !IO_APIC_IRQ(irq))
+                       continue;
+
+               if (IO_APIC_IRQ(irq)) {
+                       vector = assign_irq_vector(irq);
+                       entry.vector = vector;
+                       ioapic_register_intr(irq, vector, IOAPIC_AUTO);
+               
+                       if (!apic && (irq < 16))
+                               disable_8259A_irq(irq);
+               }
+               spin_lock_irqsave(&ioapic_lock, flags);
+               io_apic_write(apic, 0x11+2*pin, *(((int *)&entry)+1));
+               io_apic_write(apic, 0x10+2*pin, *(((int *)&entry)+0));
+               spin_unlock_irqrestore(&ioapic_lock, flags);
+       }
+       }
+
+       if (!first_notcon)
+               apic_printk(APIC_VERBOSE, " not connected.\n");
+}
+
+/*
+ * Set up the 8259A-master output pin:
+ */
+#ifndef CONFIG_XEN
+static void __init setup_ExtINT_IRQ0_pin(unsigned int pin, int vector)
+{
+       struct IO_APIC_route_entry entry;
+       unsigned long flags;
+
+       memset(&entry,0,sizeof(entry));
+
+       disable_8259A_irq(0);
+
+       /* mask LVT0 */
+       apic_write_around(APIC_LVT0, APIC_LVT_MASKED | APIC_DM_EXTINT);
+
+       /*
+        * We use logical delivery to get the timer IRQ
+        * to the first CPU.
+        */
+       entry.dest_mode = INT_DEST_MODE;
+       entry.mask = 0;                                 /* unmask IRQ now */
+       entry.dest.logical.logical_dest = cpu_mask_to_apicid(TARGET_CPUS);
+       entry.delivery_mode = INT_DELIVERY_MODE;
+       entry.polarity = 0;
+       entry.trigger = 0;
+       entry.vector = vector;
+
+       /*
+        * The timer IRQ doesn't have to know that behind the
+        * scene we have a 8259A-master in AEOI mode ...
+        */
+       irq_desc[0].handler = &ioapic_edge_type;
+
+       /*
+        * Add it to the IO-APIC irq-routing table:
+        */
+       spin_lock_irqsave(&ioapic_lock, flags);
+       io_apic_write(0, 0x11+2*pin, *(((int *)&entry)+1));
+       io_apic_write(0, 0x10+2*pin, *(((int *)&entry)+0));
+       spin_unlock_irqrestore(&ioapic_lock, flags);
+
+       enable_8259A_irq(0);
+}
+
+static inline void UNEXPECTED_IO_APIC(void)
+{
+}
+
+void __init print_IO_APIC(void)
+{
+       int apic, i;
+       union IO_APIC_reg_00 reg_00;
+       union IO_APIC_reg_01 reg_01;
+       union IO_APIC_reg_02 reg_02;
+       union IO_APIC_reg_03 reg_03;
+       unsigned long flags;
+
+       if (apic_verbosity == APIC_QUIET)
+               return;
+
+       printk(KERN_DEBUG "number of MP IRQ sources: %d.\n", mp_irq_entries);
+       for (i = 0; i < nr_ioapics; i++)
+               printk(KERN_DEBUG "number of IO-APIC #%d registers: %d.\n",
+                      mp_ioapics[i].mpc_apicid, nr_ioapic_registers[i]);
+
+       /*
+        * We are a bit conservative about what we expect.  We have to
+        * know about every hardware change ASAP.
+        */
+       printk(KERN_INFO "testing the IO APIC.......................\n");
+
+       for (apic = 0; apic < nr_ioapics; apic++) {
+
+       spin_lock_irqsave(&ioapic_lock, flags);
+       reg_00.raw = io_apic_read(apic, 0);
+       reg_01.raw = io_apic_read(apic, 1);
+       if (reg_01.bits.version >= 0x10)
+               reg_02.raw = io_apic_read(apic, 2);
+       if (reg_01.bits.version >= 0x20)
+               reg_03.raw = io_apic_read(apic, 3);
+       spin_unlock_irqrestore(&ioapic_lock, flags);
+
+       printk(KERN_DEBUG "IO APIC #%d......\n", mp_ioapics[apic].mpc_apicid);
+       printk(KERN_DEBUG ".... register #00: %08X\n", reg_00.raw);
+       printk(KERN_DEBUG ".......    : physical APIC id: %02X\n", 
reg_00.bits.ID);
+       printk(KERN_DEBUG ".......    : Delivery Type: %X\n", 
reg_00.bits.delivery_type);
+       printk(KERN_DEBUG ".......    : LTS          : %X\n", reg_00.bits.LTS);
+       if (reg_00.bits.ID >= get_physical_broadcast())
+               UNEXPECTED_IO_APIC();
+       if (reg_00.bits.__reserved_1 || reg_00.bits.__reserved_2)
+               UNEXPECTED_IO_APIC();
+
+       printk(KERN_DEBUG ".... register #01: %08X\n", reg_01.raw);
+       printk(KERN_DEBUG ".......     : max redirection entries: %04X\n", 
reg_01.bits.entries);
+       if (    (reg_01.bits.entries != 0x0f) && /* older (Neptune) boards */
+               (reg_01.bits.entries != 0x17) && /* typical ISA+PCI boards */
+               (reg_01.bits.entries != 0x1b) && /* Compaq Proliant boards */
+               (reg_01.bits.entries != 0x1f) && /* dual Xeon boards */
+               (reg_01.bits.entries != 0x22) && /* bigger Xeon boards */
+               (reg_01.bits.entries != 0x2E) &&
+               (reg_01.bits.entries != 0x3F)
+       )
+               UNEXPECTED_IO_APIC();
+
+       printk(KERN_DEBUG ".......     : PRQ implemented: %X\n", 
reg_01.bits.PRQ);
+       printk(KERN_DEBUG ".......     : IO APIC version: %04X\n", 
reg_01.bits.version);
+       if (    (reg_01.bits.version != 0x01) && /* 82489DX IO-APICs */
+               (reg_01.bits.version != 0x10) && /* oldest IO-APICs */
+               (reg_01.bits.version != 0x11) && /* Pentium/Pro IO-APICs */
+               (reg_01.bits.version != 0x13) && /* Xeon IO-APICs */
+               (reg_01.bits.version != 0x20)    /* Intel P64H (82806 AA) */
+       )
+               UNEXPECTED_IO_APIC();
+       if (reg_01.bits.__reserved_1 || reg_01.bits.__reserved_2)
+               UNEXPECTED_IO_APIC();
+
+       /*
+        * Some Intel chipsets with IO APIC VERSION of 0x1? don't have reg_02,
+        * but the value of reg_02 is read as the previous read register
+        * value, so ignore it if reg_02 == reg_01.
+        */
+       if (reg_01.bits.version >= 0x10 && reg_02.raw != reg_01.raw) {
+               printk(KERN_DEBUG ".... register #02: %08X\n", reg_02.raw);
+               printk(KERN_DEBUG ".......     : arbitration: %02X\n", 
reg_02.bits.arbitration);
+               if (reg_02.bits.__reserved_1 || reg_02.bits.__reserved_2)
+                       UNEXPECTED_IO_APIC();
+       }
+
+       /*
+        * Some Intel chipsets with IO APIC VERSION of 0x2? don't have reg_02
+        * or reg_03, but the value of reg_0[23] is read as the previous read
+        * register value, so ignore it if reg_03 == reg_0[12].
+        */
+       if (reg_01.bits.version >= 0x20 && reg_03.raw != reg_02.raw &&
+           reg_03.raw != reg_01.raw) {
+               printk(KERN_DEBUG ".... register #03: %08X\n", reg_03.raw);
+               printk(KERN_DEBUG ".......     : Boot DT    : %X\n", 
reg_03.bits.boot_DT);
+               if (reg_03.bits.__reserved_1)
+                       UNEXPECTED_IO_APIC();
+       }
+
+       printk(KERN_DEBUG ".... IRQ redirection table:\n");
+
+       printk(KERN_DEBUG " NR Log Phy Mask Trig IRR Pol"
+                         " Stat Dest Deli Vect:   \n");
+
+       for (i = 0; i <= reg_01.bits.entries; i++) {
+               struct IO_APIC_route_entry entry;
+
+               spin_lock_irqsave(&ioapic_lock, flags);
+               *(((int *)&entry)+0) = io_apic_read(apic, 0x10+i*2);
+               *(((int *)&entry)+1) = io_apic_read(apic, 0x11+i*2);
+               spin_unlock_irqrestore(&ioapic_lock, flags);
+
+               printk(KERN_DEBUG " %02x %03X %02X  ",
+                       i,
+                       entry.dest.logical.logical_dest,
+                       entry.dest.physical.physical_dest
+               );
+
+               printk("%1d    %1d    %1d   %1d   %1d    %1d    %1d    %02X\n",
+                       entry.mask,
+                       entry.trigger,
+                       entry.irr,
+                       entry.polarity,
+                       entry.delivery_status,
+                       entry.dest_mode,
+                       entry.delivery_mode,
+                       entry.vector
+               );
+       }
+       }
+       if (use_pci_vector())
+               printk(KERN_INFO "Using vector-based indexing\n");
+       printk(KERN_DEBUG "IRQ to pin mappings:\n");
+       for (i = 0; i < NR_IRQS; i++) {
+               struct irq_pin_list *entry = irq_2_pin + i;
+               if (entry->pin < 0)
+                       continue;
+               if (use_pci_vector() && !platform_legacy_irq(i))
+                       printk(KERN_DEBUG "IRQ%d ", IO_APIC_VECTOR(i));
+               else
+                       printk(KERN_DEBUG "IRQ%d ", i);
+               for (;;) {
+                       printk("-> %d:%d", entry->apic, entry->pin);
+                       if (!entry->next)
+                               break;
+                       entry = irq_2_pin + entry->next;
+               }
+               printk("\n");
+       }
+
+       printk(KERN_INFO ".................................... done.\n");
+
+       return;
+}
+
+static void print_APIC_bitfield (int base)
+{
+       unsigned int v;
+       int i, j;
+
+       if (apic_verbosity == APIC_QUIET)
+               return;
+
+       printk(KERN_DEBUG "0123456789abcdef0123456789abcdef\n" KERN_DEBUG);
+       for (i = 0; i < 8; i++) {
+               v = apic_read(base + i*0x10);
+               for (j = 0; j < 32; j++) {
+                       if (v & (1<<j))
+                               printk("1");
+                       else
+                               printk("0");
+               }
+               printk("\n");
+       }
+}
+
+void /*__init*/ print_local_APIC(void * dummy)
+{
+       unsigned int v, ver, maxlvt;
+
+       if (apic_verbosity == APIC_QUIET)
+               return;
+
+       printk("\n" KERN_DEBUG "printing local APIC contents on CPU#%d/%d:\n",
+               smp_processor_id(), hard_smp_processor_id());
+       v = apic_read(APIC_ID);
+       printk(KERN_INFO "... APIC ID:      %08x (%01x)\n", v, GET_APIC_ID(v));
+       v = apic_read(APIC_LVR);
+       printk(KERN_INFO "... APIC VERSION: %08x\n", v);
+       ver = GET_APIC_VERSION(v);
+       maxlvt = get_maxlvt();
+
+       v = apic_read(APIC_TASKPRI);
+       printk(KERN_DEBUG "... APIC TASKPRI: %08x (%02x)\n", v, v & 
APIC_TPRI_MASK);
+
+       if (APIC_INTEGRATED(ver)) {                     /* !82489DX */
+               v = apic_read(APIC_ARBPRI);
+               printk(KERN_DEBUG "... APIC ARBPRI: %08x (%02x)\n", v,
+                       v & APIC_ARBPRI_MASK);
+               v = apic_read(APIC_PROCPRI);
+               printk(KERN_DEBUG "... APIC PROCPRI: %08x\n", v);
+       }
+
+       v = apic_read(APIC_EOI);
+       printk(KERN_DEBUG "... APIC EOI: %08x\n", v);
+       v = apic_read(APIC_RRR);
+       printk(KERN_DEBUG "... APIC RRR: %08x\n", v);
+       v = apic_read(APIC_LDR);
+       printk(KERN_DEBUG "... APIC LDR: %08x\n", v);
+       v = apic_read(APIC_DFR);
+       printk(KERN_DEBUG "... APIC DFR: %08x\n", v);
+       v = apic_read(APIC_SPIV);
+       printk(KERN_DEBUG "... APIC SPIV: %08x\n", v);
+
+       printk(KERN_DEBUG "... APIC ISR field:\n");
+       print_APIC_bitfield(APIC_ISR);
+       printk(KERN_DEBUG "... APIC TMR field:\n");
+       print_APIC_bitfield(APIC_TMR);
+       printk(KERN_DEBUG "... APIC IRR field:\n");
+       print_APIC_bitfield(APIC_IRR);
+
+       if (APIC_INTEGRATED(ver)) {             /* !82489DX */
+               if (maxlvt > 3)         /* Due to the Pentium erratum 3AP. */
+                       apic_write(APIC_ESR, 0);
+               v = apic_read(APIC_ESR);
+               printk(KERN_DEBUG "... APIC ESR: %08x\n", v);
+       }
+
+       v = apic_read(APIC_ICR);
+       printk(KERN_DEBUG "... APIC ICR: %08x\n", v);
+       v = apic_read(APIC_ICR2);
+       printk(KERN_DEBUG "... APIC ICR2: %08x\n", v);
+
+       v = apic_read(APIC_LVTT);
+       printk(KERN_DEBUG "... APIC LVTT: %08x\n", v);
+
+       if (maxlvt > 3) {                       /* PC is LVT#4. */
+               v = apic_read(APIC_LVTPC);
+               printk(KERN_DEBUG "... APIC LVTPC: %08x\n", v);
+       }
+       v = apic_read(APIC_LVT0);
+       printk(KERN_DEBUG "... APIC LVT0: %08x\n", v);
+       v = apic_read(APIC_LVT1);
+       printk(KERN_DEBUG "... APIC LVT1: %08x\n", v);
+
+       if (maxlvt > 2) {                       /* ERR is LVT#3. */
+               v = apic_read(APIC_LVTERR);
+               printk(KERN_DEBUG "... APIC LVTERR: %08x\n", v);
+       }
+
+       v = apic_read(APIC_TMICT);
+       printk(KERN_DEBUG "... APIC TMICT: %08x\n", v);
+       v = apic_read(APIC_TMCCT);
+       printk(KERN_DEBUG "... APIC TMCCT: %08x\n", v);
+       v = apic_read(APIC_TDCR);
+       printk(KERN_DEBUG "... APIC TDCR: %08x\n", v);
+       printk("\n");
+}
+
+void print_all_local_APICs (void)
+{
+       on_each_cpu(print_local_APIC, NULL, 1, 1);
+}
+
+void /*__init*/ print_PIC(void)
+{
+       extern spinlock_t i8259A_lock;
+       unsigned int v;
+       unsigned long flags;
+
+       if (apic_verbosity == APIC_QUIET)
+               return;
+
+       printk(KERN_DEBUG "\nprinting PIC contents\n");
+
+       spin_lock_irqsave(&i8259A_lock, flags);
+
+       v = inb(0xa1) << 8 | inb(0x21);
+       printk(KERN_DEBUG "... PIC  IMR: %04x\n", v);
+
+       v = inb(0xa0) << 8 | inb(0x20);
+       printk(KERN_DEBUG "... PIC  IRR: %04x\n", v);
+
+       outb(0x0b,0xa0);
+       outb(0x0b,0x20);
+       v = inb(0xa0) << 8 | inb(0x20);
+       outb(0x0a,0xa0);
+       outb(0x0a,0x20);
+
+       spin_unlock_irqrestore(&i8259A_lock, flags);
+
+       printk(KERN_DEBUG "... PIC  ISR: %04x\n", v);
+
+       v = inb(0x4d1) << 8 | inb(0x4d0);
+       printk(KERN_DEBUG "... PIC ELCR: %04x\n", v);
+}
+#else
+void __init print_IO_APIC(void) { }
+#endif /* !CONFIG_XEN */
+
+static void __init enable_IO_APIC(void)
+{
+       union IO_APIC_reg_01 reg_01;
+       int i;
+       unsigned long flags;
+
+       for (i = 0; i < PIN_MAP_SIZE; i++) {
+               irq_2_pin[i].pin = -1;
+               irq_2_pin[i].next = 0;
+       }
+       if (!pirqs_enabled)
+               for (i = 0; i < MAX_PIRQS; i++)
+                       pirq_entries[i] = -1;
+
+       /*
+        * The number of IO-APIC IRQ registers (== #pins):
+        */
+       for (i = 0; i < nr_ioapics; i++) {
+               spin_lock_irqsave(&ioapic_lock, flags);
+               reg_01.raw = io_apic_read(i, 1);
+               spin_unlock_irqrestore(&ioapic_lock, flags);
+               nr_ioapic_registers[i] = reg_01.bits.entries+1;
+       }
+
+       /*
+        * Do not trust the IO-APIC being empty at bootup
+        */
+       clear_IO_APIC();
+}
+
+/*
+ * Not an __init, needed by the reboot code
+ */
+void disable_IO_APIC(void)
+{
+       /*
+        * Clear the IO-APIC before rebooting:
+        */
+       clear_IO_APIC();
+
+#ifndef CONFIG_XEN
+       disconnect_bsp_APIC();
+#endif
+}
+
+/*
+ * function to set the IO-APIC physical IDs based on the
+ * values stored in the MPC table.
+ *
+ * by Matt Domsch <Matt_Domsch@xxxxxxxx>  Tue Dec 21 12:25:05 CST 1999
+ */
+
+#if !defined(CONFIG_XEN) && !defined(CONFIG_X86_NUMAQ)
+static void __init setup_ioapic_ids_from_mpc(void)
+{
+       union IO_APIC_reg_00 reg_00;
+       physid_mask_t phys_id_present_map;
+       int apic;
+       int i;
+       unsigned char old_id;
+       unsigned long flags;
+
+       /*
+        * This is broken; anything with a real cpu count has to
+        * circumvent this idiocy regardless.
+        */
+       phys_id_present_map = ioapic_phys_id_map(phys_cpu_present_map);
+
+       /*
+        * Set the IOAPIC ID to the value stored in the MPC table.
+        */
+       for (apic = 0; apic < nr_ioapics; apic++) {
+
+               /* Read the register 0 value */
+               spin_lock_irqsave(&ioapic_lock, flags);
+               reg_00.raw = io_apic_read(apic, 0);
+               spin_unlock_irqrestore(&ioapic_lock, flags);
+               
+               old_id = mp_ioapics[apic].mpc_apicid;
+
+               if (mp_ioapics[apic].mpc_apicid >= get_physical_broadcast()) {
+                       printk(KERN_ERR "BIOS bug, IO-APIC#%d ID is %d in the 
MPC table!...\n",
+                               apic, mp_ioapics[apic].mpc_apicid);
+                       printk(KERN_ERR "... fixing up to %d. (tell your hw 
vendor)\n",
+                               reg_00.bits.ID);
+                       mp_ioapics[apic].mpc_apicid = reg_00.bits.ID;
+               }
+
+               /* Don't check I/O APIC IDs for some xAPIC systems.  They have
+                * no meaning without the serial APIC bus. */
+               if (NO_IOAPIC_CHECK)
+                       continue;
+               /*
+                * Sanity check, is the ID really free? Every APIC in a
+                * system must have a unique ID or we get lots of nice
+                * 'stuck on smp_invalidate_needed IPI wait' messages.
+                */
+               if (check_apicid_used(phys_id_present_map,
+                                       mp_ioapics[apic].mpc_apicid)) {
+                       printk(KERN_ERR "BIOS bug, IO-APIC#%d ID %d is already 
used!...\n",
+                               apic, mp_ioapics[apic].mpc_apicid);
+                       for (i = 0; i < get_physical_broadcast(); i++)
+                               if (!physid_isset(i, phys_id_present_map))
+                                       break;
+                       if (i >= get_physical_broadcast())
+                               panic("Max APIC ID exceeded!\n");
+                       printk(KERN_ERR "... fixing up to %d. (tell your hw 
vendor)\n",
+                               i);
+                       physid_set(i, phys_id_present_map);
+                       mp_ioapics[apic].mpc_apicid = i;
+               } else {
+                       physid_mask_t tmp;
+                       tmp = 
apicid_to_cpu_present(mp_ioapics[apic].mpc_apicid);
+                       apic_printk(APIC_VERBOSE, "Setting %d in the "
+                                       "phys_id_present_map\n",
+                                       mp_ioapics[apic].mpc_apicid);
+                       physids_or(phys_id_present_map, phys_id_present_map, 
tmp);
+               }
+
+
+               /*
+                * We need to adjust the IRQ routing table
+                * if the ID changed.
+                */
+               if (old_id != mp_ioapics[apic].mpc_apicid)
+                       for (i = 0; i < mp_irq_entries; i++)
+                               if (mp_irqs[i].mpc_dstapic == old_id)
+                                       mp_irqs[i].mpc_dstapic
+                                               = mp_ioapics[apic].mpc_apicid;
+
+               /*
+                * Read the right value from the MPC table and
+                * write it into the ID register.
+                */
+               apic_printk(APIC_VERBOSE, KERN_INFO
+                       "...changing IO-APIC physical APIC ID to %d ...",
+                       mp_ioapics[apic].mpc_apicid);
+
+               reg_00.bits.ID = mp_ioapics[apic].mpc_apicid;
+               spin_lock_irqsave(&ioapic_lock, flags);
+               io_apic_write(apic, 0, reg_00.raw);
+               spin_unlock_irqrestore(&ioapic_lock, flags);
+
+               /*
+                * Sanity check
+                */
+               spin_lock_irqsave(&ioapic_lock, flags);
+               reg_00.raw = io_apic_read(apic, 0);
+               spin_unlock_irqrestore(&ioapic_lock, flags);
+               if (reg_00.bits.ID != mp_ioapics[apic].mpc_apicid)
+                       printk("could not set ID!\n");
+               else
+                       apic_printk(APIC_VERBOSE, " ok.\n");
+       }
+}
+#else
+static void __init setup_ioapic_ids_from_mpc(void) { }
+#endif
+
+#ifndef CONFIG_XEN
+/*
+ * There is a nasty bug in some older SMP boards, their mptable lies
+ * about the timer IRQ. We do the following to work around the situation:
+ *
+ *     - timer IRQ defaults to IO-APIC IRQ
+ *     - if this function detects that timer IRQs are defunct, then we fall
+ *       back to ISA timer IRQs
+ */
+static int __init timer_irq_works(void)
+{
+       unsigned long t1 = jiffies;
+
+       local_irq_enable();
+       /* Let ten ticks pass... */
+       mdelay((10 * 1000) / HZ);
+
+       /*
+        * Expect a few ticks at least, to be sure some possible
+        * glue logic does not lock up after one or two first
+        * ticks in a non-ExtINT mode.  Also the local APIC
+        * might have cached one ExtINT interrupt.  Finally, at
+        * least one tick may be lost due to delays.
+        */
+       if (jiffies - t1 > 4)
+               return 1;
+
+       return 0;
+}
+
+/*
+ * In the SMP+IOAPIC case it might happen that there are an unspecified
+ * number of pending IRQ events unhandled. These cases are very rare,
+ * so we 'resend' these IRQs via IPIs, to the same CPU. It's much
+ * better to do it this way as thus we do not have to be aware of
+ * 'pending' interrupts in the IRQ path, except at this point.
+ */
+/*
+ * Edge triggered needs to resend any interrupt
+ * that was delayed but this is now handled in the device
+ * independent code.
+ */
+
+/*
+ * Starting up a edge-triggered IO-APIC interrupt is
+ * nasty - we need to make sure that we get the edge.
+ * If it is already asserted for some reason, we need
+ * return 1 to indicate that is was pending.
+ *
+ * This is not complete - we should be able to fake
+ * an edge even if it isn't on the 8259A...
+ */
+static unsigned int startup_edge_ioapic_irq(unsigned int irq)
+{
+       int was_pending = 0;
+       unsigned long flags;
+
+       spin_lock_irqsave(&ioapic_lock, flags);
+       if (irq < 16) {
+               disable_8259A_irq(irq);
+               if (i8259A_irq_pending(irq))
+                       was_pending = 1;
+       }
+       __unmask_IO_APIC_irq(irq);
+       spin_unlock_irqrestore(&ioapic_lock, flags);
+
+       return was_pending;
+}
+
+/*
+ * Once we have recorded IRQ_PENDING already, we can mask the
+ * interrupt for real. This prevents IRQ storms from unhandled
+ * devices.
+ */
+static void ack_edge_ioapic_irq(unsigned int irq)
+{
+       move_irq(irq);
+       if ((irq_desc[irq].status & (IRQ_PENDING | IRQ_DISABLED))
+                                       == (IRQ_PENDING | IRQ_DISABLED))
+               mask_IO_APIC_irq(irq);
+       ack_APIC_irq();
+}
+
+/*
+ * Level triggered interrupts can just be masked,
+ * and shutting down and starting up the interrupt
+ * is the same as enabling and disabling them -- except
+ * with a startup need to return a "was pending" value.
+ *
+ * Level triggered interrupts are special because we
+ * do not touch any IO-APIC register while handling
+ * them. We ack the APIC in the end-IRQ handler, not
+ * in the start-IRQ-handler. Protection against reentrance
+ * from the same interrupt is still provided, both by the
+ * generic IRQ layer and by the fact that an unacked local
+ * APIC does not accept IRQs.
+ */
+static unsigned int startup_level_ioapic_irq (unsigned int irq)
+{
+       unmask_IO_APIC_irq(irq);
+
+       return 0; /* don't check for pending */
+}
+
+static void end_level_ioapic_irq (unsigned int irq)
+{
+       unsigned long v;
+       int i;
+
+       move_irq(irq);
+/*
+ * It appears there is an erratum which affects at least version 0x11
+ * of I/O APIC (that's the 82093AA and cores integrated into various
+ * chipsets).  Under certain conditions a level-triggered interrupt is
+ * erroneously delivered as edge-triggered one but the respective IRR
+ * bit gets set nevertheless.  As a result the I/O unit expects an EOI
+ * message but it will never arrive and further interrupts are blocked
+ * from the source.  The exact reason is so far unknown, but the
+ * phenomenon was observed when two consecutive interrupt requests
+ * from a given source get delivered to the same CPU and the source is
+ * temporarily disabled in between.
+ *
+ * A workaround is to simulate an EOI message manually.  We achieve it
+ * by setting the trigger mode to edge and then to level when the edge
+ * trigger mode gets detected in the TMR of a local APIC for a
+ * level-triggered interrupt.  We mask the source for the time of the
+ * operation to prevent an edge-triggered interrupt escaping meanwhile.
+ * The idea is from Manfred Spraul.  --macro
+ */
+       i = IO_APIC_VECTOR(irq);
+
+       v = apic_read(APIC_TMR + ((i & ~0x1f) >> 1));
+
+       ack_APIC_irq();
+
+       if (!(v & (1 << (i & 0x1f)))) {
+               atomic_inc(&irq_mis_count);
+               spin_lock(&ioapic_lock);
+               __mask_and_edge_IO_APIC_irq(irq);
+               __unmask_and_level_IO_APIC_irq(irq);
+               spin_unlock(&ioapic_lock);
+       }
+}
+
+#ifdef CONFIG_PCI_MSI
+static unsigned int startup_edge_ioapic_vector(unsigned int vector)
+{
+       int irq = vector_to_irq(vector);
+
+       return startup_edge_ioapic_irq(irq);
+}
+
+static void ack_edge_ioapic_vector(unsigned int vector)
+{
+       int irq = vector_to_irq(vector);
+
+       ack_edge_ioapic_irq(irq);
+}
+
+static unsigned int startup_level_ioapic_vector (unsigned int vector)
+{
+       int irq = vector_to_irq(vector);
+
+       return startup_level_ioapic_irq (irq);
+}
+
+static void end_level_ioapic_vector (unsigned int vector)
+{
+       int irq = vector_to_irq(vector);
+
+       end_level_ioapic_irq(irq);
+}
+
+static void mask_IO_APIC_vector (unsigned int vector)
+{
+       int irq = vector_to_irq(vector);
+
+       mask_IO_APIC_irq(irq);
+}
+
+static void unmask_IO_APIC_vector (unsigned int vector)
+{
+       int irq = vector_to_irq(vector);
+
+       unmask_IO_APIC_irq(irq);
+}
+
+static void set_ioapic_affinity_vector (unsigned int vector,
+                                       cpumask_t cpu_mask)
+{
+       int irq = vector_to_irq(vector);
+
+       set_ioapic_affinity_irq(irq, cpu_mask);
+}
+#endif
+
+/*
+ * Level and edge triggered IO-APIC interrupts need different handling,
+ * so we use two separate IRQ descriptors. Edge triggered IRQs can be
+ * handled with the level-triggered descriptor, but that one has slightly
+ * more overhead. Level-triggered interrupts cannot be handled with the
+ * edge-triggered handler, without risking IRQ storms and other ugly
+ * races.
+ */
+static struct hw_interrupt_type ioapic_edge_type = {
+       .typename       = "IO-APIC-edge",
+       .startup        = startup_edge_ioapic,
+       .shutdown       = shutdown_edge_ioapic,
+       .enable         = enable_edge_ioapic,
+       .disable        = disable_edge_ioapic,
+       .ack            = ack_edge_ioapic,
+       .end            = end_edge_ioapic,
+       .set_affinity   = set_ioapic_affinity,
+};
+
+static struct hw_interrupt_type ioapic_level_type = {
+       .typename       = "IO-APIC-level",
+       .startup        = startup_level_ioapic,
+       .shutdown       = shutdown_level_ioapic,
+       .enable         = enable_level_ioapic,
+       .disable        = disable_level_ioapic,
+       .ack            = mask_and_ack_level_ioapic,
+       .end            = end_level_ioapic,
+       .set_affinity   = set_ioapic_affinity,
+};
+#endif /* !CONFIG_XEN */
+
+static inline void init_IO_APIC_traps(void)
+{
+       int irq;
+
+       /*
+        * NOTE! The local APIC isn't very good at handling
+        * multiple interrupts at the same interrupt level.
+        * As the interrupt level is determined by taking the
+        * vector number and shifting that right by 4, we
+        * want to spread these out a bit so that they don't
+        * all fall in the same interrupt level.
+        *
+        * Also, we've got to be careful not to trash gate
+        * 0x80, because int 0x80 is hm, kind of importantish. ;)
+        */
+       for (irq = 0; irq < NR_IRQS ; irq++) {
+               int tmp = irq;
+               if (use_pci_vector()) {
+                       if (!platform_legacy_irq(tmp))
+                               if ((tmp = vector_to_irq(tmp)) == -1)
+                                       continue;
+               }
+               if (IO_APIC_IRQ(tmp) && !IO_APIC_VECTOR(tmp)) {
+                       /*
+                        * Hmm.. We don't have an entry for this,
+                        * so default to an old-fashioned 8259
+                        * interrupt if we can..
+                        */
+                       if (irq < 16)
+                               make_8259A_irq(irq);
+#ifndef CONFIG_XEN
+                       else
+                               /* Strange. Oh, well.. */
+                               irq_desc[irq].handler = &no_irq_type;
+#endif
+               }
+       }
+}
+
+#ifndef CONFIG_XEN
+static void enable_lapic_irq (unsigned int irq)
+{
+       unsigned long v;
+
+       v = apic_read(APIC_LVT0);
+       apic_write_around(APIC_LVT0, v & ~APIC_LVT_MASKED);
+}
+
+static void disable_lapic_irq (unsigned int irq)
+{
+       unsigned long v;
+
+       v = apic_read(APIC_LVT0);
+       apic_write_around(APIC_LVT0, v | APIC_LVT_MASKED);
+}
+
+static void ack_lapic_irq (unsigned int irq)
+{
+       ack_APIC_irq();
+}
+
+static void end_lapic_irq (unsigned int i) { /* nothing */ }
+
+static struct hw_interrupt_type lapic_irq_type = {
+       .typename       = "local-APIC-edge",
+       .startup        = NULL, /* startup_irq() not used for IRQ0 */
+       .shutdown       = NULL, /* shutdown_irq() not used for IRQ0 */
+       .enable         = enable_lapic_irq,
+       .disable        = disable_lapic_irq,
+       .ack            = ack_lapic_irq,
+       .end            = end_lapic_irq
+};
+
+static void setup_nmi (void)
+{
+       /*
+        * Dirty trick to enable the NMI watchdog ...
+        * We put the 8259A master into AEOI mode and
+        * unmask on all local APICs LVT0 as NMI.
+        *
+        * The idea to use the 8259A in AEOI mode ('8259A Virtual Wire')
+        * is from Maciej W. Rozycki - so we do not have to EOI from
+        * the NMI handler or the timer interrupt.
+        */ 
+       apic_printk(APIC_VERBOSE, KERN_INFO "activating NMI Watchdog ...");
+
+       on_each_cpu(enable_NMI_through_LVT0, NULL, 1, 1);
+
+       apic_printk(APIC_VERBOSE, " done.\n");
+}
+
+/*
+ * This looks a bit hackish but it's about the only one way of sending
+ * a few INTA cycles to 8259As and any associated glue logic.  ICR does
+ * not support the ExtINT mode, unfortunately.  We need to send these
+ * cycles as some i82489DX-based boards have glue logic that keeps the
+ * 8259A interrupt line asserted until INTA.  --macro
+ */
+static inline void unlock_ExtINT_logic(void)
+{
+       int pin, i;
+       struct IO_APIC_route_entry entry0, entry1;
+       unsigned char save_control, save_freq_select;
+       unsigned long flags;
+
+       pin = find_isa_irq_pin(8, mp_INT);
+       if (pin == -1)
+               return;
+
+       spin_lock_irqsave(&ioapic_lock, flags);
+       *(((int *)&entry0) + 1) = io_apic_read(0, 0x11 + 2 * pin);
+       *(((int *)&entry0) + 0) = io_apic_read(0, 0x10 + 2 * pin);
+       spin_unlock_irqrestore(&ioapic_lock, flags);
+       clear_IO_APIC_pin(0, pin);
+
+       memset(&entry1, 0, sizeof(entry1));
+
+       entry1.dest_mode = 0;                   /* physical delivery */
+       entry1.mask = 0;                        /* unmask IRQ now */
+       entry1.dest.physical.physical_dest = hard_smp_processor_id();
+       entry1.delivery_mode = dest_ExtINT;
+       entry1.polarity = entry0.polarity;
+       entry1.trigger = 0;
+       entry1.vector = 0;
+
+       spin_lock_irqsave(&ioapic_lock, flags);
+       io_apic_write(0, 0x11 + 2 * pin, *(((int *)&entry1) + 1));
+       io_apic_write(0, 0x10 + 2 * pin, *(((int *)&entry1) + 0));
+       spin_unlock_irqrestore(&ioapic_lock, flags);
+
+       save_control = CMOS_READ(RTC_CONTROL);
+       save_freq_select = CMOS_READ(RTC_FREQ_SELECT);
+       CMOS_WRITE((save_freq_select & ~RTC_RATE_SELECT) | 0x6,
+                  RTC_FREQ_SELECT);
+       CMOS_WRITE(save_control | RTC_PIE, RTC_CONTROL);
+
+       i = 100;
+       while (i-- > 0) {
+               mdelay(10);
+               if ((CMOS_READ(RTC_INTR_FLAGS) & RTC_PF) == RTC_PF)
+                       i -= 10;
+       }
+
+       CMOS_WRITE(save_control, RTC_CONTROL);
+       CMOS_WRITE(save_freq_select, RTC_FREQ_SELECT);
+       clear_IO_APIC_pin(0, pin);
+
+       spin_lock_irqsave(&ioapic_lock, flags);
+       io_apic_write(0, 0x11 + 2 * pin, *(((int *)&entry0) + 1));
+       io_apic_write(0, 0x10 + 2 * pin, *(((int *)&entry0) + 0));
+       spin_unlock_irqrestore(&ioapic_lock, flags);
+}
+
+/*
+ * This code may look a bit paranoid, but it's supposed to cooperate with
+ * a wide range of boards and BIOS bugs.  Fortunately only the timer IRQ
+ * is so screwy.  Thanks to Brian Perkins for testing/hacking this beast
+ * fanatically on his truly buggy board.
+ */
+static inline void check_timer(void)
+{
+       int pin1, pin2;
+       int vector;
+
+       /*
+        * get/set the timer IRQ vector:
+        */
+       disable_8259A_irq(0);
+       vector = assign_irq_vector(0);
+       set_intr_gate(vector, interrupt[0]);
+
+       /*
+        * Subtle, code in do_timer_interrupt() expects an AEOI
+        * mode for the 8259A whenever interrupts are routed
+        * through I/O APICs.  Also IRQ0 has to be enabled in
+        * the 8259A which implies the virtual wire has to be
+        * disabled in the local APIC.
+        */
+       apic_write_around(APIC_LVT0, APIC_LVT_MASKED | APIC_DM_EXTINT);
+       init_8259A(1);
+       timer_ack = 1;
+       enable_8259A_irq(0);
+
+       pin1 = find_isa_irq_pin(0, mp_INT);
+       pin2 = find_isa_irq_pin(0, mp_ExtINT);
+
+       printk(KERN_INFO "..TIMER: vector=0x%02X pin1=%d pin2=%d\n", vector, 
pin1, pin2);
+
+       if (pin1 != -1) {
+               /*
+                * Ok, does IRQ0 through the IOAPIC work?
+                */
+               unmask_IO_APIC_irq(0);
+               if (timer_irq_works()) {
+                       if (nmi_watchdog == NMI_IO_APIC) {
+                               disable_8259A_irq(0);
+                               setup_nmi();
+                               enable_8259A_irq(0);
+                       }
+                       return;
+               }
+               clear_IO_APIC_pin(0, pin1);
+               printk(KERN_ERR "..MP-BIOS bug: 8254 timer not connected to 
IO-APIC\n");
+       }
+
+       printk(KERN_INFO "...trying to set up timer (IRQ0) through the 8259A 
... ");
+       if (pin2 != -1) {
+               printk("\n..... (found pin %d) ...", pin2);
+               /*
+                * legacy devices should be connected to IO APIC #0
+                */
+               setup_ExtINT_IRQ0_pin(pin2, vector);
+               if (timer_irq_works()) {
+                       printk("works.\n");
+                       if (pin1 != -1)
+                               replace_pin_at_irq(0, 0, pin1, 0, pin2);
+                       else
+                               add_pin_to_irq(0, 0, pin2);
+                       if (nmi_watchdog == NMI_IO_APIC) {
+                               setup_nmi();
+                       }
+                       return;
+               }
+               /*
+                * Cleanup, just in case ...
+                */
+               clear_IO_APIC_pin(0, pin2);
+       }
+       printk(" failed.\n");
+
+       if (nmi_watchdog == NMI_IO_APIC) {
+               printk(KERN_WARNING "timer doesn't work through the IO-APIC - 
disabling NMI Watchdog!\n");
+               nmi_watchdog = 0;
+       }
+
+       printk(KERN_INFO "...trying to set up timer as Virtual Wire IRQ...");
+
+       disable_8259A_irq(0);
+       irq_desc[0].handler = &lapic_irq_type;
+       apic_write_around(APIC_LVT0, APIC_DM_FIXED | vector);   /* Fixed mode */
+       enable_8259A_irq(0);
+
+       if (timer_irq_works()) {
+               printk(" works.\n");
+               return;
+       }
+       apic_write_around(APIC_LVT0, APIC_LVT_MASKED | APIC_DM_FIXED | vector);
+       printk(" failed.\n");
+
+       printk(KERN_INFO "...trying to set up timer as ExtINT IRQ...");
+
+       timer_ack = 0;
+       init_8259A(0);
+       make_8259A_irq(0);
+       apic_write_around(APIC_LVT0, APIC_DM_EXTINT);
+
+       unlock_ExtINT_logic();
+
+       if (timer_irq_works()) {
+               printk(" works.\n");
+               return;
+       }
+       printk(" failed :(.\n");
+       panic("IO-APIC + timer doesn't work!  Boot with apic=debug and send a "
+               "report.  Then try booting with the 'noapic' option");
+}
+#else
+#define check_timer() ((void)0)
+#endif
+
+/*
+ *
+ * IRQ's that are handled by the PIC in the MPS IOAPIC case.
+ * - IRQ2 is the cascade IRQ, and cannot be a io-apic IRQ.
+ *   Linux doesn't really care, as it's not actually used
+ *   for any interrupt handling anyway.
+ */
+#define PIC_IRQS       (1 << PIC_CASCADE_IR)
+
+void __init setup_IO_APIC(void)
+{
+       enable_IO_APIC();
+
+       if (acpi_ioapic)
+               io_apic_irqs = ~0;      /* all IRQs go through IOAPIC */
+       else
+               io_apic_irqs = ~PIC_IRQS;
+
+       printk("ENABLING IO-APIC IRQs\n");
+
+       /*
+        * Set up IO-APIC IRQ routing.
+        */
+       if (!acpi_ioapic)
+               setup_ioapic_ids_from_mpc();
+#ifndef CONFIG_XEN
+       sync_Arb_IDs();
+#endif
+       setup_IO_APIC_irqs();
+       init_IO_APIC_traps();
+       check_timer();
+       if (!acpi_ioapic)
+               print_IO_APIC();
+}
+
+/*
+ *     Called after all the initialization is done. If we didnt find any
+ *     APIC bugs then we can allow the modify fast path
+ */
+ 
+static int __init io_apic_bug_finalize(void)
+{
+       if(sis_apic_bug == -1)
+               sis_apic_bug = 0;
+       return 0;
+}
+
+late_initcall(io_apic_bug_finalize);
+
+struct sysfs_ioapic_data {
+       struct sys_device dev;
+       struct IO_APIC_route_entry entry[0];
+};
+static struct sysfs_ioapic_data * mp_ioapic_data[MAX_IO_APICS];
+
+static int ioapic_suspend(struct sys_device *dev, pm_message_t state)
+{
+       struct IO_APIC_route_entry *entry;
+       struct sysfs_ioapic_data *data;
+       unsigned long flags;
+       int i;
+       
+       data = container_of(dev, struct sysfs_ioapic_data, dev);
+       entry = data->entry;
+       spin_lock_irqsave(&ioapic_lock, flags);
+       for (i = 0; i < nr_ioapic_registers[dev->id]; i ++, entry ++ ) {
+               *(((int *)entry) + 1) = io_apic_read(dev->id, 0x11 + 2 * i);
+               *(((int *)entry) + 0) = io_apic_read(dev->id, 0x10 + 2 * i);
+       }
+       spin_unlock_irqrestore(&ioapic_lock, flags);
+
+       return 0;
+}
+
+static int ioapic_resume(struct sys_device *dev)
+{
+       struct IO_APIC_route_entry *entry;
+       struct sysfs_ioapic_data *data;
+       unsigned long flags;
+       union IO_APIC_reg_00 reg_00;
+       int i;
+       
+       data = container_of(dev, struct sysfs_ioapic_data, dev);
+       entry = data->entry;
+
+       spin_lock_irqsave(&ioapic_lock, flags);
+       reg_00.raw = io_apic_read(dev->id, 0);
+       if (reg_00.bits.ID != mp_ioapics[dev->id].mpc_apicid) {
+               reg_00.bits.ID = mp_ioapics[dev->id].mpc_apicid;
+               io_apic_write(dev->id, 0, reg_00.raw);
+       }
+       for (i = 0; i < nr_ioapic_registers[dev->id]; i ++, entry ++ ) {
+               io_apic_write(dev->id, 0x11+2*i, *(((int *)entry)+1));
+               io_apic_write(dev->id, 0x10+2*i, *(((int *)entry)+0));
+       }
+       spin_unlock_irqrestore(&ioapic_lock, flags);
+
+       return 0;
+}
+
+static struct sysdev_class ioapic_sysdev_class = {
+       set_kset_name("ioapic"),
+       .suspend = ioapic_suspend,
+       .resume = ioapic_resume,
+};
+
+static int __init ioapic_init_sysfs(void)
+{
+       struct sys_device * dev;
+       int i, size, error = 0;
+
+       error = sysdev_class_register(&ioapic_sysdev_class);
+       if (error)
+               return error;
+
+       for (i = 0; i < nr_ioapics; i++ ) {
+               size = sizeof(struct sys_device) + nr_ioapic_registers[i] 
+                       * sizeof(struct IO_APIC_route_entry);
+               mp_ioapic_data[i] = kmalloc(size, GFP_KERNEL);
+               if (!mp_ioapic_data[i]) {
+                       printk(KERN_ERR "Can't suspend/resume IOAPIC %d\n", i);
+                       continue;
+               }
+               memset(mp_ioapic_data[i], 0, size);
+               dev = &mp_ioapic_data[i]->dev;
+               dev->id = i; 
+               dev->cls = &ioapic_sysdev_class;
+               error = sysdev_register(dev);
+               if (error) {
+                       kfree(mp_ioapic_data[i]);
+                       mp_ioapic_data[i] = NULL;
+                       printk(KERN_ERR "Can't suspend/resume IOAPIC %d\n", i);
+                       continue;
+               }
+       }
+
+       return 0;
+}
+
+device_initcall(ioapic_init_sysfs);
+
+/* --------------------------------------------------------------------------
+                          ACPI-based IOAPIC Configuration
+   -------------------------------------------------------------------------- 
*/
+
+#ifdef CONFIG_ACPI_BOOT
+
+int __init io_apic_get_unique_id (int ioapic, int apic_id)
+{
+#ifndef CONFIG_XEN
+       union IO_APIC_reg_00 reg_00;
+       static physid_mask_t apic_id_map = PHYSID_MASK_NONE;
+       physid_mask_t tmp;
+       unsigned long flags;
+       int i = 0;
+
+       /*
+        * The P4 platform supports up to 256 APIC IDs on two separate APIC 
+        * buses (one for LAPICs, one for IOAPICs), where predecessors only 
+        * supports up to 16 on one shared APIC bus.
+        * 
+        * TBD: Expand LAPIC/IOAPIC support on P4-class systems to take full
+        *      advantage of new APIC bus architecture.
+        */
+
+       if (physids_empty(apic_id_map))
+               apic_id_map = ioapic_phys_id_map(phys_cpu_present_map);
+
+       spin_lock_irqsave(&ioapic_lock, flags);
+       reg_00.raw = io_apic_read(ioapic, 0);
+       spin_unlock_irqrestore(&ioapic_lock, flags);
+
+       if (apic_id >= get_physical_broadcast()) {
+               printk(KERN_WARNING "IOAPIC[%d]: Invalid apic_id %d, trying "
+                       "%d\n", ioapic, apic_id, reg_00.bits.ID);
+               apic_id = reg_00.bits.ID;
+       }
+
+       /*
+        * Every APIC in a system must have a unique ID or we get lots of nice 
+        * 'stuck on smp_invalidate_needed IPI wait' messages.
+        */
+       if (check_apicid_used(apic_id_map, apic_id)) {
+
+               for (i = 0; i < get_physical_broadcast(); i++) {
+                       if (!check_apicid_used(apic_id_map, i))
+                               break;
+               }
+
+               if (i == get_physical_broadcast())
+                       panic("Max apic_id exceeded!\n");
+
+               printk(KERN_WARNING "IOAPIC[%d]: apic_id %d already used, "
+                       "trying %d\n", ioapic, apic_id, i);
+
+               apic_id = i;
+       } 
+
+       tmp = apicid_to_cpu_present(apic_id);
+       physids_or(apic_id_map, apic_id_map, tmp);
+
+       if (reg_00.bits.ID != apic_id) {
+               reg_00.bits.ID = apic_id;
+
+               spin_lock_irqsave(&ioapic_lock, flags);
+               io_apic_write(ioapic, 0, reg_00.raw);
+               reg_00.raw = io_apic_read(ioapic, 0);
+               spin_unlock_irqrestore(&ioapic_lock, flags);
+
+               /* Sanity check */
+               if (reg_00.bits.ID != apic_id)
+                       panic("IOAPIC[%d]: Unable change apic_id!\n", ioapic);
+       }
+
+       apic_printk(APIC_VERBOSE, KERN_INFO
+                       "IOAPIC[%d]: Assigned apic_id %d\n", ioapic, apic_id);
+#endif /* !CONFIG_XEN */
+
+       return apic_id;
+}
+
+
+int __init io_apic_get_version (int ioapic)
+{
+       union IO_APIC_reg_01    reg_01;
+       unsigned long flags;
+
+       spin_lock_irqsave(&ioapic_lock, flags);
+       reg_01.raw = io_apic_read(ioapic, 1);
+       spin_unlock_irqrestore(&ioapic_lock, flags);
+
+       return reg_01.bits.version;
+}
+
+
+int __init io_apic_get_redir_entries (int ioapic)
+{
+       union IO_APIC_reg_01    reg_01;
+       unsigned long flags;
+
+       spin_lock_irqsave(&ioapic_lock, flags);
+       reg_01.raw = io_apic_read(ioapic, 1);
+       spin_unlock_irqrestore(&ioapic_lock, flags);
+
+       return reg_01.bits.entries;
+}
+
+
+int io_apic_set_pci_routing (int ioapic, int pin, int irq, int edge_level, int 
active_high_low)
+{
+       struct IO_APIC_route_entry entry;
+       unsigned long flags;
+
+       if (!IO_APIC_IRQ(irq)) {
+               printk(KERN_ERR "IOAPIC[%d]: Invalid reference to IRQ 0\n",
+                       ioapic);
+               return -EINVAL;
+       }
+
+       /*
+        * Generate a PCI IRQ routing entry and program the IOAPIC accordingly.
+        * Note that we mask (disable) IRQs now -- these get enabled when the
+        * corresponding device driver registers for this IRQ.
+        */
+
+       memset(&entry,0,sizeof(entry));
+
+       entry.delivery_mode = INT_DELIVERY_MODE;
+       entry.dest_mode = INT_DEST_MODE;
+       entry.dest.logical.logical_dest = cpu_mask_to_apicid(TARGET_CPUS);
+       entry.trigger = edge_level;
+       entry.polarity = active_high_low;
+       entry.mask  = 1;
+
+       /*
+        * IRQs < 16 are already in the irq_2_pin[] map
+        */
+       if (irq >= 16)
+               add_pin_to_irq(irq, ioapic, pin);
+
+       entry.vector = assign_irq_vector(irq);
+
+       apic_printk(APIC_DEBUG, KERN_DEBUG "IOAPIC[%d]: Set PCI routing entry "
+               "(%d-%d -> 0x%x -> IRQ %d Mode:%i Active:%i)\n", ioapic,
+               mp_ioapics[ioapic].mpc_apicid, pin, entry.vector, irq,
+               edge_level, active_high_low);
+
+       ioapic_register_intr(irq, entry.vector, edge_level);
+
+       if (!ioapic && (irq < 16))
+               disable_8259A_irq(irq);
+
+       spin_lock_irqsave(&ioapic_lock, flags);
+       io_apic_write(ioapic, 0x11+2*pin, *(((int *)&entry)+1));
+       io_apic_write(ioapic, 0x10+2*pin, *(((int *)&entry)+0));
+       spin_unlock_irqrestore(&ioapic_lock, flags);
+
+       return 0;
+}
+
+#endif /*CONFIG_ACPI_BOOT*/
Index: linux-2.6.12-xen0-arch/arch/i386/mach-xen/kernel/ioport.c
===================================================================
--- /dev/null
+++ linux-2.6.12-xen0-arch/arch/i386/mach-xen/kernel/ioport.c
@@ -0,0 +1,129 @@
+/*
+ *     linux/arch/i386/kernel/ioport.c
+ *
+ * This contains the io-permission bitmap code - written by obz, with changes
+ * by Linus.
+ */
+
+#include <linux/sched.h>
+#include <linux/kernel.h>
+#include <linux/errno.h>
+#include <linux/types.h>
+#include <linux/ioport.h>
+#include <linux/smp.h>
+#include <linux/smp_lock.h>
+#include <linux/stddef.h>
+#include <linux/slab.h>
+#include <linux/thread_info.h>
+#include <asm-xen/xen-public/physdev.h>
+
+/* Set EXTENT bits starting at BASE in BITMAP to value TURN_ON. */
+static void set_bitmap(unsigned long *bitmap, unsigned int base, unsigned int 
extent, int new_value)
+{
+       unsigned long mask;
+       unsigned long *bitmap_base = bitmap + (base / BITS_PER_LONG);
+       unsigned int low_index = base & (BITS_PER_LONG-1);
+       int length = low_index + extent;
+
+       if (low_index != 0) {
+               mask = (~0UL << low_index);
+               if (length < BITS_PER_LONG)
+                       mask &= ~(~0UL << length);
+               if (new_value)
+                       *bitmap_base++ |= mask;
+               else
+                       *bitmap_base++ &= ~mask;
+               length -= BITS_PER_LONG;
+       }
+
+       mask = (new_value ? ~0UL : 0UL);
+       while (length >= BITS_PER_LONG) {
+               *bitmap_base++ = mask;
+               length -= BITS_PER_LONG;
+       }
+
+       if (length > 0) {
+               mask = ~(~0UL << length);
+               if (new_value)
+                       *bitmap_base++ |= mask;
+               else
+                       *bitmap_base++ &= ~mask;
+       }
+}
+
+
+/*
+ * this changes the io permissions bitmap in the current task.
+ */
+asmlinkage long sys_ioperm(unsigned long from, unsigned long num, int turn_on)
+{
+       struct thread_struct * t = &current->thread;
+       unsigned long *bitmap;
+       physdev_op_t op;
+
+       if ((from + num <= from) || (from + num > IO_BITMAP_BITS))
+               return -EINVAL;
+       if (turn_on && !capable(CAP_SYS_RAWIO))
+               return -EPERM;
+
+       /*
+        * If it's the first ioperm() call in this thread's lifetime, set the
+        * IO bitmap up. ioperm() is much less timing critical than clone(),
+        * this is why we delay this operation until now:
+        */
+       if (!t->io_bitmap_ptr) {
+               bitmap = kmalloc(IO_BITMAP_BYTES, GFP_KERNEL);
+               if (!bitmap)
+                       return -ENOMEM;
+
+               memset(bitmap, 0xff, IO_BITMAP_BYTES);
+               t->io_bitmap_ptr = bitmap;
+
+               op.cmd = PHYSDEVOP_SET_IOBITMAP;
+               op.u.set_iobitmap.bitmap   = (unsigned long)bitmap;
+               op.u.set_iobitmap.nr_ports = IO_BITMAP_BITS;
+               HYPERVISOR_physdev_op(&op);
+       }
+
+       set_bitmap(t->io_bitmap_ptr, from, num, !turn_on);
+
+       return 0;
+}
+
+/*
+ * sys_iopl has to be used when you want to access the IO ports
+ * beyond the 0x3ff range: to get the full 65536 ports bitmapped
+ * you'd need 8kB of bitmaps/process, which is a bit excessive.
+ *
+ * Here we just change the eflags value on the stack: we allow
+ * only the super-user to do it. This depends on the stack-layout
+ * on system-call entry - see also fork() and the signal handling
+ * code.
+ */
+
+asmlinkage long sys_iopl(unsigned int new_io_pl)
+{
+       unsigned int old_io_pl = current->thread.io_pl;
+       physdev_op_t op;
+
+       if (new_io_pl > 3)
+               return -EINVAL;
+
+       /* Need "raw I/O" privileges for direct port access. */
+       if ((new_io_pl > old_io_pl) && !capable(CAP_SYS_RAWIO))
+               return -EPERM;
+
+       /* Maintain OS privileges even if user attempts to relinquish them. */
+       if (new_io_pl == 0)
+               new_io_pl = 1;
+
+       /* Change our version of the privilege levels. */
+       current->thread.io_pl = new_io_pl;
+
+       /* Force the change at ring 0. */
+       op.cmd             = PHYSDEVOP_SET_IOPL;
+       op.u.set_iopl.iopl = new_io_pl;
+       HYPERVISOR_physdev_op(&op);
+
+       return 0;
+}
Index: linux-2.6.12-xen0-arch/arch/i386/mach-xen/kernel/irq.c
===================================================================
--- /dev/null
+++ linux-2.6.12-xen0-arch/arch/i386/mach-xen/kernel/irq.c
@@ -0,0 +1,299 @@
+/*
+ *     linux/arch/i386/kernel/irq.c
+ *
+ *     Copyright (C) 1992, 1998 Linus Torvalds, Ingo Molnar
+ *
+ * This file contains the lowest level x86-specific interrupt
+ * entry, irq-stacks and irq statistics code. All the remaining
+ * irq logic is done by the generic kernel/irq/ code and
+ * by the x86-specific irq controller code. (e.g. i8259.c and
+ * io_apic.c.)
+ */
+
+#include <asm/uaccess.h>
+#include <linux/module.h>
+#include <linux/seq_file.h>
+#include <linux/interrupt.h>
+#include <linux/kernel_stat.h>
+#include <linux/notifier.h>
+#include <linux/cpu.h>
+#include <linux/delay.h>
+
+DEFINE_PER_CPU(irq_cpustat_t, irq_stat) ____cacheline_maxaligned_in_smp;
+EXPORT_PER_CPU_SYMBOL(irq_stat);
+
+#ifndef CONFIG_X86_LOCAL_APIC
+/*
+ * 'what should we do if we get a hw irq event on an illegal vector'.
+ * each architecture has to answer this themselves.
+ */
+void ack_bad_irq(unsigned int irq)
+{
+       printk("unexpected IRQ trap at vector %02x\n", irq);
+}
+#endif
+
+#ifdef CONFIG_4KSTACKS
+/*
+ * per-CPU IRQ handling contexts (thread information and stack)
+ */
+union irq_ctx {
+       struct thread_info      tinfo;
+       u32                     stack[THREAD_SIZE/sizeof(u32)];
+};
+
+static union irq_ctx *hardirq_ctx[NR_CPUS];
+static union irq_ctx *softirq_ctx[NR_CPUS];
+#endif
+
+/*
+ * do_IRQ handles all normal device IRQ's (the special
+ * SMP cross-CPU interrupts have their own specific
+ * handlers).
+ */
+fastcall unsigned int do_IRQ(struct pt_regs *regs)
+{      
+       /* high bits used in ret_from_ code */
+       int irq = regs->orig_eax & __IRQ_MASK(HARDIRQ_BITS);
+#ifdef CONFIG_4KSTACKS
+       union irq_ctx *curctx, *irqctx;
+       u32 *isp;
+#endif
+
+       irq_enter();
+#ifdef CONFIG_DEBUG_STACKOVERFLOW
+       /* Debugging check for stack overflow: is there less than 1KB free? */
+       {
+               long esp;
+
+               __asm__ __volatile__("andl %%esp,%0" :
+                                       "=r" (esp) : "0" (THREAD_SIZE - 1));
+               if (unlikely(esp < (sizeof(struct thread_info) + STACK_WARN))) {
+                       printk("do_IRQ: stack overflow: %ld\n",
+                               esp - sizeof(struct thread_info));
+                       dump_stack();
+               }
+       }
+#endif
+
+#ifdef CONFIG_4KSTACKS
+
+       curctx = (union irq_ctx *) current_thread_info();
+       irqctx = hardirq_ctx[smp_processor_id()];
+
+       /*
+        * this is where we switch to the IRQ stack. However, if we are
+        * already using the IRQ stack (because we interrupted a hardirq
+        * handler) we can't do that and just have to keep using the
+        * current stack (which is the irq stack already after all)
+        */
+       if (curctx != irqctx) {
+               int arg1, arg2, ebx;
+
+               /* build the stack frame on the IRQ stack */
+               isp = (u32*) ((char*)irqctx + sizeof(*irqctx));
+               irqctx->tinfo.task = curctx->tinfo.task;
+               irqctx->tinfo.previous_esp = current_stack_pointer;
+
+               asm volatile(
+                       "       xchgl   %%ebx,%%esp      \n"
+                       "       call    __do_IRQ         \n"
+                       "       movl   %%ebx,%%esp      \n"
+                       : "=a" (arg1), "=d" (arg2), "=b" (ebx)
+                       :  "0" (irq),   "1" (regs),  "2" (isp)
+                       : "memory", "cc", "ecx"
+               );
+       } else
+#endif
+               __do_IRQ(irq, regs);
+
+       irq_exit();
+
+       return 1;
+}
+
+#ifdef CONFIG_4KSTACKS
+
+/*
+ * These should really be __section__(".bss.page_aligned") as well, but
+ * gcc's 3.0 and earlier don't handle that correctly.
+ */
+static char softirq_stack[NR_CPUS * THREAD_SIZE]
+               __attribute__((__aligned__(THREAD_SIZE)));
+
+static char hardirq_stack[NR_CPUS * THREAD_SIZE]
+               __attribute__((__aligned__(THREAD_SIZE)));
+
+/*
+ * allocate per-cpu stacks for hardirq and for softirq processing
+ */
+void irq_ctx_init(int cpu)
+{
+       union irq_ctx *irqctx;
+
+       if (hardirq_ctx[cpu])
+               return;
+
+       irqctx = (union irq_ctx*) &hardirq_stack[cpu*THREAD_SIZE];
+       irqctx->tinfo.task              = NULL;
+       irqctx->tinfo.exec_domain       = NULL;
+       irqctx->tinfo.cpu               = cpu;
+       irqctx->tinfo.preempt_count     = HARDIRQ_OFFSET;
+       irqctx->tinfo.addr_limit        = MAKE_MM_SEG(0);
+
+       hardirq_ctx[cpu] = irqctx;
+
+       irqctx = (union irq_ctx*) &softirq_stack[cpu*THREAD_SIZE];
+       irqctx->tinfo.task              = NULL;
+       irqctx->tinfo.exec_domain       = NULL;
+       irqctx->tinfo.cpu               = cpu;
+       irqctx->tinfo.preempt_count     = SOFTIRQ_OFFSET;
+       irqctx->tinfo.addr_limit        = MAKE_MM_SEG(0);
+
+       softirq_ctx[cpu] = irqctx;
+
+       printk("CPU %u irqstacks, hard=%p soft=%p\n",
+               cpu,hardirq_ctx[cpu],softirq_ctx[cpu]);
+}
+
+extern asmlinkage void __do_softirq(void);
+
+asmlinkage void do_softirq(void)
+{
+       unsigned long flags;
+       struct thread_info *curctx;
+       union irq_ctx *irqctx;
+       u32 *isp;
+
+       if (in_interrupt())
+               return;
+
+       local_irq_save(flags);
+
+       if (local_softirq_pending()) {
+               curctx = current_thread_info();
+               irqctx = softirq_ctx[smp_processor_id()];
+               irqctx->tinfo.task = curctx->task;
+               irqctx->tinfo.previous_esp = current_stack_pointer;
+
+               /* build the stack frame on the softirq stack */
+               isp = (u32*) ((char*)irqctx + sizeof(*irqctx));
+
+               asm volatile(
+                       "       xchgl   %%ebx,%%esp     \n"
+                       "       call    __do_softirq    \n"
+                       "       movl    %%ebx,%%esp     \n"
+                       : "=b"(isp)
+                       : "0"(isp)
+                       : "memory", "cc", "edx", "ecx", "eax"
+               );
+       }
+
+       local_irq_restore(flags);
+}
+
+EXPORT_SYMBOL(do_softirq);
+#endif
+
+/*
+ * Interrupt statistics:
+ */
+
+atomic_t irq_err_count;
+
+/*
+ * /proc/interrupts printing:
+ */
+
+int show_interrupts(struct seq_file *p, void *v)
+{
+       int i = *(loff_t *) v, j;
+       struct irqaction * action;
+       unsigned long flags;
+
+       if (i == 0) {
+               seq_printf(p, "           ");
+               for_each_cpu(j)
+                       seq_printf(p, "CPU%d       ",j);
+               seq_putc(p, '\n');
+       }
+
+       if (i < NR_IRQS) {
+               spin_lock_irqsave(&irq_desc[i].lock, flags);
+               action = irq_desc[i].action;
+               if (!action)
+                       goto skip;
+               seq_printf(p, "%3d: ",i);
+#ifndef CONFIG_SMP
+               seq_printf(p, "%10u ", kstat_irqs(i));
+#else
+               for_each_cpu(j)
+                       seq_printf(p, "%10u ", kstat_cpu(j).irqs[i]);
+#endif
+               seq_printf(p, " %14s", irq_desc[i].handler->typename);
+               seq_printf(p, "  %s", action->name);
+
+               for (action=action->next; action; action = action->next)
+                       seq_printf(p, ", %s", action->name);
+
+               seq_putc(p, '\n');
+skip:
+               spin_unlock_irqrestore(&irq_desc[i].lock, flags);
+       } else if (i == NR_IRQS) {
+               seq_printf(p, "NMI: ");
+               for_each_cpu(j)
+                       seq_printf(p, "%10u ", nmi_count(j));
+               seq_putc(p, '\n');
+#ifdef CONFIG_X86_LOCAL_APIC
+               seq_printf(p, "LOC: ");
+               for_each_cpu(j)
+                       seq_printf(p, "%10u ", per_cpu(irq_stat, 
j).apic_timer_irqs);
+               seq_putc(p, '\n');
+#endif
+               seq_printf(p, "ERR: %10u\n", atomic_read(&irq_err_count));
+#if defined(CONFIG_X86_IO_APIC)
+               seq_printf(p, "MIS: %10u\n", atomic_read(&irq_mis_count));
+#endif
+       }
+       return 0;
+}
+
+#ifdef CONFIG_HOTPLUG_CPU
+
+void fixup_irqs(cpumask_t map)
+{
+       unsigned int irq;
+
+       for (irq = 0; irq < NR_IRQS; irq++) {
+               cpumask_t mask;
+               if (irq == 2)
+                       continue;
+
+               cpus_and(mask, irq_affinity[irq], map);
+               if (any_online_cpu(mask) == NR_CPUS) {
+                       printk("Breaking affinity for irq %i\n", irq);
+                       mask = map;
+               }
+               if (irq_desc[irq].handler->set_affinity)
+                       irq_desc[irq].handler->set_affinity(irq, mask);
+               else if (irq_desc[irq].action)
+                       printk("Cannot set affinity for irq %i\n", irq);
+       }
+
+#if 0
+       barrier();
+       /* Ingo Molnar says: "after the IO-APIC masks have been redirected
+          [note the nop - the interrupt-enable boundary on x86 is two
+          instructions from sti] - to flush out pending hardirqs and
+          IPIs. After this point nothing is supposed to reach this CPU." */
+       __asm__ __volatile__("sti; nop; cli");
+       barrier();
+#else
+       /* That doesn't seem sufficient.  Give it 1ms. */
+       local_irq_enable();
+       mdelay(1);
+       local_irq_disable();
+#endif
+}
+#endif
+
Index: linux-2.6.12-xen0-arch/arch/i386/mach-xen/kernel/ldt.c
===================================================================
--- /dev/null
+++ linux-2.6.12-xen0-arch/arch/i386/mach-xen/kernel/ldt.c
@@ -0,0 +1,275 @@
+/*
+ * linux/kernel/ldt.c
+ *
+ * Copyright (C) 1992 Krishna Balasubramanian and Linus Torvalds
+ * Copyright (C) 1999 Ingo Molnar <mingo@xxxxxxxxxx>
+ */
+
+#include <linux/errno.h>
+#include <linux/sched.h>
+#include <linux/string.h>
+#include <linux/mm.h>
+#include <linux/smp.h>
+#include <linux/smp_lock.h>
+#include <linux/vmalloc.h>
+#include <linux/slab.h>
+
+#include <asm/uaccess.h>
+#include <asm/system.h>
+#include <asm/ldt.h>
+#include <asm/desc.h>
+#include <asm/mmu_context.h>
+
+#ifdef CONFIG_SMP /* avoids "defined but not used" warnig */
+static void flush_ldt(void *null)
+{
+       if (current->active_mm)
+               load_LDT(&current->active_mm->context);
+}
+#endif
+
+static int alloc_ldt(mm_context_t *pc, int mincount, int reload)
+{
+       void *oldldt;
+       void *newldt;
+       int oldsize;
+
+       if (mincount <= pc->size)
+               return 0;
+       oldsize = pc->size;
+       mincount = (mincount+511)&(~511);
+       if (mincount*LDT_ENTRY_SIZE > PAGE_SIZE)
+               newldt = vmalloc(mincount*LDT_ENTRY_SIZE);
+       else
+               newldt = kmalloc(mincount*LDT_ENTRY_SIZE, GFP_KERNEL);
+
+       if (!newldt)
+               return -ENOMEM;
+
+       if (oldsize)
+               memcpy(newldt, pc->ldt, oldsize*LDT_ENTRY_SIZE);
+       oldldt = pc->ldt;
+       memset(newldt+oldsize*LDT_ENTRY_SIZE, 0, 
(mincount-oldsize)*LDT_ENTRY_SIZE);
+       pc->ldt = newldt;
+       wmb();
+       pc->size = mincount;
+       wmb();
+
+       if (reload) {
+#ifdef CONFIG_SMP
+               cpumask_t mask;
+               preempt_disable();
+#endif
+               make_pages_readonly(pc->ldt, (pc->size * LDT_ENTRY_SIZE) /
+                                   PAGE_SIZE);
+               load_LDT(pc);
+#ifdef CONFIG_SMP
+               mask = cpumask_of_cpu(smp_processor_id());
+               if (!cpus_equal(current->mm->cpu_vm_mask, mask))
+                       smp_call_function(flush_ldt, NULL, 1, 1);
+               preempt_enable();
+#endif
+       }
+       if (oldsize) {
+               make_pages_writable(oldldt, (oldsize * LDT_ENTRY_SIZE) /
+                       PAGE_SIZE);
+               if (oldsize*LDT_ENTRY_SIZE > PAGE_SIZE)
+                       vfree(oldldt);
+               else
+                       kfree(oldldt);
+       }
+       return 0;
+}
+
+static inline int copy_ldt(mm_context_t *new, mm_context_t *old)
+{
+       int err = alloc_ldt(new, old->size, 0);
+       if (err < 0)
+               return err;
+       memcpy(new->ldt, old->ldt, old->size*LDT_ENTRY_SIZE);
+       make_pages_readonly(new->ldt, (new->size * LDT_ENTRY_SIZE) /
+                           PAGE_SIZE);
+       return 0;
+}
+
+/*
+ * we do not have to muck with descriptors here, that is
+ * done in switch_mm() as needed.
+ */
+int init_new_context(struct task_struct *tsk, struct mm_struct *mm)
+{
+       struct mm_struct * old_mm;
+       int retval = 0;
+
+       memset(&mm->context, 0, sizeof(mm->context));
+       init_MUTEX(&mm->context.sem);
+       old_mm = current->mm;
+       if (old_mm && old_mm->context.size > 0) {
+               down(&old_mm->context.sem);
+               retval = copy_ldt(&mm->context, &old_mm->context);
+               up(&old_mm->context.sem);
+       }
+       if (retval == 0) {
+               spin_lock(&mm_unpinned_lock);
+               list_add(&mm->context.unpinned, &mm_unpinned);
+               spin_unlock(&mm_unpinned_lock);
+       }
+       return retval;
+}
+
+/*
+ * No need to lock the MM as we are the last user
+ */
+void destroy_context(struct mm_struct *mm)
+{
+       if (mm->context.size) {
+               if (mm == current->active_mm)
+                       clear_LDT();
+               make_pages_writable(mm->context.ldt, 
+                                   (mm->context.size * LDT_ENTRY_SIZE) /
+                                   PAGE_SIZE);
+               if (mm->context.size*LDT_ENTRY_SIZE > PAGE_SIZE)
+                       vfree(mm->context.ldt);
+               else
+                       kfree(mm->context.ldt);
+               mm->context.size = 0;
+       }
+       if (!mm->context.pinned) {
+               spin_lock(&mm_unpinned_lock);
+               list_del(&mm->context.unpinned);
+               spin_unlock(&mm_unpinned_lock);
+       }
+}
+
+static int read_ldt(void __user * ptr, unsigned long bytecount)
+{
+       int err;
+       unsigned long size;
+       struct mm_struct * mm = current->mm;
+
+       if (!mm->context.size)
+               return 0;
+       if (bytecount > LDT_ENTRY_SIZE*LDT_ENTRIES)
+               bytecount = LDT_ENTRY_SIZE*LDT_ENTRIES;
+
+       down(&mm->context.sem);
+       size = mm->context.size*LDT_ENTRY_SIZE;
+       if (size > bytecount)
+               size = bytecount;
+
+       err = 0;
+       if (copy_to_user(ptr, mm->context.ldt, size))
+               err = -EFAULT;
+       up(&mm->context.sem);
+       if (err < 0)
+               goto error_return;
+       if (size != bytecount) {
+               /* zero-fill the rest */
+               if (clear_user(ptr+size, bytecount-size) != 0) {
+                       err = -EFAULT;
+                       goto error_return;
+               }
+       }
+       return bytecount;
+error_return:
+       return err;
+}
+
+static int read_default_ldt(void __user * ptr, unsigned long bytecount)
+{
+       int err;
+       unsigned long size;
+       void *address;
+
+       err = 0;
+       address = &default_ldt[0];
+       size = 5*sizeof(struct desc_struct);
+       if (size > bytecount)
+               size = bytecount;
+
+       err = size;
+       if (copy_to_user(ptr, address, size))
+               err = -EFAULT;
+
+       return err;
+}
+
+static int write_ldt(void __user * ptr, unsigned long bytecount, int oldmode)
+{
+       struct mm_struct * mm = current->mm;
+       __u32 entry_1, entry_2, *lp;
+       unsigned long mach_lp;
+       int error;
+       struct user_desc ldt_info;
+
+       error = -EINVAL;
+       if (bytecount != sizeof(ldt_info))
+               goto out;
+       error = -EFAULT;        
+       if (copy_from_user(&ldt_info, ptr, sizeof(ldt_info)))
+               goto out;
+
+       error = -EINVAL;
+       if (ldt_info.entry_number >= LDT_ENTRIES)
+               goto out;
+       if (ldt_info.contents == 3) {
+               if (oldmode)
+                       goto out;
+               if (ldt_info.seg_not_present == 0)
+                       goto out;
+       }
+
+       down(&mm->context.sem);
+       if (ldt_info.entry_number >= mm->context.size) {
+               error = alloc_ldt(&current->mm->context, 
ldt_info.entry_number+1, 1);
+               if (error < 0)
+                       goto out_unlock;
+       }
+
+       lp = (__u32 *) ((ldt_info.entry_number << 3) + (char *) 
mm->context.ldt);
+       mach_lp = arbitrary_virt_to_machine(lp);
+
+       /* Allow LDTs to be cleared by the user. */
+       if (ldt_info.base_addr == 0 && ldt_info.limit == 0) {
+               if (oldmode || LDT_empty(&ldt_info)) {
+                       entry_1 = 0;
+                       entry_2 = 0;
+                       goto install;
+               }
+       }
+
+       entry_1 = LDT_entry_a(&ldt_info);
+       entry_2 = LDT_entry_b(&ldt_info);
+       if (oldmode)
+               entry_2 &= ~(1 << 20);
+
+       /* Install the new entry ...  */
+install:
+       error = HYPERVISOR_update_descriptor(mach_lp, entry_1, entry_2);
+
+out_unlock:
+       up(&mm->context.sem);
+out:
+       return error;
+}
+
+asmlinkage int sys_modify_ldt(int func, void __user *ptr, unsigned long 
bytecount)
+{
+       int ret = -ENOSYS;
+
+       switch (func) {
+       case 0:
+               ret = read_ldt(ptr, bytecount);
+               break;
+       case 1:
+               ret = write_ldt(ptr, bytecount, 1);
+               break;
+       case 2:
+               ret = read_default_ldt(ptr, bytecount);
+               break;
+       case 0x11:
+               ret = write_ldt(ptr, bytecount, 0);
+               break;
+       }
+       return ret;
+}
Index: linux-2.6.12-xen0-arch/arch/i386/mach-xen/kernel/Makefile
===================================================================
--- /dev/null
+++ linux-2.6.12-xen0-arch/arch/i386/mach-xen/kernel/Makefile
@@ -0,0 +1,99 @@
+#
+# Makefile for the linux kernel.
+#
+
+XENARCH        := $(subst ",,$(CONFIG_XENARCH))
+
+extra-y := head.o init_task.o
+
+obj-y  := process.o signal.o entry.o traps.o \
+               time.o ioport.o ldt.o setup.o \
+               pci-dma.o i386_ksyms.o irq.o quirks.o
+
+c-obj-y        := semaphore.o vm86.o \
+               ptrace.o sys_i386.o \
+               i387.o dmi_scan.o bootflag.o \
+               doublefault.o
+s-obj-y        :=
+
+obj-y                          += cpu/
+#obj-y                         += timers/
+obj-$(CONFIG_ACPI_BOOT)                += acpi/
+#c-obj-$(CONFIG_X86_BIOS_REBOOT)       += reboot.o
+c-obj-$(CONFIG_MCA)            += mca.o
+c-obj-$(CONFIG_X86_MSR)                += msr.o
+c-obj-$(CONFIG_X86_CPUID)      += cpuid.o
+obj-$(CONFIG_MICROCODE)                += microcode.o
+c-obj-$(CONFIG_APM)            += apm.o
+obj-$(CONFIG_X86_SMP)          += smp.o smpboot.o
+#obj-$(CONFIG_X86_TRAMPOLINE)  += trampoline.o
+obj-$(CONFIG_X86_MPPARSE)      += mpparse.o
+obj-$(CONFIG_X86_LOCAL_APIC)   += apic.o
+c-obj-$(CONFIG_X86_LOCAL_APIC) += nmi.o
+obj-$(CONFIG_X86_IO_APIC)      += io_apic.o
+c-obj-$(CONFIG_X86_REBOOTFIXUPS)+= reboot_fixups.o
+c-obj-$(CONFIG_X86_NUMAQ)      += numaq.o
+c-obj-$(CONFIG_X86_SUMMIT_NUMA)        += summit.o
+c-obj-$(CONFIG_MODULES)                += module.o
+c-obj-y                                += sysenter.o
+obj-y                          += vsyscall.o
+c-obj-$(CONFIG_ACPI_SRAT)      += srat.o
+c-obj-$(CONFIG_HPET_TIMER)     += time_hpet.o
+c-obj-$(CONFIG_EFI)            += efi.o efi_stub.o
+c-obj-$(CONFIG_EARLY_PRINTK)   += early_printk.o
+c-obj-$(CONFIG_SMP_ALTERNATIVES)+= smpalts.o
+
+EXTRA_AFLAGS   := -traditional
+
+c-obj-$(CONFIG_SCx200)         += scx200.o
+
+# vsyscall.o contains the vsyscall DSO images as __initdata.
+# We must build both images before we can assemble it.
+# Note: kbuild does not track this dependency due to usage of .incbin
+$(obj)/vsyscall.o: $(obj)/vsyscall-int80.so $(obj)/vsyscall-sysenter.so
+targets += $(foreach F,int80 sysenter,vsyscall-$F.o vsyscall-$F.so)
+targets += vsyscall-note.o vsyscall.lds
+
+# The DSO images are built using a special linker script.
+quiet_cmd_syscall = SYSCALL $@
+      cmd_syscall = $(CC) -m elf_i386 -nostdlib $(SYSCFLAGS_$(@F)) \
+                         -Wl,-T,$(filter-out FORCE,$^) -o $@
+
+export CPPFLAGS_vsyscall.lds += -P -C -U$(ARCH)
+
+vsyscall-flags = -shared -s -Wl,-soname=linux-gate.so.1
+SYSCFLAGS_vsyscall-sysenter.so = $(vsyscall-flags)
+SYSCFLAGS_vsyscall-int80.so    = $(vsyscall-flags)
+
+$(obj)/vsyscall-int80.so $(obj)/vsyscall-sysenter.so: \
+$(obj)/vsyscall-%.so: $(src)/vsyscall.lds \
+                     $(obj)/vsyscall-%.o FORCE
+       $(call if_changed,syscall)
+
+# We also create a special relocatable object that should mirror the symbol
+# table and layout of the linked DSO.  With ld -R we can then refer to
+# these symbols in the kernel code rather than hand-coded addresses.
+extra-y += vsyscall-syms.o
+$(obj)/built-in.o: $(obj)/vsyscall-syms.o
+$(obj)/built-in.o: ld_flags += -R $(obj)/vsyscall-syms.o
+
+SYSCFLAGS_vsyscall-syms.o = -r
+$(obj)/vsyscall-syms.o: $(src)/vsyscall.lds \
+                       $(obj)/vsyscall-sysenter.o FORCE
+       $(call if_changed,syscall)
+
+c-link := init_task.o
+s-link := vsyscall-int80.o vsyscall-sysenter.o vsyscall-sigreturn.o 
vsyscall.lds.o syscall_table.o
+
+$(patsubst %.o,$(obj)/%.c,$(c-obj-y) $(c-obj-m) $(c-link)) $(patsubst 
%.o,$(obj)/%.S,$(s-obj-y) $(s-link)):
+       @ln -fsn $(srctree)/arch/i386/kernel/$(notdir $@) $@
+
+$(obj)/vsyscall-int80.S: $(obj)/vsyscall-sigreturn.S
+
+$(obj)/entry.o: $(src)/entry.S $(src)/syscall_table.S
+
+obj-y  += $(c-obj-y) $(s-obj-y)
+obj-m  += $(c-obj-m)
+
+clean-files += $(patsubst %.o,%.c,$(c-obj-y) $(c-obj-m) $(c-obj-) $(c-link))
+clean-files += $(patsubst %.o,%.S,$(s-obj-y) $(s-obj-) $(s-link))
Index: linux-2.6.12-xen0-arch/arch/i386/mach-xen/kernel/microcode.c
===================================================================
--- /dev/null
+++ linux-2.6.12-xen0-arch/arch/i386/mach-xen/kernel/microcode.c
@@ -0,0 +1,163 @@
+/*
+ *     Intel CPU Microcode Update Driver for Linux
+ *
+ *     Copyright (C) 2000-2004 Tigran Aivazian
+ *
+ *     This driver allows to upgrade microcode on Intel processors
+ *     belonging to IA-32 family - PentiumPro, Pentium II, 
+ *     Pentium III, Xeon, Pentium 4, etc.
+ *
+ *     Reference: Section 8.10 of Volume III, Intel Pentium 4 Manual, 
+ *     Order Number 245472 or free download from:
+ *             
+ *     http://developer.intel.com/design/pentium4/manuals/245472.htm
+ *
+ *     For more information, go to http://www.urbanmyth.org/microcode
+ *
+ *     This program is free software; you can redistribute it and/or
+ *     modify it under the terms of the GNU General Public License
+ *     as published by the Free Software Foundation; either version
+ *     2 of the License, or (at your option) any later version.
+ */
+
+//#define DEBUG /* pr_debug */
+#include <linux/kernel.h>
+#include <linux/init.h>
+#include <linux/sched.h>
+#include <linux/module.h>
+#include <linux/slab.h>
+#include <linux/vmalloc.h>
+#include <linux/miscdevice.h>
+#include <linux/spinlock.h>
+#include <linux/mm.h>
+#include <linux/syscalls.h>
+
+#include <asm/msr.h>
+#include <asm/uaccess.h>
+#include <asm/processor.h>
+
+MODULE_DESCRIPTION("Intel CPU (IA-32) Microcode Update Driver");
+MODULE_AUTHOR("Tigran Aivazian <tigran@xxxxxxxxxxx>");
+MODULE_LICENSE("GPL");
+
+#define MICROCODE_VERSION      "1.14-xen"
+
+#define DEFAULT_UCODE_DATASIZE         (2000)    /* 2000 bytes */
+#define MC_HEADER_SIZE         (sizeof (microcode_header_t))     /* 48 bytes */
+#define DEFAULT_UCODE_TOTALSIZE (DEFAULT_UCODE_DATASIZE + MC_HEADER_SIZE) /* 
2048 bytes */
+
+/* no concurrent ->write()s are allowed on /dev/cpu/microcode */
+static DECLARE_MUTEX(microcode_sem);
+
+static void __user *user_buffer;       /* user area microcode data buffer */
+static unsigned int user_buffer_size;  /* it's size */
+                               
+static int microcode_open (struct inode *unused1, struct file *unused2)
+{
+       return capable(CAP_SYS_RAWIO) ? 0 : -EPERM;
+}
+
+
+static int do_microcode_update (void)
+{
+       int err;
+       dom0_op_t op;
+
+       err = sys_mlock((unsigned long)user_buffer, user_buffer_size);
+       if (err != 0)
+               return err;
+
+       op.cmd = DOM0_MICROCODE;
+       op.u.microcode.data = user_buffer;
+       op.u.microcode.length = user_buffer_size;
+       err = HYPERVISOR_dom0_op(&op);
+
+       (void)sys_munlock((unsigned long)user_buffer, user_buffer_size);
+
+       return err;
+}
+
+static ssize_t microcode_write (struct file *file, const char __user *buf, 
size_t len, loff_t *ppos)
+{
+       ssize_t ret;
+
+       if (len < DEFAULT_UCODE_TOTALSIZE) {
+               printk(KERN_ERR "microcode: not enough data\n"); 
+               return -EINVAL;
+       }
+
+       if ((len >> PAGE_SHIFT) > num_physpages) {
+               printk(KERN_ERR "microcode: too much data (max %ld pages)\n", 
num_physpages);
+               return -EINVAL;
+       }
+
+       down(&microcode_sem);
+
+       user_buffer = (void __user *) buf;
+       user_buffer_size = (int) len;
+
+       ret = do_microcode_update();
+       if (!ret)
+               ret = (ssize_t)len;
+
+       up(&microcode_sem);
+
+       return ret;
+}
+
+static int microcode_ioctl (struct inode *inode, struct file *file, 
+               unsigned int cmd, unsigned long arg)
+{
+       switch (cmd) {
+               /* 
+                *  XXX: will be removed after microcode_ctl 
+                *  is updated to ignore failure of this ioctl()
+                */
+               case MICROCODE_IOCFREE:
+                       return 0;
+               default:
+                       return -EINVAL;
+       }
+       return -EINVAL;
+}
+
+static struct file_operations microcode_fops = {
+       .owner          = THIS_MODULE,
+       .write          = microcode_write,
+       .ioctl          = microcode_ioctl,
+       .open           = microcode_open,
+};
+
+static struct miscdevice microcode_dev = {
+       .minor          = MICROCODE_MINOR,
+       .name           = "microcode",
+       .devfs_name     = "cpu/microcode",
+       .fops           = &microcode_fops,
+};
+
+static int __init microcode_init (void)
+{
+       int error;
+
+       error = misc_register(&microcode_dev);
+       if (error) {
+               printk(KERN_ERR
+                       "microcode: can't misc_register on minor=%d\n",
+                       MICROCODE_MINOR);
+               return error;
+       }
+
+       printk(KERN_INFO 
+               "IA-32 Microcode Update Driver: v" MICROCODE_VERSION " 
<tigran@xxxxxxxxxxx>\n");
+       return 0;
+}
+
+static void __exit microcode_exit (void)
+{
+       misc_deregister(&microcode_dev);
+       printk(KERN_INFO "IA-32 Microcode Update Driver v" MICROCODE_VERSION " 
unregistered\n");
+}
+
+module_init(microcode_init)
+module_exit(microcode_exit)
+MODULE_ALIAS_MISCDEV(MICROCODE_MINOR);
Index: linux-2.6.12-xen0-arch/arch/i386/mach-xen/kernel/mpparse.c
===================================================================
--- /dev/null
+++ linux-2.6.12-xen0-arch/arch/i386/mach-xen/kernel/mpparse.c
@@ -0,0 +1,1124 @@
+/*
+ *     Intel Multiprocessor Specification 1.1 and 1.4
+ *     compliant MP-table parsing routines.
+ *
+ *     (c) 1995 Alan Cox, Building #3 <alan@xxxxxxxxxx>
+ *     (c) 1998, 1999, 2000 Ingo Molnar <mingo@xxxxxxxxxx>
+ *
+ *     Fixes
+ *             Erich Boleyn    :       MP v1.4 and additional changes.
+ *             Alan Cox        :       Added EBDA scanning
+ *             Ingo Molnar     :       various cleanups and rewrites
+ *             Maciej W. Rozycki:      Bits for default MP configurations
+ *             Paul Diefenbaugh:       Added full ACPI support
+ */
+
+#include <linux/mm.h>
+#include <linux/irq.h>
+#include <linux/init.h>
+#include <linux/acpi.h>
+#include <linux/delay.h>
+#include <linux/config.h>
+#include <linux/bootmem.h>
+#include <linux/smp_lock.h>
+#include <linux/kernel_stat.h>
+#include <linux/mc146818rtc.h>
+#include <linux/bitops.h>
+
+#include <asm/smp.h>
+#include <asm/acpi.h>
+#include <asm/mtrr.h>
+#include <asm/mpspec.h>
+#include <asm/io_apic.h>
+
+#include <mach_apic.h>
+#include <mach_mpparse.h>
+#include <bios_ebda.h>
+
+/* Have we found an MP table */
+int smp_found_config;
+unsigned int __initdata maxcpus = NR_CPUS;
+
+/*
+ * Various Linux-internal data structures created from the
+ * MP-table.
+ */
+int apic_version [MAX_APICS];
+int mp_bus_id_to_type [MAX_MP_BUSSES];
+int mp_bus_id_to_node [MAX_MP_BUSSES];
+int mp_bus_id_to_local [MAX_MP_BUSSES];
+int quad_local_to_mp_bus_id [NR_CPUS/4][4];
+int mp_bus_id_to_pci_bus [MAX_MP_BUSSES] = { [0 ... MAX_MP_BUSSES-1] = -1 };
+static int mp_current_pci_id;
+
+/* I/O APIC entries */
+struct mpc_config_ioapic mp_ioapics[MAX_IO_APICS];
+
+/* # of MP IRQ source entries */
+struct mpc_config_intsrc mp_irqs[MAX_IRQ_SOURCES];
+
+/* MP IRQ source entries */
+int mp_irq_entries;
+
+int nr_ioapics;
+
+int pic_mode;
+unsigned long mp_lapic_addr;
+
+/* Processor that is doing the boot up */
+unsigned int boot_cpu_physical_apicid = -1U;
+unsigned int boot_cpu_logical_apicid = -1U;
+/* Internal processor count */
+static unsigned int __initdata num_processors;
+
+/* Bitmask of physically existing CPUs */
+physid_mask_t phys_cpu_present_map;
+
+u8 bios_cpu_apicid[NR_CPUS] = { [0 ... NR_CPUS-1] = BAD_APICID };
+
+/*
+ * Intel MP BIOS table parsing routines:
+ */
+
+
+/*
+ * Checksum an MP configuration block.
+ */
+
+static int __init mpf_checksum(unsigned char *mp, int len)
+{
+       int sum = 0;
+
+       while (len--)
+               sum += *mp++;
+
+       return sum & 0xFF;
+}
+
+/*
+ * Have to match translation table entries to main table entries by counter
+ * hence the mpc_record variable .... can't see a less disgusting way of
+ * doing this ....
+ */
+
+static int mpc_record; 
+static struct mpc_config_translation *translation_table[MAX_MPC_ENTRY] 
__initdata;
+
+#ifdef CONFIG_X86_NUMAQ
+static int MP_valid_apicid(int apicid, int version)
+{
+       return hweight_long(apicid & 0xf) == 1 && (apicid >> 4) != 0xf;
+}
+#elif !defined(CONFIG_XEN)
+static int MP_valid_apicid(int apicid, int version)
+{
+       if (version >= 0x14)
+               return apicid < 0xff;
+       else
+               return apicid < 0xf;
+}
+#endif
+
+#ifndef CONFIG_XEN
+static void __init MP_processor_info (struct mpc_config_processor *m)
+{
+       int ver, apicid;
+       physid_mask_t tmp;
+       
+       if (!(m->mpc_cpuflag & CPU_ENABLED))
+               return;
+
+       apicid = mpc_apic_id(m, translation_table[mpc_record]);
+
+       if (m->mpc_featureflag&(1<<0))
+               Dprintk("    Floating point unit present.\n");
+       if (m->mpc_featureflag&(1<<7))
+               Dprintk("    Machine Exception supported.\n");
+       if (m->mpc_featureflag&(1<<8))
+               Dprintk("    64 bit compare & exchange supported.\n");
+       if (m->mpc_featureflag&(1<<9))
+               Dprintk("    Internal APIC present.\n");
+       if (m->mpc_featureflag&(1<<11))
+               Dprintk("    SEP present.\n");
+       if (m->mpc_featureflag&(1<<12))
+               Dprintk("    MTRR  present.\n");
+       if (m->mpc_featureflag&(1<<13))
+               Dprintk("    PGE  present.\n");
+       if (m->mpc_featureflag&(1<<14))
+               Dprintk("    MCA  present.\n");
+       if (m->mpc_featureflag&(1<<15))
+               Dprintk("    CMOV  present.\n");
+       if (m->mpc_featureflag&(1<<16))
+               Dprintk("    PAT  present.\n");
+       if (m->mpc_featureflag&(1<<17))
+               Dprintk("    PSE  present.\n");
+       if (m->mpc_featureflag&(1<<18))
+               Dprintk("    PSN  present.\n");
+       if (m->mpc_featureflag&(1<<19))
+               Dprintk("    Cache Line Flush Instruction present.\n");
+       /* 20 Reserved */
+       if (m->mpc_featureflag&(1<<21))
+               Dprintk("    Debug Trace and EMON Store present.\n");
+       if (m->mpc_featureflag&(1<<22))
+               Dprintk("    ACPI Thermal Throttle Registers  present.\n");
+       if (m->mpc_featureflag&(1<<23))
+               Dprintk("    MMX  present.\n");
+       if (m->mpc_featureflag&(1<<24))
+               Dprintk("    FXSR  present.\n");
+       if (m->mpc_featureflag&(1<<25))
+               Dprintk("    XMM  present.\n");
+       if (m->mpc_featureflag&(1<<26))
+               Dprintk("    Willamette New Instructions  present.\n");
+       if (m->mpc_featureflag&(1<<27))
+               Dprintk("    Self Snoop  present.\n");
+       if (m->mpc_featureflag&(1<<28))
+               Dprintk("    HT  present.\n");
+       if (m->mpc_featureflag&(1<<29))
+               Dprintk("    Thermal Monitor present.\n");
+       /* 30, 31 Reserved */
+
+
+       if (m->mpc_cpuflag & CPU_BOOTPROCESSOR) {
+               Dprintk("    Bootup CPU\n");
+               boot_cpu_physical_apicid = m->mpc_apicid;
+               boot_cpu_logical_apicid = apicid;
+       }
+
+       if (num_processors >= NR_CPUS) {
+               printk(KERN_WARNING "WARNING: NR_CPUS limit of %i reached."
+                       "  Processor ignored.\n", NR_CPUS); 
+               return;
+       }
+
+       if (num_processors >= maxcpus) {
+               printk(KERN_WARNING "WARNING: maxcpus limit of %i reached."
+                       " Processor ignored.\n", maxcpus); 
+               return;
+       }
+       num_processors++;
+       ver = m->mpc_apicver;
+
+       if (!MP_valid_apicid(apicid, ver)) {
+               printk(KERN_WARNING "Processor #%d INVALID. (Max ID: %d).\n",
+                       m->mpc_apicid, MAX_APICS);
+               --num_processors;
+               return;
+       }
+
+       tmp = apicid_to_cpu_present(apicid);
+       physids_or(phys_cpu_present_map, phys_cpu_present_map, tmp);
+       
+       /*
+        * Validate version
+        */
+       if (ver == 0x0) {
+               printk(KERN_WARNING "BIOS bug, APIC version is 0 for CPU#%d! 
fixing up to 0x10. (tell your hw vendor)\n", m->mpc_apicid);
+               ver = 0x10;
+       }
+       apic_version[m->mpc_apicid] = ver;
+       bios_cpu_apicid[num_processors - 1] = m->mpc_apicid;
+}
+#else
+void __init MP_processor_info (struct mpc_config_processor *m)
+{
+       num_processors++;
+}
+#endif /* CONFIG_XEN */
+
+static void __init MP_bus_info (struct mpc_config_bus *m)
+{
+       char str[7];
+
+       memcpy(str, m->mpc_bustype, 6);
+       str[6] = 0;
+
+       mpc_oem_bus_info(m, str, translation_table[mpc_record]);
+
+       if (strncmp(str, BUSTYPE_ISA, sizeof(BUSTYPE_ISA)-1) == 0) {
+               mp_bus_id_to_type[m->mpc_busid] = MP_BUS_ISA;
+       } else if (strncmp(str, BUSTYPE_EISA, sizeof(BUSTYPE_EISA)-1) == 0) {
+               mp_bus_id_to_type[m->mpc_busid] = MP_BUS_EISA;
+       } else if (strncmp(str, BUSTYPE_PCI, sizeof(BUSTYPE_PCI)-1) == 0) {
+               mpc_oem_pci_bus(m, translation_table[mpc_record]);
+               mp_bus_id_to_type[m->mpc_busid] = MP_BUS_PCI;
+               mp_bus_id_to_pci_bus[m->mpc_busid] = mp_current_pci_id;
+               mp_current_pci_id++;
+       } else if (strncmp(str, BUSTYPE_MCA, sizeof(BUSTYPE_MCA)-1) == 0) {
+               mp_bus_id_to_type[m->mpc_busid] = MP_BUS_MCA;
+       } else if (strncmp(str, BUSTYPE_NEC98, sizeof(BUSTYPE_NEC98)-1) == 0) {
+               mp_bus_id_to_type[m->mpc_busid] = MP_BUS_NEC98;
+       } else {
+               printk(KERN_WARNING "Unknown bustype %s - ignoring\n", str);
+       }
+}
+
+static void __init MP_ioapic_info (struct mpc_config_ioapic *m)
+{
+       if (!(m->mpc_flags & MPC_APIC_USABLE))
+               return;
+
+       printk(KERN_INFO "I/O APIC #%d Version %d at 0x%lX.\n",
+               m->mpc_apicid, m->mpc_apicver, m->mpc_apicaddr);
+       if (nr_ioapics >= MAX_IO_APICS) {
+               printk(KERN_CRIT "Max # of I/O APICs (%d) exceeded (found 
%d).\n",
+                       MAX_IO_APICS, nr_ioapics);
+               panic("Recompile kernel with bigger MAX_IO_APICS!.\n");
+       }
+       if (!m->mpc_apicaddr) {
+               printk(KERN_ERR "WARNING: bogus zero I/O APIC address"
+                       " found in MP table, skipping!\n");
+               return;
+       }
+       mp_ioapics[nr_ioapics] = *m;
+       nr_ioapics++;
+}
+
+static void __init MP_intsrc_info (struct mpc_config_intsrc *m)
+{
+       mp_irqs [mp_irq_entries] = *m;
+       Dprintk("Int: type %d, pol %d, trig %d, bus %d,"
+               " IRQ %02x, APIC ID %x, APIC INT %02x\n",
+                       m->mpc_irqtype, m->mpc_irqflag & 3,
+                       (m->mpc_irqflag >> 2) & 3, m->mpc_srcbus,
+                       m->mpc_srcbusirq, m->mpc_dstapic, m->mpc_dstirq);
+       if (++mp_irq_entries == MAX_IRQ_SOURCES)
+               panic("Max # of irq sources exceeded!!\n");
+}
+
+static void __init MP_lintsrc_info (struct mpc_config_lintsrc *m)
+{
+       Dprintk("Lint: type %d, pol %d, trig %d, bus %d,"
+               " IRQ %02x, APIC ID %x, APIC LINT %02x\n",
+                       m->mpc_irqtype, m->mpc_irqflag & 3,
+                       (m->mpc_irqflag >> 2) &3, m->mpc_srcbusid,
+                       m->mpc_srcbusirq, m->mpc_destapic, m->mpc_destapiclint);
+       /*
+        * Well it seems all SMP boards in existence
+        * use ExtINT/LVT1 == LINT0 and
+        * NMI/LVT2 == LINT1 - the following check
+        * will show us if this assumptions is false.
+        * Until then we do not have to add baggage.
+        */
+       if ((m->mpc_irqtype == mp_ExtINT) &&
+               (m->mpc_destapiclint != 0))
+                       BUG();
+       if ((m->mpc_irqtype == mp_NMI) &&
+               (m->mpc_destapiclint != 1))
+                       BUG();
+}
+
+#ifdef CONFIG_X86_NUMAQ
+static void __init MP_translation_info (struct mpc_config_translation *m)
+{
+       printk(KERN_INFO "Translation: record %d, type %d, quad %d, global %d, 
local %d\n", mpc_record, m->trans_type, m->trans_quad, m->trans_global, 
m->trans_local);
+
+       if (mpc_record >= MAX_MPC_ENTRY) 
+               printk(KERN_ERR "MAX_MPC_ENTRY exceeded!\n");
+       else
+               translation_table[mpc_record] = m; /* stash this for later */
+       if (m->trans_quad < MAX_NUMNODES && !node_online(m->trans_quad))
+               node_set_online(m->trans_quad);
+}
+
+/*
+ * Read/parse the MPC oem tables
+ */
+
+static void __init smp_read_mpc_oem(struct mp_config_oemtable *oemtable, \
+       unsigned short oemsize)
+{
+       int count = sizeof (*oemtable); /* the header size */
+       unsigned char *oemptr = ((unsigned char *)oemtable)+count;
+       
+       mpc_record = 0;
+       printk(KERN_INFO "Found an OEM MPC table at %8p - parsing it ... \n", 
oemtable);
+       if (memcmp(oemtable->oem_signature,MPC_OEM_SIGNATURE,4))
+       {
+               printk(KERN_WARNING "SMP mpc oemtable: bad signature 
[%c%c%c%c]!\n",
+                       oemtable->oem_signature[0],
+                       oemtable->oem_signature[1],
+                       oemtable->oem_signature[2],
+                       oemtable->oem_signature[3]);
+               return;
+       }
+       if (mpf_checksum((unsigned char *)oemtable,oemtable->oem_length))
+       {
+               printk(KERN_WARNING "SMP oem mptable: checksum error!\n");
+               return;
+       }
+       while (count < oemtable->oem_length) {
+               switch (*oemptr) {
+                       case MP_TRANSLATION:
+                       {
+                               struct mpc_config_translation *m=
+                                       (struct mpc_config_translation *)oemptr;
+                               MP_translation_info(m);
+                               oemptr += sizeof(*m);
+                               count += sizeof(*m);
+                               ++mpc_record;
+                               break;
+                       }
+                       default:
+                       {
+                               printk(KERN_WARNING "Unrecognised OEM table 
entry type! - %d\n", (int) *oemptr);
+                               return;
+                       }
+               }
+       }
+}
+
+static inline void mps_oem_check(struct mp_config_table *mpc, char *oem,
+               char *productid)
+{
+       if (strncmp(oem, "IBM NUMA", 8))
+               printk("Warning!  May not be a NUMA-Q system!\n");
+       if (mpc->mpc_oemptr)
+               smp_read_mpc_oem((struct mp_config_oemtable *) mpc->mpc_oemptr,
+                               mpc->mpc_oemsize);
+}
+#endif /* CONFIG_X86_NUMAQ */
+
+/*
+ * Read/parse the MPC
+ */
+
+static int __init smp_read_mpc(struct mp_config_table *mpc)
+{
+       char str[16];
+       char oem[10];
+       int count=sizeof(*mpc);
+       unsigned char *mpt=((unsigned char *)mpc)+count;
+
+       if (memcmp(mpc->mpc_signature,MPC_SIGNATURE,4)) {
+               printk(KERN_ERR "SMP mptable: bad signature [0x%x]!\n",
+                       *(u32 *)mpc->mpc_signature);
+               return 0;
+       }
+       if (mpf_checksum((unsigned char *)mpc,mpc->mpc_length)) {
+               printk(KERN_ERR "SMP mptable: checksum error!\n");
+               return 0;
+       }
+       if (mpc->mpc_spec!=0x01 && mpc->mpc_spec!=0x04) {
+               printk(KERN_ERR "SMP mptable: bad table version (%d)!!\n",
+                       mpc->mpc_spec);
+               return 0;
+       }
+       if (!mpc->mpc_lapic) {
+               printk(KERN_ERR "SMP mptable: null local APIC address!\n");
+               return 0;
+       }
+       memcpy(oem,mpc->mpc_oem,8);
+       oem[8]=0;
+       printk(KERN_INFO "OEM ID: %s ",oem);
+
+       memcpy(str,mpc->mpc_productid,12);
+       str[12]=0;
+       printk("Product ID: %s ",str);
+
+       mps_oem_check(mpc, oem, str);
+
+       printk("APIC at: 0x%lX\n",mpc->mpc_lapic);
+
+       /* 
+        * Save the local APIC address (it might be non-default) -- but only
+        * if we're not using ACPI.
+        */
+       if (!acpi_lapic)
+               mp_lapic_addr = mpc->mpc_lapic;
+
+       /*
+        *      Now process the configuration blocks.
+        */
+       mpc_record = 0;
+       while (count < mpc->mpc_length) {
+               switch(*mpt) {
+                       case MP_PROCESSOR:
+                       {
+                               struct mpc_config_processor *m=
+                                       (struct mpc_config_processor *)mpt;
+                               /* ACPI may have already provided this data */
+                               if (!acpi_lapic)
+                                       MP_processor_info(m);
+                               mpt += sizeof(*m);
+                               count += sizeof(*m);
+                               break;
+                       }
+                       case MP_BUS:
+                       {
+                               struct mpc_config_bus *m=
+                                       (struct mpc_config_bus *)mpt;
+                               MP_bus_info(m);
+                               mpt += sizeof(*m);
+                               count += sizeof(*m);
+                               break;
+                       }
+                       case MP_IOAPIC:
+                       {
+                               struct mpc_config_ioapic *m=
+                                       (struct mpc_config_ioapic *)mpt;
+                               MP_ioapic_info(m);
+                               mpt+=sizeof(*m);
+                               count+=sizeof(*m);
+                               break;
+                       }
+                       case MP_INTSRC:
+                       {
+                               struct mpc_config_intsrc *m=
+                                       (struct mpc_config_intsrc *)mpt;
+
+                               MP_intsrc_info(m);
+                               mpt+=sizeof(*m);
+                               count+=sizeof(*m);
+                               break;
+                       }
+                       case MP_LINTSRC:
+                       {
+                               struct mpc_config_lintsrc *m=
+                                       (struct mpc_config_lintsrc *)mpt;
+                               MP_lintsrc_info(m);
+                               mpt+=sizeof(*m);
+                               count+=sizeof(*m);
+                               break;
+                       }
+                       default:
+                       {
+                               count = mpc->mpc_length;
+                               break;
+                       }
+               }
+               ++mpc_record;
+       }
+       clustered_apic_check();
+       if (!num_processors)
+               printk(KERN_ERR "SMP mptable: no processors registered!\n");
+       return num_processors;
+}
+
+static int __init ELCR_trigger(unsigned int irq)
+{
+       unsigned int port;
+
+       port = 0x4d0 + (irq >> 3);
+       return (inb(port) >> (irq & 7)) & 1;
+}
+
+static void __init construct_default_ioirq_mptable(int mpc_default_type)
+{
+       struct mpc_config_intsrc intsrc;
+       int i;
+       int ELCR_fallback = 0;
+
+       intsrc.mpc_type = MP_INTSRC;
+       intsrc.mpc_irqflag = 0;                 /* conforming */
+       intsrc.mpc_srcbus = 0;
+       intsrc.mpc_dstapic = mp_ioapics[0].mpc_apicid;
+
+       intsrc.mpc_irqtype = mp_INT;
+
+       /*
+        *  If true, we have an ISA/PCI system with no IRQ entries
+        *  in the MP table. To prevent the PCI interrupts from being set up
+        *  incorrectly, we try to use the ELCR. The sanity check to see if
+        *  there is good ELCR data is very simple - IRQ0, 1, 2 and 13 can
+        *  never be level sensitive, so we simply see if the ELCR agrees.
+        *  If it does, we assume it's valid.
+        */
+       if (mpc_default_type == 5) {
+               printk(KERN_INFO "ISA/PCI bus type with no IRQ information... 
falling back to ELCR\n");
+
+               if (ELCR_trigger(0) || ELCR_trigger(1) || ELCR_trigger(2) || 
ELCR_trigger(13))
+                       printk(KERN_WARNING "ELCR contains invalid data... not 
using ELCR\n");
+               else {
+                       printk(KERN_INFO "Using ELCR to identify PCI 
interrupts\n");
+                       ELCR_fallback = 1;
+               }
+       }
+
+       for (i = 0; i < 16; i++) {
+               switch (mpc_default_type) {
+               case 2:
+                       if (i == 0 || i == 13)
+                               continue;       /* IRQ0 & IRQ13 not connected */
+                       /* fall through */
+               default:
+                       if (i == 2)
+                               continue;       /* IRQ2 is never connected */
+               }
+
+               if (ELCR_fallback) {
+                       /*
+                        *  If the ELCR indicates a level-sensitive interrupt, 
we
+                        *  copy that information over to the MP table in the
+                        *  irqflag field (level sensitive, active high 
polarity).
+                        */
+                       if (ELCR_trigger(i))
+                               intsrc.mpc_irqflag = 13;
+                       else
+                               intsrc.mpc_irqflag = 0;
+               }
+
+               intsrc.mpc_srcbusirq = i;
+               intsrc.mpc_dstirq = i ? i : 2;          /* IRQ0 to INTIN2 */
+               MP_intsrc_info(&intsrc);
+       }
+
+       intsrc.mpc_irqtype = mp_ExtINT;
+       intsrc.mpc_srcbusirq = 0;
+       intsrc.mpc_dstirq = 0;                          /* 8259A to INTIN0 */
+       MP_intsrc_info(&intsrc);
+}
+
+static inline void __init construct_default_ISA_mptable(int mpc_default_type)
+{
+       struct mpc_config_processor processor;
+       struct mpc_config_bus bus;
+       struct mpc_config_ioapic ioapic;
+       struct mpc_config_lintsrc lintsrc;
+       int linttypes[2] = { mp_ExtINT, mp_NMI };
+       int i;
+
+       /*
+        * local APIC has default address
+        */
+       mp_lapic_addr = APIC_DEFAULT_PHYS_BASE;
+
+       /*
+        * 2 CPUs, numbered 0 & 1.
+        */
+       processor.mpc_type = MP_PROCESSOR;
+       /* Either an integrated APIC or a discrete 82489DX. */
+       processor.mpc_apicver = mpc_default_type > 4 ? 0x10 : 0x01;
+       processor.mpc_cpuflag = CPU_ENABLED;
+       processor.mpc_cpufeature = (boot_cpu_data.x86 << 8) |
+                                  (boot_cpu_data.x86_model << 4) |
+                                  boot_cpu_data.x86_mask;
+       processor.mpc_featureflag = boot_cpu_data.x86_capability[0];
+       processor.mpc_reserved[0] = 0;
+       processor.mpc_reserved[1] = 0;
+       for (i = 0; i < 2; i++) {
+               processor.mpc_apicid = i;
+               MP_processor_info(&processor);
+       }
+
+       bus.mpc_type = MP_BUS;
+       bus.mpc_busid = 0;
+       switch (mpc_default_type) {
+               default:
+                       printk("???\n");
+                       printk(KERN_ERR "Unknown standard configuration %d\n",
+                               mpc_default_type);
+                       /* fall through */
+               case 1:
+               case 5:
+                       memcpy(bus.mpc_bustype, "ISA   ", 6);
+                       break;
+               case 2:
+               case 6:
+               case 3:
+                       memcpy(bus.mpc_bustype, "EISA  ", 6);
+                       break;
+               case 4:
+               case 7:
+                       memcpy(bus.mpc_bustype, "MCA   ", 6);
+       }
+       MP_bus_info(&bus);
+       if (mpc_default_type > 4) {
+               bus.mpc_busid = 1;
+               memcpy(bus.mpc_bustype, "PCI   ", 6);
+               MP_bus_info(&bus);
+       }
+
+       ioapic.mpc_type = MP_IOAPIC;
+       ioapic.mpc_apicid = 2;
+       ioapic.mpc_apicver = mpc_default_type > 4 ? 0x10 : 0x01;
+       ioapic.mpc_flags = MPC_APIC_USABLE;
+       ioapic.mpc_apicaddr = 0xFEC00000;
+       MP_ioapic_info(&ioapic);
+
+       /*
+        * We set up most of the low 16 IO-APIC pins according to MPS rules.
+        */
+       construct_default_ioirq_mptable(mpc_default_type);
+
+       lintsrc.mpc_type = MP_LINTSRC;
+       lintsrc.mpc_irqflag = 0;                /* conforming */
+       lintsrc.mpc_srcbusid = 0;
+       lintsrc.mpc_srcbusirq = 0;
+       lintsrc.mpc_destapic = MP_APIC_ALL;
+       for (i = 0; i < 2; i++) {
+               lintsrc.mpc_irqtype = linttypes[i];
+               lintsrc.mpc_destapiclint = i;
+               MP_lintsrc_info(&lintsrc);
+       }
+}
+
+static struct intel_mp_floating *mpf_found;
+
+/*
+ * Scan the memory blocks for an SMP configuration block.
+ */
+void __init get_smp_config (void)
+{
+       struct intel_mp_floating *mpf = mpf_found;
+
+       /*
+        * ACPI may be used to obtain the entire SMP configuration or just to 
+        * enumerate/configure processors (CONFIG_ACPI_BOOT).  Note that 
+        * ACPI supports both logical (e.g. Hyper-Threading) and physical 
+        * processors, where MPS only supports physical.
+        */
+       if (acpi_lapic && acpi_ioapic) {
+               printk(KERN_INFO "Using ACPI (MADT) for SMP configuration 
information\n");
+               return;
+       }
+       else if (acpi_lapic)
+               printk(KERN_INFO "Using ACPI for processor (LAPIC) 
configuration information\n");
+
+       printk(KERN_INFO "Intel MultiProcessor Specification v1.%d\n", 
mpf->mpf_specification);
+       if (mpf->mpf_feature2 & (1<<7)) {
+               printk(KERN_INFO "    IMCR and PIC compatibility mode.\n");
+               pic_mode = 1;
+       } else {
+               printk(KERN_INFO "    Virtual Wire compatibility mode.\n");
+               pic_mode = 0;
+       }
+
+       /*
+        * Now see if we need to read further.
+        */
+       if (mpf->mpf_feature1 != 0) {
+
+               printk(KERN_INFO "Default MP configuration #%d\n", 
mpf->mpf_feature1);
+               construct_default_ISA_mptable(mpf->mpf_feature1);
+
+       } else if (mpf->mpf_physptr) {
+
+               /*
+                * Read the physical hardware table.  Anything here will
+                * override the defaults.
+                */
+               if (!smp_read_mpc(isa_bus_to_virt(mpf->mpf_physptr))) {
+                       smp_found_config = 0;
+                       printk(KERN_ERR "BIOS bug, MP table errors 
detected!...\n");
+                       printk(KERN_ERR "... disabling SMP support. (tell your 
hw vendor)\n");
+                       return;
+               }
+               /*
+                * If there are no explicit MP IRQ entries, then we are
+                * broken.  We set up most of the low 16 IO-APIC pins to
+                * ISA defaults and hope it will work.
+                */
+               if (!mp_irq_entries) {
+                       struct mpc_config_bus bus;
+
+                       printk(KERN_ERR "BIOS bug, no explicit IRQ entries, 
using default mptable. (tell your hw vendor)\n");
+
+                       bus.mpc_type = MP_BUS;
+                       bus.mpc_busid = 0;
+                       memcpy(bus.mpc_bustype, "ISA   ", 6);
+                       MP_bus_info(&bus);
+
+                       construct_default_ioirq_mptable(0);
+               }
+
+       } else
+               BUG();
+
+       printk(KERN_INFO "Processors: %d\n", num_processors);
+       /*
+        * Only use the first configuration found.
+        */
+}
+
+static int __init smp_scan_config (unsigned long base, unsigned long length)
+{
+       unsigned long *bp = isa_bus_to_virt(base);
+       struct intel_mp_floating *mpf;
+
+       Dprintk("Scan SMP from %p for %ld bytes.\n", bp,length);
+       if (sizeof(*mpf) != 16)
+               printk("Error: MPF size\n");
+
+       while (length > 0) {
+               mpf = (struct intel_mp_floating *)bp;
+               if ((*bp == SMP_MAGIC_IDENT) &&
+                       (mpf->mpf_length == 1) &&
+                       !mpf_checksum((unsigned char *)bp, 16) &&
+                       ((mpf->mpf_specification == 1)
+                               || (mpf->mpf_specification == 4)) ) {
+
+                       smp_found_config = 1;
+#ifndef CONFIG_XEN
+                       printk(KERN_INFO "found SMP MP-table at %08lx\n",
+                                               virt_to_phys(mpf));
+                       reserve_bootmem(virt_to_phys(mpf), PAGE_SIZE);
+                       if (mpf->mpf_physptr) {
+                               /*
+                                * We cannot access to MPC table to compute
+                                * table size yet, as only few megabytes from
+                                * the bottom is mapped now.
+                                * PC-9800's MPC table places on the very last
+                                * of physical memory; so that simply reserving
+                                * PAGE_SIZE from mpg->mpf_physptr yields BUG()
+                                * in reserve_bootmem.
+                                */
+                               unsigned long size = PAGE_SIZE;
+                               unsigned long end = max_low_pfn * PAGE_SIZE;
+                               if (mpf->mpf_physptr + size > end)
+                                       size = end - mpf->mpf_physptr;
+                               reserve_bootmem(mpf->mpf_physptr, size);
+                       }
+#else
+                       printk(KERN_INFO "found SMP MP-table at %08lx\n",
+                               ((unsigned long)bp - (unsigned 
long)isa_bus_to_virt(base)) + base);
+#endif
+
+                       mpf_found = mpf;
+                       return 1;
+               }
+               bp += 4;
+               length -= 16;
+       }
+       return 0;
+}
+
+void __init find_smp_config (void)
+{
+       unsigned int address;
+
+       /*
+        * FIXME: Linux assumes you have 640K of base ram..
+        * this continues the error...
+        *
+        * 1) Scan the bottom 1K for a signature
+        * 2) Scan the top 1K of base RAM
+        * 3) Scan the 64K of bios
+        */
+       if (smp_scan_config(0x0,0x400) ||
+               smp_scan_config(639*0x400,0x400) ||
+                       smp_scan_config(0xF0000,0x10000))
+               return;
+       /*
+        * If it is an SMP machine we should know now, unless the
+        * configuration is in an EISA/MCA bus machine with an
+        * extended bios data area.
+        *
+        * there is a real-mode segmented pointer pointing to the
+        * 4K EBDA area at 0x40E, calculate and scan it here.
+        *
+        * NOTE! There are Linux loaders that will corrupt the EBDA
+        * area, and as such this kind of SMP config may be less
+        * trustworthy, simply because the SMP table may have been
+        * stomped on during early boot. These loaders are buggy and
+        * should be fixed.
+        *
+        * MP1.4 SPEC states to only scan first 1K of 4K EBDA.
+        */
+
+#ifndef CONFIG_XEN
+       address = get_bios_ebda();
+       if (address)
+               smp_scan_config(address, 0x400);
+#endif
+}
+
+/* --------------------------------------------------------------------------
+                            ACPI-based MP Configuration
+   -------------------------------------------------------------------------- 
*/
+
+#ifdef CONFIG_ACPI_BOOT
+
+void __init mp_register_lapic_address (
+       u64                     address)
+{
+#ifndef CONFIG_XEN
+       mp_lapic_addr = (unsigned long) address;
+
+       if (boot_cpu_physical_apicid == -1U)
+               boot_cpu_physical_apicid = GET_APIC_ID(apic_read(APIC_ID));
+
+       Dprintk("Boot CPU = %d\n", boot_cpu_physical_apicid);
+#endif
+}
+
+
+void __init mp_register_lapic (
+       u8                      id, 
+       u8                      enabled)
+{
+       struct mpc_config_processor processor;
+       int                     boot_cpu = 0;
+       
+       if (MAX_APICS - id <= 0) {
+               printk(KERN_WARNING "Processor #%d invalid (max %d)\n",
+                       id, MAX_APICS);
+               return;
+       }
+
+       if (id == boot_cpu_physical_apicid)
+               boot_cpu = 1;
+
+#ifndef CONFIG_XEN
+       processor.mpc_type = MP_PROCESSOR;
+       processor.mpc_apicid = id;
+       processor.mpc_apicver = GET_APIC_VERSION(apic_read(APIC_LVR));
+       processor.mpc_cpuflag = (enabled ? CPU_ENABLED : 0);
+       processor.mpc_cpuflag |= (boot_cpu ? CPU_BOOTPROCESSOR : 0);
+       processor.mpc_cpufeature = (boot_cpu_data.x86 << 8) | 
+               (boot_cpu_data.x86_model << 4) | boot_cpu_data.x86_mask;
+       processor.mpc_featureflag = boot_cpu_data.x86_capability[0];
+       processor.mpc_reserved[0] = 0;
+       processor.mpc_reserved[1] = 0;
+#endif
+
+       MP_processor_info(&processor);
+}
+
+#if defined(CONFIG_X86_IO_APIC) && (defined(CONFIG_ACPI_INTERPRETER) || 
defined(CONFIG_ACPI_BOOT))
+
+#define MP_ISA_BUS             0
+#define MP_MAX_IOAPIC_PIN      127
+
+static struct mp_ioapic_routing {
+       int                     apic_id;
+       int                     gsi_base;
+       int                     gsi_end;
+       u32                     pin_programmed[4];
+} mp_ioapic_routing[MAX_IO_APICS];
+
+
+static int mp_find_ioapic (
+       int                     gsi)
+{
+       int                     i = 0;
+
+       /* Find the IOAPIC that manages this GSI. */
+       for (i = 0; i < nr_ioapics; i++) {
+               if ((gsi >= mp_ioapic_routing[i].gsi_base)
+                       && (gsi <= mp_ioapic_routing[i].gsi_end))
+                       return i;
+       }
+
+       printk(KERN_ERR "ERROR: Unable to locate IOAPIC for GSI %d\n", gsi);
+
+       return -1;
+}
+       
+
+void __init mp_register_ioapic (
+       u8                      id, 
+       u32                     address,
+       u32                     gsi_base)
+{
+       int                     idx = 0;
+
+       if (nr_ioapics >= MAX_IO_APICS) {
+               printk(KERN_ERR "ERROR: Max # of I/O APICs (%d) exceeded "
+                       "(found %d)\n", MAX_IO_APICS, nr_ioapics);
+               panic("Recompile kernel with bigger MAX_IO_APICS!\n");
+       }
+       if (!address) {
+               printk(KERN_ERR "WARNING: Bogus (zero) I/O APIC address"
+                       " found in MADT table, skipping!\n");
+               return;
+       }
+
+       idx = nr_ioapics++;
+
+       mp_ioapics[idx].mpc_type = MP_IOAPIC;
+       mp_ioapics[idx].mpc_flags = MPC_APIC_USABLE;
+       mp_ioapics[idx].mpc_apicaddr = address;
+
+       mp_ioapics[idx].mpc_apicid = io_apic_get_unique_id(idx, id);
+       mp_ioapics[idx].mpc_apicver = io_apic_get_version(idx);
+       
+       /* 
+        * Build basic GSI lookup table to facilitate gsi->io_apic lookups
+        * and to prevent reprogramming of IOAPIC pins (PCI GSIs).
+        */
+       mp_ioapic_routing[idx].apic_id = mp_ioapics[idx].mpc_apicid;
+       mp_ioapic_routing[idx].gsi_base = gsi_base;
+       mp_ioapic_routing[idx].gsi_end = gsi_base + 
+               io_apic_get_redir_entries(idx);
+
+       printk("IOAPIC[%d]: apic_id %d, version %d, address 0x%lx, "
+               "GSI %d-%d\n", idx, mp_ioapics[idx].mpc_apicid, 
+               mp_ioapics[idx].mpc_apicver, mp_ioapics[idx].mpc_apicaddr,
+               mp_ioapic_routing[idx].gsi_base,
+               mp_ioapic_routing[idx].gsi_end);
+
+       return;
+}
+
+
+void __init mp_override_legacy_irq (
+       u8                      bus_irq,
+       u8                      polarity, 
+       u8                      trigger, 
+       u32                     gsi)
+{
+       struct mpc_config_intsrc intsrc;
+       int                     ioapic = -1;
+       int                     pin = -1;
+
+       /* 
+        * Convert 'gsi' to 'ioapic.pin'.
+        */
+       ioapic = mp_find_ioapic(gsi);
+       if (ioapic < 0)
+               return;
+       pin = gsi - mp_ioapic_routing[ioapic].gsi_base;
+
+       /*
+        * TBD: This check is for faulty timer entries, where the override
+        *      erroneously sets the trigger to level, resulting in a HUGE 
+        *      increase of timer interrupts!
+        */
+       if ((bus_irq == 0) && (trigger == 3))
+               trigger = 1;
+
+       intsrc.mpc_type = MP_INTSRC;
+       intsrc.mpc_irqtype = mp_INT;
+       intsrc.mpc_irqflag = (trigger << 2) | polarity;
+       intsrc.mpc_srcbus = MP_ISA_BUS;
+       intsrc.mpc_srcbusirq = bus_irq;                                /* IRQ */
+       intsrc.mpc_dstapic = mp_ioapics[ioapic].mpc_apicid;        /* APIC ID */
+       intsrc.mpc_dstirq = pin;                                    /* INTIN# */
+
+       Dprintk("Int: type %d, pol %d, trig %d, bus %d, irq %d, %d-%d\n",
+               intsrc.mpc_irqtype, intsrc.mpc_irqflag & 3, 
+               (intsrc.mpc_irqflag >> 2) & 3, intsrc.mpc_srcbus, 
+               intsrc.mpc_srcbusirq, intsrc.mpc_dstapic, intsrc.mpc_dstirq);
+
+       mp_irqs[mp_irq_entries] = intsrc;
+       if (++mp_irq_entries == MAX_IRQ_SOURCES)
+               panic("Max # of irq sources exceeded!\n");
+
+       return;
+}
+
+int es7000_plat;
+
+void __init mp_config_acpi_legacy_irqs (void)
+{
+       struct mpc_config_intsrc intsrc;
+       int                     i = 0;
+       int                     ioapic = -1;
+
+       /* 
+        * Fabricate the legacy ISA bus (bus #31).
+        */
+       mp_bus_id_to_type[MP_ISA_BUS] = MP_BUS_ISA;
+       Dprintk("Bus #%d is ISA\n", MP_ISA_BUS);
+
+       /*
+        * Older generations of ES7000 have no legacy identity mappings
+        */
+       if (es7000_plat == 1)
+               return;
+
+       /* 
+        * Locate the IOAPIC that manages the ISA IRQs (0-15). 
+        */
+       ioapic = mp_find_ioapic(0);
+       if (ioapic < 0)
+               return;
+
+       intsrc.mpc_type = MP_INTSRC;
+       intsrc.mpc_irqflag = 0;                                 /* Conforming */
+       intsrc.mpc_srcbus = MP_ISA_BUS;
+       intsrc.mpc_dstapic = mp_ioapics[ioapic].mpc_apicid;
+
+       /* 
+        * Use the default configuration for the IRQs 0-15.  Unless
+        * overriden by (MADT) interrupt source override entries.
+        */
+       for (i = 0; i < 16; i++) {
+               int idx;
+
+               for (idx = 0; idx < mp_irq_entries; idx++) {
+                       struct mpc_config_intsrc *irq = mp_irqs + idx;
+
+                       /* Do we already have a mapping for this ISA IRQ? */
+                       if (irq->mpc_srcbus == MP_ISA_BUS && irq->mpc_srcbusirq 
== i)
+                               break;
+
+                       /* Do we already have a mapping for this IOAPIC pin */
+                       if ((irq->mpc_dstapic == intsrc.mpc_dstapic) &&
+                               (irq->mpc_dstirq == i))
+                               break;
+               }
+
+               if (idx != mp_irq_entries) {
+                       printk(KERN_DEBUG "ACPI: IRQ%d used by override.\n", i);
+                       continue;                       /* IRQ already used */
+               }
+
+               intsrc.mpc_irqtype = mp_INT;
+               intsrc.mpc_srcbusirq = i;                  /* Identity mapped */
+               intsrc.mpc_dstirq = i;
+
+               Dprintk("Int: type %d, pol %d, trig %d, bus %d, irq %d, "
+                       "%d-%d\n", intsrc.mpc_irqtype, intsrc.mpc_irqflag & 3, 
+                       (intsrc.mpc_irqflag >> 2) & 3, intsrc.mpc_srcbus, 
+                       intsrc.mpc_srcbusirq, intsrc.mpc_dstapic, 
+                       intsrc.mpc_dstirq);
+
+               mp_irqs[mp_irq_entries] = intsrc;
+               if (++mp_irq_entries == MAX_IRQ_SOURCES)
+                       panic("Max # of irq sources exceeded!\n");
+       }
+}
+
+int mp_register_gsi (u32 gsi, int edge_level, int active_high_low)
+{
+       int                     ioapic = -1;
+       int                     ioapic_pin = 0;
+       int                     idx, bit = 0;
+
+#ifdef CONFIG_ACPI_BUS
+       /* Don't set up the ACPI SCI because it's already set up */
+       if (acpi_fadt.sci_int == gsi)
+               return gsi;
+#endif
+
+       ioapic = mp_find_ioapic(gsi);
+       if (ioapic < 0) {
+               printk(KERN_WARNING "No IOAPIC for GSI %u\n", gsi);
+               return gsi;
+       }
+
+       ioapic_pin = gsi - mp_ioapic_routing[ioapic].gsi_base;
+
+       if (ioapic_renumber_irq)
+               gsi = ioapic_renumber_irq(ioapic, gsi);
+
+       /* 
+        * Avoid pin reprogramming.  PRTs typically include entries  
+        * with redundant pin->gsi mappings (but unique PCI devices);
+        * we only program the IOAPIC on the first.
+        */
+       bit = ioapic_pin % 32;
+       idx = (ioapic_pin < 32) ? 0 : (ioapic_pin / 32);
+       if (idx > 3) {
+               printk(KERN_ERR "Invalid reference to IOAPIC pin "
+                       "%d-%d\n", mp_ioapic_routing[ioapic].apic_id, 
+                       ioapic_pin);
+               return gsi;
+       }
+       if ((1<<bit) & mp_ioapic_routing[ioapic].pin_programmed[idx]) {
+               Dprintk(KERN_DEBUG "Pin %d-%d already programmed\n",
+                       mp_ioapic_routing[ioapic].apic_id, ioapic_pin);
+               return gsi;
+       }
+
+       mp_ioapic_routing[ioapic].pin_programmed[idx] |= (1<<bit);
+
+       io_apic_set_pci_routing(ioapic, ioapic_pin, gsi,
+                   edge_level == ACPI_EDGE_SENSITIVE ? 0 : 1,
+                   active_high_low == ACPI_ACTIVE_HIGH ? 0 : 1);
+       return gsi;
+}
+
+#endif /*CONFIG_X86_IO_APIC && (CONFIG_ACPI_INTERPRETER || CONFIG_ACPI_BOOT)*/
+#endif /*CONFIG_ACPI_BOOT*/
Index: linux-2.6.12-xen0-arch/arch/i386/mach-xen/kernel/pci-dma.c
===================================================================
--- /dev/null
+++ linux-2.6.12-xen0-arch/arch/i386/mach-xen/kernel/pci-dma.c
@@ -0,0 +1,282 @@
+/*
+ * Dynamic DMA mapping support.
+ *
+ * On i386 there is no hardware dynamic DMA address translation,
+ * so consistent alloc/free are merely page allocation/freeing.
+ * The rest of the dynamic DMA mapping interface is implemented
+ * in asm/pci.h.
+ */
+
+#include <linux/types.h>
+#include <linux/mm.h>
+#include <linux/string.h>
+#include <linux/pci.h>
+#include <linux/version.h>
+#include <asm/io.h>
+#include <asm-xen/balloon.h>
+#include <asm/tlbflush.h>
+
+struct dma_coherent_mem {
+       void            *virt_base;
+       u32             device_base;
+       int             size;
+       int             flags;
+       unsigned long   *bitmap;
+};
+
+void *dma_alloc_coherent(struct device *dev, size_t size,
+                          dma_addr_t *dma_handle, unsigned int __nocast gfp)
+{
+       void *ret;
+       struct dma_coherent_mem *mem = dev ? dev->dma_mem : NULL;
+       unsigned int order = get_order(size);
+       unsigned long vstart;
+       /* ignore region specifiers */
+       gfp &= ~(__GFP_DMA | __GFP_HIGHMEM);
+
+       if (mem) {
+               int page = bitmap_find_free_region(mem->bitmap, mem->size,
+                                                    order);
+               if (page >= 0) {
+                       *dma_handle = mem->device_base + (page << PAGE_SHIFT);
+                       ret = mem->virt_base + (page << PAGE_SHIFT);
+                       memset(ret, 0, size);
+                       return ret;
+               }
+               if (mem->flags & DMA_MEMORY_EXCLUSIVE)
+                       return NULL;
+       }
+
+       if (dev == NULL || (dev->coherent_dma_mask < 0xffffffff))
+               gfp |= GFP_DMA;
+
+       vstart = __get_free_pages(gfp, order);
+       ret = (void *)vstart;
+
+       if (ret != NULL) {
+               xen_contig_memory(vstart, order);
+
+               memset(ret, 0, size);
+               *dma_handle = virt_to_bus(ret);
+       }
+       return ret;
+}
+
+void dma_free_coherent(struct device *dev, size_t size,
+                        void *vaddr, dma_addr_t dma_handle)
+{
+       struct dma_coherent_mem *mem = dev ? dev->dma_mem : NULL;
+       int order = get_order(size);
+       
+       if (mem && vaddr >= mem->virt_base && vaddr < (mem->virt_base + 
(mem->size << PAGE_SHIFT))) {
+               int page = (vaddr - mem->virt_base) >> PAGE_SHIFT;
+
+               bitmap_release_region(mem->bitmap, page, order);
+       } else
+               free_pages((unsigned long)vaddr, order);
+}
+
+int dma_declare_coherent_memory(struct device *dev, dma_addr_t bus_addr,
+                               dma_addr_t device_addr, size_t size, int flags)
+{
+       void __iomem *mem_base;
+       int pages = size >> PAGE_SHIFT;
+       int bitmap_size = (pages + 31)/32;
+
+       if ((flags & (DMA_MEMORY_MAP | DMA_MEMORY_IO)) == 0)
+               goto out;
+       if (!size)
+               goto out;
+       if (dev->dma_mem)
+               goto out;
+
+       /* FIXME: this routine just ignores DMA_MEMORY_INCLUDES_CHILDREN */
+
+       mem_base = ioremap(bus_addr, size);
+       if (!mem_base)
+               goto out;
+
+       dev->dma_mem = kmalloc(sizeof(struct dma_coherent_mem), GFP_KERNEL);
+       if (!dev->dma_mem)
+               goto out;
+       memset(dev->dma_mem, 0, sizeof(struct dma_coherent_mem));
+       dev->dma_mem->bitmap = kmalloc(bitmap_size, GFP_KERNEL);
+       if (!dev->dma_mem->bitmap)
+               goto free1_out;
+       memset(dev->dma_mem->bitmap, 0, bitmap_size);
+
+       dev->dma_mem->virt_base = mem_base;
+       dev->dma_mem->device_base = device_addr;
+       dev->dma_mem->size = pages;
+       dev->dma_mem->flags = flags;
+
+       if (flags & DMA_MEMORY_MAP)
+               return DMA_MEMORY_MAP;
+
+       return DMA_MEMORY_IO;
+
+ free1_out:
+       kfree(dev->dma_mem->bitmap);
+ out:
+       return 0;
+}
+EXPORT_SYMBOL(dma_declare_coherent_memory);
+
+void dma_release_declared_memory(struct device *dev)
+{
+       struct dma_coherent_mem *mem = dev->dma_mem;
+       
+       if(!mem)
+               return;
+       dev->dma_mem = NULL;
+       iounmap(mem->virt_base);
+       kfree(mem->bitmap);
+       kfree(mem);
+}
+EXPORT_SYMBOL(dma_release_declared_memory);
+
+void *dma_mark_declared_memory_occupied(struct device *dev,
+                                       dma_addr_t device_addr, size_t size)
+{
+       struct dma_coherent_mem *mem = dev->dma_mem;
+       int pages = (size + (device_addr & ~PAGE_MASK) + PAGE_SIZE - 1) >> 
PAGE_SHIFT;
+       int pos, err;
+
+       if (!mem)
+               return ERR_PTR(-EINVAL);
+
+       pos = (device_addr - mem->device_base) >> PAGE_SHIFT;
+       err = bitmap_allocate_region(mem->bitmap, pos, get_order(pages));
+       if (err != 0)
+               return ERR_PTR(err);
+       return mem->virt_base + (pos << PAGE_SHIFT);
+}
+EXPORT_SYMBOL(dma_mark_declared_memory_occupied);
+
+static LIST_HEAD(dma_map_head);
+static DEFINE_SPINLOCK(dma_map_lock);
+struct dma_map_entry {
+       struct list_head list;
+       dma_addr_t dma;
+       char *bounce, *host;
+       size_t size;
+};
+#define DMA_MAP_MATCHES(e,d) (((e)->dma<=(d)) && (((e)->dma+(e)->size)>(d)))
+
+dma_addr_t
+dma_map_single(struct device *dev, void *ptr, size_t size,
+              enum dma_data_direction direction)
+{
+       struct dma_map_entry *ent;
+       void *bnc;
+       dma_addr_t dma;
+       unsigned long flags;
+
+       BUG_ON(direction == DMA_NONE);
+
+       /*
+        * Even if size is sub-page, the buffer may still straddle a page
+        * boundary. Take into account buffer start offset. All other calls are
+        * conservative and always search the dma_map list if it's non-empty.
+        */
+       if ((((unsigned int)ptr & ~PAGE_MASK) + size) <= PAGE_SIZE) {
+               dma = virt_to_bus(ptr);
+       } else {
+               BUG_ON((bnc = dma_alloc_coherent(dev, size, &dma, 0)) == NULL);
+               BUG_ON((ent = kmalloc(sizeof(*ent), GFP_KERNEL)) == NULL);
+               if (direction != DMA_FROM_DEVICE)
+                       memcpy(bnc, ptr, size);
+               ent->dma    = dma;
+               ent->bounce = bnc;
+               ent->host   = ptr;
+               ent->size   = size;
+               spin_lock_irqsave(&dma_map_lock, flags);
+               list_add(&ent->list, &dma_map_head);
+               spin_unlock_irqrestore(&dma_map_lock, flags);
+       }
+
+       flush_write_buffers();
+       return dma;
+}
+EXPORT_SYMBOL(dma_map_single);
+
+void
+dma_unmap_single(struct device *dev, dma_addr_t dma_addr, size_t size,
+                enum dma_data_direction direction)
+{
+       struct dma_map_entry *ent;
+       unsigned long flags;
+
+       BUG_ON(direction == DMA_NONE);
+
+       /* Fast-path check: are there any multi-page DMA mappings? */
+       if (!list_empty(&dma_map_head)) {
+               spin_lock_irqsave(&dma_map_lock, flags);
+               list_for_each_entry ( ent, &dma_map_head, list ) {
+                       if (DMA_MAP_MATCHES(ent, dma_addr)) {
+                               list_del(&ent->list);
+                               break;
+                       }
+               }
+               spin_unlock_irqrestore(&dma_map_lock, flags);
+               if (&ent->list != &dma_map_head) {
+                       BUG_ON(dma_addr != ent->dma);
+                       BUG_ON(size != ent->size);
+                       if (direction != DMA_TO_DEVICE)
+                               memcpy(ent->host, ent->bounce, size);
+                       dma_free_coherent(dev, size, ent->bounce, ent->dma);
+                       kfree(ent);
+               }
+       }
+}
+EXPORT_SYMBOL(dma_unmap_single);
+
+void
+dma_sync_single_for_cpu(struct device *dev, dma_addr_t dma_handle, size_t size,
+                       enum dma_data_direction direction)
+{
+       struct dma_map_entry *ent;
+       unsigned long flags, off;
+
+       /* Fast-path check: are there any multi-page DMA mappings? */
+       if (!list_empty(&dma_map_head)) {
+               spin_lock_irqsave(&dma_map_lock, flags);
+               list_for_each_entry ( ent, &dma_map_head, list )
+                       if (DMA_MAP_MATCHES(ent, dma_handle))
+                               break;
+               spin_unlock_irqrestore(&dma_map_lock, flags);
+               if (&ent->list != &dma_map_head) {
+                       off = dma_handle - ent->dma;
+                       BUG_ON((off + size) > ent->size);
+                       /*if (direction != DMA_TO_DEVICE)*/
+                               memcpy(ent->host+off, ent->bounce+off, size);
+               }
+       }
+}
+EXPORT_SYMBOL(dma_sync_single_for_cpu);
+
+void
+dma_sync_single_for_device(struct device *dev, dma_addr_t dma_handle, size_t 
size,
+                           enum dma_data_direction direction)
+{
+       struct dma_map_entry *ent;
+       unsigned long flags, off;
+
+       /* Fast-path check: are there any multi-page DMA mappings? */
+       if (!list_empty(&dma_map_head)) {
+               spin_lock_irqsave(&dma_map_lock, flags);
+               list_for_each_entry ( ent, &dma_map_head, list )
+                       if (DMA_MAP_MATCHES(ent, dma_handle))
+                               break;
+               spin_unlock_irqrestore(&dma_map_lock, flags);
+               if (&ent->list != &dma_map_head) {
+                       off = dma_handle - ent->dma;
+                       BUG_ON((off + size) > ent->size);
+                       /*if (direction != DMA_FROM_DEVICE)*/
+                               memcpy(ent->bounce+off, ent->host+off, size);
+               }
+       }
+
+       flush_write_buffers();
+}
+EXPORT_SYMBOL(dma_sync_single_for_device);
Index: linux-2.6.12-xen0-arch/arch/i386/mach-xen/kernel/process.c
===================================================================
--- /dev/null
+++ linux-2.6.12-xen0-arch/arch/i386/mach-xen/kernel/process.c
@@ -0,0 +1,793 @@
+/*
+ *  linux/arch/i386/kernel/process.c
+ *
+ *  Copyright (C) 1995  Linus Torvalds
+ *
+ *  Pentium III FXSR, SSE support
+ *     Gareth Hughes <gareth@xxxxxxxxxxx>, May 2000
+ */
+
+/*
+ * This file handles the architecture-dependent parts of process handling..
+ */
+
+#include <stdarg.h>
+
+#include <linux/cpu.h>
+#include <linux/errno.h>
+#include <linux/sched.h>
+#include <linux/fs.h>
+#include <linux/kernel.h>
+#include <linux/mm.h>
+#include <linux/elfcore.h>
+#include <linux/smp.h>
+#include <linux/smp_lock.h>
+#include <linux/stddef.h>
+#include <linux/slab.h>
+#include <linux/vmalloc.h>
+#include <linux/user.h>
+#include <linux/a.out.h>
+#include <linux/interrupt.h>
+#include <linux/config.h>
+#include <linux/utsname.h>
+#include <linux/delay.h>
+#include <linux/reboot.h>
+#include <linux/init.h>
+#include <linux/mc146818rtc.h>
+#include <linux/module.h>
+#include <linux/kallsyms.h>
+#include <linux/ptrace.h>
+#include <linux/random.h>
+
+#include <asm/uaccess.h>
+#include <asm/pgtable.h>
+#include <asm/system.h>
+#include <asm/io.h>
+#include <asm/ldt.h>
+#include <asm/processor.h>
+#include <asm/i387.h>
+#include <asm/irq.h>
+#include <asm/desc.h>
+#include <asm-xen/xen-public/physdev.h>
+#ifdef CONFIG_MATH_EMULATION
+#include <asm/math_emu.h>
+#endif
+
+#include <linux/irq.h>
+#include <linux/err.h>
+
+#include <asm/tlbflush.h>
+#include <asm/cpu.h>
+
+asmlinkage void ret_from_fork(void) __asm__("ret_from_fork");
+
+static int hlt_counter;
+
+unsigned long boot_option_idle_override = 0;
+EXPORT_SYMBOL(boot_option_idle_override);
+
+/*
+ * Return saved PC of a blocked thread.
+ */
+unsigned long thread_saved_pc(struct task_struct *tsk)
+{
+       return ((unsigned long *)tsk->thread.esp)[3];
+}
+
+/*
+ * Powermanagement idle function, if any..
+ */
+void (*pm_idle)(void);
+static DEFINE_PER_CPU(unsigned int, cpu_idle_state);
+
+void disable_hlt(void)
+{
+       hlt_counter++;
+}
+
+EXPORT_SYMBOL(disable_hlt);
+
+void enable_hlt(void)
+{
+       hlt_counter--;
+}
+
+EXPORT_SYMBOL(enable_hlt);
+
+/* XXX XEN doesn't use default_idle(), poll_idle(). Use xen_idle() instead. */
+extern void stop_hz_timer(void);
+extern void start_hz_timer(void);
+void xen_idle(void)
+{
+       local_irq_disable();
+
+       if (need_resched()) {
+               local_irq_enable();
+       } else {
+               stop_hz_timer();
+               HYPERVISOR_block(); /* implicit local_irq_enable() */
+               start_hz_timer();
+       }
+}
+
+#ifdef CONFIG_HOTPLUG_CPU
+#include <asm/nmi.h>
+/* We don't actually take CPU down, just spin without interrupts. */
+static inline void play_dead(void)
+{
+       /* Ack it */
+       __get_cpu_var(cpu_state) = CPU_DEAD;
+
+       /* We shouldn't have to disable interrupts while dead, but
+        * some interrupts just don't seem to go away, and this makes
+        * it "work" for testing purposes. */
+       /* Death loop */
+       while (__get_cpu_var(cpu_state) != CPU_UP_PREPARE)
+               HYPERVISOR_yield();
+
+       local_irq_disable();
+       __flush_tlb_all();
+       cpu_set(smp_processor_id(), cpu_online_map);
+       local_irq_enable();
+}
+#else
+static inline void play_dead(void)
+{
+       BUG();
+}
+#endif /* CONFIG_HOTPLUG_CPU */
+
+/*
+ * The idle thread. There's no useful work to be
+ * done, so just try to conserve power and have a
+ * low exit latency (ie sit in a loop waiting for
+ * somebody to say that they'd like to reschedule)
+ */
+void cpu_idle (void)
+{
+       int cpu = _smp_processor_id();
+
+       /* endless idle loop with no priority at all */
+       while (1) {
+               while (!need_resched()) {
+
+                       if (__get_cpu_var(cpu_idle_state))
+                               __get_cpu_var(cpu_idle_state) = 0;
+                       rmb();
+
+                       if (cpu_is_offline(cpu)) {
+#if defined(CONFIG_XEN) && defined(CONFIG_HOTPLUG_CPU)
+                               /* Tell hypervisor to take vcpu down. */
+                               HYPERVISOR_vcpu_down(cpu);
+#endif
+                               play_dead();
+         }
+
+                       __get_cpu_var(irq_stat).idle_timestamp = jiffies;
+                       xen_idle();
+               }
+               schedule();
+       }
+}
+
+void cpu_idle_wait(void)
+{
+       unsigned int cpu, this_cpu = get_cpu();
+       cpumask_t map;
+
+       set_cpus_allowed(current, cpumask_of_cpu(this_cpu));
+       put_cpu();
+
+       cpus_clear(map);
+       for_each_online_cpu(cpu) {
+               per_cpu(cpu_idle_state, cpu) = 1;
+               cpu_set(cpu, map);
+       }
+
+       __get_cpu_var(cpu_idle_state) = 0;
+
+       wmb();
+       do {
+               ssleep(1);
+               for_each_online_cpu(cpu) {
+                       if (cpu_isset(cpu, map) && !per_cpu(cpu_idle_state, 
cpu))
+                               cpu_clear(cpu, map);
+               }
+               cpus_and(map, map, cpu_online_map);
+       } while (!cpus_empty(map));
+}
+EXPORT_SYMBOL_GPL(cpu_idle_wait);
+
+/* XXX XEN doesn't use mwait_idle(), select_idle_routine(), idle_setup(). */
+/* Always use xen_idle() instead. */
+void __init select_idle_routine(const struct cpuinfo_x86 *c) {}
+
+void show_regs(struct pt_regs * regs)
+{
+       printk("\n");
+       printk("Pid: %d, comm: %20s\n", current->pid, current->comm);
+       printk("EIP: %04x:[<%08lx>] CPU: %d\n",0xffff & regs->xcs,regs->eip, 
smp_processor_id());
+       print_symbol("EIP is at %s\n", regs->eip);
+
+       if (regs->xcs & 2)
+               printk(" ESP: %04x:%08lx",0xffff & regs->xss,regs->esp);
+       printk(" EFLAGS: %08lx    %s  (%s)\n",
+              regs->eflags, print_tainted(), system_utsname.release);
+       printk("EAX: %08lx EBX: %08lx ECX: %08lx EDX: %08lx\n",
+               regs->eax,regs->ebx,regs->ecx,regs->edx);
+       printk("ESI: %08lx EDI: %08lx EBP: %08lx",
+               regs->esi, regs->edi, regs->ebp);
+       printk(" DS: %04x ES: %04x\n",
+               0xffff & regs->xds,0xffff & regs->xes);
+
+       show_trace(NULL, &regs->esp);
+}
+
+/*
+ * This gets run with %ebx containing the
+ * function to call, and %edx containing
+ * the "args".
+ */
+extern void kernel_thread_helper(void);
+__asm__(".section .text\n"
+       ".align 4\n"
+       "kernel_thread_helper:\n\t"
+       "movl %edx,%eax\n\t"
+       "pushl %edx\n\t"
+       "call *%ebx\n\t"
+       "pushl %eax\n\t"
+       "call do_exit\n"
+       ".previous");
+
+/*
+ * Create a kernel thread
+ */
+int kernel_thread(int (*fn)(void *), void * arg, unsigned long flags)
+{
+       struct pt_regs regs;
+
+       memset(&regs, 0, sizeof(regs));
+
+       regs.ebx = (unsigned long) fn;
+       regs.edx = (unsigned long) arg;
+
+       regs.xds = __USER_DS;
+       regs.xes = __USER_DS;
+       regs.orig_eax = -1;
+       regs.eip = (unsigned long) kernel_thread_helper;
+       regs.xcs = __KERNEL_CS;
+       regs.eflags = X86_EFLAGS_IF | X86_EFLAGS_SF | X86_EFLAGS_PF | 0x2;
+
+       /* Ok, create the new process.. */
+       return do_fork(flags | CLONE_VM | CLONE_UNTRACED, 0, &regs, 0, NULL, 
NULL);
+}
+
+/*
+ * Free current thread data structures etc..
+ */
+void exit_thread(void)
+{
+       struct task_struct *tsk = current;
+       struct thread_struct *t = &tsk->thread;
+
+       /* The process may have allocated an io port bitmap... nuke it. */
+       if (unlikely(NULL != t->io_bitmap_ptr)) {
+               physdev_op_t op = { 0 };
+               op.cmd = PHYSDEVOP_SET_IOBITMAP;
+               HYPERVISOR_physdev_op(&op);
+               kfree(t->io_bitmap_ptr);
+               t->io_bitmap_ptr = NULL;
+       }
+}
+
+void flush_thread(void)
+{
+       struct task_struct *tsk = current;
+
+       memset(tsk->thread.debugreg, 0, sizeof(unsigned long)*8);
+       memset(tsk->thread.tls_array, 0, sizeof(tsk->thread.tls_array));        
+       /*
+        * Forget coprocessor state..
+        */
+       clear_fpu(tsk);
+       clear_used_math();
+}
+
+void release_thread(struct task_struct *dead_task)
+{
+       if (dead_task->mm) {
+               // temporary debugging check
+               if (dead_task->mm->context.size) {
+                       printk("WARNING: dead process %8s still has LDT? 
<%p/%d>\n",
+                                       dead_task->comm,
+                                       dead_task->mm->context.ldt,
+                                       dead_task->mm->context.size);
+                       BUG();
+               }
+       }
+
+       release_vm86_irqs(dead_task);
+}
+
+/*
+ * This gets called before we allocate a new thread and copy
+ * the current task into it.
+ */
+void prepare_to_copy(struct task_struct *tsk)
+{
+       unlazy_fpu(tsk);
+}
+
+int copy_thread(int nr, unsigned long clone_flags, unsigned long esp,
+       unsigned long unused,
+       struct task_struct * p, struct pt_regs * regs)
+{
+       struct pt_regs * childregs;
+       struct task_struct *tsk;
+       int err;
+
+       childregs = ((struct pt_regs *) (THREAD_SIZE + (unsigned long) 
p->thread_info)) - 1;
+       /*
+        * The below -8 is to reserve 8 bytes on top of the ring0 stack.
+        * This is necessary to guarantee that the entire "struct pt_regs"
+        * is accessable even if the CPU haven't stored the SS/ESP registers
+        * on the stack (interrupt gate does not save these registers
+        * when switching to the same priv ring).
+        * Therefore beware: accessing the xss/esp fields of the
+        * "struct pt_regs" is possible, but they may contain the
+        * completely wrong values.
+        */
+       childregs = (struct pt_regs *) ((unsigned long) childregs - 8);
+       *childregs = *regs;
+       childregs->eax = 0;
+       childregs->esp = esp;
+
+       p->thread.esp = (unsigned long) childregs;
+       p->thread.esp0 = (unsigned long) (childregs+1);
+
+       p->thread.eip = (unsigned long) ret_from_fork;
+
+       savesegment(fs,p->thread.fs);
+       savesegment(gs,p->thread.gs);
+
+       tsk = current;
+       if (unlikely(NULL != tsk->thread.io_bitmap_ptr)) {
+               p->thread.io_bitmap_ptr = kmalloc(IO_BITMAP_BYTES, GFP_KERNEL);
+               if (!p->thread.io_bitmap_ptr) {
+                       p->thread.io_bitmap_max = 0;
+                       return -ENOMEM;
+               }
+               memcpy(p->thread.io_bitmap_ptr, tsk->thread.io_bitmap_ptr,
+                       IO_BITMAP_BYTES);
+       }
+
+       /*
+        * Set a new TLS for the child thread?
+        */
+       if (clone_flags & CLONE_SETTLS) {
+               struct desc_struct *desc;
+               struct user_desc info;
+               int idx;
+
+               err = -EFAULT;
+               if (copy_from_user(&info, (void __user *)childregs->esi, 
sizeof(info)))
+                       goto out;
+               err = -EINVAL;
+               if (LDT_empty(&info))
+                       goto out;
+
+               idx = info.entry_number;
+               if (idx < GDT_ENTRY_TLS_MIN || idx > GDT_ENTRY_TLS_MAX)
+                       goto out;
+
+               desc = p->thread.tls_array + idx - GDT_ENTRY_TLS_MIN;
+               desc->a = LDT_entry_a(&info);
+               desc->b = LDT_entry_b(&info);
+       }
+
+       p->thread.io_pl = current->thread.io_pl;
+
+       err = 0;
+ out:
+       if (err && p->thread.io_bitmap_ptr) {
+               kfree(p->thread.io_bitmap_ptr);
+               p->thread.io_bitmap_max = 0;
+       }
+       return err;
+}
+
+/*
+ * fill in the user structure for a core dump..
+ */
+void dump_thread(struct pt_regs * regs, struct user * dump)
+{
+       int i;
+
+/* changed the size calculations - should hopefully work better. lbt */
+       dump->magic = CMAGIC;
+       dump->start_code = 0;
+       dump->start_stack = regs->esp & ~(PAGE_SIZE - 1);
+       dump->u_tsize = ((unsigned long) current->mm->end_code) >> PAGE_SHIFT;
+       dump->u_dsize = ((unsigned long) (current->mm->brk + (PAGE_SIZE-1))) >> 
PAGE_SHIFT;
+       dump->u_dsize -= dump->u_tsize;
+       dump->u_ssize = 0;
+       for (i = 0; i < 8; i++)
+               dump->u_debugreg[i] = current->thread.debugreg[i];  
+
+       if (dump->start_stack < TASK_SIZE)
+               dump->u_ssize = ((unsigned long) (TASK_SIZE - 
dump->start_stack)) >> PAGE_SHIFT;
+
+       dump->regs.ebx = regs->ebx;
+       dump->regs.ecx = regs->ecx;
+       dump->regs.edx = regs->edx;
+       dump->regs.esi = regs->esi;
+       dump->regs.edi = regs->edi;
+       dump->regs.ebp = regs->ebp;
+       dump->regs.eax = regs->eax;
+       dump->regs.ds = regs->xds;
+       dump->regs.es = regs->xes;
+       savesegment(fs,dump->regs.fs);
+       savesegment(gs,dump->regs.gs);
+       dump->regs.orig_eax = regs->orig_eax;
+       dump->regs.eip = regs->eip;
+       dump->regs.cs = regs->xcs;
+       dump->regs.eflags = regs->eflags;
+       dump->regs.esp = regs->esp;
+       dump->regs.ss = regs->xss;
+
+       dump->u_fpvalid = dump_fpu (regs, &dump->i387);
+}
+
+/* 
+ * Capture the user space registers if the task is not running (in user space)
+ */
+int dump_task_regs(struct task_struct *tsk, elf_gregset_t *regs)
+{
+       struct pt_regs ptregs;
+       
+       ptregs = *(struct pt_regs *)
+               ((unsigned long)tsk->thread_info+THREAD_SIZE - sizeof(ptregs));
+       ptregs.xcs &= 0xffff;
+       ptregs.xds &= 0xffff;
+       ptregs.xes &= 0xffff;
+       ptregs.xss &= 0xffff;
+
+       elf_core_copy_regs(regs, &ptregs);
+
+       boot_option_idle_override = 1;
+       return 1;
+}
+
+
+/*
+ *     switch_to(x,yn) should switch tasks from x to y.
+ *
+ * We fsave/fwait so that an exception goes off at the right time
+ * (as a call from the fsave or fwait in effect) rather than to
+ * the wrong process. Lazy FP saving no longer makes any sense
+ * with modern CPU's, and this simplifies a lot of things (SMP
+ * and UP become the same).
+ *
+ * NOTE! We used to use the x86 hardware context switching. The
+ * reason for not using it any more becomes apparent when you
+ * try to recover gracefully from saved state that is no longer
+ * valid (stale segment register values in particular). With the
+ * hardware task-switch, there is no way to fix up bad state in
+ * a reasonable manner.
+ *
+ * The fact that Intel documents the hardware task-switching to
+ * be slow is a fairly red herring - this code is not noticeably
+ * faster. However, there _is_ some room for improvement here,
+ * so the performance issues may eventually be a valid point.
+ * More important, however, is the fact that this allows us much
+ * more flexibility.
+ *
+ * The return value (in %eax) will be the "prev" task after
+ * the task-switch, and shows up in ret_from_fork in entry.S,
+ * for example.
+ */
+struct task_struct fastcall * __switch_to(struct task_struct *prev_p, struct 
task_struct *next_p)
+{
+       struct thread_struct *prev = &prev_p->thread,
+                                *next = &next_p->thread;
+       int cpu = smp_processor_id();
+       struct tss_struct *tss = &per_cpu(init_tss, cpu);
+       physdev_op_t iopl_op, iobmp_op;
+       multicall_entry_t _mcl[8], *mcl = _mcl;
+
+       /* XEN NOTE: FS/GS saved in switch_mm(), not here. */
+
+       /*
+        * This is basically '__unlazy_fpu', except that we queue a
+        * multicall to indicate FPU task switch, rather than
+        * synchronously trapping to Xen.
+        */
+       if (prev_p->thread_info->status & TS_USEDFPU) {
+               __save_init_fpu(prev_p); /* _not_ save_init_fpu() */
+               mcl->op      = __HYPERVISOR_fpu_taskswitch;
+               mcl->args[0] = 1;
+               mcl++;
+       }
+
+       /*
+        * Reload esp0, LDT and the page table pointer:
+        * This is load_esp0(tss, next) with a multicall.
+        */
+       tss->esp0 = next->esp0;
+       mcl->op      = __HYPERVISOR_stack_switch;
+       mcl->args[0] = tss->ss0;
+       mcl->args[1] = tss->esp0;
+       mcl++;
+
+       /*
+        * Load the per-thread Thread-Local Storage descriptor.
+        * This is load_TLS(next, cpu) with multicalls.
+        */
+#define C(i) do {                                                       \
+       if (unlikely(next->tls_array[i].a != prev->tls_array[i].a ||    \
+                    next->tls_array[i].b != prev->tls_array[i].b)) {   \
+               mcl->op      = __HYPERVISOR_update_descriptor;          \
+               mcl->args[0] = virt_to_machine(&get_cpu_gdt_table(cpu)  \
+                                        [GDT_ENTRY_TLS_MIN + i]);      \
+               mcl->args[1] = ((u32 *)&next->tls_array[i])[0];         \
+               mcl->args[2] = ((u32 *)&next->tls_array[i])[1];         \
+               mcl++;                                                  \
+       }                                                               \
+} while (0)
+       C(0); C(1); C(2);
+#undef C
+
+       if (unlikely(prev->io_pl != next->io_pl)) {
+               iopl_op.cmd             = PHYSDEVOP_SET_IOPL;
+               iopl_op.u.set_iopl.iopl = next->io_pl;
+               mcl->op      = __HYPERVISOR_physdev_op;
+               mcl->args[0] = (unsigned long)&iopl_op;
+               mcl++;
+       }
+
+       if (unlikely(prev->io_bitmap_ptr || next->io_bitmap_ptr)) {
+               iobmp_op.cmd                     =
+                       PHYSDEVOP_SET_IOBITMAP;
+               iobmp_op.u.set_iobitmap.bitmap   =
+                       (unsigned long)next->io_bitmap_ptr;
+               iobmp_op.u.set_iobitmap.nr_ports =
+                       next->io_bitmap_ptr ? IO_BITMAP_BITS : 0;
+               mcl->op      = __HYPERVISOR_physdev_op;
+               mcl->args[0] = (unsigned long)&iobmp_op;
+               mcl++;
+       }
+
+       (void)HYPERVISOR_multicall(_mcl, mcl - _mcl);
+
+       /*
+        * Restore %fs and %gs if needed.
+        */
+       if (unlikely(next->fs | next->gs)) {
+               loadsegment(fs, next->fs);
+               loadsegment(gs, next->gs);
+       }
+
+       /*
+        * Now maybe reload the debug registers
+        */
+       if (unlikely(next->debugreg[7])) {
+               loaddebug(next, 0);
+               loaddebug(next, 1);
+               loaddebug(next, 2);
+               loaddebug(next, 3);
+               /* no 4 and 5 */
+               loaddebug(next, 6);
+               loaddebug(next, 7);
+       }
+
+       return prev_p;
+}
+
+asmlinkage int sys_fork(struct pt_regs regs)
+{
+       return do_fork(SIGCHLD, regs.esp, &regs, 0, NULL, NULL);
+}
+
+asmlinkage int sys_clone(struct pt_regs regs)
+{
+       unsigned long clone_flags;
+       unsigned long newsp;
+       int __user *parent_tidptr, *child_tidptr;
+
+       clone_flags = regs.ebx;
+       newsp = regs.ecx;
+       parent_tidptr = (int __user *)regs.edx;
+       child_tidptr = (int __user *)regs.edi;
+       if (!newsp)
+               newsp = regs.esp;
+       return do_fork(clone_flags, newsp, &regs, 0, parent_tidptr, 
child_tidptr);
+}
+
+/*
+ * This is trivial, and on the face of it looks like it
+ * could equally well be done in user mode.
+ *
+ * Not so, for quite unobvious reasons - register pressure.
+ * In user mode vfork() cannot have a stack frame, and if
+ * done by calling the "clone()" system call directly, you
+ * do not have enough call-clobbered registers to hold all
+ * the information you need.
+ */
+asmlinkage int sys_vfork(struct pt_regs regs)
+{
+       return do_fork(CLONE_VFORK | CLONE_VM | SIGCHLD, regs.esp, &regs, 0, 
NULL, NULL);
+}
+
+/*
+ * sys_execve() executes a new program.
+ */
+asmlinkage int sys_execve(struct pt_regs regs)
+{
+       int error;
+       char * filename;
+
+       filename = getname((char __user *) regs.ebx);
+       error = PTR_ERR(filename);
+       if (IS_ERR(filename))
+               goto out;
+       error = do_execve(filename,
+                       (char __user * __user *) regs.ecx,
+                       (char __user * __user *) regs.edx,
+                       &regs);
+       if (error == 0) {
+               task_lock(current);
+               current->ptrace &= ~PT_DTRACE;
+               task_unlock(current);
+               /* Make sure we don't return using sysenter.. */
+               set_thread_flag(TIF_IRET);
+       }
+       putname(filename);
+out:
+       return error;
+}
+
+#define top_esp                (THREAD_SIZE - sizeof(unsigned long))
+#define top_ebp                (THREAD_SIZE - 2*sizeof(unsigned long))
+
+unsigned long get_wchan(struct task_struct *p)
+{
+       unsigned long ebp, esp, eip;
+       unsigned long stack_page;
+       int count = 0;
+       if (!p || p == current || p->state == TASK_RUNNING)
+               return 0;
+       stack_page = (unsigned long)p->thread_info;
+       esp = p->thread.esp;
+       if (!stack_page || esp < stack_page || esp > top_esp+stack_page)
+               return 0;
+       /* include/asm-i386/system.h:switch_to() pushes ebp last. */
+       ebp = *(unsigned long *) esp;
+       do {
+               if (ebp < stack_page || ebp > top_ebp+stack_page)
+                       return 0;
+               eip = *(unsigned long *) (ebp+4);
+               if (!in_sched_functions(eip))
+                       return eip;
+               ebp = *(unsigned long *) ebp;
+       } while (count++ < 16);
+       return 0;
+}
+
+/*
+ * sys_alloc_thread_area: get a yet unused TLS descriptor index.
+ */
+static int get_free_idx(void)
+{
+       struct thread_struct *t = &current->thread;
+       int idx;
+
+       for (idx = 0; idx < GDT_ENTRY_TLS_ENTRIES; idx++)
+               if (desc_empty(t->tls_array + idx))
+                       return idx + GDT_ENTRY_TLS_MIN;
+       return -ESRCH;
+}
+
+/*
+ * Set a given TLS descriptor:
+ */
+asmlinkage int sys_set_thread_area(struct user_desc __user *u_info)
+{
+       struct thread_struct *t = &current->thread;
+       struct user_desc info;
+       struct desc_struct *desc;
+       int cpu, idx;
+
+       if (copy_from_user(&info, u_info, sizeof(info)))
+               return -EFAULT;
+       idx = info.entry_number;
+
+       /*
+        * index -1 means the kernel should try to find and
+        * allocate an empty descriptor:
+        */
+       if (idx == -1) {
+               idx = get_free_idx();
+               if (idx < 0)
+                       return idx;
+               if (put_user(idx, &u_info->entry_number))
+                       return -EFAULT;
+       }
+
+       if (idx < GDT_ENTRY_TLS_MIN || idx > GDT_ENTRY_TLS_MAX)
+               return -EINVAL;
+
+       desc = t->tls_array + idx - GDT_ENTRY_TLS_MIN;
+
+       /*
+        * We must not get preempted while modifying the TLS.
+        */
+       cpu = get_cpu();
+
+       if (LDT_empty(&info)) {
+               desc->a = 0;
+               desc->b = 0;
+       } else {
+               desc->a = LDT_entry_a(&info);
+               desc->b = LDT_entry_b(&info);
+       }
+       load_TLS(t, cpu);
+
+       put_cpu();
+
+       return 0;
+}
+
+/*
+ * Get the current Thread-Local Storage area:
+ */
+
+#define GET_BASE(desc) ( \
+       (((desc)->a >> 16) & 0x0000ffff) | \
+       (((desc)->b << 16) & 0x00ff0000) | \
+       ( (desc)->b        & 0xff000000)   )
+
+#define GET_LIMIT(desc) ( \
+       ((desc)->a & 0x0ffff) | \
+        ((desc)->b & 0xf0000) )
+       
+#define GET_32BIT(desc)                (((desc)->b >> 22) & 1)
+#define GET_CONTENTS(desc)     (((desc)->b >> 10) & 3)
+#define GET_WRITABLE(desc)     (((desc)->b >>  9) & 1)
+#define GET_LIMIT_PAGES(desc)  (((desc)->b >> 23) & 1)
+#define GET_PRESENT(desc)      (((desc)->b >> 15) & 1)
+#define GET_USEABLE(desc)      (((desc)->b >> 20) & 1)
+
+asmlinkage int sys_get_thread_area(struct user_desc __user *u_info)
+{
+       struct user_desc info;
+       struct desc_struct *desc;
+       int idx;
+
+       if (get_user(idx, &u_info->entry_number))
+               return -EFAULT;
+       if (idx < GDT_ENTRY_TLS_MIN || idx > GDT_ENTRY_TLS_MAX)
+               return -EINVAL;
+
+       desc = current->thread.tls_array + idx - GDT_ENTRY_TLS_MIN;
+
+       info.entry_number = idx;
+       info.base_addr = GET_BASE(desc);
+       info.limit = GET_LIMIT(desc);
+       info.seg_32bit = GET_32BIT(desc);
+       info.contents = GET_CONTENTS(desc);
+       info.read_exec_only = !GET_WRITABLE(desc);
+       info.limit_in_pages = GET_LIMIT_PAGES(desc);
+       info.seg_not_present = !GET_PRESENT(desc);
+       info.useable = GET_USEABLE(desc);
+
+       if (copy_to_user(u_info, &info, sizeof(info)))
+               return -EFAULT;
+       return 0;
+}
+
+unsigned long arch_align_stack(unsigned long sp)
+{
+       if (randomize_va_space)
+               sp -= get_random_int() % 8192;
+       return sp & ~0xf;
+}
Index: linux-2.6.12-xen0-arch/arch/i386/mach-xen/kernel/quirks.c
===================================================================
--- /dev/null
+++ linux-2.6.12-xen0-arch/arch/i386/mach-xen/kernel/quirks.c
@@ -0,0 +1,49 @@
+/*
+ * This file contains work-arounds for x86 and x86_64 platform bugs.
+ */
+#include <linux/config.h>
+#include <linux/pci.h>
+#include <linux/irq.h>
+
+#if defined(CONFIG_X86_IO_APIC) && defined(CONFIG_SMP) && defined(CONFIG_PCI)
+
+static void __devinit quirk_intel_irqbalance(struct pci_dev *dev)
+{
+       u8 config, rev;
+       u32 word;
+
+       /* BIOS may enable hardware IRQ balancing for
+        * E7520/E7320/E7525(revision ID 0x9 and below)
+        * based platforms.
+        * Disable SW irqbalance/affinity on those platforms.
+        */
+       pci_read_config_byte(dev, PCI_CLASS_REVISION, &rev);
+       if (rev > 0x9)
+               return;
+
+       printk(KERN_INFO "Intel E7520/7320/7525 detected.");
+
+       /* enable access to config space*/
+       pci_read_config_byte(dev, 0xf4, &config);
+       config |= 0x2;
+       pci_write_config_byte(dev, 0xf4, config);
+
+       /* read xTPR register */
+       raw_pci_ops->read(0, 0, 0x40, 0x4c, 2, &word);
+
+       if (!(word & (1 << 13))) {
+               dom0_op_t op;
+               printk(KERN_INFO "Disabling irq balancing and affinity\n");
+               op.cmd = DOM0_PLATFORM_QUIRK;
+               op.u.platform_quirk.quirk_id = QUIRK_NOIRQBALANCING;
+               (void)HYPERVISOR_dom0_op(&op);
+       }
+
+       config &= ~0x2;
+       /* disable access to config space*/
+       pci_write_config_byte(dev, 0xf4, config);
+}
+DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_INTEL,   PCI_DEVICE_ID_INTEL_E7320_MCH,  
quirk_intel_irqbalance);
+DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_INTEL,   PCI_DEVICE_ID_INTEL_E7525_MCH,  
quirk_intel_irqbalance);
+DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_INTEL,   PCI_DEVICE_ID_INTEL_E7520_MCH,  
quirk_intel_irqbalance);
+#endif
Index: linux-2.6.12-xen0-arch/arch/i386/mach-xen/kernel/reboot.c
===================================================================
--- /dev/null
+++ linux-2.6.12-xen0-arch/arch/i386/mach-xen/kernel/reboot.c
@@ -0,0 +1,262 @@
+
+#define __KERNEL_SYSCALLS__
+static int errno;
+#include <linux/errno.h>
+#include <linux/version.h>
+#include <linux/kernel.h>
+#include <linux/mm.h>
+#include <linux/unistd.h>
+#include <linux/module.h>
+#include <linux/reboot.h>
+#include <linux/sysrq.h>
+#include <asm/irq.h>
+#include <asm/mmu_context.h>
+
+#include <ctrl_if.h>
+#include <hypervisor.h>
+#include <xen-public/dom0_ops.h>
+#include <mach_suspend.h>
+#include <queues.h>
+
+void machine_restart(char * __unused)
+{
+       /* We really want to get pending console data out before we die. */
+       extern void xencons_force_flush(void);
+       xencons_force_flush();
+       HYPERVISOR_reboot();
+}
+
+void machine_halt(void)
+{
+       machine_power_off();
+}
+
+void machine_power_off(void)
+{
+       /* We really want to get pending console data out before we die. */
+       extern void xencons_force_flush(void);
+       xencons_force_flush();
+       HYPERVISOR_shutdown();
+}
+
+int reboot_thru_bios = 0;      /* for dmi_scan.c */
+EXPORT_SYMBOL(machine_restart);
+EXPORT_SYMBOL(machine_halt);
+EXPORT_SYMBOL(machine_power_off);
+
+
+/* FIXME move all the rest, doesn't belong here */
+
+/******************************************************************************
+ * Stop/pickle callback handling.
+ */
+
+/* Ignore multiple shutdown requests. */
+static int shutting_down = -1;
+
+static void __do_suspend(void)
+{
+    int i, j;
+    suspend_record_t *suspend_record;
+
+    /* Hmmm... a cleaner interface to suspend/resume blkdevs would be nice. */
+       /* XXX SMH: yes it would :-( */ 
+#ifdef CONFIG_XEN_BLKDEV_FRONTEND
+    extern void blkdev_suspend(void);
+    extern void blkdev_resume(void);
+#else
+#define blkdev_suspend() do{}while(0)
+#define blkdev_resume()  do{}while(0)
+#endif
+
+#ifdef CONFIG_XEN_NETDEV_FRONTEND
+    extern void netif_suspend(void);
+    extern void netif_resume(void);  
+#else
+#define netif_suspend() do{}while(0)
+#define netif_resume()  do{}while(0)
+#endif
+
+#ifdef CONFIG_XEN_USB_FRONTEND
+    extern void usbif_resume();
+#else
+#define usbif_resume() do{}while(0)
+#endif
+
+#ifdef CONFIG_XEN_BLKDEV_GRANT
+    extern int gnttab_suspend(void);
+    extern int gnttab_resume(void);
+#else
+#define gnttab_suspend() do{}while(0)
+#define gnttab_resume()  do{}while(0)
+#endif
+
+    extern void time_suspend(void);
+    extern void time_resume(void);
+    extern unsigned long max_pfn;
+    extern unsigned int *pfn_to_mfn_frame_list;
+
+    suspend_record = (suspend_record_t *)__get_free_page(GFP_KERNEL);
+    if ( suspend_record == NULL )
+        goto out;
+
+    suspend_record->nr_pfns = max_pfn; /* final number of pfns */
+
+    __cli();
+
+#ifdef __i386__
+    mm_pin_all();
+    kmem_cache_shrink(pgd_cache);
+#endif
+
+    netif_suspend();
+
+    blkdev_suspend();
+
+    time_suspend();
+
+    ctrl_if_suspend();
+
+    irq_suspend();
+
+    gnttab_suspend();
+
+    HYPERVISOR_shared_info = (shared_info_t *)empty_zero_page;
+    clear_fixmap(FIX_SHARED_INFO);
+
+    memcpy(&suspend_record->resume_info, &xen_start_info,
+           sizeof(xen_start_info));
+
+    HYPERVISOR_suspend(virt_to_machine(suspend_record) >> PAGE_SHIFT);
+
+    shutting_down = -1; 
+
+    memcpy(&xen_start_info, &suspend_record->resume_info,
+           sizeof(xen_start_info));
+
+    set_fixmap(FIX_SHARED_INFO, xen_start_info.shared_info);
+
+    HYPERVISOR_shared_info = (shared_info_t *)fix_to_virt(FIX_SHARED_INFO);
+
+    memset(empty_zero_page, 0, PAGE_SIZE);
+
+    for ( i=0, j=0; i < max_pfn; i+=(PAGE_SIZE/sizeof(unsigned long)), j++ )
+    {
+        pfn_to_mfn_frame_list[j] = 
+            virt_to_machine(&phys_to_machine_mapping[i]) >> PAGE_SHIFT;
+    }
+    HYPERVISOR_shared_info->arch.pfn_to_mfn_frame_list =
+        virt_to_machine(pfn_to_mfn_frame_list) >> PAGE_SHIFT;
+
+    gnttab_resume();
+
+    irq_resume();
+
+    ctrl_if_resume();
+
+    time_resume();
+
+    blkdev_resume();
+
+    netif_resume();
+
+    usbif_resume();
+
+    __sti();
+
+ out:
+    if ( suspend_record != NULL )
+        free_page((unsigned long)suspend_record);
+}
+
+static int shutdown_process(void *__unused)
+{
+    static char *envp[] = { "HOME=/", "TERM=linux", 
+                            "PATH=/sbin:/usr/sbin:/bin:/usr/bin", NULL };
+    static char *restart_argv[]  = { "/sbin/reboot", NULL };
+    static char *poweroff_argv[] = { "/sbin/poweroff", NULL };
+
+    extern asmlinkage long sys_reboot(int magic1, int magic2,
+                                      unsigned int cmd, void *arg);
+
+    daemonize("shutdown");
+
+    switch ( shutting_down )
+    {
+    case CMSG_SHUTDOWN_POWEROFF:
+        if ( execve("/sbin/poweroff", poweroff_argv, envp) < 0 )
+        {
+            sys_reboot(LINUX_REBOOT_MAGIC1,
+                       LINUX_REBOOT_MAGIC2,
+                       LINUX_REBOOT_CMD_POWER_OFF,
+                       NULL);
+        }
+        break;
+
+    case CMSG_SHUTDOWN_REBOOT:
+        if ( execve("/sbin/reboot", restart_argv, envp) < 0 )
+        {
+            sys_reboot(LINUX_REBOOT_MAGIC1,
+                       LINUX_REBOOT_MAGIC2,
+                       LINUX_REBOOT_CMD_RESTART,
+                       NULL);
+        }
+        break;
+    }
+
+    shutting_down = -1; /* could try again */
+
+    return 0;
+}
+
+static void __shutdown_handler(void *unused)
+{
+    int err;
+
+    if ( shutting_down != CMSG_SHUTDOWN_SUSPEND )
+    {
+        err = kernel_thread(shutdown_process, NULL, CLONE_FS | CLONE_FILES);
+        if ( err < 0 )
+            printk(KERN_ALERT "Error creating shutdown process!\n");
+    }
+    else
+    {
+        __do_suspend();
+    }
+}
+
+static void shutdown_handler(ctrl_msg_t *msg, unsigned long id)
+{
+    static DECLARE_WORK(shutdown_work, __shutdown_handler, NULL);
+
+    if ( msg->subtype == CMSG_SHUTDOWN_SYSRQ )
+    {
+       int sysrq = ((shutdown_sysrq_t *)&msg->msg[0])->key;
+       
+#ifdef CONFIG_MAGIC_SYSRQ
+       handle_sysrq(sysrq, NULL, NULL);
+#endif
+    }
+    else if ( (shutting_down == -1) &&
+         ((msg->subtype == CMSG_SHUTDOWN_POWEROFF) ||
+          (msg->subtype == CMSG_SHUTDOWN_REBOOT) ||
+          (msg->subtype == CMSG_SHUTDOWN_SUSPEND)) )
+    {
+        shutting_down = msg->subtype;
+        schedule_work(&shutdown_work);
+    }
+    else
+    {
+        printk("Ignore spurious shutdown request\n");
+    }
+
+    ctrl_if_send_response(msg);
+}
+
+static int __init setup_shutdown_event(void)
+{
+    ctrl_if_register_receiver(CMSG_SHUTDOWN, shutdown_handler, 0);
+    return 0;
+}
+
+__initcall(setup_shutdown_event);
Index: linux-2.6.12-xen0-arch/arch/i386/mach-xen/kernel/setup.c
===================================================================
--- /dev/null
+++ linux-2.6.12-xen0-arch/arch/i386/mach-xen/kernel/setup.c
@@ -0,0 +1,1712 @@
+/*
+ *  linux/arch/i386/kernel/setup.c
+ *
+ *  Copyright (C) 1995  Linus Torvalds
+ *
+ *  Support of BIGMEM added by Gerhard Wichert, Siemens AG, July 1999
+ *
+ *  Memory region support
+ *     David Parsons <orc@xxxxxxxxxxxxxx>, July-August 1999
+ *
+ *  Added E820 sanitization routine (removes overlapping memory regions);
+ *  Brian Moyle <bmoyle@xxxxxxxxxx>, February 2001
+ *
+ * Moved CPU detection code to cpu/${cpu}.c
+ *    Patrick Mochel <mochel@xxxxxxxx>, March 2002
+ *
+ *  Provisions for empty E820 memory regions (reported by certain BIOSes).
+ *  Alex Achenbach <xela@xxxxxxx>, December 2002.
+ *
+ */
+
+/*
+ * This file handles the architecture-dependent parts of initialization
+ */
+
+#include <linux/sched.h>
+#include <linux/mm.h>
+#include <linux/tty.h>
+#include <linux/ioport.h>
+#include <linux/acpi.h>
+#include <linux/apm_bios.h>
+#include <linux/initrd.h>
+#include <linux/bootmem.h>
+#include <linux/seq_file.h>
+#include <linux/console.h>
+#include <linux/mca.h>
+#include <linux/root_dev.h>
+#include <linux/highmem.h>
+#include <linux/module.h>
+#include <linux/efi.h>
+#include <linux/init.h>
+#include <linux/edd.h>
+#include <linux/nodemask.h>
+#include <linux/kernel.h>
+#include <linux/percpu.h>
+#include <linux/notifier.h>
+#include <video/edid.h>
+#include <asm/e820.h>
+#include <asm/mpspec.h>
+#include <asm/setup.h>
+#include <asm/arch_hooks.h>
+#include <asm/sections.h>
+#include <asm/io_apic.h>
+#include <asm/ist.h>
+#include <asm/io.h>
+#include <xen_hypervisor.h>
+#include <xen-public/physdev.h>
+#include "setup_arch_pre.h"
+#include <bios_ebda.h>
+
+/* Allows setting of maximum possible memory size  */
+static unsigned long xen_override_max_pfn;
+
+static int xen_panic_event(struct notifier_block *, unsigned long, void *);
+static struct notifier_block xen_panic_block = {
+       xen_panic_event, NULL, 0 /* try to go last */
+};
+
+int disable_pse __initdata = 0;
+
+/*
+ * Machine setup..
+ */
+
+#ifdef CONFIG_EFI
+int efi_enabled = 0;
+EXPORT_SYMBOL(efi_enabled);
+#endif
+
+/* cpu data as detected by the assembly code in head.S */
+struct cpuinfo_x86 new_cpu_data __initdata = { 0, 0, 0, 0, -1, 0, 1, 0, -1 };
+/* common cpu data for all cpus */
+struct cpuinfo_x86 boot_cpu_data = { 0, 0, 0, 0, -1, 0, 1, 0, -1 };
+
+unsigned long mmu_cr4_features;
+
+#ifdef CONFIG_ACPI_INTERPRETER
+       int acpi_disabled = 0;
+#else
+       int acpi_disabled = 1;
+#endif
+EXPORT_SYMBOL(acpi_disabled);
+
+#ifdef CONFIG_ACPI_BOOT
+int __initdata acpi_force = 0;
+extern acpi_interrupt_flags    acpi_sci_flags;
+#endif
+
+/* for MCA, but anyone else can use it if they want */
+unsigned int machine_id;
+unsigned int machine_submodel_id;
+unsigned int BIOS_revision;
+unsigned int mca_pentium_flag;
+
+/* For PCI or other memory-mapped resources */
+unsigned long pci_mem_start = 0x10000000;
+
+/* Boot loader ID as an integer, for the benefit of proc_dointvec */
+int bootloader_type;
+
+/* user-defined highmem size */
+static unsigned int highmem_pages = -1;
+
+/*
+ * Setup options
+ */
+struct drive_info_struct { char dummy[32]; } drive_info;
+struct screen_info screen_info;
+struct apm_info apm_info;
+struct sys_desc_table_struct {
+       unsigned short length;
+       unsigned char table[0];
+};
+struct edid_info edid_info;
+struct ist_info ist_info;
+struct e820map e820;
+
+extern void early_cpu_init(void);
+extern void dmi_scan_machine(void);
+extern void generic_apic_probe(char *);
+extern int root_mountflags;
+
+unsigned long saved_videomode;
+
+#define RAMDISK_IMAGE_START_MASK       0x07FF
+#define RAMDISK_PROMPT_FLAG            0x8000
+#define RAMDISK_LOAD_FLAG              0x4000  
+
+static char command_line[COMMAND_LINE_SIZE];
+
+unsigned char __initdata boot_params[PARAM_SIZE];
+
+static struct resource data_resource = {
+       .name   = "Kernel data",
+       .start  = 0,
+       .end    = 0,
+       .flags  = IORESOURCE_BUSY | IORESOURCE_MEM
+};
+
+static struct resource code_resource = {
+       .name   = "Kernel code",
+       .start  = 0,
+       .end    = 0,
+       .flags  = IORESOURCE_BUSY | IORESOURCE_MEM
+};
+
+#ifdef CONFIG_XEN_PRIVILEGED_GUEST
+static struct resource system_rom_resource = {
+       .name   = "System ROM",
+       .start  = 0xf0000,
+       .end    = 0xfffff,
+       .flags  = IORESOURCE_BUSY | IORESOURCE_READONLY | IORESOURCE_MEM
+};
+
+static struct resource extension_rom_resource = {
+       .name   = "Extension ROM",
+       .start  = 0xe0000,
+       .end    = 0xeffff,
+       .flags  = IORESOURCE_BUSY | IORESOURCE_READONLY | IORESOURCE_MEM
+};
+
+static struct resource adapter_rom_resources[] = { {
+       .name   = "Adapter ROM",
+       .start  = 0xc8000,
+       .end    = 0,
+       .flags  = IORESOURCE_BUSY | IORESOURCE_READONLY | IORESOURCE_MEM
+}, {
+       .name   = "Adapter ROM",
+       .start  = 0,
+       .end    = 0,
+       .flags  = IORESOURCE_BUSY | IORESOURCE_READONLY | IORESOURCE_MEM
+}, {
+       .name   = "Adapter ROM",
+       .start  = 0,
+       .end    = 0,
+       .flags  = IORESOURCE_BUSY | IORESOURCE_READONLY | IORESOURCE_MEM
+}, {
+       .name   = "Adapter ROM",
+       .start  = 0,
+       .end    = 0,
+       .flags  = IORESOURCE_BUSY | IORESOURCE_READONLY | IORESOURCE_MEM
+}, {
+       .name   = "Adapter ROM",
+       .start  = 0,
+       .end    = 0,
+       .flags  = IORESOURCE_BUSY | IORESOURCE_READONLY | IORESOURCE_MEM
+}, {
+       .name   = "Adapter ROM",
+       .start  = 0,
+       .end    = 0,
+       .flags  = IORESOURCE_BUSY | IORESOURCE_READONLY | IORESOURCE_MEM
+} };
+
+#define ADAPTER_ROM_RESOURCES \
+       (sizeof adapter_rom_resources / sizeof adapter_rom_resources[0])
+
+static struct resource video_rom_resource = {
+       .name   = "Video ROM",
+       .start  = 0xc0000,
+       .end    = 0xc7fff,
+       .flags  = IORESOURCE_BUSY | IORESOURCE_READONLY | IORESOURCE_MEM
+};
+#endif
+
+static struct resource video_ram_resource = {
+       .name   = "Video RAM area",
+       .start  = 0xa0000,
+       .end    = 0xbffff,
+       .flags  = IORESOURCE_BUSY | IORESOURCE_MEM
+};
+
+static struct resource standard_io_resources[] = { {
+       .name   = "dma1",
+       .start  = 0x0000,
+       .end    = 0x001f,
+       .flags  = IORESOURCE_BUSY | IORESOURCE_IO
+}, {
+       .name   = "pic1",
+       .start  = 0x0020,
+       .end    = 0x0021,
+       .flags  = IORESOURCE_BUSY | IORESOURCE_IO
+}, {
+       .name   = "timer0",
+       .start  = 0x0040,
+       .end    = 0x0043,
+       .flags  = IORESOURCE_BUSY | IORESOURCE_IO
+}, {
+       .name   = "timer1",
+       .start  = 0x0050,
+       .end    = 0x0053,
+       .flags  = IORESOURCE_BUSY | IORESOURCE_IO
+}, {
+       .name   = "keyboard",
+       .start  = 0x0060,
+       .end    = 0x006f,
+       .flags  = IORESOURCE_BUSY | IORESOURCE_IO
+}, {
+       .name   = "dma page reg",
+       .start  = 0x0080,
+       .end    = 0x008f,
+       .flags  = IORESOURCE_BUSY | IORESOURCE_IO
+}, {
+       .name   = "pic2",
+       .start  = 0x00a0,
+       .end    = 0x00a1,
+       .flags  = IORESOURCE_BUSY | IORESOURCE_IO
+}, {
+       .name   = "dma2",
+       .start  = 0x00c0,
+       .end    = 0x00df,
+       .flags  = IORESOURCE_BUSY | IORESOURCE_IO
+}, {
+       .name   = "fpu",
+       .start  = 0x00f0,
+       .end    = 0x00ff,
+       .flags  = IORESOURCE_BUSY | IORESOURCE_IO
+} };
+
+#define STANDARD_IO_RESOURCES \
+       (sizeof standard_io_resources / sizeof standard_io_resources[0])
+
+#ifdef CONFIG_XEN_PRIVILEGED_GUEST
+#define romsignature(x) (*(unsigned short *)(x) == 0xaa55)
+
+static int __init romchecksum(unsigned char *rom, unsigned long length)
+{
+       unsigned char *p, sum = 0;
+
+       for (p = rom; p < rom + length; p++)
+               sum += *p;
+       return sum == 0;
+}
+
+static void __init probe_roms(void)
+{
+       unsigned long start, length, upper;
+       unsigned char *rom;
+       int           i;
+
+       /* Nothing to do if not running in dom0. */
+       if (!(xen_start_info.flags & SIF_INITDOMAIN))
+               return;
+
+       /* video rom */
+       upper = adapter_rom_resources[0].start;
+       for (start = video_rom_resource.start; start < upper; start += 2048) {
+               rom = isa_bus_to_virt(start);
+               if (!romsignature(rom))
+                       continue;
+
+               video_rom_resource.start = start;
+
+               /* 0 < length <= 0x7f * 512, historically */
+               length = rom[2] * 512;
+
+               /* if checksum okay, trust length byte */
+               if (length && romchecksum(rom, length))
+                       video_rom_resource.end = start + length - 1;
+
+               request_resource(&iomem_resource, &video_rom_resource);
+               break;
+       }
+
+       start = (video_rom_resource.end + 1 + 2047) & ~2047UL;
+       if (start < upper)
+               start = upper;
+
+       /* system rom */
+       request_resource(&iomem_resource, &system_rom_resource);
+       upper = system_rom_resource.start;
+
+       /* check for extension rom (ignore length byte!) */
+       rom = isa_bus_to_virt(extension_rom_resource.start);
+       if (romsignature(rom)) {
+               length = extension_rom_resource.end - 
extension_rom_resource.start + 1;
+               if (romchecksum(rom, length)) {
+                       request_resource(&iomem_resource, 
&extension_rom_resource);
+                       upper = extension_rom_resource.start;
+               }
+       }
+
+       /* check for adapter roms on 2k boundaries */
+       for (i = 0; i < ADAPTER_ROM_RESOURCES && start < upper; start += 2048) {
+               rom = isa_bus_to_virt(start);
+               if (!romsignature(rom))
+                       continue;
+
+               /* 0 < length <= 0x7f * 512, historically */
+               length = rom[2] * 512;
+
+               /* but accept any length that fits if checksum okay */
+               if (!length || start + length > upper || !romchecksum(rom, 
length))
+                       continue;
+
+               adapter_rom_resources[i].start = start;
+               adapter_rom_resources[i].end = start + length - 1;
+               request_resource(&iomem_resource, &adapter_rom_resources[i]);
+
+               start = adapter_rom_resources[i++].end & ~2047UL;
+       }
+}
+#endif
+
+/*
+ * Point at the empty zero page to start with. We map the real shared_info
+ * page as soon as fixmap is up and running.
+ */
+shared_info_t *HYPERVISOR_shared_info = (shared_info_t *)empty_zero_page;
+EXPORT_SYMBOL(HYPERVISOR_shared_info);
+
+unsigned int *phys_to_machine_mapping, *pfn_to_mfn_frame_list;
+EXPORT_SYMBOL(phys_to_machine_mapping);
+
+/* Raw start-of-day parameters from the hypervisor. */
+union xen_start_info_union xen_start_info_union;
+
+static void __init limit_regions(unsigned long long size)
+{
+       unsigned long long current_addr = 0;
+       int i;
+
+       if (efi_enabled) {
+               for (i = 0; i < memmap.nr_map; i++) {
+                       current_addr = memmap.map[i].phys_addr +
+                                      (memmap.map[i].num_pages << 12);
+                       if (memmap.map[i].type == EFI_CONVENTIONAL_MEMORY) {
+                               if (current_addr >= size) {
+                                       memmap.map[i].num_pages -=
+                                               (((current_addr-size) + 
PAGE_SIZE-1) >> PAGE_SHIFT);
+                                       memmap.nr_map = i + 1;
+                                       return;
+                               }
+                       }
+               }
+       }
+       for (i = 0; i < e820.nr_map; i++) {
+               if (e820.map[i].type == E820_RAM) {
+                       current_addr = e820.map[i].addr + e820.map[i].size;
+                       if (current_addr >= size) {
+                               e820.map[i].size -= current_addr-size;
+                               e820.nr_map = i + 1;
+                               return;
+                       }
+               }
+       }
+}
+
+static void __init add_memory_region(unsigned long long start,
+                                  unsigned long long size, int type)
+{
+       int x;
+
+       if (!efi_enabled) {
+                       x = e820.nr_map;
+
+               if (x == E820MAX) {
+                   printk(KERN_ERR "Ooops! Too many entries in the memory 
map!\n");
+                   return;
+               }
+
+               e820.map[x].addr = start;
+               e820.map[x].size = size;
+               e820.map[x].type = type;
+               e820.nr_map++;
+       }
+} /* add_memory_region */
+
+#define E820_DEBUG     1
+
+static void __init print_memory_map(char *who)
+{
+       int i;
+
+       for (i = 0; i < e820.nr_map; i++) {
+               printk(" %s: %016Lx - %016Lx ", who,
+                       e820.map[i].addr,
+                       e820.map[i].addr + e820.map[i].size);
+               switch (e820.map[i].type) {
+               case E820_RAM:  printk("(usable)\n");
+                               break;
+               case E820_RESERVED:
+                               printk("(reserved)\n");
+                               break;
+               case E820_ACPI:
+                               printk("(ACPI data)\n");
+                               break;
+               case E820_NVS:
+                               printk("(ACPI NVS)\n");
+                               break;
+               default:        printk("type %lu\n", e820.map[i].type);
+                               break;
+               }
+       }
+}
+
+#if 0
+/*
+ * Sanitize the BIOS e820 map.
+ *
+ * Some e820 responses include overlapping entries.  The following 
+ * replaces the original e820 map with a new one, removing overlaps.
+ *
+ */
+struct change_member {
+       struct e820entry *pbios; /* pointer to original bios entry */
+       unsigned long long addr; /* address for this change point */
+};
+static struct change_member change_point_list[2*E820MAX] __initdata;
+static struct change_member *change_point[2*E820MAX] __initdata;
+static struct e820entry *overlap_list[E820MAX] __initdata;
+static struct e820entry new_bios[E820MAX] __initdata;
+
+static int __init sanitize_e820_map(struct e820entry * biosmap, char * pnr_map)
+{
+       struct change_member *change_tmp;
+       unsigned long current_type, last_type;
+       unsigned long long last_addr;
+       int chgidx, still_changing;
+       int overlap_entries;
+       int new_bios_entry;
+       int old_nr, new_nr, chg_nr;
+       int i;
+
+       /*
+               Visually we're performing the following (1,2,3,4 = memory 
types)...
+
+               Sample memory map (w/overlaps):
+                  ____22__________________
+                  ______________________4_
+                  ____1111________________
+                  _44_____________________
+                  11111111________________
+                  ____________________33__
+                  ___________44___________
+                  __________33333_________
+                  ______________22________
+                  ___________________2222_
+                  _________111111111______
+                  _____________________11_
+                  _________________4______
+
+               Sanitized equivalent (no overlap):
+                  1_______________________
+                  _44_____________________
+                  ___1____________________
+                  ____22__________________
+                  ______11________________
+                  _________1______________
+                  __________3_____________
+                  ___________44___________
+                  _____________33_________
+                  _______________2________
+                  ________________1_______
+                  _________________4______
+                  ___________________2____
+                  ____________________33__
+                  ______________________4_
+       */
+
+       /* if there's only one memory region, don't bother */
+       if (*pnr_map < 2)
+               return -1;
+
+       old_nr = *pnr_map;
+
+       /* bail out if we find any unreasonable addresses in bios map */
+       for (i=0; i<old_nr; i++)
+               if (biosmap[i].addr + biosmap[i].size < biosmap[i].addr)
+                       return -1;
+
+       /* create pointers for initial change-point information (for sorting) */
+       for (i=0; i < 2*old_nr; i++)
+               change_point[i] = &change_point_list[i];
+
+       /* record all known change-points (starting and ending addresses),
+          omitting those that are for empty memory regions */
+       chgidx = 0;
+       for (i=0; i < old_nr; i++)      {
+               if (biosmap[i].size != 0) {
+                       change_point[chgidx]->addr = biosmap[i].addr;
+                       change_point[chgidx++]->pbios = &biosmap[i];
+                       change_point[chgidx]->addr = biosmap[i].addr + 
biosmap[i].size;
+                       change_point[chgidx++]->pbios = &biosmap[i];
+               }
+       }
+       chg_nr = chgidx;        /* true number of change-points */
+
+       /* sort change-point list by memory addresses (low -> high) */
+       still_changing = 1;
+       while (still_changing)  {
+               still_changing = 0;
+               for (i=1; i < chg_nr; i++)  {
+                       /* if <current_addr> > <last_addr>, swap */
+                       /* or, if current=<start_addr> & last=<end_addr>, swap 
*/
+                       if ((change_point[i]->addr < change_point[i-1]->addr) ||
+                               ((change_point[i]->addr == 
change_point[i-1]->addr) &&
+                                (change_point[i]->addr == 
change_point[i]->pbios->addr) &&
+                                (change_point[i-1]->addr != 
change_point[i-1]->pbios->addr))
+                          )
+                       {
+                               change_tmp = change_point[i];
+                               change_point[i] = change_point[i-1];
+                               change_point[i-1] = change_tmp;
+                               still_changing=1;
+                       }
+               }
+       }
+
+       /* create a new bios memory map, removing overlaps */
+       overlap_entries=0;       /* number of entries in the overlap table */
+       new_bios_entry=0;        /* index for creating new bios map entries */
+       last_type = 0;           /* start with undefined memory type */
+       last_addr = 0;           /* start with 0 as last starting address */
+       /* loop through change-points, determining affect on the new bios map */
+       for (chgidx=0; chgidx < chg_nr; chgidx++)
+       {
+               /* keep track of all overlapping bios entries */
+               if (change_point[chgidx]->addr == 
change_point[chgidx]->pbios->addr)
+               {
+                       /* add map entry to overlap list (> 1 entry implies an 
overlap) */
+                       
overlap_list[overlap_entries++]=change_point[chgidx]->pbios;
+               }
+               else
+               {
+                       /* remove entry from list (order independent, so swap 
with last) */
+                       for (i=0; i<overlap_entries; i++)
+                       {
+                               if (overlap_list[i] == 
change_point[chgidx]->pbios)
+                                       overlap_list[i] = 
overlap_list[overlap_entries-1];
+                       }
+                       overlap_entries--;
+               }
+               /* if there are overlapping entries, decide which "type" to use 
*/
+               /* (larger value takes precedence -- 1=usable, 
2,3,4,4+=unusable) */
+               current_type = 0;
+               for (i=0; i<overlap_entries; i++)
+                       if (overlap_list[i]->type > current_type)
+                               current_type = overlap_list[i]->type;
+               /* continue building up new bios map based on this information 
*/
+               if (current_type != last_type)  {
+                       if (last_type != 0)      {
+                               new_bios[new_bios_entry].size =
+                                       change_point[chgidx]->addr - last_addr;
+                               /* move forward only if the new size was 
non-zero */
+                               if (new_bios[new_bios_entry].size != 0)
+                                       if (++new_bios_entry >= E820MAX)
+                                               break;  /* no more space left 
for new bios entries */
+                       }
+                       if (current_type != 0)  {
+                               new_bios[new_bios_entry].addr = 
change_point[chgidx]->addr;
+                               new_bios[new_bios_entry].type = current_type;
+                               last_addr=change_point[chgidx]->addr;
+                       }
+                       last_type = current_type;
+               }
+       }
+       new_nr = new_bios_entry;   /* retain count for new bios entries */
+
+       /* copy new bios mapping into original location */
+       memcpy(biosmap, new_bios, new_nr*sizeof(struct e820entry));
+       *pnr_map = new_nr;
+
+       return 0;
+}
+
+/*
+ * Copy the BIOS e820 map into a safe place.
+ *
+ * Sanity-check it while we're at it..
+ *
+ * If we're lucky and live on a modern system, the setup code
+ * will have given us a memory map that we can use to properly
+ * set up memory.  If we aren't, we'll fake a memory map.
+ *
+ * We check to see that the memory map contains at least 2 elements
+ * before we'll use it, because the detection code in setup.S may
+ * not be perfect and most every PC known to man has two memory
+ * regions: one from 0 to 640k, and one from 1mb up.  (The IBM
+ * thinkpad 560x, for example, does not cooperate with the memory
+ * detection code.)
+ */
+static int __init copy_e820_map(struct e820entry * biosmap, int nr_map)
+{
+       /* Only one memory region (or negative)? Ignore it */
+       if (nr_map < 2)
+               return -1;
+
+       do {
+               unsigned long long start = biosmap->addr;
+               unsigned long long size = biosmap->size;
+               unsigned long long end = start + size;
+               unsigned long type = biosmap->type;
+
+               /* Overflow in 64 bits? Ignore the memory map. */
+               if (start > end)
+                       return -1;
+
+               /*
+                * Some BIOSes claim RAM in the 640k - 1M region.
+                * Not right. Fix it up.
+                */
+               if (type == E820_RAM) {
+                       if (start < 0x100000ULL && end > 0xA0000ULL) {
+                               if (start < 0xA0000ULL)
+                                       add_memory_region(start, 
0xA0000ULL-start, type);
+                               if (end <= 0x100000ULL)
+                                       continue;
+                               start = 0x100000ULL;
+                               size = end - start;
+                       }
+               }
+               add_memory_region(start, size, type);
+       } while (biosmap++,--nr_map);
+       return 0;
+}
+#endif
+
+#if defined(CONFIG_EDD) || defined(CONFIG_EDD_MODULE)
+struct edd edd;
+#ifdef CONFIG_EDD_MODULE
+EXPORT_SYMBOL(edd);
+#endif
+/**
+ * copy_edd() - Copy the BIOS EDD information
+ *              from boot_params into a safe place.
+ *
+ */
+static inline void copy_edd(void)
+{
+     memcpy(edd.mbr_signature, EDD_MBR_SIGNATURE, sizeof(edd.mbr_signature));
+     memcpy(edd.edd_info, EDD_BUF, sizeof(edd.edd_info));
+     edd.mbr_signature_nr = EDD_MBR_SIG_NR;
+     edd.edd_info_nr = EDD_NR;
+}
+#else
+static inline void copy_edd(void)
+{
+}
+#endif
+
+/*
+ * Do NOT EVER look at the BIOS memory size location.
+ * It does not work on many machines.
+ */
+#define LOWMEMSIZE()   (0x9f000)
+
+static void __init parse_cmdline_early (char ** cmdline_p)
+{
+       char c = ' ', *to = command_line, *from = saved_command_line;
+       int len = 0, max_cmdline;
+       int userdef = 0;
+
+       if ((max_cmdline = MAX_GUEST_CMDLINE) > COMMAND_LINE_SIZE)
+               max_cmdline = COMMAND_LINE_SIZE;
+       memcpy(saved_command_line, xen_start_info.cmd_line, max_cmdline);
+       /* Save unparsed command line copy for /proc/cmdline */
+       saved_command_line[max_cmdline-1] = '\0';
+
+       for (;;) {
+               if (c != ' ')
+                       goto next_char;
+               /*
+                * "mem=nopentium" disables the 4MB page tables.
+                * "mem=XXX[kKmM]" defines a memory region from HIGH_MEM
+                * to <mem>, overriding the bios size.
+                * "memmap=XXX[KkmM]@XXX[KkmM]" defines a memory region from
+                * <start> to <start>+<mem>, overriding the bios size.
+                *
+                * HPA tells me bootloaders need to parse mem=, so no new
+                * option should be mem=  [also see Documentation/i386/boot.txt]
+                */
+               if (!memcmp(from, "mem=", 4)) {
+                       if (to != command_line)
+                               to--;
+                       if (!memcmp(from+4, "nopentium", 9)) {
+                               from += 9+4;
+                               clear_bit(X86_FEATURE_PSE, 
boot_cpu_data.x86_capability);
+                               disable_pse = 1;
+                       } else {
+                               /* If the user specifies memory size, we
+                                * limit the BIOS-provided memory map to
+                                * that size. exactmap can be used to specify
+                                * the exact map. mem=number can be used to
+                                * trim the existing memory map.
+                                */
+                               unsigned long long mem_size;
+ 
+                               mem_size = memparse(from+4, &from);
+#if 0
+                               limit_regions(mem_size);
+                               userdef=1;
+#else
+                               xen_override_max_pfn =
+                                       (unsigned long)(mem_size>>PAGE_SHIFT);
+#endif
+                       }
+               }
+
+               else if (!memcmp(from, "memmap=", 7)) {
+                       if (to != command_line)
+                               to--;
+                       if (!memcmp(from+7, "exactmap", 8)) {
+                               from += 8+7;
+                               e820.nr_map = 0;
+                               userdef = 1;
+                       } else {
+                               /* If the user specifies memory size, we
+                                * limit the BIOS-provided memory map to
+                                * that size. exactmap can be used to specify
+                                * the exact map. mem=number can be used to
+                                * trim the existing memory map.
+                                */
+                               unsigned long long start_at, mem_size;
+ 
+                               mem_size = memparse(from+7, &from);
+                               if (*from == '@') {
+                                       start_at = memparse(from+1, &from);
+                                       add_memory_region(start_at, mem_size, 
E820_RAM);
+                               } else if (*from == '#') {
+                                       start_at = memparse(from+1, &from);
+                                       add_memory_region(start_at, mem_size, 
E820_ACPI);
+                               } else if (*from == '$') {
+                                       start_at = memparse(from+1, &from);
+                                       add_memory_region(start_at, mem_size, 
E820_RESERVED);
+                               } else {
+                                       limit_regions(mem_size);
+                                       userdef=1;
+                               }
+                       }
+               }
+
+               else if (!memcmp(from, "noexec=", 7))
+                       noexec_setup(from + 7);
+
+
+#ifdef  CONFIG_X86_MPPARSE
+               /*
+                * If the BIOS enumerates physical processors before logical,
+                * maxcpus=N at enumeration-time can be used to disable HT.
+                */
+               else if (!memcmp(from, "maxcpus=", 8)) {
+                       extern unsigned int maxcpus;
+
+                       maxcpus = simple_strtoul(from + 8, NULL, 0);
+               }
+#endif
+
+#ifdef CONFIG_ACPI_BOOT
+               /* "acpi=off" disables both ACPI table parsing and interpreter 
*/
+               else if (!memcmp(from, "acpi=off", 8)) {
+                       disable_acpi();
+               }
+
+               /* acpi=force to over-ride black-list */
+               else if (!memcmp(from, "acpi=force", 10)) {
+                       acpi_force = 1;
+                       acpi_ht = 1;
+                       acpi_disabled = 0;
+               }
+
+               /* acpi=strict disables out-of-spec workarounds */
+               else if (!memcmp(from, "acpi=strict", 11)) {
+                       acpi_strict = 1;
+               }
+
+               /* Limit ACPI just to boot-time to enable HT */
+               else if (!memcmp(from, "acpi=ht", 7)) {
+                       if (!acpi_force)
+                               disable_acpi();
+                       acpi_ht = 1;
+               }
+               
+               /* "pci=noacpi" disable ACPI IRQ routing and PCI scan */
+               else if (!memcmp(from, "pci=noacpi", 10)) {
+                       acpi_disable_pci();
+               }
+               /* "acpi=noirq" disables ACPI interrupt routing */
+               else if (!memcmp(from, "acpi=noirq", 10)) {
+                       acpi_noirq_set();
+               }
+
+               else if (!memcmp(from, "acpi_sci=edge", 13))
+                       acpi_sci_flags.trigger =  1;
+
+               else if (!memcmp(from, "acpi_sci=level", 14))
+                       acpi_sci_flags.trigger = 3;
+
+               else if (!memcmp(from, "acpi_sci=high", 13))
+                       acpi_sci_flags.polarity = 1;
+
+               else if (!memcmp(from, "acpi_sci=low", 12))
+                       acpi_sci_flags.polarity = 3;
+
+#ifdef CONFIG_X86_IO_APIC
+               else if (!memcmp(from, "acpi_skip_timer_override", 24))
+                       acpi_skip_timer_override = 1;
+#endif
+
+#ifdef CONFIG_X86_LOCAL_APIC
+               /* disable IO-APIC */
+               else if (!memcmp(from, "noapic", 6))
+                       disable_ioapic_setup();
+#endif /* CONFIG_X86_LOCAL_APIC */
+#endif /* CONFIG_ACPI_BOOT */
+
+               /*
+                * highmem=size forces highmem to be exactly 'size' bytes.
+                * This works even on boxes that have no highmem otherwise.
+                * This also works to reduce highmem size on bigger boxes.
+                */
+               else if (!memcmp(from, "highmem=", 8))
+                       highmem_pages = memparse(from+8, &from) >> PAGE_SHIFT;
+       
+               /*
+                * vmalloc=size forces the vmalloc area to be exactly 'size'
+                * bytes. This can be used to increase (or decrease) the
+                * vmalloc area - the default is 128m.
+                */
+               else if (!memcmp(from, "vmalloc=", 8))
+                       __VMALLOC_RESERVE = memparse(from+8, &from);
+
+       next_char:
+               c = *(from++);
+               if (!c)
+                       break;
+               if (COMMAND_LINE_SIZE <= ++len)
+                       break;
+               *(to++) = c;
+       }
+       *to = '\0';
+       *cmdline_p = command_line;
+       if (userdef) {
+               printk(KERN_INFO "user-defined physical RAM map:\n");
+               print_memory_map("user");
+       }
+}
+
+#if 0 /* !XEN */
+/*
+ * Callback for efi_memory_walk.
+ */
+static int __init
+efi_find_max_pfn(unsigned long start, unsigned long end, void *arg)
+{
+       unsigned long *max_pfn = arg, pfn;
+
+       if (start < end) {
+               pfn = PFN_UP(end -1);
+               if (pfn > *max_pfn)
+                       *max_pfn = pfn;
+       }
+       return 0;
+}
+
+
+/*
+ * Find the highest page frame number we have available
+ */
+void __init find_max_pfn(void)
+{
+       int i;
+
+       max_pfn = 0;
+       if (efi_enabled) {
+               efi_memmap_walk(efi_find_max_pfn, &max_pfn);
+               return;
+       }
+
+       for (i = 0; i < e820.nr_map; i++) {
+               unsigned long start, end;
+               /* RAM? */
+               if (e820.map[i].type != E820_RAM)
+                       continue;
+               start = PFN_UP(e820.map[i].addr);
+               end = PFN_DOWN(e820.map[i].addr + e820.map[i].size);
+               if (start >= end)
+                       continue;
+               if (end > max_pfn)
+                       max_pfn = end;
+       }
+}
+#else
+/* We don't use the fake e820 because we need to respond to user override. */
+void __init find_max_pfn(void)
+{
+       if ( xen_override_max_pfn < xen_start_info.nr_pages )
+               xen_override_max_pfn = xen_start_info.nr_pages;
+       max_pfn = xen_override_max_pfn;
+}
+#endif /* XEN */
+
+/*
+ * Determine low and high memory ranges:
+ */
+unsigned long __init find_max_low_pfn(void)
+{
+       unsigned long max_low_pfn;
+
+       max_low_pfn = max_pfn;
+       if (max_low_pfn > MAXMEM_PFN) {
+               if (highmem_pages == -1)
+                       highmem_pages = max_pfn - MAXMEM_PFN;
+               if (highmem_pages + MAXMEM_PFN < max_pfn)
+                       max_pfn = MAXMEM_PFN + highmem_pages;
+               if (highmem_pages + MAXMEM_PFN > max_pfn) {
+                       printk("only %luMB highmem pages available, ignoring 
highmem size of %uMB.\n", pages_to_mb(max_pfn - MAXMEM_PFN), 
pages_to_mb(highmem_pages));
+                       highmem_pages = 0;
+               }
+               max_low_pfn = MAXMEM_PFN;
+#ifndef CONFIG_HIGHMEM
+               /* Maximum memory usable is what is directly addressable */
+               printk(KERN_WARNING "Warning only %ldMB will be used.\n",
+                                       MAXMEM>>20);
+               if (max_pfn > MAX_NONPAE_PFN)
+                       printk(KERN_WARNING "Use a PAE enabled kernel.\n");
+               else
+                       printk(KERN_WARNING "Use a HIGHMEM enabled kernel.\n");
+               max_pfn = MAXMEM_PFN;
+#else /* !CONFIG_HIGHMEM */
+#ifndef CONFIG_X86_PAE
+               if (max_pfn > MAX_NONPAE_PFN) {
+                       max_pfn = MAX_NONPAE_PFN;
+                       printk(KERN_WARNING "Warning only 4GB will be used.\n");
+                       printk(KERN_WARNING "Use a PAE enabled kernel.\n");
+               }
+#endif /* !CONFIG_X86_PAE */
+#endif /* !CONFIG_HIGHMEM */
+       } else {
+               if (highmem_pages == -1)
+                       highmem_pages = 0;
+#ifdef CONFIG_HIGHMEM
+               if (highmem_pages >= max_pfn) {
+                       printk(KERN_ERR "highmem size specified (%uMB) is 
bigger than pages available (%luMB)!.\n", pages_to_mb(highmem_pages), 
pages_to_mb(max_pfn));
+                       highmem_pages = 0;
+               }
+               if (highmem_pages) {
+                       if (max_low_pfn-highmem_pages < 64*1024*1024/PAGE_SIZE){
+                               printk(KERN_ERR "highmem size %uMB results in 
smaller than 64MB lowmem, ignoring it.\n", pages_to_mb(highmem_pages));
+                               highmem_pages = 0;
+                       }
+                       max_low_pfn -= highmem_pages;
+               }
+#else
+               if (highmem_pages)
+                       printk(KERN_ERR "ignoring highmem size on non-highmem 
kernel!\n");
+#endif
+       }
+       return max_low_pfn;
+}
+
+/*
+ * Free all available memory for boot time allocation.  Used
+ * as a callback function by efi_memory_walk()
+ */
+
+static int __init
+free_available_memory(unsigned long start, unsigned long end, void *arg)
+{
+       /* check max_low_pfn */
+       if (start >= ((max_low_pfn + 1) << PAGE_SHIFT))
+               return 0;
+       if (end >= ((max_low_pfn + 1) << PAGE_SHIFT))
+               end = (max_low_pfn + 1) << PAGE_SHIFT;
+       if (start < end)
+               free_bootmem(start, end - start);
+
+       return 0;
+}
+/*
+ * Register fully available low RAM pages with the bootmem allocator.
+ */
+static void __init register_bootmem_low_pages(unsigned long max_low_pfn)
+{
+       int i;
+
+       if (efi_enabled) {
+               efi_memmap_walk(free_available_memory, NULL);
+               return;
+       }
+       for (i = 0; i < e820.nr_map; i++) {
+               unsigned long curr_pfn, last_pfn, size;
+               /*
+                * Reserve usable low memory
+                */
+               if (e820.map[i].type != E820_RAM)
+                       continue;
+               /*
+                * We are rounding up the start address of usable memory:
+                */
+               curr_pfn = PFN_UP(e820.map[i].addr);
+               if (curr_pfn >= max_low_pfn)
+                       continue;
+               /*
+                * ... and at the end of the usable range downwards:
+                */
+               last_pfn = PFN_DOWN(e820.map[i].addr + e820.map[i].size);
+
+               if (last_pfn > max_low_pfn)
+                       last_pfn = max_low_pfn;
+
+               /*
+                * .. finally, did all the rounding and playing
+                * around just make the area go away?
+                */
+               if (last_pfn <= curr_pfn)
+                       continue;
+
+               size = last_pfn - curr_pfn;
+               free_bootmem(PFN_PHYS(curr_pfn), PFN_PHYS(size));
+       }
+}
+
+#ifndef CONFIG_XEN
+/*
+ * workaround for Dell systems that neglect to reserve EBDA
+ */
+static void __init reserve_ebda_region(void)
+{
+       unsigned int addr;
+       addr = get_bios_ebda();
+       if (addr)
+               reserve_bootmem(addr, PAGE_SIZE);       
+}
+#endif
+
+#ifndef CONFIG_DISCONTIGMEM
+void __init setup_bootmem_allocator(void);
+static unsigned long __init setup_memory(void)
+{
+
+       /*
+        * partially used pages are not usable - thus
+        * we are rounding upwards:
+        */
+       min_low_pfn = PFN_UP(__pa(xen_start_info.pt_base)) + 
xen_start_info.nr_pt_frames;
+
+       find_max_pfn();
+
+       max_low_pfn = find_max_low_pfn();
+
+#ifdef CONFIG_HIGHMEM
+       highstart_pfn = highend_pfn = max_pfn;
+       if (max_pfn > max_low_pfn) {
+               highstart_pfn = max_low_pfn;
+       }
+       printk(KERN_NOTICE "%ldMB HIGHMEM available.\n",
+               pages_to_mb(highend_pfn - highstart_pfn));
+#endif
+       printk(KERN_NOTICE "%ldMB LOWMEM available.\n",
+                       pages_to_mb(max_low_pfn));
+
+       setup_bootmem_allocator();
+
+       return max_low_pfn;
+}
+
+void __init zone_sizes_init(void)
+{
+       unsigned long zones_size[MAX_NR_ZONES] = {0, 0, 0};
+       unsigned int max_dma, low;
+
+       /*
+        * XEN: Our notion of "DMA memory" is fake when running over Xen.
+        * We simply put all RAM in the DMA zone so that those drivers which
+        * needlessly specify GFP_DMA do not get starved of RAM unnecessarily.
+        * Those drivers that *do* require lowmem are screwed anyway when
+        * running over Xen!
+        */
+       max_dma = max_low_pfn;
+       low = max_low_pfn;
+
+       if (low < max_dma)
+               zones_size[ZONE_DMA] = low;
+       else {
+               zones_size[ZONE_DMA] = max_dma;
+               zones_size[ZONE_NORMAL] = low - max_dma;
+#ifdef CONFIG_HIGHMEM
+               zones_size[ZONE_HIGHMEM] = highend_pfn - low;
+#endif
+       }
+       free_area_init(zones_size);
+}
+#else
+extern unsigned long setup_memory(void);
+extern void zone_sizes_init(void);
+#endif /* !CONFIG_DISCONTIGMEM */
+
+void __init setup_bootmem_allocator(void)
+{
+       unsigned long bootmap_size;
+       /*
+        * Initialize the boot-time allocator (with low memory only):
+        */
+       bootmap_size = init_bootmem(min_low_pfn, max_low_pfn);
+
+       register_bootmem_low_pages(max_low_pfn);
+
+       /*
+        * Reserve the bootmem bitmap itself as well. We do this in two
+        * steps (first step was init_bootmem()) because this catches
+        * the (very unlikely) case of us accidentally initializing the
+        * bootmem allocator with an invalid RAM area.
+        */
+       reserve_bootmem(HIGH_MEMORY, (PFN_PHYS(min_low_pfn) +
+                        bootmap_size + PAGE_SIZE-1) - (HIGH_MEMORY));
+
+#ifndef CONFIG_XEN
+       /*
+        * reserve physical page 0 - it's a special BIOS page on many boxes,
+        * enabling clean reboots, SMP operation, laptop functions.
+        */
+       reserve_bootmem(0, PAGE_SIZE);
+
+       /* reserve EBDA region, it's a 4K region */
+       reserve_ebda_region();
+
+    /* could be an AMD 768MPX chipset. Reserve a page  before VGA to prevent
+       PCI prefetch into it (errata #56). Usually the page is reserved anyways,
+       unless you have no PS/2 mouse plugged in. */
+       if (boot_cpu_data.x86_vendor == X86_VENDOR_AMD &&
+           boot_cpu_data.x86 == 6)
+            reserve_bootmem(0xa0000 - 4096, 4096);
+
+#ifdef CONFIG_SMP
+       /*
+        * But first pinch a few for the stack/trampoline stuff
+        * FIXME: Don't need the extra page at 4K, but need to fix
+        * trampoline before removing it. (see the GDT stuff)
+        */
+       reserve_bootmem(PAGE_SIZE, PAGE_SIZE);
+#endif
+#ifdef CONFIG_ACPI_SLEEP
+       /*
+        * Reserve low memory region for sleep support.
+        */
+       acpi_reserve_bootmem();
+#endif
+#endif /* !CONFIG_XEN */
+
+#ifdef CONFIG_BLK_DEV_INITRD
+       if (xen_start_info.mod_start) {
+               if (INITRD_START + INITRD_SIZE <= (max_low_pfn << PAGE_SHIFT)) {
+                       /*reserve_bootmem(INITRD_START, INITRD_SIZE);*/
+                       initrd_start = INITRD_START + PAGE_OFFSET;
+                       initrd_end = initrd_start+INITRD_SIZE;
+                       initrd_below_start_ok = 1;
+               }
+               else {
+                       printk(KERN_ERR "initrd extends beyond end of memory "
+                           "(0x%08lx > 0x%08lx)\ndisabling initrd\n",
+                           INITRD_START + INITRD_SIZE,
+                           max_low_pfn << PAGE_SHIFT);
+                       initrd_start = 0;
+               }
+       }
+#endif
+
+       phys_to_machine_mapping = (unsigned int *)xen_start_info.mfn_list;
+}
+
+/*
+ * The node 0 pgdat is initialized before all of these because
+ * it's needed for bootmem.  node>0 pgdats have their virtual
+ * space allocated before the pagetables are in place to access
+ * them, so they can't be cleared then.
+ *
+ * This should all compile down to nothing when NUMA is off.
+ */
+void __init remapped_pgdat_init(void)
+{
+       int nid;
+
+       for_each_online_node(nid) {
+               if (nid != 0)
+                       memset(NODE_DATA(nid), 0, sizeof(struct pglist_data));
+       }
+}
+
+/*
+ * Request address space for all standard RAM and ROM resources
+ * and also for regions reported as reserved by the e820.
+ */
+static void __init
+legacy_init_iomem_resources(struct resource *code_resource, struct resource 
*data_resource)
+{
+       int i;
+
+#ifdef CONFIG_XEN_PRIVILEGED_GUEST
+       probe_roms();
+#endif
+       for (i = 0; i < e820.nr_map; i++) {
+               struct resource *res;
+               if (e820.map[i].addr + e820.map[i].size > 0x100000000ULL)
+                       continue;
+               res = alloc_bootmem_low(sizeof(struct resource));
+               switch (e820.map[i].type) {
+               case E820_RAM:  res->name = "System RAM"; break;
+               case E820_ACPI: res->name = "ACPI Tables"; break;
+               case E820_NVS:  res->name = "ACPI Non-volatile Storage"; break;
+               default:        res->name = "reserved";
+               }
+               res->start = e820.map[i].addr;
+               res->end = res->start + e820.map[i].size - 1;
+               res->flags = IORESOURCE_MEM | IORESOURCE_BUSY;
+               request_resource(&iomem_resource, res);
+               if (e820.map[i].type == E820_RAM) {
+                       /*
+                        *  We don't know which RAM region contains kernel data,
+                        *  so we try it repeatedly and let the resource manager
+                        *  test it.
+                        */
+                       request_resource(res, code_resource);
+                       request_resource(res, data_resource);
+               }
+       }
+}
+
+/*
+ * Request address space for all standard resources
+ */
+static void __init register_memory(void)
+{
+       unsigned long gapstart, gapsize;
+       unsigned long long last;
+       int           i;
+
+       if (efi_enabled)
+               efi_initialize_iomem_resources(&code_resource, &data_resource);
+       else
+               legacy_init_iomem_resources(&code_resource, &data_resource);
+
+       if (xen_start_info.flags & SIF_INITDOMAIN)
+               /* EFI systems may still have VGA */
+               request_resource(&iomem_resource, &video_ram_resource);
+
+       /* request I/O space for devices used on all i[345]86 PCs */
+       for (i = 0; i < STANDARD_IO_RESOURCES; i++)
+               request_resource(&ioport_resource, &standard_io_resources[i]);
+
+       /*
+        * Search for the bigest gap in the low 32 bits of the e820
+        * memory space.
+        */
+       last = 0x100000000ull;
+       gapstart = 0x10000000;
+       gapsize = 0x400000;
+       i = e820.nr_map;
+       while (--i >= 0) {
+               unsigned long long start = e820.map[i].addr;
+               unsigned long long end = start + e820.map[i].size;
+
+               /*
+                * Since "last" is at most 4GB, we know we'll
+                * fit in 32 bits if this condition is true
+                */
+               if (last > end) {
+                       unsigned long gap = last - end;
+
+                       if (gap > gapsize) {
+                               gapsize = gap;
+                               gapstart = end;
+                       }
+               }
+               if (start < last)
+                       last = start;
+       }
+
+       /*
+        * Start allocating dynamic PCI memory a bit into the gap,
+        * aligned up to the nearest megabyte.
+        *
+        * Question: should we try to pad it up a bit (do something
+        * like " + (gapsize >> 3)" in there too?). We now have the
+        * technology.
+        */
+       pci_mem_start = (gapstart + 0xfffff) & ~0xfffff;
+
+       printk("Allocating PCI resources starting at %08lx (gap: 
%08lx:%08lx)\n",
+               pci_mem_start, gapstart, gapsize);
+}
+
+/* Use inline assembly to define this because the nops are defined 
+   as inline assembly strings in the include files and we cannot 
+   get them easily into strings. */
+asm("\t.data\nintelnops: " 
+    GENERIC_NOP1 GENERIC_NOP2 GENERIC_NOP3 GENERIC_NOP4 GENERIC_NOP5 
GENERIC_NOP6
+    GENERIC_NOP7 GENERIC_NOP8); 
+asm("\t.data\nk8nops: " 
+    K8_NOP1 K8_NOP2 K8_NOP3 K8_NOP4 K8_NOP5 K8_NOP6
+    K8_NOP7 K8_NOP8); 
+asm("\t.data\nk7nops: " 
+    K7_NOP1 K7_NOP2 K7_NOP3 K7_NOP4 K7_NOP5 K7_NOP6
+    K7_NOP7 K7_NOP8); 
+    
+extern unsigned char intelnops[], k8nops[], k7nops[];
+static unsigned char *intel_nops[ASM_NOP_MAX+1] = { 
+     NULL,
+     intelnops,
+     intelnops + 1,
+     intelnops + 1 + 2,
+     intelnops + 1 + 2 + 3,
+     intelnops + 1 + 2 + 3 + 4,
+     intelnops + 1 + 2 + 3 + 4 + 5,
+     intelnops + 1 + 2 + 3 + 4 + 5 + 6,
+     intelnops + 1 + 2 + 3 + 4 + 5 + 6 + 7,
+}; 
+static unsigned char *k8_nops[ASM_NOP_MAX+1] = { 
+     NULL,
+     k8nops,
+     k8nops + 1,
+     k8nops + 1 + 2,
+     k8nops + 1 + 2 + 3,
+     k8nops + 1 + 2 + 3 + 4,
+     k8nops + 1 + 2 + 3 + 4 + 5,
+     k8nops + 1 + 2 + 3 + 4 + 5 + 6,
+     k8nops + 1 + 2 + 3 + 4 + 5 + 6 + 7,
+}; 
+static unsigned char *k7_nops[ASM_NOP_MAX+1] = { 
+     NULL,
+     k7nops,
+     k7nops + 1,
+     k7nops + 1 + 2,
+     k7nops + 1 + 2 + 3,
+     k7nops + 1 + 2 + 3 + 4,
+     k7nops + 1 + 2 + 3 + 4 + 5,
+     k7nops + 1 + 2 + 3 + 4 + 5 + 6,
+     k7nops + 1 + 2 + 3 + 4 + 5 + 6 + 7,
+}; 
+static struct nop { 
+     int cpuid; 
+     unsigned char **noptable; 
+} noptypes[] = { 
+     { X86_FEATURE_K8, k8_nops }, 
+     { X86_FEATURE_K7, k7_nops }, 
+     { -1, NULL }
+}; 
+
+/* Replace instructions with better alternatives for this CPU type.
+
+   This runs before SMP is initialized to avoid SMP problems with
+   self modifying code. This implies that assymetric systems where
+   APs have less capabilities than the boot processor are not handled. 
+   In this case boot with "noreplacement". */ 
+void apply_alternatives(void *start, void *end) 
+{ 
+       struct alt_instr *a; 
+       int diff, i, k;
+        unsigned char **noptable = intel_nops; 
+       for (i = 0; noptypes[i].cpuid >= 0; i++) { 
+               if (boot_cpu_has(noptypes[i].cpuid)) { 
+                       noptable = noptypes[i].noptable;
+                       break;
+               }
+       } 
+       for (a = start; (void *)a < end; a++) { 
+               if (!boot_cpu_has(a->cpuid))
+                       continue;
+               BUG_ON(a->replacementlen > a->instrlen); 
+               memcpy(a->instr, a->replacement, a->replacementlen); 
+               diff = a->instrlen - a->replacementlen; 
+               /* Pad the rest with nops */
+               for (i = a->replacementlen; diff > 0; diff -= k, i += k) {
+                       k = diff;
+                       if (k > ASM_NOP_MAX)
+                               k = ASM_NOP_MAX;
+                       memcpy(a->instr + i, noptable[k], k); 
+               } 
+       }
+} 
+
+static int no_replacement __initdata = 0; 
+ 
+void __init alternative_instructions(void)
+{
+       extern struct alt_instr __alt_instructions[], __alt_instructions_end[];
+       if (no_replacement) 
+               return;
+       apply_alternatives(__alt_instructions, __alt_instructions_end);
+}
+
+static int __init noreplacement_setup(char *s)
+{ 
+     no_replacement = 1; 
+     return 0; 
+} 
+
+__setup("noreplacement", noreplacement_setup); 
+
+static char * __init machine_specific_memory_setup(void);
+
+#ifdef CONFIG_MCA
+static void set_mca_bus(int x)
+{
+       MCA_bus = x;
+}
+#else
+static void set_mca_bus(int x) { }
+#endif
+
+/*
+ * Determine if we were loaded by an EFI loader.  If so, then we have also been
+ * passed the efi memmap, systab, etc., so we should use these data structures
+ * for initialization.  Note, the efi init code path is determined by the
+ * global efi_enabled. This allows the same kernel image to be used on existing
+ * systems (with a traditional BIOS) as well as on EFI systems.
+ */
+void __init setup_arch(char **cmdline_p)
+{
+       int i, j;
+       physdev_op_t op;
+       unsigned long max_low_pfn;
+
+       /* Force a quick death if the kernel panics. */
+       extern int panic_timeout;
+       if (panic_timeout == 0)
+               panic_timeout = 1;
+
+       /* Register a call for panic conditions. */
+       notifier_chain_register(&panic_notifier_list, &xen_panic_block);
+
+       HYPERVISOR_vm_assist(VMASST_CMD_enable, VMASST_TYPE_4gb_segments);
+       HYPERVISOR_vm_assist(VMASST_CMD_enable,
+                            VMASST_TYPE_writable_pagetables);
+
+       memcpy(&boot_cpu_data, &new_cpu_data, sizeof(new_cpu_data));
+       early_cpu_init();
+
+       /*
+        * FIXME: This isn't an official loader_type right
+        * now but does currently work with elilo.
+        * If we were configured as an EFI kernel, check to make
+        * sure that we were loaded correctly from elilo and that
+        * the system table is valid.  If not, then initialize normally.
+        */
+#ifdef CONFIG_EFI
+       if ((LOADER_TYPE == 0x50) && EFI_SYSTAB)
+               efi_enabled = 1;
+#endif
+
+       /* This must be initialized to UNNAMED_MAJOR for ipconfig to work
+          properly.  Setting ROOT_DEV to default to /dev/ram0 breaks initrd.
+       */
+       ROOT_DEV = MKDEV(UNNAMED_MAJOR,0);
+       drive_info = DRIVE_INFO;
+       screen_info = SCREEN_INFO;
+       edid_info = EDID_INFO;
+       apm_info.bios = APM_BIOS_INFO;
+       ist_info = IST_INFO;
+       saved_videomode = VIDEO_MODE;
+       if( SYS_DESC_TABLE.length != 0 ) {
+               set_mca_bus(SYS_DESC_TABLE.table[3] & 0x2);
+               machine_id = SYS_DESC_TABLE.table[0];
+               machine_submodel_id = SYS_DESC_TABLE.table[1];
+               BIOS_revision = SYS_DESC_TABLE.table[2];
+       }
+       bootloader_type = LOADER_TYPE;
+
+#ifdef CONFIG_XEN_PHYSDEV_ACCESS
+       /* This is drawn from a dump from vgacon:startup in standard Linux. */
+       screen_info.orig_video_mode = 3; 
+       screen_info.orig_video_isVGA = 1;
+       screen_info.orig_video_lines = 25;
+       screen_info.orig_video_cols = 80;
+       screen_info.orig_video_ega_bx = 3;
+       screen_info.orig_video_points = 16;
+#endif
+
+#ifdef CONFIG_BLK_DEV_RAM
+       rd_image_start = RAMDISK_FLAGS & RAMDISK_IMAGE_START_MASK;
+       rd_prompt = ((RAMDISK_FLAGS & RAMDISK_PROMPT_FLAG) != 0);
+       rd_doload = ((RAMDISK_FLAGS & RAMDISK_LOAD_FLAG) != 0);
+#endif
+       ARCH_SETUP
+       if (efi_enabled)
+               efi_init();
+       else {
+               printk(KERN_INFO "BIOS-provided physical RAM map:\n");
+               print_memory_map(machine_specific_memory_setup());
+       }
+
+       copy_edd();
+
+       if (!MOUNT_ROOT_RDONLY)
+               root_mountflags &= ~MS_RDONLY;
+       init_mm.start_code = (unsigned long) _text;
+       init_mm.end_code = (unsigned long) _etext;
+       init_mm.end_data = (unsigned long) _edata;
+       init_mm.brk = (PFN_UP(__pa(xen_start_info.pt_base)) +
+                      xen_start_info.nr_pt_frames) << PAGE_SHIFT;
+
+       /* XEN: This is nonsense: kernel may not even be contiguous in RAM. */
+       /*code_resource.start = virt_to_phys(_text);*/
+       /*code_resource.end = virt_to_phys(_etext)-1;*/
+       /*data_resource.start = virt_to_phys(_etext);*/
+       /*data_resource.end = virt_to_phys(_edata)-1;*/
+
+       parse_cmdline_early(cmdline_p);
+
+       max_low_pfn = setup_memory();
+
+       /*
+        * NOTE: before this point _nobody_ is allowed to allocate
+        * any memory using the bootmem allocator.  Although the
+        * alloctor is now initialised only the first 8Mb of the kernel
+        * virtual address space has been mapped.  All allocations before
+        * paging_init() has completed must use the alloc_bootmem_low_pages()
+        * variant (which allocates DMA'able memory) and care must be taken
+        * not to exceed the 8Mb limit.
+        */
+
+#ifdef CONFIG_SMP
+       smp_alloc_memory(); /* AP processor realmode stacks in low memory*/
+#endif
+       paging_init();
+       remapped_pgdat_init();
+       zone_sizes_init();
+
+#ifdef CONFIG_X86_FIND_SMP_CONFIG
+       /*
+        * Find and reserve possible boot-time SMP configuration:
+        */
+       find_smp_config();
+#endif
+
+       /* Make sure we have a correctly sized P->M table. */
+       if (max_pfn != xen_start_info.nr_pages) {
+               phys_to_machine_mapping = alloc_bootmem_low_pages(
+                       max_pfn * sizeof(unsigned long));
+
+               if (max_pfn > xen_start_info.nr_pages) {
+                       /* set to INVALID_P2M_ENTRY */
+                       memset(phys_to_machine_mapping, ~0,
+                               max_pfn * sizeof(unsigned long));
+                       memcpy(phys_to_machine_mapping,
+                               (unsigned long *)xen_start_info.mfn_list,
+                               xen_start_info.nr_pages * sizeof(unsigned 
long));
+               } else {
+                       memcpy(phys_to_machine_mapping,
+                               (unsigned long *)xen_start_info.mfn_list,
+                               max_pfn * sizeof(unsigned long));
+                       if (HYPERVISOR_dom_mem_op(
+                               MEMOP_decrease_reservation,
+                               (unsigned long *)xen_start_info.mfn_list + 
max_pfn,
+                               xen_start_info.nr_pages - max_pfn, 0) !=
+                           (xen_start_info.nr_pages - max_pfn)) BUG();
+               }
+               free_bootmem(
+                       __pa(xen_start_info.mfn_list), 
+                       PFN_PHYS(PFN_UP(xen_start_info.nr_pages *
+                       sizeof(unsigned long))));
+       }
+
+       pfn_to_mfn_frame_list = alloc_bootmem_low_pages(PAGE_SIZE);
+       for ( i=0, j=0; i < max_pfn; i+=(PAGE_SIZE/sizeof(unsigned long)), j++ )
+       {       
+            pfn_to_mfn_frame_list[j] = 
+                 virt_to_machine(&phys_to_machine_mapping[i]) >> PAGE_SHIFT;
+       }
+       HYPERVISOR_shared_info->arch.pfn_to_mfn_frame_list =
+            virt_to_machine(pfn_to_mfn_frame_list) >> PAGE_SHIFT;
+
+
+       /*
+        * NOTE: at this point the bootmem allocator is fully available.
+        */
+
+#ifdef CONFIG_EARLY_PRINTK
+       {
+               char *s = strstr(*cmdline_p, "earlyprintk=");
+               if (s) {
+                       extern void setup_early_printk(char *);
+
+                       setup_early_printk(s);
+                       printk("early console enabled\n");
+               }
+       }
+#endif
+
+
+       dmi_scan_machine();
+
+#ifdef CONFIG_X86_GENERICARCH
+       generic_apic_probe(*cmdline_p);
+#endif 
+       if (efi_enabled)
+               efi_map_memmap();
+
+       op.cmd             = PHYSDEVOP_SET_IOPL;
+       op.u.set_iopl.iopl = current->thread.io_pl = 1;
+       HYPERVISOR_physdev_op(&op);
+
+#ifdef CONFIG_ACPI_BOOT
+       if (!(xen_start_info.flags & SIF_INITDOMAIN)) {
+               printk(KERN_INFO "ACPI in unprivileged domain disabled\n");
+               acpi_disabled = 1;
+               acpi_ht = 0;
+       }
+#endif
+
+#ifdef CONFIG_ACPI_BOOT
+       /*
+        * Parse the ACPI tables for possible boot-time SMP configuration.
+        */
+       acpi_boot_table_init();
+       acpi_boot_init();
+#endif
+
+#ifdef CONFIG_X86_LOCAL_APIC
+       if (smp_found_config)
+               get_smp_config();
+#endif
+
+       /* XXX Disable irqdebug until we have a way to avoid interrupt
+        * conflicts. */
+       noirqdebug_setup("");
+
+       register_memory();
+
+       if (xen_start_info.flags & SIF_INITDOMAIN) {
+               if (!(xen_start_info.flags & SIF_PRIVILEGED))
+                       panic("Xen granted us console access "
+                             "but not privileged status");
+
+#ifdef CONFIG_VT
+#if defined(CONFIG_VGA_CONSOLE)
+               if (!efi_enabled ||
+                   (efi_mem_type(0xa0000) != EFI_CONVENTIONAL_MEMORY))
+                       conswitchp = &vga_con;
+#elif defined(CONFIG_DUMMY_CONSOLE)
+               conswitchp = &dummy_con;
+#endif
+#endif
+       } else {
+#ifdef CONFIG_XEN_PRIVILEGED_GUEST
+               extern const struct consw xennull_con;
+               extern int console_use_vt;
+#if defined(CONFIG_VGA_CONSOLE)
+               /* disable VGA driver */
+               ORIG_VIDEO_ISVGA = VIDEO_TYPE_VLFB;
+#endif
+               conswitchp = &xennull_con;
+               console_use_vt = 0;
+#endif
+       }
+}
+
+static int
+xen_panic_event(struct notifier_block *this, unsigned long event, void *ptr)
+{
+       HYPERVISOR_crash();    
+       /* we're never actually going to get here... */
+       return NOTIFY_DONE;
+}
+
+#include "setup_arch_post.h"
+/*
+ * Local Variables:
+ * mode:c
+ * c-file-style:"k&r"
+ * c-basic-offset:8
+ * End:
+ */
Index: linux-2.6.12-xen0-arch/arch/i386/mach-xen/kernel/signal.c
===================================================================
--- /dev/null
+++ linux-2.6.12-xen0-arch/arch/i386/mach-xen/kernel/signal.c
@@ -0,0 +1,665 @@
+/*
+ *  linux/arch/i386/kernel/signal.c
+ *
+ *  Copyright (C) 1991, 1992  Linus Torvalds
+ *
+ *  1997-11-28  Modified for POSIX.1b signals by Richard Henderson
+ *  2000-06-20  Pentium III FXSR, SSE support by Gareth Hughes
+ */
+
+#include <linux/sched.h>
+#include <linux/mm.h>
+#include <linux/smp.h>
+#include <linux/smp_lock.h>
+#include <linux/kernel.h>
+#include <linux/signal.h>
+#include <linux/errno.h>
+#include <linux/wait.h>
+#include <linux/unistd.h>
+#include <linux/stddef.h>
+#include <linux/personality.h>
+#include <linux/suspend.h>
+#include <linux/ptrace.h>
+#include <linux/elf.h>
+#include <asm/processor.h>
+#include <asm/ucontext.h>
+#include <asm/uaccess.h>
+#include <asm/i387.h>
+#include "../../kernel/sigframe.h"
+
+#define DEBUG_SIG 0
+
+#define _BLOCKABLE (~(sigmask(SIGKILL) | sigmask(SIGSTOP)))
+
+/*
+ * Atomically swap in the new signal mask, and wait for a signal.
+ */
+asmlinkage int
+sys_sigsuspend(int history0, int history1, old_sigset_t mask)
+{
+       struct pt_regs * regs = (struct pt_regs *) &history0;
+       sigset_t saveset;
+
+       mask &= _BLOCKABLE;
+       spin_lock_irq(&current->sighand->siglock);
+       saveset = current->blocked;
+       siginitset(&current->blocked, mask);
+       recalc_sigpending();
+       spin_unlock_irq(&current->sighand->siglock);
+
+       regs->eax = -EINTR;
+       while (1) {
+               current->state = TASK_INTERRUPTIBLE;
+               schedule();
+               if (do_signal(regs, &saveset))
+                       return -EINTR;
+       }
+}
+
+asmlinkage int
+sys_rt_sigsuspend(struct pt_regs regs)
+{
+       sigset_t saveset, newset;
+
+       /* XXX: Don't preclude handling different sized sigset_t's.  */
+       if (regs.ecx != sizeof(sigset_t))
+               return -EINVAL;
+
+       if (copy_from_user(&newset, (sigset_t __user *)regs.ebx, 
sizeof(newset)))
+               return -EFAULT;
+       sigdelsetmask(&newset, ~_BLOCKABLE);
+
+       spin_lock_irq(&current->sighand->siglock);
+       saveset = current->blocked;
+       current->blocked = newset;
+       recalc_sigpending();
+       spin_unlock_irq(&current->sighand->siglock);
+
+       regs.eax = -EINTR;
+       while (1) {
+               current->state = TASK_INTERRUPTIBLE;
+               schedule();
+               if (do_signal(&regs, &saveset))
+                       return -EINTR;
+       }
+}
+
+asmlinkage int 
+sys_sigaction(int sig, const struct old_sigaction __user *act,
+             struct old_sigaction __user *oact)
+{
+       struct k_sigaction new_ka, old_ka;
+       int ret;
+
+       if (act) {
+               old_sigset_t mask;
+               if (!access_ok(VERIFY_READ, act, sizeof(*act)) ||
+                   __get_user(new_ka.sa.sa_handler, &act->sa_handler) ||
+                   __get_user(new_ka.sa.sa_restorer, &act->sa_restorer))
+                       return -EFAULT;
+               __get_user(new_ka.sa.sa_flags, &act->sa_flags);
+               __get_user(mask, &act->sa_mask);
+               siginitset(&new_ka.sa.sa_mask, mask);
+       }
+
+       ret = do_sigaction(sig, act ? &new_ka : NULL, oact ? &old_ka : NULL);
+
+       if (!ret && oact) {
+               if (!access_ok(VERIFY_WRITE, oact, sizeof(*oact)) ||
+                   __put_user(old_ka.sa.sa_handler, &oact->sa_handler) ||
+                   __put_user(old_ka.sa.sa_restorer, &oact->sa_restorer))
+                       return -EFAULT;
+               __put_user(old_ka.sa.sa_flags, &oact->sa_flags);
+               __put_user(old_ka.sa.sa_mask.sig[0], &oact->sa_mask);
+       }
+
+       return ret;
+}
+
+asmlinkage int
+sys_sigaltstack(unsigned long ebx)
+{
+       /* This is needed to make gcc realize it doesn't own the "struct 
pt_regs" */
+       struct pt_regs *regs = (struct pt_regs *)&ebx;
+       const stack_t __user *uss = (const stack_t __user *)ebx;
+       stack_t __user *uoss = (stack_t __user *)regs->ecx;
+
+       return do_sigaltstack(uss, uoss, regs->esp);
+}
+
+
+/*
+ * Do a signal return; undo the signal stack.
+ */
+
+static int
+restore_sigcontext(struct pt_regs *regs, struct sigcontext __user *sc, int 
*peax)
+{
+       unsigned int err = 0;
+
+       /* Always make any pending restarted system calls return -EINTR */
+       current_thread_info()->restart_block.fn = do_no_restart_syscall;
+
+#define COPY(x)                err |= __get_user(regs->x, &sc->x)
+
+#define COPY_SEG(seg)                                                  \
+       { unsigned short tmp;                                           \
+         err |= __get_user(tmp, &sc->seg);                             \
+         regs->x##seg = tmp; }
+
+#define COPY_SEG_STRICT(seg)                                           \
+       { unsigned short tmp;                                           \
+         err |= __get_user(tmp, &sc->seg);                             \
+         regs->x##seg = tmp|3; }
+
+#define GET_SEG(seg)                                                   \
+       { unsigned short tmp;                                           \
+         err |= __get_user(tmp, &sc->seg);                             \
+         loadsegment(seg,tmp); }
+
+#define        FIX_EFLAGS      (X86_EFLAGS_AC | X86_EFLAGS_OF | X86_EFLAGS_DF 
| \
+                        X86_EFLAGS_TF | X86_EFLAGS_SF | X86_EFLAGS_ZF | \
+                        X86_EFLAGS_AF | X86_EFLAGS_PF | X86_EFLAGS_CF)
+
+       GET_SEG(gs);
+       GET_SEG(fs);
+       COPY_SEG(es);
+       COPY_SEG(ds);
+       COPY(edi);
+       COPY(esi);
+       COPY(ebp);
+       COPY(esp);
+       COPY(ebx);
+       COPY(edx);
+       COPY(ecx);
+       COPY(eip);
+       COPY_SEG_STRICT(cs);
+       COPY_SEG_STRICT(ss);
+       
+       {
+               unsigned int tmpflags;
+               err |= __get_user(tmpflags, &sc->eflags);
+               regs->eflags = (regs->eflags & ~FIX_EFLAGS) | (tmpflags & 
FIX_EFLAGS);
+               regs->orig_eax = -1;            /* disable syscall checks */
+       }
+
+       {
+               struct _fpstate __user * buf;
+               err |= __get_user(buf, &sc->fpstate);
+               if (buf) {
+                       if (!access_ok(VERIFY_READ, buf, sizeof(*buf)))
+                               goto badframe;
+                       err |= restore_i387(buf);
+               } else {
+                       struct task_struct *me = current;
+                       if (used_math()) {
+                               clear_fpu(me);
+                               clear_used_math();
+                       }
+               }
+       }
+
+       err |= __get_user(*peax, &sc->eax);
+       return err;
+
+badframe:
+       return 1;
+}
+
+asmlinkage int sys_sigreturn(unsigned long __unused)
+{
+       struct pt_regs *regs = (struct pt_regs *) &__unused;
+       struct sigframe __user *frame = (struct sigframe __user *)(regs->esp - 
8);
+       sigset_t set;
+       int eax;
+
+       if (!access_ok(VERIFY_READ, frame, sizeof(*frame)))
+               goto badframe;
+       if (__get_user(set.sig[0], &frame->sc.oldmask)
+           || (_NSIG_WORDS > 1
+               && __copy_from_user(&set.sig[1], &frame->extramask,
+                                   sizeof(frame->extramask))))
+               goto badframe;
+
+       sigdelsetmask(&set, ~_BLOCKABLE);
+       spin_lock_irq(&current->sighand->siglock);
+       current->blocked = set;
+       recalc_sigpending();
+       spin_unlock_irq(&current->sighand->siglock);
+       
+       if (restore_sigcontext(regs, &frame->sc, &eax))
+               goto badframe;
+       return eax;
+
+badframe:
+       force_sig(SIGSEGV, current);
+       return 0;
+}      
+
+asmlinkage int sys_rt_sigreturn(unsigned long __unused)
+{
+       struct pt_regs *regs = (struct pt_regs *) &__unused;
+       struct rt_sigframe __user *frame = (struct rt_sigframe __user 
*)(regs->esp - 4);
+       sigset_t set;
+       int eax;
+
+       if (!access_ok(VERIFY_READ, frame, sizeof(*frame)))
+               goto badframe;
+       if (__copy_from_user(&set, &frame->uc.uc_sigmask, sizeof(set)))
+               goto badframe;
+
+       sigdelsetmask(&set, ~_BLOCKABLE);
+       spin_lock_irq(&current->sighand->siglock);
+       current->blocked = set;
+       recalc_sigpending();
+       spin_unlock_irq(&current->sighand->siglock);
+       
+       if (restore_sigcontext(regs, &frame->uc.uc_mcontext, &eax))
+               goto badframe;
+
+       if (do_sigaltstack(&frame->uc.uc_stack, NULL, regs->esp) == -EFAULT)
+               goto badframe;
+
+       return eax;
+
+badframe:
+       force_sig(SIGSEGV, current);
+       return 0;
+}      
+
+/*
+ * Set up a signal frame.
+ */
+
+static int
+setup_sigcontext(struct sigcontext __user *sc, struct _fpstate __user *fpstate,
+                struct pt_regs *regs, unsigned long mask)
+{
+       int tmp, err = 0;
+
+       tmp = 0;
+       __asm__("movl %%gs,%0" : "=r"(tmp): "0"(tmp));
+       err |= __put_user(tmp, (unsigned int __user *)&sc->gs);
+       __asm__("movl %%fs,%0" : "=r"(tmp): "0"(tmp));
+       err |= __put_user(tmp, (unsigned int __user *)&sc->fs);
+
+       err |= __put_user(regs->xes, (unsigned int __user *)&sc->es);
+       err |= __put_user(regs->xds, (unsigned int __user *)&sc->ds);
+       err |= __put_user(regs->edi, &sc->edi);
+       err |= __put_user(regs->esi, &sc->esi);
+       err |= __put_user(regs->ebp, &sc->ebp);
+       err |= __put_user(regs->esp, &sc->esp);
+       err |= __put_user(regs->ebx, &sc->ebx);
+       err |= __put_user(regs->edx, &sc->edx);
+       err |= __put_user(regs->ecx, &sc->ecx);
+       err |= __put_user(regs->eax, &sc->eax);
+       err |= __put_user(current->thread.trap_no, &sc->trapno);
+       err |= __put_user(current->thread.error_code, &sc->err);
+       err |= __put_user(regs->eip, &sc->eip);
+       err |= __put_user(regs->xcs, (unsigned int __user *)&sc->cs);
+       err |= __put_user(regs->eflags, &sc->eflags);
+       err |= __put_user(regs->esp, &sc->esp_at_signal);
+       err |= __put_user(regs->xss, (unsigned int __user *)&sc->ss);
+
+       tmp = save_i387(fpstate);
+       if (tmp < 0)
+         err = 1;
+       else
+         err |= __put_user(tmp ? fpstate : NULL, &sc->fpstate);
+
+       /* non-iBCS2 extensions.. */
+       err |= __put_user(mask, &sc->oldmask);
+       err |= __put_user(current->thread.cr2, &sc->cr2);
+
+       return err;
+}
+
+/*
+ * Determine which stack to use..
+ */
+static inline void __user *
+get_sigframe(struct k_sigaction *ka, struct pt_regs * regs, size_t frame_size)
+{
+       unsigned long esp;
+
+       /* Default to using normal stack */
+       esp = regs->esp;
+
+       /* This is the X/Open sanctioned signal stack switching.  */
+       if (ka->sa.sa_flags & SA_ONSTACK) {
+               if (sas_ss_flags(esp) == 0)
+                       esp = current->sas_ss_sp + current->sas_ss_size;
+       }
+
+       /* This is the legacy signal stack switching. */
+       else if ((regs->xss & 0xffff) != __USER_DS &&
+                !(ka->sa.sa_flags & SA_RESTORER) &&
+                ka->sa.sa_restorer) {
+               esp = (unsigned long) ka->sa.sa_restorer;
+       }
+
+       return (void __user *)((esp - frame_size) & -8ul);
+}
+
+/* These symbols are defined with the addresses in the vsyscall page.
+   See vsyscall-sigreturn.S.  */
+extern void __user __kernel_sigreturn;
+extern void __user __kernel_rt_sigreturn;
+
+static void setup_frame(int sig, struct k_sigaction *ka,
+                       sigset_t *set, struct pt_regs * regs)
+{
+       void __user *restorer;
+       struct sigframe __user *frame;
+       int err = 0;
+       int usig;
+
+       frame = get_sigframe(ka, regs, sizeof(*frame));
+
+       if (!access_ok(VERIFY_WRITE, frame, sizeof(*frame)))
+               goto give_sigsegv;
+
+       usig = current_thread_info()->exec_domain
+               && current_thread_info()->exec_domain->signal_invmap
+               && sig < 32
+               ? current_thread_info()->exec_domain->signal_invmap[sig]
+               : sig;
+
+       err = __put_user(usig, &frame->sig);
+       if (err)
+               goto give_sigsegv;
+
+       err = setup_sigcontext(&frame->sc, &frame->fpstate, regs, set->sig[0]);
+       if (err)
+               goto give_sigsegv;
+
+       if (_NSIG_WORDS > 1) {
+               err = __copy_to_user(&frame->extramask, &set->sig[1],
+                                     sizeof(frame->extramask));
+               if (err)
+                       goto give_sigsegv;
+       }
+
+       restorer = &__kernel_sigreturn;
+       if (ka->sa.sa_flags & SA_RESTORER)
+               restorer = ka->sa.sa_restorer;
+
+       /* Set up to return from userspace.  */
+       err |= __put_user(restorer, &frame->pretcode);
+        
+       /*
+        * This is popl %eax ; movl $,%eax ; int $0x80
+        *
+        * WE DO NOT USE IT ANY MORE! It's only left here for historical
+        * reasons and because gdb uses it as a signature to notice
+        * signal handler stack frames.
+        */
+       err |= __put_user(0xb858, (short __user *)(frame->retcode+0));
+       err |= __put_user(__NR_sigreturn, (int __user *)(frame->retcode+2));
+       err |= __put_user(0x80cd, (short __user *)(frame->retcode+6));
+
+       if (err)
+               goto give_sigsegv;
+
+       /* Set up registers for signal handler */
+       regs->esp = (unsigned long) frame;
+       regs->eip = (unsigned long) ka->sa.sa_handler;
+       regs->eax = (unsigned long) sig;
+       regs->edx = (unsigned long) 0;
+       regs->ecx = (unsigned long) 0;
+
+       set_fs(USER_DS);
+       regs->xds = __USER_DS;
+       regs->xes = __USER_DS;
+       regs->xss = __USER_DS;
+       regs->xcs = __USER_CS;
+
+       /*
+        * Clear TF when entering the signal handler, but
+        * notify any tracer that was single-stepping it.
+        * The tracer may want to single-step inside the
+        * handler too.
+        */
+       regs->eflags &= ~TF_MASK;
+       if (test_thread_flag(TIF_SINGLESTEP))
+               ptrace_notify(SIGTRAP);
+
+#if DEBUG_SIG
+       printk("SIG deliver (%s:%d): sp=%p pc=%p ra=%p\n",
+               current->comm, current->pid, frame, regs->eip, frame->pretcode);
+#endif
+
+       return;
+
+give_sigsegv:
+       force_sigsegv(sig, current);
+}
+
+static void setup_rt_frame(int sig, struct k_sigaction *ka, siginfo_t *info,
+                          sigset_t *set, struct pt_regs * regs)
+{
+       void __user *restorer;
+       struct rt_sigframe __user *frame;
+       int err = 0;
+       int usig;
+
+       frame = get_sigframe(ka, regs, sizeof(*frame));
+
+       if (!access_ok(VERIFY_WRITE, frame, sizeof(*frame)))
+               goto give_sigsegv;
+
+       usig = current_thread_info()->exec_domain
+               && current_thread_info()->exec_domain->signal_invmap
+               && sig < 32
+               ? current_thread_info()->exec_domain->signal_invmap[sig]
+               : sig;
+
+       err |= __put_user(usig, &frame->sig);
+       err |= __put_user(&frame->info, &frame->pinfo);
+       err |= __put_user(&frame->uc, &frame->puc);
+       err |= copy_siginfo_to_user(&frame->info, info);
+       if (err)
+               goto give_sigsegv;
+
+       /* Create the ucontext.  */
+       err |= __put_user(0, &frame->uc.uc_flags);
+       err |= __put_user(0, &frame->uc.uc_link);
+       err |= __put_user(current->sas_ss_sp, &frame->uc.uc_stack.ss_sp);
+       err |= __put_user(sas_ss_flags(regs->esp),
+                         &frame->uc.uc_stack.ss_flags);
+       err |= __put_user(current->sas_ss_size, &frame->uc.uc_stack.ss_size);
+       err |= setup_sigcontext(&frame->uc.uc_mcontext, &frame->fpstate,
+                               regs, set->sig[0]);
+       err |= __copy_to_user(&frame->uc.uc_sigmask, set, sizeof(*set));
+       if (err)
+               goto give_sigsegv;
+
+       /* Set up to return from userspace.  */
+       restorer = &__kernel_rt_sigreturn;
+       if (ka->sa.sa_flags & SA_RESTORER)
+               restorer = ka->sa.sa_restorer;
+       err |= __put_user(restorer, &frame->pretcode);
+        
+       /*
+        * This is movl $,%eax ; int $0x80
+        *
+        * WE DO NOT USE IT ANY MORE! It's only left here for historical
+        * reasons and because gdb uses it as a signature to notice
+        * signal handler stack frames.
+        */
+       err |= __put_user(0xb8, (char __user *)(frame->retcode+0));
+       err |= __put_user(__NR_rt_sigreturn, (int __user *)(frame->retcode+1));
+       err |= __put_user(0x80cd, (short __user *)(frame->retcode+5));
+
+       if (err)
+               goto give_sigsegv;
+
+       /* Set up registers for signal handler */
+       regs->esp = (unsigned long) frame;
+       regs->eip = (unsigned long) ka->sa.sa_handler;
+       regs->eax = (unsigned long) usig;
+       regs->edx = (unsigned long) &frame->info;
+       regs->ecx = (unsigned long) &frame->uc;
+
+       set_fs(USER_DS);
+       regs->xds = __USER_DS;
+       regs->xes = __USER_DS;
+       regs->xss = __USER_DS;
+       regs->xcs = __USER_CS;
+
+       /*
+        * Clear TF when entering the signal handler, but
+        * notify any tracer that was single-stepping it.
+        * The tracer may want to single-step inside the
+        * handler too.
+        */
+       regs->eflags &= ~TF_MASK;
+       if (test_thread_flag(TIF_SINGLESTEP))
+               ptrace_notify(SIGTRAP);
+
+#if DEBUG_SIG
+       printk("SIG deliver (%s:%d): sp=%p pc=%p ra=%p\n",
+               current->comm, current->pid, frame, regs->eip, frame->pretcode);
+#endif
+
+       return;
+
+give_sigsegv:
+       force_sigsegv(sig, current);
+}
+
+/*
+ * OK, we're invoking a handler
+ */    
+
+static void
+handle_signal(unsigned long sig, siginfo_t *info, struct k_sigaction *ka,
+             sigset_t *oldset, struct pt_regs * regs)
+{
+       /* Are we from a system call? */
+       if (regs->orig_eax >= 0) {
+               /* If so, check system call restarting.. */
+               switch (regs->eax) {
+                       case -ERESTART_RESTARTBLOCK:
+                       case -ERESTARTNOHAND:
+                               regs->eax = -EINTR;
+                               break;
+
+                       case -ERESTARTSYS:
+                               if (!(ka->sa.sa_flags & SA_RESTART)) {
+                                       regs->eax = -EINTR;
+                                       break;
+                               }
+                       /* fallthrough */
+                       case -ERESTARTNOINTR:
+                               regs->eax = regs->orig_eax;
+                               regs->eip -= 2;
+               }
+       }
+
+       /*
+        * If TF is set due to a debugger (PT_DTRACE), clear the TF flag so
+        * that register information in the sigcontext is correct.
+        */
+       if (unlikely(regs->eflags & TF_MASK)
+           && likely(current->ptrace & PT_DTRACE)) {
+               current->ptrace &= ~PT_DTRACE;
+               regs->eflags &= ~TF_MASK;
+       }
+
+       /* Set up the stack frame */
+       if (ka->sa.sa_flags & SA_SIGINFO)
+               setup_rt_frame(sig, ka, info, oldset, regs);
+       else
+               setup_frame(sig, ka, oldset, regs);
+
+       if (!(ka->sa.sa_flags & SA_NODEFER)) {
+               spin_lock_irq(&current->sighand->siglock);
+               sigorsets(&current->blocked,&current->blocked,&ka->sa.sa_mask);
+               sigaddset(&current->blocked,sig);
+               recalc_sigpending();
+               spin_unlock_irq(&current->sighand->siglock);
+       }
+}
+
+/*
+ * Note that 'init' is a special process: it doesn't get signals it doesn't
+ * want to handle. Thus you cannot kill init even with a SIGKILL even by
+ * mistake.
+ */
+int fastcall do_signal(struct pt_regs *regs, sigset_t *oldset)
+{
+       siginfo_t info;
+       int signr;
+       struct k_sigaction ka;
+
+       /*
+        * We want the common case to go fast, which
+        * is why we may in certain cases get here from
+        * kernel mode. Just return without doing anything
+        * if so.
+        */
+       if ((regs->xcs & 2) != 2)
+               return 1;
+
+       if (current->flags & PF_FREEZE) {
+               refrigerator(0);
+               goto no_signal;
+       }
+
+       if (!oldset)
+               oldset = &current->blocked;
+
+       signr = get_signal_to_deliver(&info, &ka, regs, NULL);
+       if (signr > 0) {
+               /* Reenable any watchpoints before delivering the
+                * signal to user space. The processor register will
+                * have been cleared if the watchpoint triggered
+                * inside the kernel.
+                */
+               if (unlikely(current->thread.debugreg[7])) {
+                       loaddebug(&current->thread, 7);
+               }
+
+               /* Whee!  Actually deliver the signal.  */
+               handle_signal(signr, &info, &ka, oldset, regs);
+               return 1;
+       }
+
+ no_signal:
+       /* Did we come from a system call? */
+       if (regs->orig_eax >= 0) {
+               /* Restart the system call - no handlers present */
+               if (regs->eax == -ERESTARTNOHAND ||
+                   regs->eax == -ERESTARTSYS ||
+                   regs->eax == -ERESTARTNOINTR) {
+                       regs->eax = regs->orig_eax;
+                       regs->eip -= 2;
+               }
+               if (regs->eax == -ERESTART_RESTARTBLOCK){
+                       regs->eax = __NR_restart_syscall;
+                       regs->eip -= 2;
+               }
+       }
+       return 0;
+}
+
+/*
+ * notification of userspace execution resumption
+ * - triggered by current->work.notify_resume
+ */
+__attribute__((regparm(3)))
+void do_notify_resume(struct pt_regs *regs, sigset_t *oldset,
+                     __u32 thread_info_flags)
+{
+       /* Pending single-step? */
+       if (thread_info_flags & _TIF_SINGLESTEP) {
+               regs->eflags |= TF_MASK;
+               clear_thread_flag(TIF_SINGLESTEP);
+       }
+       /* deal with pending signal delivery */
+       if (thread_info_flags & _TIF_SIGPENDING)
+               do_signal(regs,oldset);
+       
+       clear_thread_flag(TIF_IRET);
+}
Index: linux-2.6.12-xen0-arch/arch/i386/mach-xen/kernel/syscall_table.S
===================================================================
--- /dev/null
+++ linux-2.6.12-xen0-arch/arch/i386/mach-xen/kernel/syscall_table.S
@@ -0,0 +1,291 @@
+.data
+ENTRY(sys_call_table)
+       .long sys_restart_syscall       /* 0 - old "setup()" system call, used 
for restarting */
+       .long sys_exit
+       .long sys_fork
+       .long sys_read
+       .long sys_write
+       .long sys_open          /* 5 */
+       .long sys_close
+       .long sys_waitpid
+       .long sys_creat
+       .long sys_link
+       .long sys_unlink        /* 10 */
+       .long sys_execve
+       .long sys_chdir
+       .long sys_time
+       .long sys_mknod
+       .long sys_chmod         /* 15 */
+       .long sys_lchown16
+       .long sys_ni_syscall    /* old break syscall holder */
+       .long sys_stat
+       .long sys_lseek
+       .long sys_getpid        /* 20 */
+       .long sys_mount
+       .long sys_oldumount
+       .long sys_setuid16
+       .long sys_getuid16
+       .long sys_stime         /* 25 */
+       .long sys_ptrace
+       .long sys_alarm
+       .long sys_fstat
+       .long sys_pause
+       .long sys_utime         /* 30 */
+       .long sys_ni_syscall    /* old stty syscall holder */
+       .long sys_ni_syscall    /* old gtty syscall holder */
+       .long sys_access
+       .long sys_nice
+       .long sys_ni_syscall    /* 35 - old ftime syscall holder */
+       .long sys_sync
+       .long sys_kill
+       .long sys_rename
+       .long sys_mkdir
+       .long sys_rmdir         /* 40 */
+       .long sys_dup
+       .long sys_pipe
+       .long sys_times
+       .long sys_ni_syscall    /* old prof syscall holder */
+       .long sys_brk           /* 45 */
+       .long sys_setgid16
+       .long sys_getgid16
+       .long sys_signal
+       .long sys_geteuid16
+       .long sys_getegid16     /* 50 */
+       .long sys_acct
+       .long sys_umount        /* recycled never used phys() */
+       .long sys_ni_syscall    /* old lock syscall holder */
+       .long sys_ioctl
+       .long sys_fcntl         /* 55 */
+       .long sys_ni_syscall    /* old mpx syscall holder */
+       .long sys_setpgid
+       .long sys_ni_syscall    /* old ulimit syscall holder */
+       .long sys_olduname
+       .long sys_umask         /* 60 */
+       .long sys_chroot
+       .long sys_ustat
+       .long sys_dup2
+       .long sys_getppid
+       .long sys_getpgrp       /* 65 */
+       .long sys_setsid
+       .long sys_sigaction
+       .long sys_sgetmask
+       .long sys_ssetmask
+       .long sys_setreuid16    /* 70 */
+       .long sys_setregid16
+       .long sys_sigsuspend
+       .long sys_sigpending
+       .long sys_sethostname
+       .long sys_setrlimit     /* 75 */
+       .long sys_old_getrlimit
+       .long sys_getrusage
+       .long sys_gettimeofday
+       .long sys_settimeofday
+       .long sys_getgroups16   /* 80 */
+       .long sys_setgroups16
+       .long old_select
+       .long sys_symlink
+       .long sys_lstat
+       .long sys_readlink      /* 85 */
+       .long sys_uselib
+       .long sys_swapon
+       .long sys_reboot
+       .long old_readdir
+       .long old_mmap          /* 90 */
+       .long sys_munmap
+       .long sys_truncate
+       .long sys_ftruncate
+       .long sys_fchmod
+       .long sys_fchown16      /* 95 */
+       .long sys_getpriority
+       .long sys_setpriority
+       .long sys_ni_syscall    /* old profil syscall holder */
+       .long sys_statfs
+       .long sys_fstatfs       /* 100 */
+       .long sys_ioperm
+       .long sys_socketcall
+       .long sys_syslog
+       .long sys_setitimer
+       .long sys_getitimer     /* 105 */
+       .long sys_newstat
+       .long sys_newlstat
+       .long sys_newfstat
+       .long sys_uname
+       .long sys_iopl          /* 110 */
+       .long sys_vhangup
+       .long sys_ni_syscall    /* old "idle" system call */
+       .long sys_vm86old
+       .long sys_wait4
+       .long sys_swapoff       /* 115 */
+       .long sys_sysinfo
+       .long sys_ipc
+       .long sys_fsync
+       .long sys_sigreturn
+       .long sys_clone         /* 120 */
+       .long sys_setdomainname
+       .long sys_newuname
+       .long sys_modify_ldt
+       .long sys_adjtimex
+       .long sys_mprotect      /* 125 */
+       .long sys_sigprocmask
+       .long sys_ni_syscall    /* old "create_module" */
+       .long sys_init_module
+       .long sys_delete_module
+       .long sys_ni_syscall    /* 130: old "get_kernel_syms" */
+       .long sys_quotactl
+       .long sys_getpgid
+       .long sys_fchdir
+       .long sys_bdflush
+       .long sys_sysfs         /* 135 */
+       .long sys_personality
+       .long sys_ni_syscall    /* reserved for afs_syscall */
+       .long sys_setfsuid16
+       .long sys_setfsgid16
+       .long sys_llseek        /* 140 */
+       .long sys_getdents
+       .long sys_select
+       .long sys_flock
+       .long sys_msync
+       .long sys_readv         /* 145 */
+       .long sys_writev
+       .long sys_getsid
+       .long sys_fdatasync
+       .long sys_sysctl
+       .long sys_mlock         /* 150 */
+       .long sys_munlock
+       .long sys_mlockall
+       .long sys_munlockall
+       .long sys_sched_setparam
+       .long sys_sched_getparam   /* 155 */
+       .long sys_sched_setscheduler
+       .long sys_sched_getscheduler
+       .long sys_sched_yield
+       .long sys_sched_get_priority_max
+       .long sys_sched_get_priority_min  /* 160 */
+       .long sys_sched_rr_get_interval
+       .long sys_nanosleep
+       .long sys_mremap
+       .long sys_setresuid16
+       .long sys_getresuid16   /* 165 */
+       .long sys_vm86
+       .long sys_ni_syscall    /* Old sys_query_module */
+       .long sys_poll
+       .long sys_nfsservctl
+       .long sys_setresgid16   /* 170 */
+       .long sys_getresgid16
+       .long sys_prctl
+       .long sys_rt_sigreturn
+       .long sys_rt_sigaction
+       .long sys_rt_sigprocmask        /* 175 */
+       .long sys_rt_sigpending
+       .long sys_rt_sigtimedwait
+       .long sys_rt_sigqueueinfo
+       .long sys_rt_sigsuspend
+       .long sys_pread64       /* 180 */
+       .long sys_pwrite64
+       .long sys_chown16
+       .long sys_getcwd
+       .long sys_capget
+       .long sys_capset        /* 185 */
+       .long sys_sigaltstack
+       .long sys_sendfile
+       .long sys_ni_syscall    /* reserved for streams1 */
+       .long sys_ni_syscall    /* reserved for streams2 */
+       .long sys_vfork         /* 190 */
+       .long sys_getrlimit
+       .long sys_mmap2
+       .long sys_truncate64
+       .long sys_ftruncate64
+       .long sys_stat64        /* 195 */
+       .long sys_lstat64
+       .long sys_fstat64
+       .long sys_lchown
+       .long sys_getuid
+       .long sys_getgid        /* 200 */
+       .long sys_geteuid
+       .long sys_getegid
+       .long sys_setreuid
+       .long sys_setregid
+       .long sys_getgroups     /* 205 */
+       .long sys_setgroups
+       .long sys_fchown
+       .long sys_setresuid
+       .long sys_getresuid
+       .long sys_setresgid     /* 210 */
+       .long sys_getresgid
+       .long sys_chown
+       .long sys_setuid
+       .long sys_setgid
+       .long sys_setfsuid      /* 215 */
+       .long sys_setfsgid
+       .long sys_pivot_root
+       .long sys_mincore
+       .long sys_madvise
+       .long sys_getdents64    /* 220 */
+       .long sys_fcntl64
+       .long sys_ni_syscall    /* reserved for TUX */
+       .long sys_ni_syscall
+       .long sys_gettid
+       .long sys_readahead     /* 225 */
+       .long sys_setxattr
+       .long sys_lsetxattr
+       .long sys_fsetxattr
+       .long sys_getxattr
+       .long sys_lgetxattr     /* 230 */
+       .long sys_fgetxattr
+       .long sys_listxattr
+       .long sys_llistxattr
+       .long sys_flistxattr
+       .long sys_removexattr   /* 235 */
+       .long sys_lremovexattr
+       .long sys_fremovexattr
+       .long sys_tkill
+       .long sys_sendfile64
+       .long sys_futex         /* 240 */
+       .long sys_sched_setaffinity
+       .long sys_sched_getaffinity
+       .long sys_set_thread_area
+       .long sys_get_thread_area
+       .long sys_io_setup      /* 245 */
+       .long sys_io_destroy
+       .long sys_io_getevents
+       .long sys_io_submit
+       .long sys_io_cancel
+       .long sys_fadvise64     /* 250 */
+       .long sys_ni_syscall
+       .long sys_exit_group
+       .long sys_lookup_dcookie
+       .long sys_epoll_create
+       .long sys_epoll_ctl     /* 255 */
+       .long sys_epoll_wait
+       .long sys_remap_file_pages
+       .long sys_set_tid_address
+       .long sys_timer_create
+       .long sys_timer_settime         /* 260 */
+       .long sys_timer_gettime
+       .long sys_timer_getoverrun
+       .long sys_timer_delete
+       .long sys_clock_settime
+       .long sys_clock_gettime         /* 265 */
+       .long sys_clock_getres
+       .long sys_clock_nanosleep
+       .long sys_statfs64
+       .long sys_fstatfs64
+       .long sys_tgkill        /* 270 */
+       .long sys_utimes
+       .long sys_fadvise64_64
+       .long sys_ni_syscall    /* sys_vserver */
+       .long sys_mbind
+       .long sys_get_mempolicy
+       .long sys_set_mempolicy
+       .long sys_mq_open
+       .long sys_mq_unlink
+       .long sys_mq_timedsend
+       .long sys_mq_timedreceive       /* 280 */
+       .long sys_mq_notify
+       .long sys_mq_getsetattr
+       .long sys_ni_syscall            /* reserved for kexec */
+       .long sys_waitid
+       .long sys_ni_syscall            /* 285 */ /* available */
+       .long sys_add_key
+       .long sys_request_key
+       .long sys_keyctl
Index: linux-2.6.12-xen0-arch/arch/i386/mach-xen/kernel/time.c
===================================================================
--- /dev/null
+++ linux-2.6.12-xen0-arch/arch/i386/mach-xen/kernel/time.c
@@ -0,0 +1,929 @@
+/*
+ *  linux/arch/i386/kernel/time.c
+ *
+ *  Copyright (C) 1991, 1992, 1995  Linus Torvalds
+ *
+ * This file contains the PC-specific time handling details:
+ * reading the RTC at bootup, etc..
+ * 1994-07-02    Alan Modra
+ *     fixed set_rtc_mmss, fixed time.year for >= 2000, new mktime
+ * 1995-03-26    Markus Kuhn
+ *      fixed 500 ms bug at call to set_rtc_mmss, fixed DS12887
+ *      precision CMOS clock update
+ * 1996-05-03    Ingo Molnar
+ *      fixed time warps in do_[slow|fast]_gettimeoffset()
+ * 1997-09-10  Updated NTP code according to technical memorandum Jan '96
+ *             "A Kernel Model for Precision Timekeeping" by Dave Mills
+ * 1998-09-05    (Various)
+ *     More robust do_fast_gettimeoffset() algorithm implemented
+ *     (works with APM, Cyrix 6x86MX and Centaur C6),
+ *     monotonic gettimeofday() with fast_get_timeoffset(),
+ *     drift-proof precision TSC calibration on boot
+ *     (C. Scott Ananian <cananian@xxxxxxxxxxxxxxxxxxxx>, Andrew D.
+ *     Balsa <andrebalsa@xxxxxxxxxx>, Philip Gladstone <philip@xxxxxxxxxx>;
+ *     ported from 2.0.35 Jumbo-9 by Michael Krause <m.krause@xxxxxxxxxxxxx>).
+ * 1998-12-16    Andrea Arcangeli
+ *     Fixed Jumbo-9 code in 2.1.131: do_gettimeofday was missing 1 jiffy
+ *     because was not accounting lost_ticks.
+ * 1998-12-24 Copyright (C) 1998  Andrea Arcangeli
+ *     Fixed a xtime SMP race (we need the xtime_lock rw spinlock to
+ *     serialize accesses to xtime/lost_ticks).
+ */
+
+#include <linux/errno.h>
+#include <linux/sched.h>
+#include <linux/kernel.h>
+#include <linux/param.h>
+#include <linux/string.h>
+#include <linux/mm.h>
+#include <linux/interrupt.h>
+#include <linux/time.h>
+#include <linux/delay.h>
+#include <linux/init.h>
+#include <linux/smp.h>
+#include <linux/module.h>
+#include <linux/sysdev.h>
+#include <linux/bcd.h>
+#include <linux/efi.h>
+#include <linux/mca.h>
+#include <linux/sysctl.h>
+#include <linux/percpu.h>
+
+#include <asm/io.h>
+#include <asm/smp.h>
+#include <asm/irq.h>
+#include <asm/msr.h>
+#include <asm/delay.h>
+#include <asm/mpspec.h>
+#include <asm/uaccess.h>
+#include <asm/processor.h>
+#include <asm/timer.h>
+
+#include "mach_time.h"
+
+#include <linux/timex.h>
+#include <linux/config.h>
+
+#include <asm/hpet.h>
+
+#include <asm/arch_hooks.h>
+
+#include "io_ports.h"
+
+extern spinlock_t i8259A_lock;
+int pit_latch_buggy;              /* extern */
+
+u64 jiffies_64 = INITIAL_JIFFIES;
+
+EXPORT_SYMBOL(jiffies_64);
+
+#if defined(__x86_64__)
+unsigned long vxtime_hz = PIT_TICK_RATE;
+struct vxtime_data __vxtime __section_vxtime;   /* for vsyscalls */
+volatile unsigned long __jiffies __section_jiffies = INITIAL_JIFFIES;
+unsigned long __wall_jiffies __section_wall_jiffies = INITIAL_JIFFIES;
+struct timespec __xtime __section_xtime;
+struct timezone __sys_tz __section_sys_tz;
+#endif
+
+#if defined(__x86_64__)
+unsigned int cpu_khz;  /* Detected as we calibrate the TSC */
+#else
+unsigned long cpu_khz; /* Detected as we calibrate the TSC */
+#endif
+
+extern unsigned long wall_jiffies;
+
+DEFINE_SPINLOCK(rtc_lock);
+
+DEFINE_SPINLOCK(i8253_lock);
+EXPORT_SYMBOL(i8253_lock);
+
+extern struct init_timer_opts timer_tsc_init;
+extern struct timer_opts timer_tsc;
+struct timer_opts *cur_timer = &timer_tsc;
+
+/* These are peridically updated in shared_info, and then copied here. */
+struct shadow_time_info {
+       u64 tsc_timestamp;     /* TSC at last update of time vals.  */
+       u64 system_timestamp;  /* Time, in nanosecs, since boot.    */
+       u32 tsc_to_nsec_mul;
+       u32 tsc_to_usec_mul;
+       int tsc_shift;
+       u32 version;
+};
+static DEFINE_PER_CPU(struct shadow_time_info, shadow_time);
+static struct timeval shadow_tv;
+
+/* Keep track of last time we did processing/updating of jiffies and xtime. */
+static u64 processed_system_time;   /* System time (ns) at last processing. */
+static DEFINE_PER_CPU(u64, processed_system_time);
+
+#define NS_PER_TICK (1000000000ULL/HZ)
+
+#define HANDLE_USEC_UNDERFLOW(_tv) do {                \
+       while ((_tv).tv_usec < 0) {             \
+               (_tv).tv_usec += USEC_PER_SEC;  \
+               (_tv).tv_sec--;                 \
+       }                                       \
+} while (0)
+#define HANDLE_USEC_OVERFLOW(_tv) do {         \
+       while ((_tv).tv_usec >= USEC_PER_SEC) { \
+               (_tv).tv_usec -= USEC_PER_SEC;  \
+               (_tv).tv_sec++;                 \
+       }                                       \
+} while (0)
+static inline void __normalize_time(time_t *sec, s64 *nsec)
+{
+       while (*nsec >= NSEC_PER_SEC) {
+               (*nsec) -= NSEC_PER_SEC;
+               (*sec)++;
+       }
+       while (*nsec < 0) {
+               (*nsec) += NSEC_PER_SEC;
+               (*sec)--;
+       }
+}
+
+/* Does this guest OS track Xen time, or set its wall clock independently? */
+static int independent_wallclock = 0;
+static int __init __independent_wallclock(char *str)
+{
+       independent_wallclock = 1;
+       return 1;
+}
+__setup("independent_wallclock", __independent_wallclock);
+#define INDEPENDENT_WALLCLOCK() \
+    (independent_wallclock || (xen_start_info.flags & SIF_INITDOMAIN))
+
+int tsc_disable __initdata = 0;
+
+static void delay_tsc(unsigned long loops)
+{
+       unsigned long bclock, now;
+       
+       rdtscl(bclock);
+       do
+       {
+               rep_nop();
+               rdtscl(now);
+       } while ((now-bclock) < loops);
+}
+
+struct timer_opts timer_tsc = {
+       .name = "tsc",
+       .delay = delay_tsc,
+};
+
+static inline u32 down_shift(u64 time, int shift)
+{
+       if ( shift < 0 )
+               return (u32)(time >> -shift);
+       return (u32)((u32)time << shift);
+}
+
+/*
+ * 32-bit multiplication of integer multiplicand and fractional multiplier
+ * yielding 32-bit integer product.
+ */
+static inline u32 mul_frac(u32 multiplicand, u32 multiplier)
+{
+       u32 product_int, product_frac;
+       __asm__ (
+               "mul %3"
+               : "=a" (product_frac), "=d" (product_int)
+               : "0" (multiplicand), "r" (multiplier) );
+       return product_int;
+}
+
+void init_cpu_khz(void)
+{
+       u64 __cpu_khz = 1000000ULL << 32;
+       struct vcpu_time_info *info = &HYPERVISOR_shared_info->vcpu_time[0];
+       do_div(__cpu_khz, info->tsc_to_system_mul);
+       cpu_khz = down_shift(__cpu_khz, -info->tsc_shift);
+       printk(KERN_INFO "Xen reported: %lu.%03lu MHz processor.\n",
+              cpu_khz / 1000, cpu_khz % 1000);
+}
+
+static u64 get_nsec_offset(struct shadow_time_info *shadow)
+{
+       u64 now;
+       u32 delta;
+       rdtscll(now);
+       delta = down_shift(now - shadow->tsc_timestamp, shadow->tsc_shift);
+       return mul_frac(delta, shadow->tsc_to_nsec_mul);
+}
+
+static unsigned long get_usec_offset(struct shadow_time_info *shadow)
+{
+       u64 now;
+       u32 delta;
+       rdtscll(now);
+       delta = down_shift(now - shadow->tsc_timestamp, shadow->tsc_shift);
+       return mul_frac(delta, shadow->tsc_to_usec_mul);
+}
+
+static void update_wallclock(void)
+{
+       shared_info_t *s = HYPERVISOR_shared_info;
+       long wtm_nsec, xtime_nsec;
+       time_t wtm_sec, xtime_sec;
+       u64 tmp, usec;
+
+       shadow_tv.tv_sec  = s->wc_sec;
+       shadow_tv.tv_usec = s->wc_usec;
+
+       if (INDEPENDENT_WALLCLOCK())
+               return;
+
+       if ((time_status & STA_UNSYNC) != 0)
+               return;
+
+       /* Adjust wall-clock time base based on wall_jiffies ticks. */
+       usec = processed_system_time;
+       do_div(usec, 1000);
+       usec += (u64)shadow_tv.tv_sec * 1000000ULL;
+       usec += (u64)shadow_tv.tv_usec;
+       usec -= (jiffies - wall_jiffies) * (USEC_PER_SEC / HZ);
+
+       /* Split wallclock base into seconds and nanoseconds. */
+       tmp = usec;
+       xtime_nsec = do_div(tmp, 1000000) * 1000ULL;
+       xtime_sec  = (time_t)tmp;
+
+       wtm_sec  = wall_to_monotonic.tv_sec + (xtime.tv_sec - xtime_sec);
+       wtm_nsec = wall_to_monotonic.tv_nsec + (xtime.tv_nsec - xtime_nsec);
+
+       set_normalized_timespec(&xtime, xtime_sec, xtime_nsec);
+       set_normalized_timespec(&wall_to_monotonic, wtm_sec, wtm_nsec);
+}
+
+/*
+ * Reads a consistent set of time-base values from Xen, into a shadow data
+ * area. Must be called with the xtime_lock held for writing.
+ */
+static void __get_time_values_from_xen(void)
+{
+       shared_info_t           *s = HYPERVISOR_shared_info;
+       struct vcpu_time_info   *src;
+       struct shadow_time_info *dst;
+
+       src = &s->vcpu_time[smp_processor_id()];
+       dst = &per_cpu(shadow_time, smp_processor_id());
+
+       do {
+               dst->version = src->time_version2;
+               rmb();
+               dst->tsc_timestamp     = src->tsc_timestamp;
+               dst->system_timestamp  = src->system_time;
+               dst->tsc_to_nsec_mul   = src->tsc_to_system_mul;
+               dst->tsc_shift         = src->tsc_shift;
+               rmb();
+       }
+       while (dst->version != src->time_version1);
+
+       dst->tsc_to_usec_mul = dst->tsc_to_nsec_mul / 1000;
+
+       if ((shadow_tv.tv_sec != s->wc_sec) ||
+           (shadow_tv.tv_usec != s->wc_usec))
+               update_wallclock();
+}
+
+static inline int time_values_up_to_date(int cpu)
+{
+       struct vcpu_time_info   *src;
+       struct shadow_time_info *dst;
+
+       src = &HYPERVISOR_shared_info->vcpu_time[smp_processor_id()];
+       dst = &per_cpu(shadow_time, smp_processor_id());
+
+       return (dst->version == src->time_version2);
+}
+
+#define TIME_VALUES_UP_TO_DATE \
+ ({ rmb(); (shadow_time_version == HYPERVISOR_shared_info->time_version2); })
+
+/*
+ * This is a special lock that is owned by the CPU and holds the index
+ * register we are working with.  It is required for NMI access to the
+ * CMOS/RTC registers.  See include/asm-i386/mc146818rtc.h for details.
+ */
+volatile unsigned long cmos_lock = 0;
+EXPORT_SYMBOL(cmos_lock);
+
+/* Routines for accessing the CMOS RAM/RTC. */
+unsigned char rtc_cmos_read(unsigned char addr)
+{
+       unsigned char val;
+       lock_cmos_prefix(addr);
+       outb_p(addr, RTC_PORT(0));
+       val = inb_p(RTC_PORT(1));
+       lock_cmos_suffix(addr);
+       return val;
+}
+EXPORT_SYMBOL(rtc_cmos_read);
+
+void rtc_cmos_write(unsigned char val, unsigned char addr)
+{
+       lock_cmos_prefix(addr);
+       outb_p(addr, RTC_PORT(0));
+       outb_p(val, RTC_PORT(1));
+       lock_cmos_suffix(addr);
+}
+EXPORT_SYMBOL(rtc_cmos_write);
+
+/*
+ * This version of gettimeofday has microsecond resolution
+ * and better than microsecond precision on fast x86 machines with TSC.
+ */
+void do_gettimeofday(struct timeval *tv)
+{
+       unsigned long seq;
+       unsigned long usec, sec;
+       unsigned long max_ntp_tick;
+       unsigned long flags;
+       s64 nsec;
+       unsigned int cpu;
+       struct shadow_time_info *shadow;
+
+       cpu = get_cpu();
+       shadow = &per_cpu(shadow_time, cpu);
+
+       do {
+               unsigned long lost;
+
+               seq = read_seqbegin(&xtime_lock);
+
+               usec = get_usec_offset(shadow);
+               lost = jiffies - wall_jiffies;
+
+               /*
+                * If time_adjust is negative then NTP is slowing the clock
+                * so make sure not to go into next possible interval.
+                * Better to lose some accuracy than have time go backwards..
+                */
+               if (unlikely(time_adjust < 0)) {
+                       max_ntp_tick = (USEC_PER_SEC / HZ) - tickadj;
+                       usec = min(usec, max_ntp_tick);
+
+                       if (lost)
+                               usec += lost * max_ntp_tick;
+               }
+               else if (unlikely(lost))
+                       usec += lost * (USEC_PER_SEC / HZ);
+
+               sec = xtime.tv_sec;
+               usec += (xtime.tv_nsec / NSEC_PER_USEC);
+
+               nsec = shadow->system_timestamp - processed_system_time;
+               __normalize_time(&sec, &nsec);
+               usec += (long)nsec / NSEC_PER_USEC;
+
+               if (unlikely(!time_values_up_to_date(cpu))) {
+                       /*
+                        * We may have blocked for a long time,
+                        * rendering our calculations invalid
+                        * (e.g. the time delta may have
+                        * overflowed). Detect that and recalculate
+                        * with fresh values.
+                        */
+                       write_seqlock_irqsave(&xtime_lock, flags);
+                       __get_time_values_from_xen();
+                       write_sequnlock_irqrestore(&xtime_lock, flags);
+                       continue;
+               }
+       } while (read_seqretry(&xtime_lock, seq));
+
+       put_cpu();
+
+       while (usec >= USEC_PER_SEC) {
+               usec -= USEC_PER_SEC;
+               sec++;
+       }
+
+       tv->tv_sec = sec;
+       tv->tv_usec = usec;
+}
+
+EXPORT_SYMBOL(do_gettimeofday);
+
+int do_settimeofday(struct timespec *tv)
+{
+       time_t wtm_sec, sec = tv->tv_sec;
+       long wtm_nsec;
+       s64 nsec;
+       struct timespec xentime;
+       unsigned int cpu;
+       struct shadow_time_info *shadow;
+
+       if ((unsigned long)tv->tv_nsec >= NSEC_PER_SEC)
+               return -EINVAL;
+
+       if (!INDEPENDENT_WALLCLOCK())
+               return 0; /* Silent failure? */
+
+       cpu = get_cpu();
+       shadow = &per_cpu(shadow_time, cpu);
+
+       write_seqlock_irq(&xtime_lock);
+
+       /*
+        * Ensure we don't get blocked for a long time so that our time delta
+        * overflows. If that were to happen then our shadow time values would
+        * be stale, so we can retry with fresh ones.
+        */
+ again:
+       nsec = (s64)tv->tv_nsec - (s64)get_nsec_offset(shadow);
+       if (unlikely(!time_values_up_to_date(cpu))) {
+               __get_time_values_from_xen();
+               goto again;
+       }
+
+       __normalize_time(&sec, &nsec);
+       set_normalized_timespec(&xentime, sec, nsec);
+
+       /*
+        * This is revolting. We need to set "xtime" correctly. However, the
+        * value in this location is the value at the most recent update of
+        * wall time.  Discover what correction gettimeofday() would have
+        * made, and then undo it!
+        */
+       nsec -= (jiffies - wall_jiffies) * TICK_NSEC;
+
+       nsec -= (shadow->system_timestamp - processed_system_time);
+
+       __normalize_time(&sec, &nsec);
+       wtm_sec  = wall_to_monotonic.tv_sec + (xtime.tv_sec - sec);
+       wtm_nsec = wall_to_monotonic.tv_nsec + (xtime.tv_nsec - nsec);
+
+       set_normalized_timespec(&xtime, sec, nsec);
+       set_normalized_timespec(&wall_to_monotonic, wtm_sec, wtm_nsec);
+
+       time_adjust = 0;                /* stop active adjtime() */
+       time_status |= STA_UNSYNC;
+       time_maxerror = NTP_PHASE_LIMIT;
+       time_esterror = NTP_PHASE_LIMIT;
+
+#ifdef CONFIG_XEN_PRIVILEGED_GUEST
+       if (xen_start_info.flags & SIF_INITDOMAIN) {
+               dom0_op_t op;
+               op.cmd = DOM0_SETTIME;
+               op.u.settime.secs        = xentime.tv_sec;
+               op.u.settime.usecs       = xentime.tv_nsec / NSEC_PER_USEC;
+               op.u.settime.system_time = shadow->system_timestamp;
+               write_sequnlock_irq(&xtime_lock);
+               HYPERVISOR_dom0_op(&op);
+       } else
+#endif
+               write_sequnlock_irq(&xtime_lock);
+
+       put_cpu();
+
+       clock_was_set();
+       return 0;
+}
+
+EXPORT_SYMBOL(do_settimeofday);
+
+#ifdef CONFIG_XEN_PRIVILEGED_GUEST
+static int set_rtc_mmss(unsigned long nowtime)
+{
+       int retval;
+
+       WARN_ON(irqs_disabled());
+
+       /* gets recalled with irq locally disabled */
+       spin_lock_irq(&rtc_lock);
+       if (efi_enabled)
+               retval = efi_set_rtc_mmss(nowtime);
+       else
+               retval = mach_set_rtc_mmss(nowtime);
+       spin_unlock_irq(&rtc_lock);
+
+       return retval;
+}
+#else
+static int set_rtc_mmss(unsigned long nowtime)
+{
+       return 0;
+}
+#endif
+
+/* monotonic_clock(): returns # of nanoseconds passed since time_init()
+ *             Note: This function is required to return accurate
+ *             time even in the absence of multiple timer ticks.
+ */
+unsigned long long monotonic_clock(void)
+{
+       int cpu = get_cpu();
+       struct shadow_time_info *shadow = &per_cpu(shadow_time, cpu);
+       s64 off;
+       unsigned long flags;
+       
+       for ( ; ; ) {
+               off = get_nsec_offset(shadow);
+               if (time_values_up_to_date(cpu))
+                       break;
+               write_seqlock_irqsave(&xtime_lock, flags);
+               __get_time_values_from_xen();
+               write_sequnlock_irqrestore(&xtime_lock, flags);
+       }
+
+       put_cpu();
+
+       return shadow->system_timestamp + off;
+}
+EXPORT_SYMBOL(monotonic_clock);
+
+unsigned long long sched_clock(void)
+{
+       return monotonic_clock();
+}
+
+#if defined(CONFIG_SMP) && defined(CONFIG_FRAME_POINTER)
+unsigned long profile_pc(struct pt_regs *regs)
+{
+       unsigned long pc = instruction_pointer(regs);
+
+       if (in_lock_functions(pc))
+               return *(unsigned long *)(regs->ebp + 4);
+
+       return pc;
+}
+EXPORT_SYMBOL(profile_pc);
+#endif
+
+/*
+ * timer_interrupt() needs to keep up the real-time clock,
+ * as well as call the "do_timer()" routine every clocktick
+ */
+static inline void do_timer_interrupt(int irq, void *dev_id,
+                                       struct pt_regs *regs)
+{
+       s64 delta, delta_cpu;
+       int cpu = smp_processor_id();
+       struct shadow_time_info *shadow = &per_cpu(shadow_time, cpu);
+
+       do {
+               __get_time_values_from_xen();
+
+               delta = delta_cpu = 
+                       shadow->system_timestamp + get_nsec_offset(shadow);
+               delta     -= processed_system_time;
+               delta_cpu -= per_cpu(processed_system_time, cpu);
+       }
+       while (!time_values_up_to_date(cpu));
+
+       if (unlikely(delta < 0) || unlikely(delta_cpu < 0)) {
+               printk("Timer ISR/%d: Time went backwards: "
+                      "delta=%lld cpu_delta=%lld shadow=%lld "
+                      "off=%lld processed=%lld cpu_processed=%lld\n",
+                      cpu, delta, delta_cpu, shadow->system_timestamp,
+                      (s64)get_nsec_offset(shadow),
+                      processed_system_time,
+                      per_cpu(processed_system_time, cpu));
+               for (cpu = 0; cpu < num_online_cpus(); cpu++)
+                       printk(" %d: %lld\n", cpu,
+                              per_cpu(processed_system_time, cpu));
+               return;
+       }
+
+       /* System-wide jiffy work. */
+       while (delta >= NS_PER_TICK) {
+               delta -= NS_PER_TICK;
+               processed_system_time += NS_PER_TICK;
+               do_timer(regs);
+       }
+
+       /* Local CPU jiffy work. */
+       while (delta_cpu >= NS_PER_TICK) {
+               delta_cpu -= NS_PER_TICK;
+               per_cpu(processed_system_time, cpu) += NS_PER_TICK;
+               update_process_times(user_mode(regs));
+               profile_tick(CPU_PROFILING, regs);
+       }
+}
+
+/*
+ * This is the same as the above, except we _also_ save the current
+ * Time Stamp Counter value at the time of the timer interrupt, so that
+ * we later on can estimate the time of day more exactly.
+ */
+irqreturn_t timer_interrupt(int irq, void *dev_id, struct pt_regs *regs)
+{
+       /*
+        * Here we are in the timer irq handler. We just have irqs locally
+        * disabled but we don't know if the timer_bh is running on the other
+        * CPU. We need to avoid to SMP race with it. NOTE: we don' t need
+        * the irq version of write_lock because as just said we have irq
+        * locally disabled. -arca
+        */
+       write_seqlock(&xtime_lock);
+       do_timer_interrupt(irq, NULL, regs);
+       write_sequnlock(&xtime_lock);
+       return IRQ_HANDLED;
+}
+
+/* not static: needed by APM */
+unsigned long get_cmos_time(void)
+{
+       unsigned long retval;
+
+       spin_lock(&rtc_lock);
+
+       if (efi_enabled)
+               retval = efi_get_time();
+       else
+               retval = mach_get_cmos_time();
+
+       spin_unlock(&rtc_lock);
+
+       return retval;
+}
+static void sync_cmos_clock(unsigned long dummy);
+
+static struct timer_list sync_cmos_timer =
+                                      TIMER_INITIALIZER(sync_cmos_clock, 0, 0);
+
+static void sync_cmos_clock(unsigned long dummy)
+{
+       struct timeval now, next;
+       int fail = 1;
+
+       /*
+        * If we have an externally synchronized Linux clock, then update
+        * CMOS clock accordingly every ~11 minutes. Set_rtc_mmss() has to be
+        * called as close as possible to 500 ms before the new second starts.
+        * This code is run on a timer.  If the clock is set, that timer
+        * may not expire at the correct time.  Thus, we adjust...
+        */
+       if ((time_status & STA_UNSYNC) != 0)
+               /*
+                * Not synced, exit, do not restart a timer (if one is
+                * running, let it run out).
+                */
+               return;
+
+       do_gettimeofday(&now);
+       if (now.tv_usec >= USEC_AFTER - ((unsigned) TICK_SIZE) / 2 &&
+           now.tv_usec <= USEC_BEFORE + ((unsigned) TICK_SIZE) / 2)
+               fail = set_rtc_mmss(now.tv_sec);
+
+       next.tv_usec = USEC_AFTER - now.tv_usec;
+       if (next.tv_usec <= 0)
+               next.tv_usec += USEC_PER_SEC;
+
+       if (!fail)
+               next.tv_sec = 659;
+       else
+               next.tv_sec = 0;
+
+       if (next.tv_usec >= USEC_PER_SEC) {
+               next.tv_sec++;
+               next.tv_usec -= USEC_PER_SEC;
+       }
+       mod_timer(&sync_cmos_timer, jiffies + timeval_to_jiffies(&next));
+}
+
+void notify_arch_cmos_timer(void)
+{
+       mod_timer(&sync_cmos_timer, jiffies + 1);
+}
+
+static long clock_cmos_diff, sleep_start;
+
+static int timer_suspend(struct sys_device *dev, pm_message_t state)
+{
+       /*
+        * Estimate time zone so that set_time can update the clock
+        */
+       clock_cmos_diff = -get_cmos_time();
+       clock_cmos_diff += get_seconds();
+       sleep_start = get_cmos_time();
+       return 0;
+}
+
+static int timer_resume(struct sys_device *dev)
+{
+       unsigned long flags;
+       unsigned long sec;
+       unsigned long sleep_length;
+
+#ifdef CONFIG_HPET_TIMER
+       if (is_hpet_enabled())
+               hpet_reenable();
+#endif
+       sec = get_cmos_time() + clock_cmos_diff;
+       sleep_length = (get_cmos_time() - sleep_start) * HZ;
+       write_seqlock_irqsave(&xtime_lock, flags);
+       xtime.tv_sec = sec;
+       xtime.tv_nsec = 0;
+       write_sequnlock_irqrestore(&xtime_lock, flags);
+       jiffies += sleep_length;
+       wall_jiffies += sleep_length;
+       return 0;
+}
+
+static struct sysdev_class timer_sysclass = {
+       .resume = timer_resume,
+       .suspend = timer_suspend,
+       set_kset_name("timer"),
+};
+
+
+/* XXX this driverfs stuff should probably go elsewhere later -john */
+static struct sys_device device_timer = {
+       .id     = 0,
+       .cls    = &timer_sysclass,
+};
+
+static int time_init_device(void)
+{
+       int error = sysdev_class_register(&timer_sysclass);
+       if (!error)
+               error = sysdev_register(&device_timer);
+       return error;
+}
+
+device_initcall(time_init_device);
+
+#ifdef CONFIG_HPET_TIMER
+extern void (*late_time_init)(void);
+/* Duplicate of time_init() below, with hpet_enable part added */
+static void __init hpet_time_init(void)
+{
+       xtime.tv_sec = get_cmos_time();
+       xtime.tv_nsec = (INITIAL_JIFFIES % HZ) * (NSEC_PER_SEC / HZ);
+       set_normalized_timespec(&wall_to_monotonic,
+               -xtime.tv_sec, -xtime.tv_nsec);
+
+       if ((hpet_enable() >= 0) && hpet_use_timer) {
+               printk("Using HPET for base-timer\n");
+       }
+
+       cur_timer = select_timer();
+       printk(KERN_INFO "Using %s for high-res timesource\n",cur_timer->name);
+
+       time_init_hook();
+}
+#endif
+
+/* Dynamically-mapped IRQ. */
+static DEFINE_PER_CPU(int, timer_irq);
+
+static struct irqaction irq_timer = {
+       timer_interrupt, SA_INTERRUPT, CPU_MASK_NONE, "timer0",
+       NULL, NULL
+};
+
+void __init time_init(void)
+{
+#ifdef CONFIG_HPET_TIMER
+       if (is_hpet_capable()) {
+               /*
+                * HPET initialization needs to do memory-mapped io. So, let
+                * us do a late initialization after mem_init().
+                */
+               late_time_init = hpet_time_init;
+               return;
+       }
+#endif
+       __get_time_values_from_xen();
+       xtime.tv_sec = shadow_tv.tv_sec;
+       xtime.tv_nsec = shadow_tv.tv_usec * NSEC_PER_USEC;
+       set_normalized_timespec(&wall_to_monotonic,
+               -xtime.tv_sec, -xtime.tv_nsec);
+       processed_system_time = per_cpu(shadow_time, 0).system_timestamp;
+       per_cpu(processed_system_time, 0) = processed_system_time;
+
+       init_cpu_khz();
+
+#if defined(__x86_64__)
+       vxtime.mode = VXTIME_TSC;
+       vxtime.quot = (1000000L << 32) / vxtime_hz;
+       vxtime.tsc_quot = (1000L << 32) / cpu_khz;
+       vxtime.hz = vxtime_hz;
+       sync_core();
+       rdtscll(vxtime.last_tsc);
+#endif
+
+       per_cpu(timer_irq, 0) = bind_virq_to_irq(VIRQ_TIMER);
+       (void)setup_irq(per_cpu(timer_irq, 0), &irq_timer);
+}
+
+/* Convert jiffies to system time. */
+static inline u64 jiffies_to_st(unsigned long j) 
+{
+       unsigned long seq;
+       long delta;
+       u64 st;
+
+       do {
+               seq = read_seqbegin(&xtime_lock);
+               delta = j - jiffies;
+               /* NB. The next check can trigger in some wrap-around cases,
+                * but that's ok: we'll just end up with a shorter timeout. */
+               if (delta < 1)
+                       delta = 1;
+               st = processed_system_time + (delta * NS_PER_TICK);
+       } while (read_seqretry(&xtime_lock, seq));
+
+       return st;
+}
+
+/*
+ * stop_hz_timer / start_hz_timer - enter/exit 'tickless mode' on an idle cpu
+ * These functions are based on implementations from arch/s390/kernel/time.c
+ */
+void stop_hz_timer(void)
+{
+       unsigned int cpu = smp_processor_id();
+       unsigned long j;
+
+       /* s390 does this /before/ checking rcu_pending(). We copy them. */
+       cpu_set(cpu, nohz_cpu_mask);
+
+       /* Leave ourselves in 'tick mode' if rcu or softirq pending. */
+       if (rcu_pending(cpu) || local_softirq_pending()) {
+               cpu_clear(cpu, nohz_cpu_mask);
+               j = jiffies + 1;
+       } else {
+               j = next_timer_interrupt();
+       }
+
+       BUG_ON(HYPERVISOR_set_timer_op(jiffies_to_st(j)) != 0);
+}
+
+void start_hz_timer(void)
+{
+       cpu_clear(smp_processor_id(), nohz_cpu_mask);
+}
+
+void time_suspend(void)
+{
+       /* nothing */
+}
+
+/* No locking required. We are only CPU running, and interrupts are off. */
+void time_resume(void)
+{
+       init_cpu_khz();
+
+       /* Get timebases for new environment. */ 
+       __get_time_values_from_xen();
+
+       /* Reset our own concept of passage of system time. */
+       processed_system_time =
+               per_cpu(shadow_time, smp_processor_id()).system_timestamp;
+       per_cpu(processed_system_time, 0) = processed_system_time;
+}
+
+#ifdef CONFIG_SMP
+static char timer_name[NR_CPUS][15];
+void local_setup_timer(void)
+{
+       int seq, cpu = smp_processor_id();
+
+       do {
+               seq = read_seqbegin(&xtime_lock);
+               per_cpu(processed_system_time, cpu) = 
+                       per_cpu(shadow_time, cpu).system_timestamp;
+       } while (read_seqretry(&xtime_lock, seq));
+
+       per_cpu(timer_irq, cpu) = bind_virq_to_irq(VIRQ_TIMER);
+       sprintf(timer_name[cpu], "timer%d", cpu);
+       BUG_ON(request_irq(per_cpu(timer_irq, cpu), timer_interrupt,
+                          SA_INTERRUPT, timer_name[cpu], NULL));
+}
+#endif
+
+/*
+ * /proc/sys/xen: This really belongs in another file. It can stay here for
+ * now however.
+ */
+static ctl_table xen_subtable[] = {
+       {1, "independent_wallclock", &independent_wallclock,
+        sizeof(independent_wallclock), 0644, NULL, proc_dointvec},
+       {0}
+};
+static ctl_table xen_table[] = {
+       {123, "xen", NULL, 0, 0555, xen_subtable},
+       {0}
+};
+static int __init xen_sysctl_init(void)
+{
+       (void)register_sysctl_table(xen_table, 0);
+       return 0;
+}
+__initcall(xen_sysctl_init);
+
+/*
+ * Local variables:
+ *  c-file-style: "linux"
+ *  indent-tabs-mode: t
+ *  c-indent-level: 8
+ *  c-basic-offset: 8
+ *  tab-width: 8
+ * End:
+ */
Index: linux-2.6.12-xen0-arch/arch/i386/mach-xen/kernel/traps.c
===================================================================
--- /dev/null
+++ linux-2.6.12-xen0-arch/arch/i386/mach-xen/kernel/traps.c
@@ -0,0 +1,1026 @@
+/*
+ *  linux/arch/i386/traps.c
+ *
+ *  Copyright (C) 1991, 1992  Linus Torvalds
+ *
+ *  Pentium III FXSR, SSE support
+ *     Gareth Hughes <gareth@xxxxxxxxxxx>, May 2000
+ */
+
+/*
+ * 'Traps.c' handles hardware traps and faults after we have saved some
+ * state in 'asm.s'.
+ */
+#include <linux/config.h>
+#include <linux/sched.h>
+#include <linux/kernel.h>
+#include <linux/string.h>
+#include <linux/errno.h>
+#include <linux/timer.h>
+#include <linux/mm.h>
+#include <linux/init.h>
+#include <linux/delay.h>
+#include <linux/spinlock.h>
+#include <linux/interrupt.h>
+#include <linux/highmem.h>
+#include <linux/kallsyms.h>
+#include <linux/ptrace.h>
+#include <linux/utsname.h>
+#include <linux/kprobes.h>
+
+#ifdef CONFIG_EISA
+#include <linux/ioport.h>
+#include <linux/eisa.h>
+#endif
+
+#ifdef CONFIG_MCA
+#include <linux/mca.h>
+#endif
+
+#include <asm/processor.h>
+#include <asm/system.h>
+#include <asm/uaccess.h>
+#include <asm/io.h>
+#include <asm/atomic.h>
+#include <asm/debugreg.h>
+#include <asm/desc.h>
+#include <asm/i387.h>
+#include <asm/nmi.h>
+
+#include <asm/smp.h>
+#include <asm/arch_hooks.h>
+#include <asm/kdebug.h>
+
+#include <linux/irq.h>
+#include <linux/module.h>
+
+#include "mach_traps.h"
+
+asmlinkage int system_call(void);
+
+/* Do we ignore FPU interrupts ? */
+char ignore_fpu_irq = 0;
+
+/*
+ * The IDT has to be page-aligned to simplify the Pentium
+ * F0 0F bug workaround.. We have a special link segment
+ * for this.
+ */
+struct desc_struct idt_table[256] __attribute__((__section__(".data.idt"))) = 
{ {0, 0}, };
+
+asmlinkage void divide_error(void);
+asmlinkage void debug(void);
+asmlinkage void nmi(void);
+asmlinkage void int3(void);
+asmlinkage void overflow(void);
+asmlinkage void bounds(void);
+asmlinkage void invalid_op(void);
+asmlinkage void device_not_available(void);
+asmlinkage void coprocessor_segment_overrun(void);
+asmlinkage void invalid_TSS(void);
+asmlinkage void segment_not_present(void);
+asmlinkage void stack_segment(void);
+asmlinkage void general_protection(void);
+asmlinkage void page_fault(void);
+asmlinkage void coprocessor_error(void);
+asmlinkage void simd_coprocessor_error(void);
+asmlinkage void alignment_check(void);
+asmlinkage void fixup_4gb_segment(void);
+asmlinkage void machine_check(void);
+
+static int kstack_depth_to_print = 24;
+struct notifier_block *i386die_chain;
+static DEFINE_SPINLOCK(die_notifier_lock);
+
+int register_die_notifier(struct notifier_block *nb)
+{
+       int err = 0;
+       unsigned long flags;
+       spin_lock_irqsave(&die_notifier_lock, flags);
+       err = notifier_chain_register(&i386die_chain, nb);
+       spin_unlock_irqrestore(&die_notifier_lock, flags);
+       return err;
+}
+
+static inline int valid_stack_ptr(struct thread_info *tinfo, void *p)
+{
+       return  p > (void *)tinfo &&
+               p < (void *)tinfo + THREAD_SIZE - 3;
+}
+
+static inline unsigned long print_context_stack(struct thread_info *tinfo,
+                               unsigned long *stack, unsigned long ebp)
+{
+       unsigned long addr;
+
+#ifdef CONFIG_FRAME_POINTER
+       while (valid_stack_ptr(tinfo, (void *)ebp)) {
+               addr = *(unsigned long *)(ebp + 4);
+               printk(" [<%08lx>] ", addr);
+               print_symbol("%s", addr);
+               printk("\n");
+               ebp = *(unsigned long *)ebp;
+       }
+#else
+       while (valid_stack_ptr(tinfo, stack)) {
+               addr = *stack++;
+               if (__kernel_text_address(addr)) {
+                       printk(" [<%08lx>]", addr);
+                       print_symbol(" %s", addr);
+                       printk("\n");
+               }
+       }
+#endif
+       return ebp;
+}
+
+void show_trace(struct task_struct *task, unsigned long * stack)
+{
+       unsigned long ebp;
+
+       if (!task)
+               task = current;
+
+       if (task == current) {
+               /* Grab ebp right from our regs */
+               asm ("movl %%ebp, %0" : "=r" (ebp) : );
+       } else {
+               /* ebp is the last reg pushed by switch_to */
+               ebp = *(unsigned long *) task->thread.esp;
+       }
+
+       while (1) {
+               struct thread_info *context;
+               context = (struct thread_info *)
+                       ((unsigned long)stack & (~(THREAD_SIZE - 1)));
+               ebp = print_context_stack(context, stack, ebp);
+               stack = (unsigned long*)context->previous_esp;
+               if (!stack)
+                       break;
+               printk(" =======================\n");
+       }
+}
+
+void show_stack(struct task_struct *task, unsigned long *esp)
+{
+       unsigned long *stack;
+       int i;
+
+       if (esp == NULL) {
+               if (task)
+                       esp = (unsigned long*)task->thread.esp;
+               else
+                       esp = (unsigned long *)&esp;
+       }
+
+       stack = esp;
+       for(i = 0; i < kstack_depth_to_print; i++) {
+               if (kstack_end(stack))
+                       break;
+               if (i && ((i % 8) == 0))
+                       printk("\n       ");
+               printk("%08lx ", *stack++);
+       }
+       printk("\nCall Trace:\n");
+       show_trace(task, esp);
+}
+
+/*
+ * The architecture-independent dump_stack generator
+ */
+void dump_stack(void)
+{
+       unsigned long stack;
+
+       show_trace(current, &stack);
+}
+
+EXPORT_SYMBOL(dump_stack);
+
+void show_registers(struct pt_regs *regs)
+{
+       int i;
+       int in_kernel = 1;
+       unsigned long esp;
+       unsigned short ss;
+
+       esp = (unsigned long) (&regs->esp);
+       ss = __KERNEL_DS;
+       if (regs->xcs & 2) {
+               in_kernel = 0;
+               esp = regs->esp;
+               ss = regs->xss & 0xffff;
+       }
+       print_modules();
+       printk("CPU:    %d\nEIP:    %04x:[<%08lx>]    %s VLI\nEFLAGS: %08lx"
+                       "   (%s) \n",
+               smp_processor_id(), 0xffff & regs->xcs, regs->eip,
+               print_tainted(), regs->eflags, system_utsname.release);
+       print_symbol("EIP is at %s\n", regs->eip);
+       printk("eax: %08lx   ebx: %08lx   ecx: %08lx   edx: %08lx\n",
+               regs->eax, regs->ebx, regs->ecx, regs->edx);
+       printk("esi: %08lx   edi: %08lx   ebp: %08lx   esp: %08lx\n",
+               regs->esi, regs->edi, regs->ebp, esp);
+       printk("ds: %04x   es: %04x   ss: %04x\n",
+               regs->xds & 0xffff, regs->xes & 0xffff, ss);
+       printk("Process %s (pid: %d, threadinfo=%p task=%p)",
+               current->comm, current->pid, current_thread_info(), current);
+       /*
+        * When in-kernel, we also print out the stack and code at the
+        * time of the fault..
+        */
+       if (in_kernel) {
+               u8 *eip;
+
+               printk("\nStack: ");
+               show_stack(NULL, (unsigned long*)esp);
+
+               printk("Code: ");
+
+               eip = (u8 *)regs->eip - 43;
+               for (i = 0; i < 64; i++, eip++) {
+                       unsigned char c;
+
+                       if (eip < (u8 *)PAGE_OFFSET || __get_user(c, eip)) {
+                               printk(" Bad EIP value.");
+                               break;
+                       }
+                       if (eip == (u8 *)regs->eip)
+                               printk("<%02x> ", c);
+                       else
+                               printk("%02x ", c);
+               }
+       }
+       printk("\n");
+}      
+
+static void handle_BUG(struct pt_regs *regs)
+{
+       unsigned short ud2;
+       unsigned short line;
+       char *file;
+       char c;
+       unsigned long eip;
+
+       if (regs->xcs & 2)
+               goto no_bug;            /* Not in kernel */
+
+       eip = regs->eip;
+
+       if (eip < PAGE_OFFSET)
+               goto no_bug;
+       if (__get_user(ud2, (unsigned short *)eip))
+               goto no_bug;
+       if (ud2 != 0x0b0f)
+               goto no_bug;
+       if (__get_user(line, (unsigned short *)(eip + 2)))
+               goto bug;
+       if (__get_user(file, (char **)(eip + 4)) ||
+               (unsigned long)file < PAGE_OFFSET || __get_user(c, file))
+               file = "<bad filename>";
+
+       printk("------------[ cut here ]------------\n");
+       printk(KERN_ALERT "kernel BUG at %s:%d!\n", file, line);
+
+no_bug:
+       return;
+
+       /* Here we know it was a BUG but file-n-line is unavailable */
+bug:
+       printk("Kernel BUG\n");
+}
+
+void die(const char * str, struct pt_regs * regs, long err)
+{
+       static struct {
+               spinlock_t lock;
+               u32 lock_owner;
+               int lock_owner_depth;
+       } die = {
+               .lock =                 SPIN_LOCK_UNLOCKED,
+               .lock_owner =           -1,
+               .lock_owner_depth =     0
+       };
+       static int die_counter;
+
+       if (die.lock_owner != _smp_processor_id()) {
+               console_verbose();
+               spin_lock_irq(&die.lock);
+               die.lock_owner = smp_processor_id();
+               die.lock_owner_depth = 0;
+               bust_spinlocks(1);
+       }
+
+       if (++die.lock_owner_depth < 3) {
+               int nl = 0;
+               handle_BUG(regs);
+               printk(KERN_ALERT "%s: %04lx [#%d]\n", str, err & 0xffff, 
++die_counter);
+#ifdef CONFIG_PREEMPT
+               printk("PREEMPT ");
+               nl = 1;
+#endif
+#ifdef CONFIG_SMP
+               printk("SMP ");
+               nl = 1;
+#endif
+#ifdef CONFIG_DEBUG_PAGEALLOC
+               printk("DEBUG_PAGEALLOC");
+               nl = 1;
+#endif
+               if (nl)
+                       printk("\n");
+       notify_die(DIE_OOPS, (char *)str, regs, err, 255, SIGSEGV);
+               show_registers(regs);
+       } else
+               printk(KERN_ERR "Recursive die() failure, output suppressed\n");
+
+       bust_spinlocks(0);
+       die.lock_owner = -1;
+       spin_unlock_irq(&die.lock);
+       if (in_interrupt())
+               panic("Fatal exception in interrupt");
+
+       if (panic_on_oops) {
+               printk(KERN_EMERG "Fatal exception: panic in 5 seconds\n");
+               ssleep(5);
+               panic("Fatal exception");
+       }
+       do_exit(SIGSEGV);
+}
+
+static inline void die_if_kernel(const char * str, struct pt_regs * regs, long 
err)
+{
+       if (!(regs->eflags & VM_MASK) && !(2 & regs->xcs))
+               die(str, regs, err);
+}
+
+static void do_trap(int trapnr, int signr, char *str, int vm86,
+                          struct pt_regs * regs, long error_code, siginfo_t 
*info)
+{
+       if (regs->eflags & VM_MASK) {
+               if (vm86)
+                       goto vm86_trap;
+               goto trap_signal;
+       }
+
+       if (!(regs->xcs & 2))
+               goto kernel_trap;
+
+       trap_signal: {
+               struct task_struct *tsk = current;
+               tsk->thread.error_code = error_code;
+               tsk->thread.trap_no = trapnr;
+               if (info)
+                       force_sig_info(signr, info, tsk);
+               else
+                       force_sig(signr, tsk);
+               return;
+       }
+
+       kernel_trap: {
+               if (!fixup_exception(regs))
+                       die(str, regs, error_code);
+               return;
+       }
+
+       vm86_trap: {
+               int ret = handle_vm86_trap((struct kernel_vm86_regs *) regs, 
error_code, trapnr);
+               if (ret) goto trap_signal;
+               return;
+       }
+}
+
+#define DO_ERROR(trapnr, signr, str, name) \
+fastcall void do_##name(struct pt_regs * regs, long error_code) \
+{ \
+       if (notify_die(DIE_TRAP, str, regs, error_code, trapnr, signr) \
+                                               == NOTIFY_STOP) \
+               return; \
+       do_trap(trapnr, signr, str, 0, regs, error_code, NULL); \
+}
+
+#define DO_ERROR_INFO(trapnr, signr, str, name, sicode, siaddr) \
+fastcall void do_##name(struct pt_regs * regs, long error_code) \
+{ \
+       siginfo_t info; \
+       info.si_signo = signr; \
+       info.si_errno = 0; \
+       info.si_code = sicode; \
+       info.si_addr = (void __user *)siaddr; \
+       if (notify_die(DIE_TRAP, str, regs, error_code, trapnr, signr) \
+                                               == NOTIFY_STOP) \
+               return; \
+       do_trap(trapnr, signr, str, 0, regs, error_code, &info); \
+}
+
+#define DO_VM86_ERROR(trapnr, signr, str, name) \
+fastcall void do_##name(struct pt_regs * regs, long error_code) \
+{ \
+       if (notify_die(DIE_TRAP, str, regs, error_code, trapnr, signr) \
+                                               == NOTIFY_STOP) \
+               return; \
+       do_trap(trapnr, signr, str, 1, regs, error_code, NULL); \
+}
+
+#define DO_VM86_ERROR_INFO(trapnr, signr, str, name, sicode, siaddr) \
+fastcall void do_##name(struct pt_regs * regs, long error_code) \
+{ \
+       siginfo_t info; \
+       info.si_signo = signr; \
+       info.si_errno = 0; \
+       info.si_code = sicode; \
+       info.si_addr = (void __user *)siaddr; \
+       if (notify_die(DIE_TRAP, str, regs, error_code, trapnr, signr) \
+                                               == NOTIFY_STOP) \
+               return; \
+       do_trap(trapnr, signr, str, 1, regs, error_code, &info); \
+}
+
+DO_VM86_ERROR_INFO( 0, SIGFPE,  "divide error", divide_error, FPE_INTDIV, 
regs->eip)
+#ifndef CONFIG_KPROBES
+DO_VM86_ERROR( 3, SIGTRAP, "int3", int3)
+#endif
+DO_VM86_ERROR( 4, SIGSEGV, "overflow", overflow)
+DO_VM86_ERROR( 5, SIGSEGV, "bounds", bounds)
+DO_ERROR_INFO( 6, SIGILL,  "invalid operand", invalid_op, ILL_ILLOPN, 
regs->eip)
+DO_VM86_ERROR( 7, SIGSEGV, "device not available", device_not_available)
+DO_ERROR( 9, SIGFPE,  "coprocessor segment overrun", 
coprocessor_segment_overrun)
+DO_ERROR(10, SIGSEGV, "invalid TSS", invalid_TSS)
+DO_ERROR(11, SIGBUS,  "segment not present", segment_not_present)
+DO_ERROR(12, SIGBUS,  "stack segment", stack_segment)
+DO_ERROR_INFO(17, SIGBUS, "alignment check", alignment_check, BUS_ADRALN, 0)
+DO_ERROR_INFO(32, SIGSEGV, "iret exception", iret_error, ILL_BADSTK, 0)
+#ifdef CONFIG_X86_MCE
+DO_ERROR(18, SIGBUS, "machine check", machine_check)
+#endif
+
+fastcall void do_general_protection(struct pt_regs * regs, long error_code)
+{
+       /*
+        * If we trapped on an LDT access then ensure that the default_ldt is
+        * loaded, if nothing else. We load default_ldt lazily because LDT
+        * switching costs time and many applications don't need it.
+        */
+       if (unlikely((error_code & 6) == 4)) {
+               unsigned long ldt;
+               __asm__ __volatile__ ("sldt %0" : "=r" (ldt));
+               if (ldt == 0) {
+                       xen_set_ldt((unsigned long)&default_ldt[0], 5);
+                       return;
+               }
+       }
+
+       if (regs->eflags & VM_MASK)
+               goto gp_in_vm86;
+
+       if (!(regs->xcs & 2))
+               goto gp_in_kernel;
+
+       current->thread.error_code = error_code;
+       current->thread.trap_no = 13;
+       force_sig(SIGSEGV, current);
+       return;
+
+gp_in_vm86:
+       local_irq_enable();
+       handle_vm86_fault((struct kernel_vm86_regs *) regs, error_code);
+       return;
+
+gp_in_kernel:
+       if (!fixup_exception(regs)) {
+               if (notify_die(DIE_GPF, "general protection fault", regs,
+                               error_code, 13, SIGSEGV) == NOTIFY_STOP)
+                       return;
+               die("general protection fault", regs, error_code);
+       }
+}
+
+static void mem_parity_error(unsigned char reason, struct pt_regs * regs)
+{
+       printk("Uhhuh. NMI received. Dazed and confused, but trying to 
continue\n");
+       printk("You probably have a hardware problem with your RAM chips\n");
+
+       /* Clear and disable the memory parity error line. */
+       clear_mem_error(reason);
+}
+
+static void io_check_error(unsigned char reason, struct pt_regs * regs)
+{
+       unsigned long i;
+
+       printk("NMI: IOCK error (debug interrupt?)\n");
+       show_registers(regs);
+
+       /* Re-enable the IOCK line, wait for a few seconds */
+       reason = (reason & 0xf) | 8;
+       outb(reason, 0x61);
+       i = 2000;
+       while (--i) udelay(1000);
+       reason &= ~8;
+       outb(reason, 0x61);
+}
+
+static void unknown_nmi_error(unsigned char reason, struct pt_regs * regs)
+{
+#ifdef CONFIG_MCA
+       /* Might actually be able to figure out what the guilty party
+       * is. */
+       if( MCA_bus ) {
+               mca_handle_nmi();
+               return;
+       }
+#endif
+       printk("Uhhuh. NMI received for unknown reason %02x on CPU %d.\n",
+               reason, smp_processor_id());
+       printk("Dazed and confused, but trying to continue\n");
+       printk("Do you have a strange power saving mode enabled?\n");
+}
+
+static DEFINE_SPINLOCK(nmi_print_lock);
+
+void die_nmi (struct pt_regs *regs, const char *msg)
+{
+       spin_lock(&nmi_print_lock);
+       /*
+       * We are in trouble anyway, lets at least try
+       * to get a message out.
+       */
+       bust_spinlocks(1);
+       printk(msg);
+       printk(" on CPU%d, eip %08lx, registers:\n",
+               smp_processor_id(), regs->eip);
+       show_registers(regs);
+       printk("console shuts up ...\n");
+       console_silent();
+       spin_unlock(&nmi_print_lock);
+       bust_spinlocks(0);
+       do_exit(SIGSEGV);
+}
+
+static void default_do_nmi(struct pt_regs * regs)
+{
+       unsigned char reason = 0;
+
+       /* Only the BSP gets external NMIs from the system.  */
+       if (!smp_processor_id())
+               reason = get_nmi_reason();
+ 
+       if (!(reason & 0xc0)) {
+               if (notify_die(DIE_NMI_IPI, "nmi_ipi", regs, reason, 0, SIGINT)
+                                                       == NOTIFY_STOP)
+                       return;
+#ifdef CONFIG_X86_LOCAL_APIC
+               /*
+                * Ok, so this is none of the documented NMI sources,
+                * so it must be the NMI watchdog.
+                */
+               if (nmi_watchdog) {
+                       nmi_watchdog_tick(regs);
+                       return;
+               }
+#endif
+               unknown_nmi_error(reason, regs);
+               return;
+       }
+       if (notify_die(DIE_NMI, "nmi", regs, reason, 0, SIGINT) == NOTIFY_STOP)
+               return;
+       if (reason & 0x80)
+               mem_parity_error(reason, regs);
+       if (reason & 0x40)
+               io_check_error(reason, regs);
+       /*
+        * Reassert NMI in case it became active meanwhile
+        * as it's edge-triggered.
+        */
+       reassert_nmi();
+}
+
+static int dummy_nmi_callback(struct pt_regs * regs, int cpu)
+{
+       return 0;
+}
+ 
+static nmi_callback_t nmi_callback = dummy_nmi_callback;
+ 
+fastcall void do_nmi(struct pt_regs * regs, long error_code)
+{
+       int cpu;
+
+       nmi_enter();
+
+       cpu = smp_processor_id();
+
+#ifdef CONFIG_HOTPLUG_CPU
+       if (!cpu_online(cpu)) {
+               nmi_exit();
+               return;
+       }
+#endif
+
+       ++nmi_count(cpu);
+
+       if (!nmi_callback(regs, cpu))
+               default_do_nmi(regs);
+
+       nmi_exit();
+}
+
+void set_nmi_callback(nmi_callback_t callback)
+{
+       nmi_callback = callback;
+}
+
+void unset_nmi_callback(void)
+{
+       nmi_callback = dummy_nmi_callback;
+}
+
+#ifdef CONFIG_KPROBES
+fastcall void do_int3(struct pt_regs *regs, long error_code)
+{
+       if (notify_die(DIE_INT3, "int3", regs, error_code, 3, SIGTRAP)
+                       == NOTIFY_STOP)
+               return;
+       /* This is an interrupt gate, because kprobes wants interrupts
+       disabled.  Normal trap handlers don't. */
+       restore_interrupts(regs);
+       do_trap(3, SIGTRAP, "int3", 1, regs, error_code, NULL);
+}
+#endif
+
+/*
+ * Our handling of the processor debug registers is non-trivial.
+ * We do not clear them on entry and exit from the kernel. Therefore
+ * it is possible to get a watchpoint trap here from inside the kernel.
+ * However, the code in ./ptrace.c has ensured that the user can
+ * only set watchpoints on userspace addresses. Therefore the in-kernel
+ * watchpoint trap can only occur in code which is reading/writing
+ * from user space. Such code must not hold kernel locks (since it
+ * can equally take a page fault), therefore it is safe to call
+ * force_sig_info even though that claims and releases locks.
+ * 
+ * Code in ./signal.c ensures that the debug control register
+ * is restored before we deliver any signal, and therefore that
+ * user code runs with the correct debug control register even though
+ * we clear it here.
+ *
+ * Being careful here means that we don't have to be as careful in a
+ * lot of more complicated places (task switching can be a bit lazy
+ * about restoring all the debug state, and ptrace doesn't have to
+ * find every occurrence of the TF bit that could be saved away even
+ * by user code)
+ */
+fastcall void do_debug(struct pt_regs * regs, long error_code)
+{
+       unsigned int condition;
+       struct task_struct *tsk = current;
+
+       condition = HYPERVISOR_get_debugreg(6);
+
+       if (notify_die(DIE_DEBUG, "debug", regs, condition, error_code,
+                                       SIGTRAP) == NOTIFY_STOP)
+               return;
+#if 0
+       /* It's safe to allow irq's after DR6 has been saved */
+       if (regs->eflags & X86_EFLAGS_IF)
+               local_irq_enable();
+#endif
+
+       /* Mask out spurious debug traps due to lazy DR7 setting */
+       if (condition & (DR_TRAP0|DR_TRAP1|DR_TRAP2|DR_TRAP3)) {
+               if (!tsk->thread.debugreg[7])
+                       goto clear_dr7;
+       }
+
+       if (regs->eflags & VM_MASK)
+               goto debug_vm86;
+
+       /* Save debug status register where ptrace can see it */
+       tsk->thread.debugreg[6] = condition;
+
+       /*
+        * Single-stepping through TF: make sure we ignore any events in
+        * kernel space (but re-enable TF when returning to user mode).
+        */
+       if (condition & DR_STEP) {
+               /*
+                * We already checked v86 mode above, so we can
+                * check for kernel mode by just checking the CPL
+                * of CS.
+                */
+               if ((regs->xcs & 2) == 0)
+                       goto clear_TF_reenable;
+       }
+
+       /* Ok, finally something we can handle */
+       send_sigtrap(tsk, regs, error_code);
+
+       /* Disable additional traps. They'll be re-enabled when
+        * the signal is delivered.
+        */
+clear_dr7:
+       HYPERVISOR_set_debugreg(7, 0);
+       return;
+
+debug_vm86:
+       handle_vm86_trap((struct kernel_vm86_regs *) regs, error_code, 1);
+       return;
+
+clear_TF_reenable:
+       set_tsk_thread_flag(tsk, TIF_SINGLESTEP);
+       regs->eflags &= ~TF_MASK;
+       return;
+}
+
+/*
+ * Note that we play around with the 'TS' bit in an attempt to get
+ * the correct behaviour even in the presence of the asynchronous
+ * IRQ13 behaviour
+ */
+void math_error(void __user *eip)
+{
+       struct task_struct * task;
+       siginfo_t info;
+       unsigned short cwd, swd;
+
+       /*
+        * Save the info for the exception handler and clear the error.
+        */
+       task = current;
+       save_init_fpu(task);
+       task->thread.trap_no = 16;
+       task->thread.error_code = 0;
+       info.si_signo = SIGFPE;
+       info.si_errno = 0;
+       info.si_code = __SI_FAULT;
+       info.si_addr = eip;
+       /*
+        * (~cwd & swd) will mask out exceptions that are not set to unmasked
+        * status.  0x3f is the exception bits in these regs, 0x200 is the
+        * C1 reg you need in case of a stack fault, 0x040 is the stack
+        * fault bit.  We should only be taking one exception at a time,
+        * so if this combination doesn't produce any single exception,
+        * then we have a bad program that isn't syncronizing its FPU usage
+        * and it will suffer the consequences since we won't be able to
+        * fully reproduce the context of the exception
+        */
+       cwd = get_fpu_cwd(task);
+       swd = get_fpu_swd(task);
+       switch (((~cwd) & swd & 0x3f) | (swd & 0x240)) {
+               case 0x000:
+               default:
+                       break;
+               case 0x001: /* Invalid Op */
+               case 0x041: /* Stack Fault */
+               case 0x241: /* Stack Fault | Direction */
+                       info.si_code = FPE_FLTINV;
+                       /* Should we clear the SF or let user space do it ???? 
*/
+                       break;
+               case 0x002: /* Denormalize */
+               case 0x010: /* Underflow */
+                       info.si_code = FPE_FLTUND;
+                       break;
+               case 0x004: /* Zero Divide */
+                       info.si_code = FPE_FLTDIV;
+                       break;
+               case 0x008: /* Overflow */
+                       info.si_code = FPE_FLTOVF;
+                       break;
+               case 0x020: /* Precision */
+                       info.si_code = FPE_FLTRES;
+                       break;
+       }
+       force_sig_info(SIGFPE, &info, task);
+}
+
+fastcall void do_coprocessor_error(struct pt_regs * regs, long error_code)
+{
+       ignore_fpu_irq = 1;
+       math_error((void __user *)regs->eip);
+}
+
+static void simd_math_error(void __user *eip)
+{
+       struct task_struct * task;
+       siginfo_t info;
+       unsigned short mxcsr;
+
+       /*
+        * Save the info for the exception handler and clear the error.
+        */
+       task = current;
+       save_init_fpu(task);
+       task->thread.trap_no = 19;
+       task->thread.error_code = 0;
+       info.si_signo = SIGFPE;
+       info.si_errno = 0;
+       info.si_code = __SI_FAULT;
+       info.si_addr = eip;
+       /*
+        * The SIMD FPU exceptions are handled a little differently, as there
+        * is only a single status/control register.  Thus, to determine which
+        * unmasked exception was caught we must mask the exception mask bits
+        * at 0x1f80, and then use these to mask the exception bits at 0x3f.
+        */
+       mxcsr = get_fpu_mxcsr(task);
+       switch (~((mxcsr & 0x1f80) >> 7) & (mxcsr & 0x3f)) {
+               case 0x000:
+               default:
+                       break;
+               case 0x001: /* Invalid Op */
+                       info.si_code = FPE_FLTINV;
+                       break;
+               case 0x002: /* Denormalize */
+               case 0x010: /* Underflow */
+                       info.si_code = FPE_FLTUND;
+                       break;
+               case 0x004: /* Zero Divide */
+                       info.si_code = FPE_FLTDIV;
+                       break;
+               case 0x008: /* Overflow */
+                       info.si_code = FPE_FLTOVF;
+                       break;
+               case 0x020: /* Precision */
+                       info.si_code = FPE_FLTRES;
+                       break;
+       }
+       force_sig_info(SIGFPE, &info, task);
+}
+
+fastcall void do_simd_coprocessor_error(struct pt_regs * regs,
+                                         long error_code)
+{
+       if (cpu_has_xmm) {
+               /* Handle SIMD FPU exceptions on PIII+ processors. */
+               ignore_fpu_irq = 1;
+               simd_math_error((void __user *)regs->eip);
+       } else {
+               /*
+                * Handle strange cache flush from user space exception
+                * in all other cases.  This is undocumented behaviour.
+                */
+               if (regs->eflags & VM_MASK) {
+                       handle_vm86_fault((struct kernel_vm86_regs *)regs,
+                                         error_code);
+                       return;
+               }
+               die_if_kernel("cache flush denied", regs, error_code);
+               current->thread.trap_no = 19;
+               current->thread.error_code = error_code;
+               force_sig(SIGSEGV, current);
+       }
+}
+
+fastcall void setup_x86_bogus_stack(unsigned char * stk)
+{
+       unsigned long *switch16_ptr, *switch32_ptr;
+       struct pt_regs *regs;
+       unsigned long stack_top, stack_bot;
+       unsigned short iret_frame16_off;
+       int cpu = smp_processor_id();
+       /* reserve the space on 32bit stack for the magic switch16 pointer */
+       memmove(stk, stk + 8, sizeof(struct pt_regs));
+       switch16_ptr = (unsigned long *)(stk + sizeof(struct pt_regs));
+       regs = (struct pt_regs *)stk;
+       /* now the switch32 on 16bit stack */
+       stack_bot = (unsigned long)&per_cpu(cpu_16bit_stack, cpu);
+       stack_top = stack_bot + CPU_16BIT_STACK_SIZE;
+       switch32_ptr = (unsigned long *)(stack_top - 8);
+       iret_frame16_off = CPU_16BIT_STACK_SIZE - 8 - 20;
+       /* copy iret frame on 16bit stack */
+       memcpy((void *)(stack_bot + iret_frame16_off), &regs->eip, 20);
+       /* fill in the switch pointers */
+       switch16_ptr[0] = (regs->esp & 0xffff0000) | iret_frame16_off;
+       switch16_ptr[1] = __ESPFIX_SS;
+       switch32_ptr[0] = (unsigned long)stk + sizeof(struct pt_regs) +
+               8 - CPU_16BIT_STACK_SIZE;
+       switch32_ptr[1] = __KERNEL_DS;
+}
+
+fastcall unsigned char * fixup_x86_bogus_stack(unsigned short sp)
+{
+       unsigned long *switch32_ptr;
+       unsigned char *stack16, *stack32;
+       unsigned long stack_top, stack_bot;
+       int len;
+       int cpu = smp_processor_id();
+       stack_bot = (unsigned long)&per_cpu(cpu_16bit_stack, cpu);
+       stack_top = stack_bot + CPU_16BIT_STACK_SIZE;
+       switch32_ptr = (unsigned long *)(stack_top - 8);
+       /* copy the data from 16bit stack to 32bit stack */
+       len = CPU_16BIT_STACK_SIZE - 8 - sp;
+       stack16 = (unsigned char *)(stack_bot + sp);
+       stack32 = (unsigned char *)
+               (switch32_ptr[0] + CPU_16BIT_STACK_SIZE - 8 - len);
+       memcpy(stack32, stack16, len);
+       return stack32;
+}
+
+/*
+ *  'math_state_restore()' saves the current math information in the
+ * old math state array, and gets the new ones from the current task
+ *
+ * Careful.. There are problems with IBM-designed IRQ13 behaviour.
+ * Don't touch unless you *really* know how it works.
+ *
+ * Must be called with kernel preemption disabled (in this case,
+ * local interrupts are disabled at the call-site in entry.S).
+ */
+asmlinkage void math_state_restore(struct pt_regs regs)
+{
+       struct thread_info *thread = current_thread_info();
+       struct task_struct *tsk = thread->task;
+
+       /* NB. 'clts' is done for us by Xen during virtual trap. */
+       if (!tsk_used_math(tsk))
+               init_fpu(tsk);
+       restore_fpu(tsk);
+       thread->status |= TS_USEDFPU;   /* So we fnsave on switch_to() */
+}
+
+#ifndef CONFIG_MATH_EMULATION
+
+asmlinkage void math_emulate(long arg)
+{
+       printk("math-emulation not enabled and no coprocessor found.\n");
+       printk("killing %s.\n",current->comm);
+       force_sig(SIGFPE,current);
+       schedule();
+}
+
+#endif /* CONFIG_MATH_EMULATION */
+
+#ifdef CONFIG_X86_F00F_BUG
+void __init trap_init_f00f_bug(void)
+{
+       __set_fixmap(FIX_F00F_IDT, __pa(&idt_table), PAGE_KERNEL_RO);
+
+       /*
+        * Update the IDT descriptor and reload the IDT so that
+        * it uses the read-only mapped virtual address.
+        */
+       idt_descr.address = fix_to_virt(FIX_F00F_IDT);
+       __asm__ __volatile__("lidt %0" : : "m" (idt_descr));
+}
+#endif
+
+
+/* NB. All these are "trap gates" (i.e. events_mask isn't cleared). */
+static trap_info_t trap_table[] = {
+       {  0, 0, __KERNEL_CS, (unsigned long)divide_error               },
+       {  1, 0, __KERNEL_CS, (unsigned long)debug                      },
+       {  3, 3, __KERNEL_CS, (unsigned long)int3                       },
+       {  4, 3, __KERNEL_CS, (unsigned long)overflow                   },
+       {  5, 3, __KERNEL_CS, (unsigned long)bounds                     },
+       {  6, 0, __KERNEL_CS, (unsigned long)invalid_op                 },
+       {  7, 0, __KERNEL_CS, (unsigned long)device_not_available       },
+       {  9, 0, __KERNEL_CS, (unsigned long)coprocessor_segment_overrun },
+       { 10, 0, __KERNEL_CS, (unsigned long)invalid_TSS                },
+       { 11, 0, __KERNEL_CS, (unsigned long)segment_not_present        },
+       { 12, 0, __KERNEL_CS, (unsigned long)stack_segment              },
+       { 13, 0, __KERNEL_CS, (unsigned long)general_protection         },
+       { 14, 0, __KERNEL_CS, (unsigned long)page_fault                 },
+       { 15, 0, __KERNEL_CS, (unsigned long)fixup_4gb_segment          },
+       { 16, 0, __KERNEL_CS, (unsigned long)coprocessor_error          },
+       { 17, 0, __KERNEL_CS, (unsigned long)alignment_check            },
+#ifdef CONFIG_X86_MCE
+       { 18, 0, __KERNEL_CS, (unsigned long)machine_check              },
+#endif
+       { 19, 0, __KERNEL_CS, (unsigned long)simd_coprocessor_error     },
+       { SYSCALL_VECTOR,  3, __KERNEL_CS, (unsigned long)system_call   },
+       {  0, 0,           0, 0                                         }
+};
+
+void __init trap_init(void)
+{
+       HYPERVISOR_set_trap_table(trap_table);
+
+       /*
+        * default LDT is a single-entry callgate to lcall7 for iBCS
+        * and a callgate to lcall27 for Solaris/x86 binaries
+        */
+       make_lowmem_page_readonly(&default_ldt[0]);
+
+       /*
+        * Should be a barrier for any external CPU state.
+        */
+       cpu_init();
+}
+
+void smp_trap_init(trap_info_t *trap_ctxt)
+{
+       trap_info_t *t = trap_table;
+
+       for (t = trap_table; t->address; t++) {
+               trap_ctxt[t->vector].flags = t->flags;
+               trap_ctxt[t->vector].cs = t->cs;
+               trap_ctxt[t->vector].address = t->address;
+       }
+}
+
+static int __init kstack_setup(char *s)
+{
+       kstack_depth_to_print = simple_strtoul(s, NULL, 0);
+       return 0;
+}
+__setup("kstack=", kstack_setup);
Index: linux-2.6.12-xen0-arch/arch/i386/mach-xen/kernel/vsyscall.S
===================================================================
--- /dev/null
+++ linux-2.6.12-xen0-arch/arch/i386/mach-xen/kernel/vsyscall.S
@@ -0,0 +1,15 @@
+#include <linux/init.h>
+
+__INITDATA
+
+       .globl vsyscall_int80_start, vsyscall_int80_end
+vsyscall_int80_start:
+       .incbin "arch/i386/mach-xen/kernel/vsyscall-int80.so"
+vsyscall_int80_end:
+
+       .globl vsyscall_sysenter_start, vsyscall_sysenter_end
+vsyscall_sysenter_start:
+       .incbin "arch/i386/mach-xen/kernel/vsyscall-sysenter.so"
+vsyscall_sysenter_end:
+
+__FINIT
Index: linux-2.6.12-xen0-arch/arch/i386/mach-xen/Makefile
===================================================================
--- /dev/null
+++ linux-2.6.12-xen0-arch/arch/i386/mach-xen/Makefile
@@ -0,0 +1 @@
+obj-y          := kernel/ mm/
Index: linux-2.6.12-xen0-arch/arch/i386/mach-xen/mm/fault.c
===================================================================
--- /dev/null
+++ linux-2.6.12-xen0-arch/arch/i386/mach-xen/mm/fault.c
@@ -0,0 +1,561 @@
+/*
+ *  linux/arch/i386/mm/fault.c
+ *
+ *  Copyright (C) 1995  Linus Torvalds
+ */
+
+#include <linux/signal.h>
+#include <linux/sched.h>
+#include <linux/kernel.h>
+#include <linux/errno.h>
+#include <linux/string.h>
+#include <linux/types.h>
+#include <linux/ptrace.h>
+#include <linux/mman.h>
+#include <linux/mm.h>
+#include <linux/smp.h>
+#include <linux/smp_lock.h>
+#include <linux/interrupt.h>
+#include <linux/init.h>
+#include <linux/tty.h>
+#include <linux/vt_kern.h>             /* For unblank_screen() */
+#include <linux/highmem.h>
+#include <linux/module.h>
+#include <linux/percpu.h>
+
+#include <asm/system.h>
+#include <asm/uaccess.h>
+#include <asm/desc.h>
+#include <asm/kdebug.h>
+
+extern void die(const char *,struct pt_regs *,long);
+
+DEFINE_PER_CPU(pgd_t *, cur_pgd);
+
+/*
+ * Unlock any spinlocks which will prevent us from getting the
+ * message out 
+ */
+void bust_spinlocks(int yes)
+{
+       int loglevel_save = console_loglevel;
+
+       if (yes) {
+               oops_in_progress = 1;
+               return;
+       }
+#ifdef CONFIG_VT
+       unblank_screen();
+#endif
+       oops_in_progress = 0;
+       /*
+        * OK, the message is on the console.  Now we call printk()
+        * without oops_in_progress set so that printk will give klogd
+        * a poke.  Hold onto your hats...
+        */
+       console_loglevel = 15;          /* NMI oopser may have shut the console 
up */
+       printk(" ");
+       console_loglevel = loglevel_save;
+}
+
+/*
+ * Return EIP plus the CS segment base.  The segment limit is also
+ * adjusted, clamped to the kernel/user address space (whichever is
+ * appropriate), and returned in *eip_limit.
+ *
+ * The segment is checked, because it might have been changed by another
+ * task between the original faulting instruction and here.
+ *
+ * If CS is no longer a valid code segment, or if EIP is beyond the
+ * limit, or if it is a kernel address when CS is not a kernel segment,
+ * then the returned value will be greater than *eip_limit.
+ * 
+ * This is slow, but is very rarely executed.
+ */
+static inline unsigned long get_segment_eip(struct pt_regs *regs,
+                                           unsigned long *eip_limit)
+{
+       unsigned long eip = regs->eip;
+       unsigned seg = regs->xcs & 0xffff;
+       u32 seg_ar, seg_limit, base, *desc;
+
+       /* The standard kernel/user address space limit. */
+       *eip_limit = (seg & 2) ? USER_DS.seg : KERNEL_DS.seg;
+
+       /* Unlikely, but must come before segment checks. */
+       if (unlikely((regs->eflags & VM_MASK) != 0))
+               return eip + (seg << 4);
+       
+       /* By far the most common cases. */
+       if (likely(seg == __USER_CS || seg == __KERNEL_CS))
+               return eip;
+
+       /* Check the segment exists, is within the current LDT/GDT size,
+          that kernel/user (ring 0..3) has the appropriate privilege,
+          that it's a code segment, and get the limit. */
+       __asm__ ("larl %3,%0; lsll %3,%1"
+                : "=&r" (seg_ar), "=r" (seg_limit) : "0" (0), "rm" (seg));
+       if ((~seg_ar & 0x9800) || eip > seg_limit) {
+               *eip_limit = 0;
+               return 1;        /* So that returned eip > *eip_limit. */
+       }
+
+       /* Get the GDT/LDT descriptor base. 
+          When you look for races in this code remember that
+          LDT and other horrors are only used in user space. */
+       if (seg & (1<<2)) {
+               /* Must lock the LDT while reading it. */
+               down(&current->mm->context.sem);
+               desc = current->mm->context.ldt;
+               desc = (void *)desc + (seg & ~7);
+       } else {
+               /* Must disable preemption while reading the GDT. */
+               desc = (u32 *)get_cpu_gdt_table(get_cpu());
+               desc = (void *)desc + (seg & ~7);
+       }
+
+       /* Decode the code segment base from the descriptor */
+       base = get_desc_base((unsigned long *)desc);
+
+       if (seg & (1<<2)) { 
+               up(&current->mm->context.sem);
+       } else
+               put_cpu();
+
+       /* Adjust EIP and segment limit, and clamp at the kernel limit.
+          It's legitimate for segments to wrap at 0xffffffff. */
+       seg_limit += base;
+       if (seg_limit < *eip_limit && seg_limit >= base)
+               *eip_limit = seg_limit;
+       return eip + base;
+}
+
+/* 
+ * Sometimes AMD Athlon/Opteron CPUs report invalid exceptions on prefetch.
+ * Check that here and ignore it.
+ */
+static int __is_prefetch(struct pt_regs *regs, unsigned long addr)
+{ 
+       unsigned long limit;
+       unsigned long instr = get_segment_eip (regs, &limit);
+       int scan_more = 1;
+       int prefetch = 0; 
+       int i;
+
+       for (i = 0; scan_more && i < 15; i++) { 
+               unsigned char opcode;
+               unsigned char instr_hi;
+               unsigned char instr_lo;
+
+               if (instr > limit)
+                       break;
+               if (__get_user(opcode, (unsigned char *) instr))
+                       break; 
+
+               instr_hi = opcode & 0xf0; 
+               instr_lo = opcode & 0x0f; 
+               instr++;
+
+               switch (instr_hi) { 
+               case 0x20:
+               case 0x30:
+                       /* Values 0x26,0x2E,0x36,0x3E are valid x86 prefixes. */
+                       scan_more = ((instr_lo & 7) == 0x6);
+                       break;
+                       
+               case 0x60:
+                       /* 0x64 thru 0x67 are valid prefixes in all modes. */
+                       scan_more = (instr_lo & 0xC) == 0x4;
+                       break;          
+               case 0xF0:
+                       /* 0xF0, 0xF2, and 0xF3 are valid prefixes */
+                       scan_more = !instr_lo || (instr_lo>>1) == 1;
+                       break;                  
+               case 0x00:
+                       /* Prefetch instruction is 0x0F0D or 0x0F18 */
+                       scan_more = 0;
+                       if (instr > limit)
+                               break;
+                       if (__get_user(opcode, (unsigned char *) instr)) 
+                               break;
+                       prefetch = (instr_lo == 0xF) &&
+                               (opcode == 0x0D || opcode == 0x18);
+                       break;                  
+               default:
+                       scan_more = 0;
+                       break;
+               } 
+       }
+       return prefetch;
+}
+
+static inline int is_prefetch(struct pt_regs *regs, unsigned long addr,
+                             unsigned long error_code)
+{
+       if (unlikely(boot_cpu_data.x86_vendor == X86_VENDOR_AMD &&
+                    boot_cpu_data.x86 >= 6)) {
+               /* Catch an obscure case of prefetch inside an NX page. */
+               if (nx_enabled && (error_code & 16))
+                       return 0;
+               return __is_prefetch(regs, addr);
+       }
+       return 0;
+} 
+
+fastcall void do_invalid_op(struct pt_regs *, unsigned long);
+
+/*
+ * This routine handles page faults.  It determines the address,
+ * and the problem, and then passes it off to one of the appropriate
+ * routines.
+ *
+ * error_code:
+ *     bit 0 == 0 means no page found, 1 means protection fault
+ *     bit 1 == 0 means read, 1 means write
+ *     bit 2 == 0 means kernel, 1 means user-mode
+ */
+fastcall void do_page_fault(struct pt_regs *regs, unsigned long error_code,
+                             unsigned long address)
+{
+       struct task_struct *tsk;
+       struct mm_struct *mm;
+       struct vm_area_struct * vma;
+       unsigned long page;
+       int write;
+       siginfo_t info;
+
+       /* Set the "privileged fault" bit to something sane. */
+       error_code &= 3;
+       error_code |= (regs->xcs & 2) << 1;
+       if (regs->eflags & X86_EFLAGS_VM)
+               error_code |= 4;
+
+       if (notify_die(DIE_PAGE_FAULT, "page fault", regs, error_code, 14,
+                                       SIGSEGV) == NOTIFY_STOP)
+               return;
+#if 0
+       /* It's safe to allow irq's after cr2 has been saved */
+       if (regs->eflags & (X86_EFLAGS_IF|VM_MASK))
+               local_irq_enable();
+#endif
+
+       tsk = current;
+
+       info.si_code = SEGV_MAPERR;
+
+       /*
+        * We fault-in kernel-space virtual memory on-demand. The
+        * 'reference' page table is init_mm.pgd.
+        *
+        * NOTE! We MUST NOT take any locks for this case. We may
+        * be in an interrupt or a critical region, and should
+        * only copy the information from the master page table,
+        * nothing more.
+        *
+        * This verifies that the fault happens in kernel space
+        * (error_code & 4) == 0, and that the fault was not a
+        * protection error (error_code & 1) == 0.
+        */
+       if (unlikely(address >= TASK_SIZE)) { 
+               if (!(error_code & 5))
+                       goto vmalloc_fault;
+               /* 
+                * Don't take the mm semaphore here. If we fixup a prefetch
+                * fault we could otherwise deadlock.
+                */
+               goto bad_area_nosemaphore;
+       } 
+
+       mm = tsk->mm;
+
+       /*
+        * If we're in an interrupt, have no user context or are running in an
+        * atomic region then we must not take the fault..
+        */
+       if (in_atomic() || !mm)
+               goto bad_area_nosemaphore;
+
+       /* When running in the kernel we expect faults to occur only to
+        * addresses in user space.  All other faults represent errors in the
+        * kernel and should generate an OOPS.  Unfortunatly, in the case of an
+        * erroneous fault occuring in a code path which already holds mmap_sem
+        * we will deadlock attempting to validate the fault against the
+        * address space.  Luckily the kernel only validly references user
+        * space from well defined areas of code, which are listed in the
+        * exceptions table.
+        *
+        * As the vast majority of faults will be valid we will only perform
+        * the source reference check when there is a possibilty of a deadlock.
+        * Attempt to lock the address space, if we cannot we then validate the
+        * source.  If this is invalid we can skip the address space check,
+        * thus avoiding the deadlock.
+        */
+       if (!down_read_trylock(&mm->mmap_sem)) {
+               if ((error_code & 4) == 0 &&
+                   !search_exception_tables(regs->eip))
+                       goto bad_area_nosemaphore;
+               down_read(&mm->mmap_sem);
+       }
+
+       vma = find_vma(mm, address);
+       if (!vma)
+               goto bad_area;
+       if (vma->vm_start <= address)
+               goto good_area;
+       if (!(vma->vm_flags & VM_GROWSDOWN))
+               goto bad_area;
+       if (error_code & 4) {
+               /*
+                * accessing the stack below %esp is always a bug.
+                * The "+ 32" is there due to some instructions (like
+                * pusha) doing post-decrement on the stack and that
+                * doesn't show up until later..
+                */
+               if (address + 32 < regs->esp)
+                       goto bad_area;
+       }
+       if (expand_stack(vma, address))
+               goto bad_area;
+/*
+ * Ok, we have a good vm_area for this memory access, so
+ * we can handle it..
+ */
+good_area:
+       info.si_code = SEGV_ACCERR;
+       write = 0;
+       switch (error_code & 3) {
+               default:        /* 3: write, present */
+#ifdef TEST_VERIFY_AREA
+                       if (regs->cs == KERNEL_CS)
+                               printk("WP fault at %08lx\n", regs->eip);
+#endif
+                       /* fall through */
+               case 2:         /* write, not present */
+                       if (!(vma->vm_flags & VM_WRITE))
+                               goto bad_area;
+                       write++;
+                       break;
+               case 1:         /* read, present */
+                       goto bad_area;
+               case 0:         /* read, not present */
+                       if (!(vma->vm_flags & (VM_READ | VM_EXEC)))
+                               goto bad_area;
+       }
+
+ survive:
+       /*
+        * If for any reason at all we couldn't handle the fault,
+        * make sure we exit gracefully rather than endlessly redo
+        * the fault.
+        */
+       switch (handle_mm_fault(mm, vma, address, write)) {
+               case VM_FAULT_MINOR:
+                       tsk->min_flt++;
+                       break;
+               case VM_FAULT_MAJOR:
+                       tsk->maj_flt++;
+                       break;
+               case VM_FAULT_SIGBUS:
+                       goto do_sigbus;
+               case VM_FAULT_OOM:
+                       goto out_of_memory;
+               default:
+                       BUG();
+       }
+
+       /*
+        * Did it hit the DOS screen memory VA from vm86 mode?
+        */
+       if (regs->eflags & VM_MASK) {
+               unsigned long bit = (address - 0xA0000) >> PAGE_SHIFT;
+               if (bit < 32)
+                       tsk->thread.screen_bitmap |= 1 << bit;
+       }
+       up_read(&mm->mmap_sem);
+       return;
+
+/*
+ * Something tried to access memory that isn't in our memory map..
+ * Fix it, but check if it's kernel or user first..
+ */
+bad_area:
+       up_read(&mm->mmap_sem);
+
+bad_area_nosemaphore:
+       /* User mode accesses just cause a SIGSEGV */
+       if (error_code & 4) {
+               /* 
+                * Valid to do another page fault here because this one came 
+                * from user space.
+                */
+               if (is_prefetch(regs, address, error_code))
+                       return;
+
+               tsk->thread.cr2 = address;
+               /* Kernel addresses are always protection faults */
+               tsk->thread.error_code = error_code | (address >= TASK_SIZE);
+               tsk->thread.trap_no = 14;
+               info.si_signo = SIGSEGV;
+               info.si_errno = 0;
+               /* info.si_code has been set above */
+               info.si_addr = (void __user *)address;
+               force_sig_info(SIGSEGV, &info, tsk);
+               return;
+       }
+
+#ifdef CONFIG_X86_F00F_BUG
+       /*
+        * Pentium F0 0F C7 C8 bug workaround.
+        */
+       if (boot_cpu_data.f00f_bug) {
+               unsigned long nr;
+               
+               nr = (address - idt_descr.address) >> 3;
+
+               if (nr == 6) {
+                       do_invalid_op(regs, 0);
+                       return;
+               }
+       }
+#endif
+
+no_context:
+       /* Are we prepared to handle this kernel fault?  */
+       if (fixup_exception(regs))
+               return;
+
+       /* 
+        * Valid to do another page fault here, because if this fault
+        * had been triggered by is_prefetch fixup_exception would have 
+        * handled it.
+        */
+       if (is_prefetch(regs, address, error_code))
+               return;
+
+/*
+ * Oops. The kernel tried to access some bad page. We'll have to
+ * terminate things with extreme prejudice.
+ */
+
+       bust_spinlocks(1);
+
+#ifdef CONFIG_X86_PAE
+       if (error_code & 16) {
+               pte_t *pte = lookup_address(address);
+
+               if (pte && pte_present(*pte) && !pte_exec_kernel(*pte))
+                       printk(KERN_CRIT "kernel tried to execute NX-protected 
page - exploit attempt? (uid: %d)\n", current->uid);
+       }
+#endif
+       if (address < PAGE_SIZE)
+               printk(KERN_ALERT "Unable to handle kernel NULL pointer 
dereference");
+       else
+               printk(KERN_ALERT "Unable to handle kernel paging request");
+       printk(" at virtual address %08lx\n",address);
+       printk(KERN_ALERT " printing eip:\n");
+       printk("%08lx\n", regs->eip);
+       page = ((unsigned long *) per_cpu(cur_pgd, smp_processor_id()))
+           [address >> 22];
+       printk(KERN_ALERT "*pde = ma %08lx pa %08lx\n", page,
+              machine_to_phys(page));
+       /*
+        * We must not directly access the pte in the highpte
+        * case, the page table might be allocated in highmem.
+        * And lets rather not kmap-atomic the pte, just in case
+        * it's allocated already.
+        */
+#ifndef CONFIG_HIGHPTE
+       if (page & 1) {
+               page &= PAGE_MASK;
+               address &= 0x003ff000;
+               page = machine_to_phys(page);
+               page = ((unsigned long *) __va(page))[address >> PAGE_SHIFT];
+               printk(KERN_ALERT "*pte = ma %08lx pa %08lx\n", page,
+                      machine_to_phys(page));
+       }
+#endif
+       die("Oops", regs, error_code);
+       bust_spinlocks(0);
+       do_exit(SIGKILL);
+
+/*
+ * We ran out of memory, or some other thing happened to us that made
+ * us unable to handle the page fault gracefully.
+ */
+out_of_memory:
+       up_read(&mm->mmap_sem);
+       if (tsk->pid == 1) {
+               yield();
+               down_read(&mm->mmap_sem);
+               goto survive;
+       }
+       printk("VM: killing process %s\n", tsk->comm);
+       if (error_code & 4)
+               do_exit(SIGKILL);
+       goto no_context;
+
+do_sigbus:
+       up_read(&mm->mmap_sem);
+
+       /* Kernel mode? Handle exceptions or die */
+       if (!(error_code & 4))
+               goto no_context;
+
+       /* User space => ok to do another page fault */
+       if (is_prefetch(regs, address, error_code))
+               return;
+
+       tsk->thread.cr2 = address;
+       tsk->thread.error_code = error_code;
+       tsk->thread.trap_no = 14;
+       info.si_signo = SIGBUS;
+       info.si_errno = 0;
+       info.si_code = BUS_ADRERR;
+       info.si_addr = (void __user *)address;
+       force_sig_info(SIGBUS, &info, tsk);
+       return;
+
+vmalloc_fault:
+       {
+               /*
+                * Synchronize this task's top level page-table
+                * with the 'reference' page table.
+                *
+                * Do _not_ use "tsk" here. We might be inside
+                * an interrupt in the middle of a task switch..
+                */
+               int index = pgd_index(address);
+               pgd_t *pgd, *pgd_k;
+               pud_t *pud, *pud_k;
+               pmd_t *pmd, *pmd_k;
+               pte_t *pte_k;
+
+               pgd = index + per_cpu(cur_pgd, smp_processor_id());
+               pgd_k = init_mm.pgd + index;
+
+               if (!pgd_present(*pgd_k))
+                       goto no_context;
+
+               /*
+                * set_pgd(pgd, *pgd_k); here would be useless on PAE
+                * and redundant with the set_pmd() on non-PAE. As would
+                * set_pud.
+                */
+
+               pud = pud_offset(pgd, address);
+               pud_k = pud_offset(pgd_k, address);
+               if (!pud_present(*pud_k))
+                       goto no_context;
+               
+               pmd = pmd_offset(pud, address);
+               pmd_k = pmd_offset(pud_k, address);
+               if (!pmd_present(*pmd_k))
+                       goto no_context;
+               set_pmd(pmd, *pmd_k);
+
+               pte_k = pte_offset_kernel(pmd_k, address);
+               if (!pte_present(*pte_k))
+                       goto no_context;
+               return;
+       }
+}
Index: linux-2.6.12-xen0-arch/arch/i386/mach-xen/mm/highmem.c
===================================================================
--- /dev/null
+++ linux-2.6.12-xen0-arch/arch/i386/mach-xen/mm/highmem.c
@@ -0,0 +1,94 @@
+#include <linux/highmem.h>
+
+void *kmap(struct page *page)
+{
+       might_sleep();
+       if (!PageHighMem(page))
+               return page_address(page);
+       return kmap_high(page);
+}
+
+void kunmap(struct page *page)
+{
+       if (in_interrupt())
+               BUG();
+       if (!PageHighMem(page))
+               return;
+       kunmap_high(page);
+}
+
+/*
+ * kmap_atomic/kunmap_atomic is significantly faster than kmap/kunmap because
+ * no global lock is needed and because the kmap code must perform a global TLB
+ * invalidation when the kmap pool wraps.
+ *
+ * However when holding an atomic kmap is is not legal to sleep, so atomic
+ * kmaps are appropriate for short, tight code paths only.
+ */
+void *__kmap_atomic(struct page *page, enum km_type type, pgprot_t prot)
+{
+       enum fixed_addresses idx;
+       unsigned long vaddr;
+
+       /* even !CONFIG_PREEMPT needs this, for in_atomic in do_page_fault */
+       inc_preempt_count();
+       if (!PageHighMem(page))
+               return page_address(page);
+
+       idx = type + KM_TYPE_NR*smp_processor_id();
+       vaddr = __fix_to_virt(FIX_KMAP_BEGIN + idx);
+#ifdef CONFIG_DEBUG_HIGHMEM
+       if (!pte_none(*(kmap_pte-idx)))
+               BUG();
+#endif
+       set_pte(kmap_pte-idx, mk_pte(page, prot));
+       __flush_tlb_one(vaddr);
+
+       return (void*) vaddr;
+}
+
+void *kmap_atomic(struct page *page, enum km_type type)
+{
+       return __kmap_atomic(page, type, kmap_prot);
+}
+
+void kunmap_atomic(void *kvaddr, enum km_type type)
+{
+#ifdef CONFIG_DEBUG_HIGHMEM
+       unsigned long vaddr = (unsigned long) kvaddr & PAGE_MASK;
+       enum fixed_addresses idx = type + KM_TYPE_NR*smp_processor_id();
+
+       if (vaddr < FIXADDR_START) { // FIXME
+               dec_preempt_count();
+               preempt_check_resched();
+               return;
+       }
+
+       if (vaddr != __fix_to_virt(FIX_KMAP_BEGIN+idx))
+               BUG();
+
+       /*
+        * force other mappings to Oops if they'll try to access
+        * this pte without first remap it
+        */
+       pte_clear(&init_mm, vaddr, kmap_pte-idx);
+       __flush_tlb_one(vaddr);
+#endif
+
+       dec_preempt_count();
+       preempt_check_resched();
+}
+
+struct page *kmap_atomic_to_page(void *ptr)
+{
+       unsigned long idx, vaddr = (unsigned long)ptr;
+       pte_t *pte;
+
+       if (vaddr < FIXADDR_START)
+               return virt_to_page(ptr);
+
+       idx = virt_to_fix(vaddr);
+       pte = kmap_pte - (idx - FIX_KMAP_BEGIN);
+       return pte_page(*pte);
+}
+
Index: linux-2.6.12-xen0-arch/arch/i386/mach-xen/mm/hypervisor.c
===================================================================
--- /dev/null
+++ linux-2.6.12-xen0-arch/arch/i386/mach-xen/mm/hypervisor.c
@@ -0,0 +1,363 @@
+/******************************************************************************
+ * mm/hypervisor.c
+ * 
+ * Update page tables via the hypervisor.
+ * 
+ * Copyright (c) 2002-2004, K A Fraser
+ * 
+ * This file may be distributed separately from the Linux kernel, or
+ * incorporated into other software packages, subject to the following license:
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this source file (the "Software"), to deal in the Software without
+ * restriction, including without limitation the rights to use, copy, modify,
+ * merge, publish, distribute, sublicense, and/or sell copies of the Software,
+ * and to permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#include <linux/config.h>
+#include <linux/sched.h>
+#include <linux/mm.h>
+#include <linux/vmalloc.h>
+#include <asm/page.h>
+#include <asm/pgtable.h>
+#include <asm-xen/hypervisor.h>
+#include <asm-xen/balloon.h>
+#if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,0)
+#include <linux/percpu.h>
+#include <asm/tlbflush.h>
+#endif
+
+#if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,0)
+#define pte_offset_kernel pte_offset
+#define pud_t pgd_t
+#define pud_offset(d, va) d
+#elif defined(CONFIG_X86_64)
+#define pmd_val_ma(v) (v).pmd
+#else
+#ifdef CONFIG_X86_PAE
+# define pmd_val_ma(v) ((v).pmd)
+# define pud_val_ma(v) ((v).pgd.pgd)
+#else
+# define pmd_val_ma(v) ((v).pud.pgd.pgd)
+#endif
+#endif
+
+#ifndef CONFIG_XEN_SHADOW_MODE
+void xen_l1_entry_update(pte_t *ptr, pte_t val)
+{
+    mmu_update_t u;
+    u.ptr = virt_to_machine(ptr);
+    u.val = pte_val_ma(val);
+    BUG_ON(HYPERVISOR_mmu_update(&u, 1, NULL, DOMID_SELF) < 0);
+}
+
+void xen_l2_entry_update(pmd_t *ptr, pmd_t val)
+{
+    mmu_update_t u;
+    u.ptr = virt_to_machine(ptr);
+    u.val = pmd_val_ma(val);
+    BUG_ON(HYPERVISOR_mmu_update(&u, 1, NULL, DOMID_SELF) < 0);
+}
+
+#ifdef CONFIG_X86_PAE
+void xen_l3_entry_update(pud_t *ptr, pud_t val)
+{
+    mmu_update_t u;
+    u.ptr = virt_to_machine(ptr);
+    u.val = pud_val_ma(val);
+    BUG_ON(HYPERVISOR_mmu_update(&u, 1, NULL, DOMID_SELF) < 0);
+}
+#endif
+
+#ifdef CONFIG_X86_64
+void xen_l3_entry_update(pud_t *ptr, pud_t val)
+{
+    mmu_update_t u;
+    u.ptr = virt_to_machine(ptr);
+    u.val = val.pud;
+    BUG_ON(HYPERVISOR_mmu_update(&u, 1, NULL, DOMID_SELF) < 0);
+}
+
+void xen_l4_entry_update(pgd_t *ptr, pgd_t val)
+{
+    mmu_update_t u;
+    u.ptr = virt_to_machine(ptr);
+    u.val = val.pgd;
+    BUG_ON(HYPERVISOR_mmu_update(&u, 1, NULL, DOMID_SELF) < 0);
+}
+#endif /* CONFIG_X86_64 */
+#endif /* CONFIG_XEN_SHADOW_MODE */
+
+void xen_machphys_update(unsigned long mfn, unsigned long pfn)
+{
+    mmu_update_t u;
+    u.ptr = (mfn << PAGE_SHIFT) | MMU_MACHPHYS_UPDATE;
+    u.val = pfn;
+    BUG_ON(HYPERVISOR_mmu_update(&u, 1, NULL, DOMID_SELF) < 0);
+}
+
+void xen_pt_switch(unsigned long ptr)
+{
+    struct mmuext_op op;
+    op.cmd = MMUEXT_NEW_BASEPTR;
+    op.mfn = pfn_to_mfn(ptr >> PAGE_SHIFT);
+    BUG_ON(HYPERVISOR_mmuext_op(&op, 1, NULL, DOMID_SELF) < 0);
+}
+
+void xen_new_user_pt(unsigned long ptr)
+{
+    struct mmuext_op op;
+    op.cmd = MMUEXT_NEW_USER_BASEPTR;
+    op.mfn = pfn_to_mfn(ptr >> PAGE_SHIFT);
+    BUG_ON(HYPERVISOR_mmuext_op(&op, 1, NULL, DOMID_SELF) < 0);
+}
+
+void xen_tlb_flush(void)
+{
+    struct mmuext_op op;
+    op.cmd = MMUEXT_TLB_FLUSH_LOCAL;
+    BUG_ON(HYPERVISOR_mmuext_op(&op, 1, NULL, DOMID_SELF) < 0);
+}
+
+void xen_invlpg(unsigned long ptr)
+{
+    struct mmuext_op op;
+    op.cmd = MMUEXT_INVLPG_LOCAL;
+    op.linear_addr = ptr & PAGE_MASK;
+    BUG_ON(HYPERVISOR_mmuext_op(&op, 1, NULL, DOMID_SELF) < 0);
+}
+
+#ifdef CONFIG_SMP
+
+void xen_tlb_flush_all(void)
+{
+    struct mmuext_op op;
+    op.cmd = MMUEXT_TLB_FLUSH_ALL;
+    BUG_ON(HYPERVISOR_mmuext_op(&op, 1, NULL, DOMID_SELF) < 0);
+}
+
+void xen_tlb_flush_mask(cpumask_t *mask)
+{
+    struct mmuext_op op;
+    if ( cpus_empty(*mask) )
+        return;
+    op.cmd = MMUEXT_TLB_FLUSH_MULTI;
+    op.vcpumask = mask->bits;
+    BUG_ON(HYPERVISOR_mmuext_op(&op, 1, NULL, DOMID_SELF) < 0);
+}
+
+void xen_invlpg_all(unsigned long ptr)
+{
+    struct mmuext_op op;
+    op.cmd = MMUEXT_INVLPG_ALL;
+    op.linear_addr = ptr & PAGE_MASK;
+    BUG_ON(HYPERVISOR_mmuext_op(&op, 1, NULL, DOMID_SELF) < 0);
+}
+
+void xen_invlpg_mask(cpumask_t *mask, unsigned long ptr)
+{
+    struct mmuext_op op;
+    if ( cpus_empty(*mask) )
+        return;
+    op.cmd = MMUEXT_INVLPG_MULTI;
+    op.vcpumask = mask->bits;
+    op.linear_addr = ptr & PAGE_MASK;
+    BUG_ON(HYPERVISOR_mmuext_op(&op, 1, NULL, DOMID_SELF) < 0);
+}
+
+#endif /* CONFIG_SMP */
+
+#ifndef CONFIG_XEN_SHADOW_MODE
+void xen_pgd_pin(unsigned long ptr)
+{
+    struct mmuext_op op;
+#ifdef CONFIG_X86_64
+    op.cmd = MMUEXT_PIN_L4_TABLE;
+#elif defined(CONFIG_X86_PAE)
+    op.cmd = MMUEXT_PIN_L3_TABLE;
+#else
+    op.cmd = MMUEXT_PIN_L2_TABLE;
+#endif
+    op.mfn = pfn_to_mfn(ptr >> PAGE_SHIFT);
+    BUG_ON(HYPERVISOR_mmuext_op(&op, 1, NULL, DOMID_SELF) < 0);
+}
+
+void xen_pgd_unpin(unsigned long ptr)
+{
+    struct mmuext_op op;
+    op.cmd = MMUEXT_UNPIN_TABLE;
+    op.mfn = pfn_to_mfn(ptr >> PAGE_SHIFT);
+    BUG_ON(HYPERVISOR_mmuext_op(&op, 1, NULL, DOMID_SELF) < 0);
+}
+
+void xen_pte_pin(unsigned long ptr)
+{
+    struct mmuext_op op;
+    op.cmd = MMUEXT_PIN_L1_TABLE;
+    op.mfn = pfn_to_mfn(ptr >> PAGE_SHIFT);
+    BUG_ON(HYPERVISOR_mmuext_op(&op, 1, NULL, DOMID_SELF) < 0);
+}
+
+void xen_pte_unpin(unsigned long ptr)
+{
+    struct mmuext_op op;
+    op.cmd = MMUEXT_UNPIN_TABLE;
+    op.mfn = pfn_to_mfn(ptr >> PAGE_SHIFT);
+    BUG_ON(HYPERVISOR_mmuext_op(&op, 1, NULL, DOMID_SELF) < 0);
+}
+
+#ifdef CONFIG_X86_64
+void xen_pud_pin(unsigned long ptr)
+{
+    struct mmuext_op op;
+    op.cmd = MMUEXT_PIN_L3_TABLE;
+    op.mfn = pfn_to_mfn(ptr >> PAGE_SHIFT);
+    BUG_ON(HYPERVISOR_mmuext_op(&op, 1, NULL, DOMID_SELF) < 0);
+}
+
+void xen_pud_unpin(unsigned long ptr)
+{
+    struct mmuext_op op;
+    op.cmd = MMUEXT_UNPIN_TABLE;
+    op.mfn = pfn_to_mfn(ptr >> PAGE_SHIFT);
+    BUG_ON(HYPERVISOR_mmuext_op(&op, 1, NULL, DOMID_SELF) < 0);
+}
+
+void xen_pmd_pin(unsigned long ptr)
+{
+    struct mmuext_op op;
+    op.cmd = MMUEXT_PIN_L2_TABLE;
+    op.mfn = pfn_to_mfn(ptr >> PAGE_SHIFT);
+    BUG_ON(HYPERVISOR_mmuext_op(&op, 1, NULL, DOMID_SELF) < 0);
+}
+
+void xen_pmd_unpin(unsigned long ptr)
+{
+    struct mmuext_op op;
+    op.cmd = MMUEXT_UNPIN_TABLE;
+    op.mfn = pfn_to_mfn(ptr >> PAGE_SHIFT);
+    BUG_ON(HYPERVISOR_mmuext_op(&op, 1, NULL, DOMID_SELF) < 0);
+}
+#endif /* CONFIG_X86_64 */
+#endif /* CONFIG_XEN_SHADOW_MODE */
+
+void xen_set_ldt(unsigned long ptr, unsigned long len)
+{
+    struct mmuext_op op;
+    op.cmd = MMUEXT_SET_LDT;
+    op.linear_addr = ptr;
+    op.nr_ents = len;
+    BUG_ON(HYPERVISOR_mmuext_op(&op, 1, NULL, DOMID_SELF) < 0);
+}
+
+void xen_contig_memory(unsigned long vstart, unsigned int order)
+{
+    /*
+     * Ensure multi-page extents are contiguous in machine memory. This code 
+     * could be cleaned up some, and the number of hypercalls reduced.
+     */
+    pgd_t         *pgd; 
+    pud_t         *pud; 
+    pmd_t         *pmd;
+    pte_t         *pte;
+    unsigned long  mfn, i, flags;
+
+    scrub_pages(vstart, 1 << order);
+
+    balloon_lock(flags);
+
+    /* 1. Zap current PTEs, giving away the underlying pages. */
+    for (i = 0; i < (1<<order); i++) {
+        pgd = pgd_offset_k(vstart + (i*PAGE_SIZE));
+        pud = pud_offset(pgd, (vstart + (i*PAGE_SIZE)));
+        pmd = pmd_offset(pud, (vstart + (i*PAGE_SIZE)));
+        pte = pte_offset_kernel(pmd, (vstart + (i*PAGE_SIZE)));
+        mfn = pte_mfn(*pte);
+        HYPERVISOR_update_va_mapping(
+            vstart + (i*PAGE_SIZE), __pte_ma(0), 0);
+        phys_to_machine_mapping[(__pa(vstart)>>PAGE_SHIFT)+i] =
+            INVALID_P2M_ENTRY;
+        BUG_ON(HYPERVISOR_dom_mem_op(
+            MEMOP_decrease_reservation, &mfn, 1, 0) != 1);
+    }
+
+    /* 2. Get a new contiguous memory extent. */
+    BUG_ON(HYPERVISOR_dom_mem_op(
+        MEMOP_increase_reservation, &mfn, 1, order) != 1);
+
+    /* 3. Map the new extent in place of old pages. */
+    for (i = 0; i < (1<<order); i++) {
+        HYPERVISOR_update_va_mapping(
+            vstart + (i*PAGE_SIZE),
+            __pte_ma(((mfn+i)<<PAGE_SHIFT)|__PAGE_KERNEL), 0);
+        xen_machphys_update(mfn+i, (__pa(vstart)>>PAGE_SHIFT)+i);
+        phys_to_machine_mapping[(__pa(vstart)>>PAGE_SHIFT)+i] = mfn+i;
+    }
+
+    flush_tlb_all();
+
+    balloon_unlock(flags);
+}
+
+#ifdef CONFIG_XEN_PHYSDEV_ACCESS
+
+unsigned long allocate_empty_lowmem_region(unsigned long pages)
+{
+    pgd_t         *pgd;
+    pud_t         *pud; 
+    pmd_t         *pmd;
+    pte_t         *pte;
+    unsigned long *pfn_array;
+    unsigned long  vstart;
+    unsigned long  i;
+    unsigned int   order = get_order(pages*PAGE_SIZE);
+
+    vstart = __get_free_pages(GFP_KERNEL, order);
+    if ( vstart == 0 )
+        return 0UL;
+
+    scrub_pages(vstart, 1 << order);
+
+    pfn_array = vmalloc((1<<order) * sizeof(*pfn_array));
+    if ( pfn_array == NULL )
+        BUG();
+
+    for ( i = 0; i < (1<<order); i++ )
+    {
+        pgd = pgd_offset_k(   (vstart + (i*PAGE_SIZE)));
+        pud = pud_offset(pgd, (vstart + (i*PAGE_SIZE)));
+        pmd = pmd_offset(pud, (vstart + (i*PAGE_SIZE)));
+        pte = pte_offset_kernel(pmd, (vstart + (i*PAGE_SIZE))); 
+        pfn_array[i] = pte_mfn(*pte);
+#ifdef CONFIG_X86_64
+        xen_l1_entry_update(pte, __pte(0));
+#else
+        HYPERVISOR_update_va_mapping(vstart + (i*PAGE_SIZE), __pte_ma(0), 0);
+#endif
+        phys_to_machine_mapping[(__pa(vstart)>>PAGE_SHIFT)+i] =
+            INVALID_P2M_ENTRY;
+    }
+
+    flush_tlb_all();
+
+    balloon_put_pages(pfn_array, 1 << order);
+
+    vfree(pfn_array);
+
+    return vstart;
+}
+
+#endif /* CONFIG_XEN_PHYSDEV_ACCESS */
Index: linux-2.6.12-xen0-arch/arch/i386/mach-xen/mm/init.c
===================================================================
--- /dev/null
+++ linux-2.6.12-xen0-arch/arch/i386/mach-xen/mm/init.c
@@ -0,0 +1,801 @@
+/*
+ *  linux/arch/i386/mm/init.c
+ *
+ *  Copyright (C) 1995  Linus Torvalds
+ *
+ *  Support of BIGMEM added by Gerhard Wichert, Siemens AG, July 1999
+ */
+
+#include <linux/config.h>
+#include <linux/module.h>
+#include <linux/signal.h>
+#include <linux/sched.h>
+#include <linux/kernel.h>
+#include <linux/errno.h>
+#include <linux/string.h>
+#include <linux/types.h>
+#include <linux/ptrace.h>
+#include <linux/mman.h>
+#include <linux/mm.h>
+#include <linux/hugetlb.h>
+#include <linux/swap.h>
+#include <linux/smp.h>
+#include <linux/init.h>
+#include <linux/highmem.h>
+#include <linux/pagemap.h>
+#include <linux/bootmem.h>
+#include <linux/slab.h>
+#include <linux/proc_fs.h>
+#include <linux/efi.h>
+
+#include <asm/processor.h>
+#include <asm/system.h>
+#include <asm/uaccess.h>
+#include <asm/pgtable.h>
+#include <asm/dma.h>
+#include <asm/fixmap.h>
+#include <asm/e820.h>
+#include <asm/apic.h>
+#include <asm/tlb.h>
+#include <asm/tlbflush.h>
+#include <asm/sections.h>
+#include <asm-xen/hypervisor.h>
+
+unsigned int __VMALLOC_RESERVE = 128 << 20;
+
+DEFINE_PER_CPU(struct mmu_gather, mmu_gathers);
+unsigned long highstart_pfn, highend_pfn;
+
+static int noinline do_test_wp_bit(void);
+
+/*
+ * Creates a middle page table and puts a pointer to it in the
+ * given global directory entry. This only returns the gd entry
+ * in non-PAE compilation mode, since the middle layer is folded.
+ */
+static pmd_t * __init one_md_table_init(pgd_t *pgd)
+{
+       pud_t *pud;
+       pmd_t *pmd_table;
+
+#ifdef CONFIG_X86_PAE
+       pmd_table = (pmd_t *) alloc_bootmem_low_pages(PAGE_SIZE);
+       make_page_readonly(pmd_table);
+       set_pgd(pgd, __pgd(__pa(pmd_table) | _PAGE_PRESENT));
+       pud = pud_offset(pgd, 0);
+       if (pmd_table != pmd_offset(pud, 0)) 
+               BUG();
+#else
+       pud = pud_offset(pgd, 0);
+       pmd_table = pmd_offset(pud, 0);
+#endif
+
+       return pmd_table;
+}
+
+/*
+ * Create a page table and place a pointer to it in a middle page
+ * directory entry.
+ */
+static pte_t * __init one_page_table_init(pmd_t *pmd)
+{
+       if (pmd_none(*pmd)) {
+               pte_t *page_table = (pte_t *) 
alloc_bootmem_low_pages(PAGE_SIZE);
+               make_page_readonly(page_table);
+               set_pmd(pmd, __pmd(__pa(page_table) | _PAGE_TABLE));
+               if (page_table != pte_offset_kernel(pmd, 0))
+                       BUG();  
+
+               return page_table;
+       }
+       
+       return pte_offset_kernel(pmd, 0);
+}
+
+/*
+ * This function initializes a certain range of kernel virtual memory 
+ * with new bootmem page tables, everywhere page tables are missing in
+ * the given range.
+ */
+
+/*
+ * NOTE: The pagetables are allocated contiguous on the physical space 
+ * so we can cache the place of the first one and move around without 
+ * checking the pgd every time.
+ */
+static void __init page_table_range_init (unsigned long start, unsigned long 
end, pgd_t *pgd_base)
+{
+       pgd_t *pgd;
+       pud_t *pud;
+       pmd_t *pmd;
+       int pgd_idx, pmd_idx;
+       unsigned long vaddr;
+
+       vaddr = start;
+       pgd_idx = pgd_index(vaddr);
+       pmd_idx = pmd_index(vaddr);
+       pgd = pgd_base + pgd_idx;
+
+       for ( ; (pgd_idx < PTRS_PER_PGD) && (vaddr != end); pgd++, pgd_idx++) {
+               if (pgd_none(*pgd)) 
+                       one_md_table_init(pgd);
+               pud = pud_offset(pgd, vaddr);
+               pmd = pmd_offset(pud, vaddr);
+               for (; (pmd_idx < PTRS_PER_PMD) && (vaddr != end); pmd++, 
pmd_idx++) {
+                       if (vaddr < HYPERVISOR_VIRT_START && pmd_none(*pmd)) 
+                               one_page_table_init(pmd);
+
+                       vaddr += PMD_SIZE;
+               }
+               pmd_idx = 0;
+       }
+}
+
+static inline int is_kernel_text(unsigned long addr)
+{
+       if (addr >= PAGE_OFFSET && addr <= (unsigned long)__init_end)
+               return 1;
+       return 0;
+}
+
+/*
+ * This maps the physical memory to kernel virtual address space, a total 
+ * of max_low_pfn pages, by creating page tables starting from address 
+ * PAGE_OFFSET.
+ */
+static void __init kernel_physical_mapping_init(pgd_t *pgd_base)
+{
+       unsigned long pfn;
+       pgd_t *pgd;
+       pmd_t *pmd;
+       pte_t *pte;
+       int pgd_idx, pmd_idx, pte_ofs;
+
+       unsigned long max_ram_pfn = xen_start_info.nr_pages;
+       if (max_ram_pfn > max_low_pfn)
+               max_ram_pfn = max_low_pfn;
+
+       pgd_idx = pgd_index(PAGE_OFFSET);
+       pgd = pgd_base + pgd_idx;
+       pfn = 0;
+       pmd_idx = pmd_index(PAGE_OFFSET);
+       pte_ofs = pte_index(PAGE_OFFSET);
+
+       for (; pgd_idx < PTRS_PER_PGD; pgd++, pgd_idx++) {
+#ifdef CONFIG_XEN
+               /*
+                * Native linux hasn't PAE-paging enabled yet at this
+                * point.  When running as xen domain we are in PAE
+                * mode already, thus we can't simply hook a empty
+                * pmd.  That would kill the mappings we are currently
+                * using ...
+                */
+               pmd = pmd_offset(pud_offset(pgd, PAGE_OFFSET), PAGE_OFFSET);
+#else
+               pmd = one_md_table_init(pgd);
+#endif
+               if (pfn >= max_low_pfn)
+                       continue;
+               pmd += pmd_idx;
+               for (; pmd_idx < PTRS_PER_PMD && pfn < max_low_pfn; pmd++, 
pmd_idx++) {
+                       unsigned int address = pfn * PAGE_SIZE + PAGE_OFFSET;
+                       if (address >= HYPERVISOR_VIRT_START)
+                               continue;
+
+                       /* Map with big pages if possible, otherwise create 
normal page tables. */
+                       if (cpu_has_pse) {
+                               unsigned int address2 = (pfn + PTRS_PER_PTE - 
1) * PAGE_SIZE + PAGE_OFFSET + PAGE_SIZE-1;
+
+                               if (is_kernel_text(address) || 
is_kernel_text(address2))
+                                       set_pmd(pmd, pfn_pmd(pfn, 
PAGE_KERNEL_LARGE_EXEC));
+                               else
+                                       set_pmd(pmd, pfn_pmd(pfn, 
PAGE_KERNEL_LARGE));
+                               pfn += PTRS_PER_PTE;
+                       } else {
+                               pte = one_page_table_init(pmd);
+
+                               pte += pte_ofs;
+                               for (; pte_ofs < PTRS_PER_PTE && pfn < 
max_low_pfn; pte++, pfn++, pte_ofs++) {
+                                               /* XEN: Only map initial RAM 
allocation. */
+                                               if ((pfn >= max_ram_pfn) || 
pte_present(*pte))
+                                                       continue;
+                                               if (is_kernel_text(address))
+                                                       set_pte(pte, 
pfn_pte(pfn, PAGE_KERNEL_EXEC));
+                                               else
+                                                       set_pte(pte, 
pfn_pte(pfn, PAGE_KERNEL));
+                               }
+                               pte_ofs = 0;
+                       }
+               }
+               pmd_idx = 0;
+       }
+}
+
+static inline int page_kills_ppro(unsigned long pagenr)
+{
+       if (pagenr >= 0x70000 && pagenr <= 0x7003F)
+               return 1;
+       return 0;
+}
+
+extern int is_available_memory(efi_memory_desc_t *);
+
+static inline int page_is_ram(unsigned long pagenr)
+{
+       int i;
+       unsigned long addr, end;
+
+       if (efi_enabled) {
+               efi_memory_desc_t *md;
+
+               for (i = 0; i < memmap.nr_map; i++) {
+                       md = &memmap.map[i];
+                       if (!is_available_memory(md))
+                               continue;
+                       addr = (md->phys_addr+PAGE_SIZE-1) >> PAGE_SHIFT;
+                       end = (md->phys_addr + (md->num_pages << 
EFI_PAGE_SHIFT)) >> PAGE_SHIFT;
+
+                       if ((pagenr >= addr) && (pagenr < end))
+                               return 1;
+               }
+               return 0;
+       }
+
+       for (i = 0; i < e820.nr_map; i++) {
+
+               if (e820.map[i].type != E820_RAM)       /* not usable memory */
+                       continue;
+               /*
+                *      !!!FIXME!!! Some BIOSen report areas as RAM that
+                *      are not. Notably the 640->1Mb area. We need a sanity
+                *      check here.
+                */
+               addr = (e820.map[i].addr+PAGE_SIZE-1) >> PAGE_SHIFT;
+               end = (e820.map[i].addr+e820.map[i].size) >> PAGE_SHIFT;
+               if  ((pagenr >= addr) && (pagenr < end))
+                       return 1;
+       }
+       return 0;
+}
+
+#ifdef CONFIG_HIGHMEM
+pte_t *kmap_pte;
+pgprot_t kmap_prot;
+
+#define kmap_get_fixmap_pte(vaddr)                                     \
+       pte_offset_kernel(pmd_offset(pud_offset(pgd_offset_k(vaddr), vaddr), 
(vaddr)), (vaddr))
+
+static void __init kmap_init(void)
+{
+       unsigned long kmap_vstart;
+
+       /* cache the first kmap pte */
+       kmap_vstart = __fix_to_virt(FIX_KMAP_BEGIN);
+       kmap_pte = kmap_get_fixmap_pte(kmap_vstart);
+
+       kmap_prot = PAGE_KERNEL;
+}
+
+static void __init permanent_kmaps_init(pgd_t *pgd_base)
+{
+       pgd_t *pgd;
+       pud_t *pud;
+       pmd_t *pmd;
+       pte_t *pte;
+       unsigned long vaddr;
+
+       vaddr = PKMAP_BASE;
+       page_table_range_init(vaddr, vaddr + PAGE_SIZE*LAST_PKMAP, pgd_base);
+
+       pgd = swapper_pg_dir + pgd_index(vaddr);
+       pud = pud_offset(pgd, vaddr);
+       pmd = pmd_offset(pud, vaddr);
+       pte = pte_offset_kernel(pmd, vaddr);
+       pkmap_page_table = pte; 
+}
+
+void __init one_highpage_init(struct page *page, int pfn, int bad_ppro)
+{
+       if (page_is_ram(pfn) && !(bad_ppro && page_kills_ppro(pfn))) {
+               ClearPageReserved(page);
+               set_bit(PG_highmem, &page->flags);
+               set_page_count(page, 1);
+               if (pfn < xen_start_info.nr_pages)
+                       __free_page(page);
+               totalhigh_pages++;
+       } else
+               SetPageReserved(page);
+}
+
+#ifndef CONFIG_DISCONTIGMEM
+static void __init set_highmem_pages_init(int bad_ppro)
+{
+       int pfn;
+       for (pfn = highstart_pfn; pfn < highend_pfn; pfn++)
+               one_highpage_init(pfn_to_page(pfn), pfn, bad_ppro);
+       totalram_pages += totalhigh_pages;
+}
+#else
+extern void set_highmem_pages_init(int);
+#endif /* !CONFIG_DISCONTIGMEM */
+
+#else
+#define kmap_init() do { } while (0)
+#define permanent_kmaps_init(pgd_base) do { } while (0)
+#define set_highmem_pages_init(bad_ppro) do { } while (0)
+#endif /* CONFIG_HIGHMEM */
+
+unsigned long long __PAGE_KERNEL = _PAGE_KERNEL;
+unsigned long long __PAGE_KERNEL_EXEC = _PAGE_KERNEL_EXEC;
+
+#ifndef CONFIG_DISCONTIGMEM
+#define remap_numa_kva() do {} while (0)
+#else
+extern void __init remap_numa_kva(void);
+#endif
+
+static void __init pagetable_init (void)
+{
+       unsigned long vaddr;
+       pgd_t *pgd_base = swapper_pg_dir;
+       pgd_t *old_pgd = (pgd_t *)xen_start_info.pt_base;
+
+#ifdef CONFIG_X86_PAE
+       int i;
+       /* Init entries of the first-level page table to the zero page */
+       for (i = 0; i < PTRS_PER_PGD; i++)
+               set_pgd(pgd_base + i, __pgd(__pa(empty_zero_page) | 
_PAGE_PRESENT));
+#endif
+
+       /* Enable PSE if available */
+       if (cpu_has_pse) {
+               set_in_cr4(X86_CR4_PSE);
+       }
+
+       /* Enable PGE if available */
+       if (cpu_has_pge) {
+               set_in_cr4(X86_CR4_PGE);
+               __PAGE_KERNEL |= _PAGE_GLOBAL;
+               __PAGE_KERNEL_EXEC |= _PAGE_GLOBAL;
+       }
+
+       /*
+        * Switch to proper mm_init page directory. Initialise from the current
+        * page directory, write-protect the new page directory, then switch to
+        * it. We clean up by write-enabling and then freeing the old page dir.
+        */
+#ifndef CONFIG_X86_PAE
+       memcpy(pgd_base, old_pgd, PTRS_PER_PGD_NO_HV*sizeof(pgd_t));
+       make_page_readonly(pgd_base);
+       xen_pgd_pin(__pa(pgd_base));
+       load_cr3(pgd_base);
+       xen_pgd_unpin(__pa(old_pgd));
+       make_page_writable(old_pgd);
+       __flush_tlb_all();
+       free_bootmem(__pa(old_pgd), PAGE_SIZE);
+#else
+       {
+               pud_t *old_pud = pud_offset(old_pgd+3, PAGE_OFFSET);
+               pmd_t *old_pmd = pmd_offset(old_pud, PAGE_OFFSET);
+               pmd_t *new_pmd = alloc_bootmem_low_pages(PAGE_SIZE);
+
+               memcpy(new_pmd,  old_pmd, PAGE_SIZE);
+               memcpy(pgd_base, old_pgd, PTRS_PER_PGD_NO_HV*sizeof(pgd_t));
+               set_pgd(&pgd_base[3], __pgd(__pa(new_pmd) | _PAGE_PRESENT));
+
+               make_page_readonly(new_pmd);
+               make_page_readonly(pgd_base);
+               xen_pgd_pin(__pa(pgd_base));
+               load_cr3(pgd_base);
+               xen_pgd_unpin(__pa(old_pgd));
+               make_page_writable(old_pgd);
+               make_page_writable(old_pmd);
+               __flush_tlb_all();
+
+               free_bootmem(__pa(old_pgd), PAGE_SIZE);
+               free_bootmem(__pa(old_pmd), PAGE_SIZE);
+       }
+#endif
+
+       init_mm.context.pinned = 1;
+       kernel_physical_mapping_init(pgd_base);
+       remap_numa_kva();
+
+       /*
+        * Fixed mappings, only the page table structure has to be
+        * created - mappings will be set by set_fixmap():
+        */
+       vaddr = __fix_to_virt(__end_of_fixed_addresses - 1) & PMD_MASK;
+       page_table_range_init(vaddr, 0, pgd_base);
+
+       permanent_kmaps_init(pgd_base);
+
+#if 0 /* def CONFIG_X86_PAE */
+       /*
+        * Add low memory identity-mappings - SMP needs it when
+        * starting up on an AP from real-mode. In the non-PAE
+        * case we already have these mappings through head.S.
+        * All user-space mappings are explicitly cleared after
+        * SMP startup.
+        */
+       set_pgd(&pgd_base[0], pgd_base[USER_PTRS_PER_PGD]);
+#endif
+}
+
+#if defined(CONFIG_PM_DISK) || defined(CONFIG_SOFTWARE_SUSPEND)
+/*
+ * Swap suspend & friends need this for resume because things like the 
intel-agp
+ * driver might have split up a kernel 4MB mapping.
+ */
+char __nosavedata swsusp_pg_dir[PAGE_SIZE]
+       __attribute__ ((aligned (PAGE_SIZE)));
+
+static inline void save_pg_dir(void)
+{
+       memcpy(swsusp_pg_dir, swapper_pg_dir, PAGE_SIZE);
+}
+#else
+static inline void save_pg_dir(void)
+{
+}
+#endif
+
+void zap_low_mappings (void)
+{
+       int i;
+
+       save_pg_dir();
+
+       /*
+        * Zap initial low-memory mappings.
+        *
+        * Note that "pgd_clear()" doesn't do it for
+        * us, because pgd_clear() is a no-op on i386.
+        */
+       for (i = 0; i < USER_PTRS_PER_PGD; i++)
+#if defined(CONFIG_X86_PAE) && !defined(CONFIG_XEN)
+               set_pgd(swapper_pg_dir+i, __pgd(1 + __pa(empty_zero_page)));
+#else
+               set_pgd(swapper_pg_dir+i, __pgd(0));
+#endif
+       flush_tlb_all();
+}
+
+static int disable_nx __initdata = 0;
+u64 __supported_pte_mask = ~_PAGE_NX;
+
+/*
+ * noexec = on|off
+ *
+ * Control non executable mappings.
+ *
+ * on      Enable
+ * off     Disable
+ */
+void __init noexec_setup(const char *str)
+{
+       if (!strncmp(str, "on",2) && cpu_has_nx) {
+               __supported_pte_mask |= _PAGE_NX;
+               disable_nx = 0;
+       } else if (!strncmp(str,"off",3)) {
+               disable_nx = 1;
+               __supported_pte_mask &= ~_PAGE_NX;
+       }
+}
+
+int nx_enabled = 0;
+#ifdef CONFIG_X86_PAE
+
+static void __init set_nx(void)
+{
+       unsigned int v[4], l, h;
+
+       if (cpu_has_pae && (cpuid_eax(0x80000000) > 0x80000001)) {
+               cpuid(0x80000001, &v[0], &v[1], &v[2], &v[3]);
+               if ((v[3] & (1 << 20)) && !disable_nx) {
+                       rdmsr(MSR_EFER, l, h);
+                       l |= EFER_NX;
+                       wrmsr(MSR_EFER, l, h);
+                       nx_enabled = 1;
+                       __supported_pte_mask |= _PAGE_NX;
+               }
+       }
+}
+
+/*
+ * Enables/disables executability of a given kernel page and
+ * returns the previous setting.
+ */
+int __init set_kernel_exec(unsigned long vaddr, int enable)
+{
+       pte_t *pte;
+       int ret = 1;
+
+       if (!nx_enabled)
+               goto out;
+
+       pte = lookup_address(vaddr);
+       BUG_ON(!pte);
+
+       if (!pte_exec_kernel(*pte))
+               ret = 0;
+
+       if (enable)
+               pte->pte_high &= ~(1 << (_PAGE_BIT_NX - 32));
+       else
+               pte->pte_high |= 1 << (_PAGE_BIT_NX - 32);
+       __flush_tlb_all();
+out:
+       return ret;
+}
+
+#endif
+
+/*
+ * paging_init() sets up the page tables - note that the first 8MB are
+ * already mapped by head.S.
+ *
+ * This routines also unmaps the page at virtual kernel address 0, so
+ * that we can trap those pesky NULL-reference errors in the kernel.
+ */
+void __init paging_init(void)
+{
+#ifdef CONFIG_XEN_PHYSDEV_ACCESS
+       int i;
+#endif
+
+#ifdef CONFIG_X86_PAE
+       set_nx();
+       if (nx_enabled)
+               printk("NX (Execute Disable) protection: active\n");
+#endif
+
+       pagetable_init();
+
+#if defined(CONFIG_X86_PAE) && !defined(CONFIG_XEN)
+       /*
+        * We will bail out later - printk doesn't work right now so
+        * the user would just see a hanging kernel.
+        * when running as xen domain we are already in PAE mode at
+        * this point.
+        */
+       if (cpu_has_pae)
+               set_in_cr4(X86_CR4_PAE);
+#endif
+       __flush_tlb_all();
+
+       kmap_init();
+
+       /* Switch to the real shared_info page, and clear the dummy page. */
+       set_fixmap(FIX_SHARED_INFO, xen_start_info.shared_info);
+       HYPERVISOR_shared_info = (shared_info_t *)fix_to_virt(FIX_SHARED_INFO);
+       memset(empty_zero_page, 0, sizeof(empty_zero_page));
+
+#ifdef CONFIG_XEN_PHYSDEV_ACCESS
+       /* Setup mapping of lower 1st MB */
+       for (i = 0; i < NR_FIX_ISAMAPS; i++)
+               if (xen_start_info.flags & SIF_PRIVILEGED)
+                       set_fixmap(FIX_ISAMAP_BEGIN - i, i * PAGE_SIZE);
+               else
+                       __set_fixmap(FIX_ISAMAP_BEGIN - i,
+                                    virt_to_machine(empty_zero_page),
+                                    PAGE_KERNEL_RO);
+#endif
+}
+
+/*
+ * Test if the WP bit works in supervisor mode. It isn't supported on 386's
+ * and also on some strange 486's (NexGen etc.). All 586+'s are OK. This
+ * used to involve black magic jumps to work around some nasty CPU bugs,
+ * but fortunately the switch to using exceptions got rid of all that.
+ */
+
+static void __init test_wp_bit(void)
+{
+       printk("Checking if this processor honours the WP bit even in 
supervisor mode... ");
+
+       /* Any page-aligned address will do, the test is non-destructive */
+       __set_fixmap(FIX_WP_TEST, __pa(&swapper_pg_dir), PAGE_READONLY);
+       boot_cpu_data.wp_works_ok = do_test_wp_bit();
+       clear_fixmap(FIX_WP_TEST);
+
+       if (!boot_cpu_data.wp_works_ok) {
+               printk("No.\n");
+#ifdef CONFIG_X86_WP_WORKS_OK
+               panic("This kernel doesn't support CPU's with broken WP. 
Recompile it for a 386!");
+#endif
+       } else {
+               printk("Ok.\n");
+       }
+}
+
+static void __init set_max_mapnr_init(void)
+{
+#ifdef CONFIG_HIGHMEM
+       num_physpages = highend_pfn;
+#else
+       num_physpages = max_low_pfn;
+#endif
+#ifndef CONFIG_DISCONTIGMEM
+       max_mapnr = num_physpages;
+#endif
+}
+
+static struct kcore_list kcore_mem, kcore_vmalloc; 
+
+void __init mem_init(void)
+{
+       extern int ppro_with_ram_bug(void);
+       int codesize, reservedpages, datasize, initsize;
+       int tmp;
+       int bad_ppro;
+       unsigned long pfn;
+
+#ifndef CONFIG_DISCONTIGMEM
+       if (!mem_map)
+               BUG();
+#endif
+       
+       bad_ppro = ppro_with_ram_bug();
+
+#ifdef CONFIG_HIGHMEM
+       /* check that fixmap and pkmap do not overlap */
+       if (PKMAP_BASE+LAST_PKMAP*PAGE_SIZE >= FIXADDR_START) {
+               printk(KERN_ERR "fixmap and kmap areas overlap - this will 
crash\n");
+               printk(KERN_ERR "pkstart: %lxh pkend: %lxh fixstart %lxh\n",
+                               PKMAP_BASE, PKMAP_BASE+LAST_PKMAP*PAGE_SIZE, 
FIXADDR_START);
+               BUG();
+       }
+#endif
+ 
+       set_max_mapnr_init();
+
+#ifdef CONFIG_HIGHMEM
+       high_memory = (void *) __va(highstart_pfn * PAGE_SIZE - 1) + 1;
+#else
+       high_memory = (void *) __va(max_low_pfn * PAGE_SIZE - 1) + 1;
+#endif
+       printk("vmalloc area: %lx-%lx, maxmem %lx\n",
+              VMALLOC_START,VMALLOC_END,MAXMEM);
+       BUG_ON(VMALLOC_START > VMALLOC_END);
+       
+       /* this will put all low memory onto the freelists */
+       totalram_pages += free_all_bootmem();
+       /* XEN: init and count low-mem pages outside initial allocation. */
+       for (pfn = xen_start_info.nr_pages; pfn < max_low_pfn; pfn++) {
+               ClearPageReserved(&mem_map[pfn]);
+               set_page_count(&mem_map[pfn], 1);
+               totalram_pages++;
+       }
+
+       reservedpages = 0;
+       for (tmp = 0; tmp < max_low_pfn; tmp++)
+               /*
+                * Only count reserved RAM pages
+                */
+               if (page_is_ram(tmp) && PageReserved(pfn_to_page(tmp)))
+                       reservedpages++;
+
+       set_highmem_pages_init(bad_ppro);
+
+       codesize =  (unsigned long) &_etext - (unsigned long) &_text;
+       datasize =  (unsigned long) &_edata - (unsigned long) &_etext;
+       initsize =  (unsigned long) &__init_end - (unsigned long) &__init_begin;
+
+       kclist_add(&kcore_mem, __va(0), max_low_pfn << PAGE_SHIFT); 
+       kclist_add(&kcore_vmalloc, (void *)VMALLOC_START, 
+                  VMALLOC_END-VMALLOC_START);
+
+       printk(KERN_INFO "Memory: %luk/%luk available (%dk kernel code, %dk 
reserved, %dk data, %dk init, %ldk highmem)\n",
+               (unsigned long) nr_free_pages() << (PAGE_SHIFT-10),
+               num_physpages << (PAGE_SHIFT-10),
+               codesize >> 10,
+               reservedpages << (PAGE_SHIFT-10),
+               datasize >> 10,
+               initsize >> 10,
+               (unsigned long) (totalhigh_pages << (PAGE_SHIFT-10))
+              );
+
+#ifdef CONFIG_X86_PAE
+       if (!cpu_has_pae)
+               panic("cannot execute a PAE-enabled kernel on a PAE-less CPU!");
+#endif
+       if (boot_cpu_data.wp_works_ok < 0)
+               test_wp_bit();
+
+       /*
+        * Subtle. SMP is doing it's boot stuff late (because it has to
+        * fork idle threads) - but it also needs low mappings for the
+        * protected-mode entry to work. We zap these entries only after
+        * the WP-bit has been tested.
+        */
+#ifndef CONFIG_SMP
+       zap_low_mappings();
+#endif
+}
+
+kmem_cache_t *pgd_cache;
+kmem_cache_t *pmd_cache;
+
+void __init pgtable_cache_init(void)
+{
+       if (PTRS_PER_PMD > 1) {
+               pmd_cache = kmem_cache_create("pmd",
+                                       PTRS_PER_PMD*sizeof(pmd_t),
+                                       PTRS_PER_PMD*sizeof(pmd_t),
+                                       0,
+                                       pmd_ctor,
+                                       NULL);
+               if (!pmd_cache)
+                       panic("pgtable_cache_init(): cannot create pmd cache");
+       }
+       pgd_cache = kmem_cache_create("pgd",
+#if 0 /* How the heck _this_ works in native linux ??? */
+                               PTRS_PER_PGD*sizeof(pgd_t),
+                               PTRS_PER_PGD*sizeof(pgd_t),
+#else
+                               PAGE_SIZE,
+                               PAGE_SIZE,
+#endif
+                               0,
+                               pgd_ctor,
+                               pgd_dtor);
+       if (!pgd_cache)
+               panic("pgtable_cache_init(): Cannot create pgd cache");
+}
+
+/*
+ * This function cannot be __init, since exceptions don't work in that
+ * section.  Put this after the callers, so that it cannot be inlined.
+ */
+static int noinline do_test_wp_bit(void)
+{
+       char tmp_reg;
+       int flag;
+
+       __asm__ __volatile__(
+               "       movb %0,%1      \n"
+               "1:     movb %1,%0      \n"
+               "       xorl %2,%2      \n"
+               "2:                     \n"
+               ".section __ex_table,\"a\"\n"
+               "       .align 4        \n"
+               "       .long 1b,2b     \n"
+               ".previous              \n"
+               :"=m" (*(char *)fix_to_virt(FIX_WP_TEST)),
+                "=q" (tmp_reg),
+                "=r" (flag)
+               :"2" (1)
+               :"memory");
+       
+       return flag;
+}
+
+void free_initmem(void)
+{
+       unsigned long addr;
+
+       addr = (unsigned long)(&__init_begin);
+       for (; addr < (unsigned long)(&__init_end); addr += PAGE_SIZE) {
+               ClearPageReserved(virt_to_page(addr));
+               set_page_count(virt_to_page(addr), 1);
+               memset((void *)addr, 0xcc, PAGE_SIZE);
+               free_page(addr);
+               totalram_pages++;
+       }
+       printk (KERN_INFO "Freeing unused kernel memory: %dk freed\n", 
(__init_end - __init_begin) >> 10);
+}
+
+#ifdef CONFIG_BLK_DEV_INITRD
+void free_initrd_mem(unsigned long start, unsigned long end)
+{
+       if (start < end)
+               printk (KERN_INFO "Freeing initrd memory: %ldk freed\n", (end - 
start) >> 10);
+       for (; start < end; start += PAGE_SIZE) {
+               ClearPageReserved(virt_to_page(start));
+               set_page_count(virt_to_page(start), 1);
+               free_page(start);
+               totalram_pages++;
+       }
+}
+#endif
Index: linux-2.6.12-xen0-arch/arch/i386/mach-xen/mm/ioremap.c
===================================================================
--- /dev/null
+++ linux-2.6.12-xen0-arch/arch/i386/mach-xen/mm/ioremap.c
@@ -0,0 +1,442 @@
+/*
+ * arch/i386/mm/ioremap.c
+ *
+ * Re-map IO memory to kernel address space so that we can access it.
+ * This is needed for high PCI addresses that aren't mapped in the
+ * 640k-1MB IO memory area on PC's
+ *
+ * (C) Copyright 1995 1996 Linus Torvalds
+ */
+
+#include <linux/vmalloc.h>
+#include <linux/init.h>
+#include <linux/slab.h>
+#include <linux/module.h>
+#include <asm/io.h>
+#include <asm/fixmap.h>
+#include <asm/cacheflush.h>
+#include <asm/tlbflush.h>
+#include <asm/pgtable.h>
+#include <asm/pgalloc.h>
+
+#ifndef CONFIG_XEN_PHYSDEV_ACCESS
+
+void * __ioremap(unsigned long phys_addr, unsigned long size,
+                unsigned long flags)
+{
+       return NULL;
+}
+
+void *ioremap_nocache (unsigned long phys_addr, unsigned long size)
+{
+       return NULL;
+}
+
+void iounmap(volatile void __iomem *addr)
+{
+}
+
+void __init *bt_ioremap(unsigned long phys_addr, unsigned long size)
+{
+       return NULL;
+}
+
+void __init bt_iounmap(void *addr, unsigned long size)
+{
+}
+
+#else
+
+/*
+ * Does @address reside within a non-highmem page that is local to this virtual
+ * machine (i.e., not an I/O page, nor a memory page belonging to another VM).
+ * See the comment that accompanies pte_pfn() in pgtable-2level.h to understand
+ * why this works.
+ */
+static inline int is_local_lowmem(unsigned long address)
+{
+       extern unsigned long max_low_pfn;
+       unsigned long mfn = address >> PAGE_SHIFT;
+       unsigned long pfn = mfn_to_pfn(mfn);
+       return ((pfn < max_low_pfn) && (pfn_to_mfn(pfn) == mfn));
+}
+
+/*
+ * Generic mapping function (not visible outside):
+ */
+
+/*
+ * Remap an arbitrary physical address space into the kernel virtual
+ * address space. Needed when the kernel wants to access high addresses
+ * directly.
+ *
+ * NOTE! We need to allow non-page-aligned mappings too: we will obviously
+ * have to convert them into an offset in a page-aligned mapping, but the
+ * caller shouldn't need to know that small detail.
+ */
+void __iomem * __ioremap(unsigned long phys_addr, unsigned long size, unsigned 
long flags)
+{
+       void __iomem * addr;
+       struct vm_struct * area;
+       unsigned long offset, last_addr;
+       domid_t domid = DOMID_IO;
+
+       /* Don't allow wraparound or zero size */
+       last_addr = phys_addr + size - 1;
+       if (!size || last_addr < phys_addr)
+               return NULL;
+
+#ifdef CONFIG_XEN_PRIVILEGED_GUEST
+       /*
+        * Don't remap the low PCI/ISA area, it's always mapped..
+        */
+       if (phys_addr >= 0x0 && last_addr < 0x100000)
+               return isa_bus_to_virt(phys_addr);
+#endif
+
+       /*
+        * Don't allow anybody to remap normal RAM that we're using..
+        */
+       if (is_local_lowmem(phys_addr)) {
+               char *t_addr, *t_end;
+               struct page *page;
+
+               t_addr = bus_to_virt(phys_addr);
+               t_end = t_addr + (size - 1);
+          
+               for(page = virt_to_page(t_addr); page <= virt_to_page(t_end); 
page++)
+                       if(!PageReserved(page))
+                               return NULL;
+
+               domid = DOMID_SELF;
+       }
+
+       /*
+        * Mappings have to be page-aligned
+        */
+       offset = phys_addr & ~PAGE_MASK;
+       phys_addr &= PAGE_MASK;
+       size = PAGE_ALIGN(last_addr+1) - phys_addr;
+
+       /*
+        * Ok, go for it..
+        */
+       area = get_vm_area(size, VM_IOREMAP | (flags << 20));
+       if (!area)
+               return NULL;
+       area->phys_addr = phys_addr;
+       addr = (void __iomem *) area->addr;
+       if (direct_remap_area_pages(&init_mm, (unsigned long) addr, phys_addr,
+                                   size, __pgprot(_PAGE_PRESENT | _PAGE_RW |
+                                                  _PAGE_DIRTY | _PAGE_ACCESSED
+                                                  | flags), domid)) {
+               vunmap((void __force *) addr);
+               return NULL;
+       }
+       return (void __iomem *) (offset + (char __iomem *)addr);
+}
+
+
+/**
+ * ioremap_nocache     -   map bus memory into CPU space
+ * @offset:    bus address of the memory
+ * @size:      size of the resource to map
+ *
+ * ioremap_nocache performs a platform specific sequence of operations to
+ * make bus memory CPU accessible via the readb/readw/readl/writeb/
+ * writew/writel functions and the other mmio helpers. The returned
+ * address is not guaranteed to be usable directly as a virtual
+ * address. 
+ *
+ * This version of ioremap ensures that the memory is marked uncachable
+ * on the CPU as well as honouring existing caching rules from things like
+ * the PCI bus. Note that there are other caches and buffers on many 
+ * busses. In particular driver authors should read up on PCI writes
+ *
+ * It's useful if some control registers are in such an area and
+ * write combining or read caching is not desirable:
+ * 
+ * Must be freed with iounmap.
+ */
+
+void __iomem *ioremap_nocache (unsigned long phys_addr, unsigned long size)
+{
+       unsigned long last_addr;
+       void __iomem *p = __ioremap(phys_addr, size, _PAGE_PCD);
+       if (!p) 
+               return p; 
+
+       /* Guaranteed to be > phys_addr, as per __ioremap() */
+       last_addr = phys_addr + size - 1;
+
+       if (is_local_lowmem(last_addr)) { 
+               struct page *ppage = virt_to_page(bus_to_virt(phys_addr));
+               unsigned long npages;
+
+               phys_addr &= PAGE_MASK;
+
+               /* This might overflow and become zero.. */
+               last_addr = PAGE_ALIGN(last_addr);
+
+               /* .. but that's ok, because modulo-2**n arithmetic will make
+               * the page-aligned "last - first" come out right.
+               */
+               npages = (last_addr - phys_addr) >> PAGE_SHIFT;
+
+               if (change_page_attr(ppage, npages, PAGE_KERNEL_NOCACHE) < 0) { 
+                       iounmap(p); 
+                       p = NULL;
+               }
+               global_flush_tlb();
+       }
+
+       return p;                                       
+}
+
+void iounmap(volatile void __iomem *addr)
+{
+       struct vm_struct *p;
+       if ((void __force *) addr <= high_memory) 
+               return; 
+#ifdef CONFIG_XEN_PRIVILEGED_GUEST
+       if ((unsigned long) addr >= fix_to_virt(FIX_ISAMAP_BEGIN))
+               return;
+#endif
+       p = remove_vm_area((void *) (PAGE_MASK & (unsigned long __force) addr));
+       if (!p) { 
+               printk("__iounmap: bad address %p\n", addr);
+               return;
+       }
+
+       if ((p->flags >> 20) && is_local_lowmem(p->phys_addr)) {
+               /* p->size includes the guard page, but cpa doesn't like that */
+               change_page_attr(virt_to_page(bus_to_virt(p->phys_addr)),
+                                (p->size - PAGE_SIZE) >> PAGE_SHIFT,
+                                PAGE_KERNEL);                           
+               global_flush_tlb();
+       } 
+       kfree(p); 
+}
+
+void __init *bt_ioremap(unsigned long phys_addr, unsigned long size)
+{
+       unsigned long offset, last_addr;
+       unsigned int nrpages;
+       enum fixed_addresses idx;
+
+       /* Don't allow wraparound or zero size */
+       last_addr = phys_addr + size - 1;
+       if (!size || last_addr < phys_addr)
+               return NULL;
+
+#ifdef CONFIG_XEN_PRIVILEGED_GUEST
+       /*
+        * Don't remap the low PCI/ISA area, it's always mapped..
+        */
+       if (phys_addr >= 0x0 && last_addr < 0x100000)
+               return isa_bus_to_virt(phys_addr);
+#endif
+
+       /*
+        * Mappings have to be page-aligned
+        */
+       offset = phys_addr & ~PAGE_MASK;
+       phys_addr &= PAGE_MASK;
+       size = PAGE_ALIGN(last_addr) - phys_addr;
+
+       /*
+        * Mappings have to fit in the FIX_BTMAP area.
+        */
+       nrpages = size >> PAGE_SHIFT;
+       if (nrpages > NR_FIX_BTMAPS)
+               return NULL;
+
+       /*
+        * Ok, go for it..
+        */
+       idx = FIX_BTMAP_BEGIN;
+       while (nrpages > 0) {
+               set_fixmap(idx, phys_addr);
+               phys_addr += PAGE_SIZE;
+               --idx;
+               --nrpages;
+       }
+       return (void*) (offset + fix_to_virt(FIX_BTMAP_BEGIN));
+}
+
+void __init bt_iounmap(void *addr, unsigned long size)
+{
+       unsigned long virt_addr;
+       unsigned long offset;
+       unsigned int nrpages;
+       enum fixed_addresses idx;
+
+       virt_addr = (unsigned long)addr;
+       if (virt_addr < fix_to_virt(FIX_BTMAP_BEGIN))
+               return;
+#ifdef CONFIG_XEN_PRIVILEGED_GUEST
+       if (virt_addr >= fix_to_virt(FIX_ISAMAP_BEGIN))
+               return;
+#endif
+       offset = virt_addr & ~PAGE_MASK;
+       nrpages = PAGE_ALIGN(offset + size - 1) >> PAGE_SHIFT;
+
+       idx = FIX_BTMAP_BEGIN;
+       while (nrpages > 0) {
+               clear_fixmap(idx);
+               --idx;
+               --nrpages;
+       }
+}
+
+#endif /* CONFIG_XEN_PHYSDEV_ACCESS */
+
+/* These hacky macros avoid phys->machine translations. */
+#define __direct_pte(x) ((pte_t) { (x) } )
+#define __direct_mk_pte(page_nr,pgprot) \
+  __direct_pte(((page_nr) << PAGE_SHIFT) | pgprot_val(pgprot))
+#define direct_mk_pte_phys(physpage, pgprot) \
+  __direct_mk_pte((physpage) >> PAGE_SHIFT, pgprot)
+
+static inline void direct_remap_area_pte(pte_t *pte, 
+                                        unsigned long address, 
+                                        unsigned long size,
+                                        mmu_update_t **v)
+{
+       unsigned long end;
+
+       address &= ~PMD_MASK;
+       end = address + size;
+       if (end > PMD_SIZE)
+               end = PMD_SIZE;
+       if (address >= end)
+               BUG();
+
+       do {
+               (*v)->ptr = virt_to_machine(pte);
+               (*v)++;
+               address += PAGE_SIZE;
+               pte++;
+       } while (address && (address < end));
+}
+
+static inline int direct_remap_area_pmd(struct mm_struct *mm,
+                                       pmd_t *pmd, 
+                                       unsigned long address, 
+                                       unsigned long size,
+                                       mmu_update_t **v)
+{
+       unsigned long end;
+
+       address &= ~PGDIR_MASK;
+       end = address + size;
+       if (end > PGDIR_SIZE)
+               end = PGDIR_SIZE;
+       if (address >= end)
+               BUG();
+       do {
+               pte_t *pte = (mm == &init_mm) ? 
+                       pte_alloc_kernel(mm, pmd, address) :
+                       pte_alloc_map(mm, pmd, address);
+               if (!pte)
+                       return -ENOMEM;
+               direct_remap_area_pte(pte, address, end - address, v);
+               pte_unmap(pte);
+               address = (address + PMD_SIZE) & PMD_MASK;
+               pmd++;
+       } while (address && (address < end));
+       return 0;
+}
+ 
+int __direct_remap_area_pages(struct mm_struct *mm,
+                             unsigned long address, 
+                             unsigned long size, 
+                             mmu_update_t *v)
+{
+       pgd_t * dir;
+       unsigned long end = address + size;
+       int error;
+
+       dir = pgd_offset(mm, address);
+       if (address >= end)
+               BUG();
+       spin_lock(&mm->page_table_lock);
+       do {
+               pud_t *pud;
+               pmd_t *pmd;
+
+               error = -ENOMEM;
+               pud = pud_alloc(mm, dir, address);
+               if (!pud)
+                       break;
+               pmd = pmd_alloc(mm, pud, address);
+               if (!pmd)
+                       break;
+               error = 0;
+               direct_remap_area_pmd(mm, pmd, address, end - address, &v);
+               address = (address + PGDIR_SIZE) & PGDIR_MASK;
+               dir++;
+
+       } while (address && (address < end));
+       spin_unlock(&mm->page_table_lock);
+       return error;
+}
+
+
+int direct_remap_area_pages(struct mm_struct *mm,
+                           unsigned long address, 
+                           unsigned long machine_addr,
+                           unsigned long size, 
+                           pgprot_t prot,
+                           domid_t  domid)
+{
+       int i;
+       unsigned long start_address;
+#define MAX_DIRECTMAP_MMU_QUEUE 130
+       mmu_update_t u[MAX_DIRECTMAP_MMU_QUEUE], *v = u;
+
+       start_address = address;
+
+       flush_cache_all();
+
+       for (i = 0; i < size; i += PAGE_SIZE) {
+               if ((v - u) == MAX_DIRECTMAP_MMU_QUEUE) {
+                       /* Fill in the PTE pointers. */
+                       __direct_remap_area_pages(mm,
+                                                 start_address, 
+                                                 address-start_address, 
+                                                 u);
+ 
+                       if (HYPERVISOR_mmu_update(u, v - u, NULL, domid) < 0)
+                               return -EFAULT;
+                       v = u;
+                       start_address = address;
+               }
+
+               /*
+                * Fill in the machine address: PTE ptr is done later by
+                * __direct_remap_area_pages(). 
+                */
+               v->val = (machine_addr & PAGE_MASK) | pgprot_val(prot);
+
+               machine_addr += PAGE_SIZE;
+               address += PAGE_SIZE; 
+               v++;
+       }
+
+       if (v != u) {
+               /* get the ptep's filled in */
+               __direct_remap_area_pages(mm,
+                                         start_address, 
+                                         address-start_address, 
+                                         u);
+               if (unlikely(HYPERVISOR_mmu_update(u, v - u, NULL, domid) < 0))
+                       return -EFAULT;
+       }
+
+       flush_tlb_all();
+
+       return 0;
+}
+
+EXPORT_SYMBOL(direct_remap_area_pages);
Index: linux-2.6.12-xen0-arch/arch/i386/mach-xen/mm/Makefile
===================================================================
--- /dev/null
+++ linux-2.6.12-xen0-arch/arch/i386/mach-xen/mm/Makefile
@@ -0,0 +1,24 @@
+#
+# Makefile for the linux i386-specific parts of the memory manager.
+#
+
+XENARCH        := $(subst ",,$(CONFIG_XENARCH))
+
+CFLAGS += -Iarch/$(XENARCH)/mm
+
+obj-y  := init.o pgtable.o fault.o ioremap.o hypervisor.o
+c-obj-y        := extable.o mmap.o pageattr.o
+
+c-obj-$(CONFIG_DISCONTIGMEM)   += discontig.o
+c-obj-$(CONFIG_HUGETLB_PAGE) += hugetlbpage.o
+obj-$(CONFIG_HIGHMEM) += highmem.o
+c-obj-$(CONFIG_BOOT_IOREMAP) += boot_ioremap.o
+
+c-link :=
+
+$(patsubst %.o,$(obj)/%.c,$(c-obj-y) $(c-link)):
+       @ln -fsn $(srctree)/arch/i386/mm/$(notdir $@) $@
+
+obj-y  += $(c-obj-y)
+
+clean-files += $(patsubst %.o,%.c,$(c-obj-y) $(c-obj-) $(c-link))
Index: linux-2.6.12-xen0-arch/arch/i386/mach-xen/mm/pgtable.c
===================================================================
--- /dev/null
+++ linux-2.6.12-xen0-arch/arch/i386/mach-xen/mm/pgtable.c
@@ -0,0 +1,551 @@
+/*
+ *  linux/arch/i386/mm/pgtable.c
+ */
+
+#include <linux/config.h>
+#include <linux/sched.h>
+#include <linux/kernel.h>
+#include <linux/errno.h>
+#include <linux/mm.h>
+#include <linux/swap.h>
+#include <linux/smp.h>
+#include <linux/highmem.h>
+#include <linux/slab.h>
+#include <linux/pagemap.h>
+#include <linux/spinlock.h>
+
+#include <asm/system.h>
+#include <asm/pgtable.h>
+#include <asm/pgalloc.h>
+#include <asm/fixmap.h>
+#include <asm/e820.h>
+#include <asm/tlb.h>
+#include <asm/tlbflush.h>
+#include <asm/io.h>
+#include <asm/mmu_context.h>
+
+#include <asm-xen/foreign_page.h>
+
+void show_mem(void)
+{
+       int total = 0, reserved = 0;
+       int shared = 0, cached = 0;
+       int highmem = 0;
+       struct page *page;
+       pg_data_t *pgdat;
+       unsigned long i;
+
+       printk("Mem-info:\n");
+       show_free_areas();
+       printk("Free swap:       %6ldkB\n", nr_swap_pages<<(PAGE_SHIFT-10));
+       for_each_pgdat(pgdat) {
+               for (i = 0; i < pgdat->node_spanned_pages; ++i) {
+                       page = pgdat->node_mem_map + i;
+                       total++;
+                       if (PageHighMem(page))
+                               highmem++;
+                       if (PageReserved(page))
+                               reserved++;
+                       else if (PageSwapCache(page))
+                               cached++;
+                       else if (page_count(page))
+                               shared += page_count(page) - 1;
+               }
+       }
+       printk("%d pages of RAM\n", total);
+       printk("%d pages of HIGHMEM\n",highmem);
+       printk("%d reserved pages\n",reserved);
+       printk("%d pages shared\n",shared);
+       printk("%d pages swap cached\n",cached);
+}
+
+/*
+ * Associate a virtual page frame with a given physical page frame 
+ * and protection flags for that frame.
+ */ 
+static void set_pte_pfn(unsigned long vaddr, unsigned long pfn, pgprot_t flags)
+{
+       pgd_t *pgd;
+       pud_t *pud;
+       pmd_t *pmd;
+       pte_t *pte;
+
+       pgd = swapper_pg_dir + pgd_index(vaddr);
+       if (pgd_none(*pgd)) {
+               BUG();
+               return;
+       }
+       pud = pud_offset(pgd, vaddr);
+       if (pud_none(*pud)) {
+               BUG();
+               return;
+       }
+       pmd = pmd_offset(pud, vaddr);
+       if (pmd_none(*pmd)) {
+               BUG();
+               return;
+       }
+       pte = pte_offset_kernel(pmd, vaddr);
+       /* <pfn,flags> stored as-is, to permit clearing entries */
+       set_pte(pte, pfn_pte(pfn, flags));
+
+       /*
+        * It's enough to flush this one mapping.
+        * (PGE mappings get flushed as well)
+        */
+       __flush_tlb_one(vaddr);
+}
+
+/*
+ * Associate a virtual page frame with a given physical page frame 
+ * and protection flags for that frame.
+ */ 
+static void set_pte_pfn_ma(unsigned long vaddr, unsigned long pfn,
+                          pgprot_t flags)
+{
+       pgd_t *pgd;
+       pud_t *pud;
+       pmd_t *pmd;
+       pte_t *pte;
+
+       pgd = swapper_pg_dir + pgd_index(vaddr);
+       if (pgd_none(*pgd)) {
+               BUG();
+               return;
+       }
+       pud = pud_offset(pgd, vaddr);
+       if (pud_none(*pud)) {
+               BUG();
+               return;
+       }
+       pmd = pmd_offset(pud, vaddr);
+       if (pmd_none(*pmd)) {
+               BUG();
+               return;
+       }
+       pte = pte_offset_kernel(pmd, vaddr);
+       /* <pfn,flags> stored as-is, to permit clearing entries */
+       set_pte(pte, pfn_pte_ma(pfn, flags));
+
+       /*
+        * It's enough to flush this one mapping.
+        * (PGE mappings get flushed as well)
+        */
+       __flush_tlb_one(vaddr);
+}
+
+/*
+ * Associate a large virtual page frame with a given physical page frame 
+ * and protection flags for that frame. pfn is for the base of the page,
+ * vaddr is what the page gets mapped to - both must be properly aligned. 
+ * The pmd must already be instantiated. Assumes PAE mode.
+ */ 
+void set_pmd_pfn(unsigned long vaddr, unsigned long pfn, pgprot_t flags)
+{
+       pgd_t *pgd;
+       pud_t *pud;
+       pmd_t *pmd;
+
+       if (vaddr & (PMD_SIZE-1)) {             /* vaddr is misaligned */
+               printk ("set_pmd_pfn: vaddr misaligned\n");
+               return; /* BUG(); */
+       }
+       if (pfn & (PTRS_PER_PTE-1)) {           /* pfn is misaligned */
+               printk ("set_pmd_pfn: pfn misaligned\n");
+               return; /* BUG(); */
+       }
+       pgd = swapper_pg_dir + pgd_index(vaddr);
+       if (pgd_none(*pgd)) {
+               printk ("set_pmd_pfn: pgd_none\n");
+               return; /* BUG(); */
+       }
+       pud = pud_offset(pgd, vaddr);
+       pmd = pmd_offset(pud, vaddr);
+       set_pmd(pmd, pfn_pmd(pfn, flags));
+       /*
+        * It's enough to flush this one mapping.
+        * (PGE mappings get flushed as well)
+        */
+       __flush_tlb_one(vaddr);
+}
+
+void __set_fixmap (enum fixed_addresses idx, unsigned long phys, pgprot_t 
flags)
+{
+       unsigned long address = __fix_to_virt(idx);
+
+       if (idx >= __end_of_fixed_addresses) {
+               BUG();
+               return;
+       }
+       switch (idx) {
+       case FIX_WP_TEST:
+       case FIX_VSYSCALL:
+#ifdef CONFIG_X86_F00F_BUG
+       case FIX_F00F_IDT:
+#endif
+               set_pte_pfn(address, phys >> PAGE_SHIFT, flags);
+               break;
+       default:
+               set_pte_pfn_ma(address, phys >> PAGE_SHIFT, flags);
+               break;
+       }
+}
+
+pte_t *pte_alloc_one_kernel(struct mm_struct *mm, unsigned long address)
+{
+       pte_t *pte = (pte_t 
*)__get_free_page(GFP_KERNEL|__GFP_REPEAT|__GFP_ZERO);
+       if (pte)
+               make_page_readonly(pte);
+       return pte;
+}
+
+struct page *pte_alloc_one(struct mm_struct *mm, unsigned long address)
+{
+       struct page *pte;
+
+#ifdef CONFIG_HIGHPTE
+       pte = alloc_pages(GFP_KERNEL|__GFP_HIGHMEM|__GFP_REPEAT|__GFP_ZERO, 0);
+#else
+       pte = alloc_pages(GFP_KERNEL|__GFP_REPEAT|__GFP_ZERO, 0);
+       if (pte) {
+               SetPageForeign(pte, _pte_free);
+               set_page_count(pte, 1);
+       }
+#endif
+
+       return pte;
+}
+
+void _pte_free(struct page *pte)
+{
+       unsigned long va = (unsigned long)__va(page_to_pfn(pte)<<PAGE_SHIFT);
+
+       if (!pte_write(*virt_to_ptep(va)))
+               HYPERVISOR_update_va_mapping(
+                       va, pfn_pte(page_to_pfn(pte), PAGE_KERNEL), 0);
+
+       ClearPageForeign(pte);
+       set_page_count(pte, 1);
+
+       __free_page(pte);
+}
+
+void pmd_ctor(void *pmd, kmem_cache_t *cache, unsigned long flags)
+{
+       memset(pmd, 0, PTRS_PER_PMD*sizeof(pmd_t));
+}
+
+/*
+ * List of all pgd's needed for non-PAE so it can invalidate entries
+ * in both cached and uncached pgd's; not needed for PAE since the
+ * kernel pmd is shared. If PAE were not to share the pmd a similar
+ * tactic would be needed. This is essentially codepath-based locking
+ * against pageattr.c; it is the unique case in which a valid change
+ * of kernel pagetables can't be lazily synchronized by vmalloc faults.
+ * vmalloc faults work because attached pagetables are never freed.
+ * The locking scheme was chosen on the basis of manfred's
+ * recommendations and having no core impact whatsoever.
+ * -- wli
+ */
+DEFINE_SPINLOCK(pgd_lock);
+struct page *pgd_list;
+
+static inline void pgd_list_add(pgd_t *pgd)
+{
+       struct page *page = virt_to_page(pgd);
+       page->index = (unsigned long)pgd_list;
+       if (pgd_list)
+               pgd_list->private = (unsigned long)&page->index;
+       pgd_list = page;
+       page->private = (unsigned long)&pgd_list;
+}
+
+static inline void pgd_list_del(pgd_t *pgd)
+{
+       struct page *next, **pprev, *page = virt_to_page(pgd);
+       next = (struct page *)page->index;
+       pprev = (struct page **)page->private;
+       *pprev = next;
+       if (next)
+               next->private = (unsigned long)pprev;
+}
+
+void pgd_ctor(void *pgd, kmem_cache_t *cache, unsigned long unused)
+{
+       unsigned long flags;
+
+       if (!HAVE_SHARED_KERNEL_PMD)
+               spin_lock_irqsave(&pgd_lock, flags);
+
+       memcpy((pgd_t *)pgd + USER_PTRS_PER_PGD,
+                       swapper_pg_dir + USER_PTRS_PER_PGD,
+                       (PTRS_PER_PGD - USER_PTRS_PER_PGD) * sizeof(pgd_t));
+
+       if (HAVE_SHARED_KERNEL_PMD)
+               return;
+
+       pgd_list_add(pgd);
+       spin_unlock_irqrestore(&pgd_lock, flags);
+       memset(pgd, 0, USER_PTRS_PER_PGD*sizeof(pgd_t));
+}
+
+void pgd_dtor(void *pgd, kmem_cache_t *cache, unsigned long unused)
+{
+       unsigned long flags; /* can be called from interrupt context */
+
+       if (HAVE_SHARED_KERNEL_PMD)
+               return;
+
+       spin_lock_irqsave(&pgd_lock, flags);
+       pgd_list_del(pgd);
+       spin_unlock_irqrestore(&pgd_lock, flags);
+}
+
+pgd_t *pgd_alloc(struct mm_struct *mm)
+{
+       int i = 0;
+       pgd_t *pgd = kmem_cache_alloc(pgd_cache, GFP_KERNEL);
+
+       if (PTRS_PER_PMD == 1 || !pgd)
+               return pgd;
+
+       if (!HAVE_SHARED_KERNEL_PMD) {
+               /* alloc and copy kernel pmd */
+               unsigned long flags;
+               pgd_t *copy_pgd = pgd_offset_k(PAGE_OFFSET);
+               pud_t *copy_pud = pud_offset(copy_pgd, PAGE_OFFSET);
+               pmd_t *copy_pmd = pmd_offset(copy_pud, PAGE_OFFSET);
+               pmd_t *pmd = kmem_cache_alloc(pmd_cache, GFP_KERNEL);
+               if (0 == pmd)
+                       goto out_oom;
+
+               spin_lock_irqsave(&pgd_lock, flags);
+               memcpy(pmd, copy_pmd, PAGE_SIZE);
+               spin_unlock_irqrestore(&pgd_lock, flags);
+               make_page_readonly(pmd);
+               set_pgd(&pgd[USER_PTRS_PER_PGD], __pgd(1 + __pa(pmd)));
+       }
+
+       /* alloc user pmds */
+       for (i = 0; i < USER_PTRS_PER_PGD; ++i) {
+               pmd_t *pmd = kmem_cache_alloc(pmd_cache, GFP_KERNEL);
+               if (!pmd)
+                       goto out_oom;
+               set_pgd(&pgd[i], __pgd(1 + __pa(pmd)));
+       }
+       return pgd;
+
+out_oom:
+       for (i--; i >= 0; i--)
+               kmem_cache_free(pmd_cache, (void *)__va(pgd_val(pgd[i])-1));
+       kmem_cache_free(pgd_cache, pgd);
+       return NULL;
+}
+
+void pgd_free(pgd_t *pgd)
+{
+       int i;
+       pte_t *ptep = virt_to_ptep(pgd);
+
+       if (!pte_write(*ptep)) {
+               xen_pgd_unpin(__pa(pgd));
+               HYPERVISOR_update_va_mapping(
+                       (unsigned long)pgd,
+                       pfn_pte(virt_to_phys(pgd)>>PAGE_SHIFT, PAGE_KERNEL),
+                       0);
+       }
+
+       /* in the PAE case user pgd entries are overwritten before usage */
+       if (PTRS_PER_PMD > 1) {
+               for (i = 0; i < USER_PTRS_PER_PGD; ++i) {
+                       pmd_t *pmd = (void *)__va(pgd_val(pgd[i])-1);
+                       kmem_cache_free(pmd_cache, pmd);
+               }
+               if (!HAVE_SHARED_KERNEL_PMD) {
+                       pmd_t *pmd = (void 
*)__va(pgd_val(pgd[USER_PTRS_PER_PGD])-1);
+                       make_page_writable(pmd);
+                       memset(pmd, 0, PTRS_PER_PMD*sizeof(pmd_t));
+                       kmem_cache_free(pmd_cache, pmd);
+               }
+       }
+       /* in the non-PAE case, free_pgtables() clears user pgd entries */
+       kmem_cache_free(pgd_cache, pgd);
+}
+
+#ifndef CONFIG_XEN_SHADOW_MODE
+void make_lowmem_page_readonly(void *va)
+{
+       pte_t *pte = virt_to_ptep(va);
+       set_pte(pte, pte_wrprotect(*pte));
+}
+
+void make_lowmem_page_writable(void *va)
+{
+       pte_t *pte = virt_to_ptep(va);
+       set_pte(pte, pte_mkwrite(*pte));
+}
+
+void make_page_readonly(void *va)
+{
+       pte_t *pte = virt_to_ptep(va);
+       set_pte(pte, pte_wrprotect(*pte));
+       if ( (unsigned long)va >= (unsigned long)high_memory )
+       {
+               unsigned long phys;
+               phys = machine_to_phys(*(unsigned long *)pte & PAGE_MASK);
+#ifdef CONFIG_HIGHMEM
+               if ( (phys >> PAGE_SHIFT) < highstart_pfn )
+#endif
+                       make_lowmem_page_readonly(phys_to_virt(phys));
+       }
+}
+
+void make_page_writable(void *va)
+{
+       pte_t *pte = virt_to_ptep(va);
+       set_pte(pte, pte_mkwrite(*pte));
+       if ( (unsigned long)va >= (unsigned long)high_memory )
+       {
+               unsigned long phys;
+               phys = machine_to_phys(*(unsigned long *)pte & PAGE_MASK);
+#ifdef CONFIG_HIGHMEM
+               if ( (phys >> PAGE_SHIFT) < highstart_pfn )
+#endif
+                       make_lowmem_page_writable(phys_to_virt(phys));
+       }
+}
+
+void make_pages_readonly(void *va, unsigned int nr)
+{
+       while ( nr-- != 0 )
+       {
+               make_page_readonly(va);
+               va = (void *)((unsigned long)va + PAGE_SIZE);
+       }
+}
+
+void make_pages_writable(void *va, unsigned int nr)
+{
+       while ( nr-- != 0 )
+       {
+               make_page_writable(va);
+               va = (void *)((unsigned long)va + PAGE_SIZE);
+       }
+}
+#endif /* CONFIG_XEN_SHADOW_MODE */
+
+LIST_HEAD(mm_unpinned);
+DEFINE_SPINLOCK(mm_unpinned_lock);
+
+static inline void mm_walk_set_prot(void *pt, pgprot_t flags)
+{
+       struct page *page = virt_to_page(pt);
+       unsigned long pfn = page_to_pfn(page);
+
+       if (PageHighMem(page))
+               return;
+       HYPERVISOR_update_va_mapping(
+               (unsigned long)__va(pfn << PAGE_SHIFT),
+               pfn_pte(pfn, flags), 0);
+}
+
+static void mm_walk(struct mm_struct *mm, pgprot_t flags)
+{
+       pgd_t       *pgd;
+       pud_t       *pud;
+       pmd_t       *pmd;
+       pte_t       *pte;
+       int          g,u,m;
+
+       pgd = mm->pgd;
+       for (g = 0; g < USER_PTRS_PER_PGD; g++, pgd++) {
+               if (pgd_none(*pgd))
+                       continue;
+               pud = pud_offset(pgd, 0);
+               if (PTRS_PER_PUD > 1) /* not folded */
+                       mm_walk_set_prot(pud,flags);
+               for (u = 0; u < PTRS_PER_PUD; u++, pud++) {
+                       if (pud_none(*pud))
+                               continue;
+                       pmd = pmd_offset(pud, 0);
+                       if (PTRS_PER_PMD > 1) /* not folded */
+                               mm_walk_set_prot(pmd,flags);
+                       for (m = 0; m < PTRS_PER_PMD; m++, pmd++) {
+                               if (pmd_none(*pmd))
+                                       continue;
+                               pte = pte_offset_kernel(pmd,0);
+                               mm_walk_set_prot(pte,flags);
+                       }
+               }
+       }
+}
+
+void mm_pin(struct mm_struct *mm)
+{
+    spin_lock(&mm->page_table_lock);
+
+    mm_walk(mm, PAGE_KERNEL_RO);
+    HYPERVISOR_update_va_mapping(
+        (unsigned long)mm->pgd,
+        pfn_pte(virt_to_phys(mm->pgd)>>PAGE_SHIFT, PAGE_KERNEL_RO),
+        UVMF_TLB_FLUSH);
+    xen_pgd_pin(__pa(mm->pgd));
+    mm->context.pinned = 1;
+    spin_lock(&mm_unpinned_lock);
+    list_del(&mm->context.unpinned);
+    spin_unlock(&mm_unpinned_lock);
+
+    spin_unlock(&mm->page_table_lock);
+}
+
+void mm_unpin(struct mm_struct *mm)
+{
+    spin_lock(&mm->page_table_lock);
+
+    xen_pgd_unpin(__pa(mm->pgd));
+    HYPERVISOR_update_va_mapping(
+        (unsigned long)mm->pgd,
+        pfn_pte(virt_to_phys(mm->pgd)>>PAGE_SHIFT, PAGE_KERNEL), 0);
+    mm_walk(mm, PAGE_KERNEL);
+    xen_tlb_flush();
+    mm->context.pinned = 0;
+    spin_lock(&mm_unpinned_lock);
+    list_add(&mm->context.unpinned, &mm_unpinned);
+    spin_unlock(&mm_unpinned_lock);
+
+    spin_unlock(&mm->page_table_lock);
+}
+
+void mm_pin_all(void)
+{
+    while (!list_empty(&mm_unpinned))  
+       mm_pin(list_entry(mm_unpinned.next, struct mm_struct,
+                         context.unpinned));
+}
+
+void _arch_exit_mmap(struct mm_struct *mm)
+{
+    struct task_struct *tsk = current;
+
+    task_lock(tsk);
+
+    /*
+     * We aggressively remove defunct pgd from cr3. We execute unmap_vmas()
+     * *much* faster this way, as no tlb flushes means bigger wrpt batches.
+     */
+    if ( tsk->active_mm == mm )
+    {
+        tsk->active_mm = &init_mm;
+        atomic_inc(&init_mm.mm_count);
+
+        switch_mm(mm, &init_mm, tsk);
+
+        atomic_dec(&mm->mm_count);
+        BUG_ON(atomic_read(&mm->mm_count) == 0);
+    }
+
+    task_unlock(tsk);
+
+    if ( mm->context.pinned && (atomic_read(&mm->mm_count) == 1) )
+        mm_unpin(mm);
+}
Index: linux-2.6.12-xen0-arch/arch/i386/mach-xen/pci/irq.c
===================================================================
--- /dev/null
+++ linux-2.6.12-xen0-arch/arch/i386/mach-xen/pci/irq.c
@@ -0,0 +1,1120 @@
+/*
+ *     Low-Level PCI Support for PC -- Routing of Interrupts
+ *
+ *     (c) 1999--2000 Martin Mares <mj@xxxxxx>
+ */
+
+#include <linux/config.h>
+#include <linux/types.h>
+#include <linux/kernel.h>
+#include <linux/pci.h>
+#include <linux/init.h>
+#include <linux/slab.h>
+#include <linux/interrupt.h>
+#include <linux/irq.h>
+#include <linux/dmi.h>
+#include <asm/io.h>
+#include <asm/smp.h>
+#include <asm/io_apic.h>
+#include <asm/hw_irq.h>
+#include <linux/acpi.h>
+
+#include "pci.h"
+
+#define PIRQ_SIGNATURE (('$' << 0) + ('P' << 8) + ('I' << 16) + ('R' << 24))
+#define PIRQ_VERSION 0x0100
+
+static int broken_hp_bios_irq9;
+static int acer_tm360_irqrouting;
+
+static struct irq_routing_table *pirq_table;
+
+static int pirq_enable_irq(struct pci_dev *dev);
+
+/*
+ * Never use: 0, 1, 2 (timer, keyboard, and cascade)
+ * Avoid using: 13, 14 and 15 (FP error and IDE).
+ * Penalize: 3, 4, 6, 7, 12 (known ISA uses: serial, floppy, parallel and 
mouse)
+ */
+unsigned int pcibios_irq_mask = 0xfff8;
+
+static int pirq_penalty[16] = {
+       1000000, 1000000, 1000000, 1000, 1000, 0, 1000, 1000,
+       0, 0, 0, 0, 1000, 100000, 100000, 100000
+};
+
+struct irq_router {
+       char *name;
+       u16 vendor, device;
+       int (*get)(struct pci_dev *router, struct pci_dev *dev, int pirq);
+       int (*set)(struct pci_dev *router, struct pci_dev *dev, int pirq, int 
new);
+};
+
+struct irq_router_handler {
+       u16 vendor;
+       int (*probe)(struct irq_router *r, struct pci_dev *router, u16 device);
+};
+
+int (*pcibios_enable_irq)(struct pci_dev *dev) = NULL;
+
+/*
+ *  Search 0xf0000 -- 0xfffff for the PCI IRQ Routing Table.
+ */
+
+static struct irq_routing_table * __init pirq_find_routing_table(void)
+{
+       u8 *addr;
+       struct irq_routing_table *rt;
+       int i;
+       u8 sum;
+
+#ifdef CONFIG_XEN_PRIVILEGED_GUEST
+       for(addr = (u8 *) isa_bus_to_virt(0xf0000); addr < (u8 *) 
isa_bus_to_virt(0x100000); addr += 16) {
+               rt = (struct irq_routing_table *) addr;
+               if (rt->signature != PIRQ_SIGNATURE ||
+                   rt->version != PIRQ_VERSION ||
+                   rt->size % 16 ||
+                   rt->size < sizeof(struct irq_routing_table))
+                       continue;
+               sum = 0;
+               for(i=0; i<rt->size; i++)
+                       sum += addr[i];
+               if (!sum) {
+                       DBG("PCI: Interrupt Routing Table found at 0x%p\n", rt);
+                       return rt;
+               }
+       }
+#endif
+       
+       return NULL;
+}
+
+/*
+ *  If we have a IRQ routing table, use it to search for peer host
+ *  bridges.  It's a gross hack, but since there are no other known
+ *  ways how to get a list of buses, we have to go this way.
+ */
+
+static void __init pirq_peer_trick(void)
+{
+       struct irq_routing_table *rt = pirq_table;
+       u8 busmap[256];
+       int i;
+       struct irq_info *e;
+
+       memset(busmap, 0, sizeof(busmap));
+       for(i=0; i < (rt->size - sizeof(struct irq_routing_table)) / 
sizeof(struct irq_info); i++) {
+               e = &rt->slots[i];
+#ifdef DEBUG
+               {
+                       int j;
+                       DBG("%02x:%02x slot=%02x", e->bus, e->devfn/8, e->slot);
+                       for(j=0; j<4; j++)
+                               DBG(" %d:%02x/%04x", j, e->irq[j].link, 
e->irq[j].bitmap);
+                       DBG("\n");
+               }
+#endif
+               busmap[e->bus] = 1;
+       }
+       for(i = 1; i < 256; i++) {
+               if (!busmap[i] || pci_find_bus(0, i))
+                       continue;
+               if (pci_scan_bus(i, &pci_root_ops, NULL))
+                       printk(KERN_INFO "PCI: Discovered primary peer bus %02x 
[IRQ]\n", i);
+       }
+       pcibios_last_bus = -1;
+}
+
+/*
+ *  Code for querying and setting of IRQ routes on various interrupt routers.
+ */
+
+void eisa_set_level_irq(unsigned int irq)
+{
+       unsigned char mask = 1 << (irq & 7);
+       unsigned int port = 0x4d0 + (irq >> 3);
+       unsigned char val;
+       static u16 eisa_irq_mask;
+
+       if (irq >= 16 || (1 << irq) & eisa_irq_mask)
+               return;
+
+       eisa_irq_mask |= (1 << irq);
+       printk("PCI: setting IRQ %u as level-triggered\n", irq);
+       val = inb(port);
+       if (!(val & mask)) {
+               DBG(" -> edge");
+               outb(val | mask, port);
+       }
+}
+
+/*
+ * Common IRQ routing practice: nybbles in config space,
+ * offset by some magic constant.
+ */
+static unsigned int read_config_nybble(struct pci_dev *router, unsigned 
offset, unsigned nr)
+{
+       u8 x;
+       unsigned reg = offset + (nr >> 1);
+
+       pci_read_config_byte(router, reg, &x);
+       return (nr & 1) ? (x >> 4) : (x & 0xf);
+}
+
+static void write_config_nybble(struct pci_dev *router, unsigned offset, 
unsigned nr, unsigned int val)
+{
+       u8 x;
+       unsigned reg = offset + (nr >> 1);
+
+       pci_read_config_byte(router, reg, &x);
+       x = (nr & 1) ? ((x & 0x0f) | (val << 4)) : ((x & 0xf0) | val);
+       pci_write_config_byte(router, reg, x);
+}
+
+/*
+ * ALI pirq entries are damn ugly, and completely undocumented.
+ * This has been figured out from pirq tables, and it's not a pretty
+ * picture.
+ */
+static int pirq_ali_get(struct pci_dev *router, struct pci_dev *dev, int pirq)
+{
+       static unsigned char irqmap[16] = { 0, 9, 3, 10, 4, 5, 7, 6, 1, 11, 0, 
12, 0, 14, 0, 15 };
+
+       return irqmap[read_config_nybble(router, 0x48, pirq-1)];
+}
+
+static int pirq_ali_set(struct pci_dev *router, struct pci_dev *dev, int pirq, 
int irq)
+{
+       static unsigned char irqmap[16] = { 0, 8, 0, 2, 4, 5, 7, 6, 0, 1, 3, 9, 
11, 0, 13, 15 };
+       unsigned int val = irqmap[irq];
+               
+       if (val) {
+               write_config_nybble(router, 0x48, pirq-1, val);
+               return 1;
+       }
+       return 0;
+}
+
+/*
+ * The Intel PIIX4 pirq rules are fairly simple: "pirq" is
+ * just a pointer to the config space.
+ */
+static int pirq_piix_get(struct pci_dev *router, struct pci_dev *dev, int pirq)
+{
+       u8 x;
+
+       pci_read_config_byte(router, pirq, &x);
+       return (x < 16) ? x : 0;
+}
+
+static int pirq_piix_set(struct pci_dev *router, struct pci_dev *dev, int 
pirq, int irq)
+{
+       pci_write_config_byte(router, pirq, irq);
+       return 1;
+}
+
+/*
+ * The VIA pirq rules are nibble-based, like ALI,
+ * but without the ugly irq number munging.
+ * However, PIRQD is in the upper instead of lower 4 bits.
+ */
+static int pirq_via_get(struct pci_dev *router, struct pci_dev *dev, int pirq)
+{
+       return read_config_nybble(router, 0x55, pirq == 4 ? 5 : pirq);
+}
+
+static int pirq_via_set(struct pci_dev *router, struct pci_dev *dev, int pirq, 
int irq)
+{
+       write_config_nybble(router, 0x55, pirq == 4 ? 5 : pirq, irq);
+       return 1;
+}
+
+/*
+ * ITE 8330G pirq rules are nibble-based
+ * FIXME: pirqmap may be { 1, 0, 3, 2 },
+ *       2+3 are both mapped to irq 9 on my system
+ */
+static int pirq_ite_get(struct pci_dev *router, struct pci_dev *dev, int pirq)
+{
+       static unsigned char pirqmap[4] = { 1, 0, 2, 3 };
+       return read_config_nybble(router,0x43, pirqmap[pirq-1]);
+}
+
+static int pirq_ite_set(struct pci_dev *router, struct pci_dev *dev, int pirq, 
int irq)
+{
+       static unsigned char pirqmap[4] = { 1, 0, 2, 3 };
+       write_config_nybble(router, 0x43, pirqmap[pirq-1], irq);
+       return 1;
+}
+
+/*
+ * OPTI: high four bits are nibble pointer..
+ * I wonder what the low bits do?
+ */
+static int pirq_opti_get(struct pci_dev *router, struct pci_dev *dev, int pirq)
+{
+       return read_config_nybble(router, 0xb8, pirq >> 4);
+}
+
+static int pirq_opti_set(struct pci_dev *router, struct pci_dev *dev, int 
pirq, int irq)
+{
+       write_config_nybble(router, 0xb8, pirq >> 4, irq);
+       return 1;
+}
+
+/*
+ * Cyrix: nibble offset 0x5C
+ * 0x5C bits 7:4 is INTB bits 3:0 is INTA 
+ * 0x5D bits 7:4 is INTD bits 3:0 is INTC
+ */
+static int pirq_cyrix_get(struct pci_dev *router, struct pci_dev *dev, int 
pirq)
+{
+       return read_config_nybble(router, 0x5C, (pirq-1)^1);
+}
+
+static int pirq_cyrix_set(struct pci_dev *router, struct pci_dev *dev, int 
pirq, int irq)
+{
+       write_config_nybble(router, 0x5C, (pirq-1)^1, irq);
+       return 1;
+}
+
+/*
+ *     PIRQ routing for SiS 85C503 router used in several SiS chipsets.
+ *     We have to deal with the following issues here:
+ *     - vendors have different ideas about the meaning of link values
+ *     - some onboard devices (integrated in the chipset) have special
+ *       links and are thus routed differently (i.e. not via PCI INTA-INTD)
+ *     - different revision of the router have a different layout for
+ *       the routing registers, particularly for the onchip devices
+ *
+ *     For all routing registers the common thing is we have one byte
+ *     per routeable link which is defined as:
+ *              bit 7      IRQ mapping enabled (0) or disabled (1)
+ *              bits [6:4] reserved (sometimes used for onchip devices)
+ *              bits [3:0] IRQ to map to
+ *                  allowed: 3-7, 9-12, 14-15
+ *                  reserved: 0, 1, 2, 8, 13
+ *
+ *     The config-space registers located at 0x41/0x42/0x43/0x44 are
+ *     always used to route the normal PCI INT A/B/C/D respectively.
+ *     Apparently there are systems implementing PCI routing table using
+ *     link values 0x01-0x04 and others using 0x41-0x44 for PCI INTA..D.
+ *     We try our best to handle both link mappings.
+ *     
+ *     Currently (2003-05-21) it appears most SiS chipsets follow the
+ *     definition of routing registers from the SiS-5595 southbridge.
+ *     According to the SiS 5595 datasheets the revision id's of the
+ *     router (ISA-bridge) should be 0x01 or 0xb0.
+ *
+ *     Furthermore we've also seen lspci dumps with revision 0x00 and 0xb1.
+ *     Looks like these are used in a number of SiS 5xx/6xx/7xx chipsets.
+ *     They seem to work with the current routing code. However there is
+ *     some concern because of the two USB-OHCI HCs (original SiS 5595
+ *     had only one). YMMV.
+ *
+ *     Onchip routing for router rev-id 0x01/0xb0 and probably 0x00/0xb1:
+ *
+ *     0x61:   IDEIRQ:
+ *             bits [6:5] must be written 01
+ *             bit 4 channel-select primary (0), secondary (1)
+ *
+ *     0x62:   USBIRQ:
+ *             bit 6 OHCI function disabled (0), enabled (1)
+ *     
+ *     0x6a:   ACPI/SCI IRQ: bits 4-6 reserved
+ *
+ *     0x7e:   Data Acq. Module IRQ - bits 4-6 reserved
+ *
+ *     We support USBIRQ (in addition to INTA-INTD) and keep the
+ *     IDE, ACPI and DAQ routing untouched as set by the BIOS.
+ *
+ *     Currently the only reported exception is the new SiS 65x chipset
+ *     which includes the SiS 69x southbridge. Here we have the 85C503
+ *     router revision 0x04 and there are changes in the register layout
+ *     mostly related to the different USB HCs with USB 2.0 support.
+ *
+ *     Onchip routing for router rev-id 0x04 (try-and-error observation)
+ *
+ *     0x60/0x61/0x62/0x63:    1xEHCI and 3xOHCI (companion) USB-HCs
+ *                             bit 6-4 are probably unused, not like 5595
+ */
+
+#define PIRQ_SIS_IRQ_MASK      0x0f
+#define PIRQ_SIS_IRQ_DISABLE   0x80
+#define PIRQ_SIS_USB_ENABLE    0x40
+
+static int pirq_sis_get(struct pci_dev *router, struct pci_dev *dev, int pirq)
+{
+       u8 x;
+       int reg;
+
+       reg = pirq;
+       if (reg >= 0x01 && reg <= 0x04)
+               reg += 0x40;
+       pci_read_config_byte(router, reg, &x);
+       return (x & PIRQ_SIS_IRQ_DISABLE) ? 0 : (x & PIRQ_SIS_IRQ_MASK);
+}
+
+static int pirq_sis_set(struct pci_dev *router, struct pci_dev *dev, int pirq, 
int irq)
+{
+       u8 x;
+       int reg;
+
+       reg = pirq;
+       if (reg >= 0x01 && reg <= 0x04)
+               reg += 0x40;
+       pci_read_config_byte(router, reg, &x);
+       x &= ~(PIRQ_SIS_IRQ_MASK | PIRQ_SIS_IRQ_DISABLE);
+       x |= irq ? irq: PIRQ_SIS_IRQ_DISABLE;
+       pci_write_config_byte(router, reg, x);
+       return 1;
+}
+
+
+/*
+ * VLSI: nibble offset 0x74 - educated guess due to routing table and
+ *       config space of VLSI 82C534 PCI-bridge/router (1004:0102)
+ *       Tested on HP OmniBook 800 covering PIRQ 1, 2, 4, 8 for onboard
+ *       devices, PIRQ 3 for non-pci(!) soundchip and (untested) PIRQ 6
+ *       for the busbridge to the docking station.
+ */
+
+static int pirq_vlsi_get(struct pci_dev *router, struct pci_dev *dev, int pirq)
+{
+       if (pirq > 8) {
+               printk(KERN_INFO "VLSI router pirq escape (%d)\n", pirq);
+               return 0;
+       }
+       return read_config_nybble(router, 0x74, pirq-1);
+}
+
+static int pirq_vlsi_set(struct pci_dev *router, struct pci_dev *dev, int 
pirq, int irq)
+{
+       if (pirq > 8) {
+               printk(KERN_INFO "VLSI router pirq escape (%d)\n", pirq);
+               return 0;
+       }
+       write_config_nybble(router, 0x74, pirq-1, irq);
+       return 1;
+}
+
+/*
+ * ServerWorks: PCI interrupts mapped to system IRQ lines through Index
+ * and Redirect I/O registers (0x0c00 and 0x0c01).  The Index register
+ * format is (PCIIRQ## | 0x10), e.g.: PCIIRQ10=0x1a.  The Redirect
+ * register is a straight binary coding of desired PIC IRQ (low nibble).
+ *
+ * The 'link' value in the PIRQ table is already in the correct format
+ * for the Index register.  There are some special index values:
+ * 0x00 for ACPI (SCI), 0x01 for USB, 0x02 for IDE0, 0x04 for IDE1,
+ * and 0x03 for SMBus.
+ */
+static int pirq_serverworks_get(struct pci_dev *router, struct pci_dev *dev, 
int pirq)
+{
+       outb_p(pirq, 0xc00);
+       return inb(0xc01) & 0xf;
+}
+
+static int pirq_serverworks_set(struct pci_dev *router, struct pci_dev *dev, 
int pirq, int irq)
+{
+       outb_p(pirq, 0xc00);
+       outb_p(irq, 0xc01);
+       return 1;
+}
+
+/* Support for AMD756 PCI IRQ Routing
+ * Jhon H. Caicedo <jhcaiced@xxxxxxxxxxx>
+ * Jun/21/2001 0.2.0 Release, fixed to use "nybble" functions... (jhcaiced)
+ * Jun/19/2001 Alpha Release 0.1.0 (jhcaiced)
+ * The AMD756 pirq rules are nibble-based
+ * offset 0x56 0-3 PIRQA  4-7  PIRQB
+ * offset 0x57 0-3 PIRQC  4-7  PIRQD
+ */
+static int pirq_amd756_get(struct pci_dev *router, struct pci_dev *dev, int 
pirq)
+{
+       u8 irq;
+       irq = 0;
+       if (pirq <= 4)
+       {
+               irq = read_config_nybble(router, 0x56, pirq - 1);
+       }
+       printk(KERN_INFO "AMD756: dev %04x:%04x, router pirq : %d get irq : 
%2d\n",
+               dev->vendor, dev->device, pirq, irq);
+       return irq;
+}
+
+static int pirq_amd756_set(struct pci_dev *router, struct pci_dev *dev, int 
pirq, int irq)
+{
+       printk(KERN_INFO "AMD756: dev %04x:%04x, router pirq : %d SET irq : 
%2d\n", 
+               dev->vendor, dev->device, pirq, irq);
+       if (pirq <= 4)
+       {
+               write_config_nybble(router, 0x56, pirq - 1, irq);
+       }
+       return 1;
+}
+
+#ifdef CONFIG_PCI_BIOS
+
+static int pirq_bios_set(struct pci_dev *router, struct pci_dev *dev, int 
pirq, int irq)
+{
+       struct pci_dev *bridge;
+       int pin = pci_get_interrupt_pin(dev, &bridge);
+       return pcibios_set_irq_routing(bridge, pin, irq);
+}
+
+#endif
+
+static __init int intel_router_probe(struct irq_router *r, struct pci_dev 
*router, u16 device)
+{
+       static struct pci_device_id pirq_440gx[] = {
+               { PCI_DEVICE(PCI_VENDOR_ID_INTEL, 
PCI_DEVICE_ID_INTEL_82443GX_0) },
+               { PCI_DEVICE(PCI_VENDOR_ID_INTEL, 
PCI_DEVICE_ID_INTEL_82443GX_2) },
+               { },
+       };
+
+       /* 440GX has a proprietary PIRQ router -- don't use it */
+       if (pci_dev_present(pirq_440gx))
+               return 0;
+
+       switch(device)
+       {
+               case PCI_DEVICE_ID_INTEL_82371FB_0:
+               case PCI_DEVICE_ID_INTEL_82371SB_0:
+               case PCI_DEVICE_ID_INTEL_82371AB_0:
+               case PCI_DEVICE_ID_INTEL_82371MX:
+               case PCI_DEVICE_ID_INTEL_82443MX_0:
+               case PCI_DEVICE_ID_INTEL_82801AA_0:
+               case PCI_DEVICE_ID_INTEL_82801AB_0:
+               case PCI_DEVICE_ID_INTEL_82801BA_0:
+               case PCI_DEVICE_ID_INTEL_82801BA_10:
+               case PCI_DEVICE_ID_INTEL_82801CA_0:
+               case PCI_DEVICE_ID_INTEL_82801CA_12:
+               case PCI_DEVICE_ID_INTEL_82801DB_0:
+               case PCI_DEVICE_ID_INTEL_82801E_0:
+               case PCI_DEVICE_ID_INTEL_82801EB_0:
+               case PCI_DEVICE_ID_INTEL_ESB_1:
+               case PCI_DEVICE_ID_INTEL_ICH6_0:
+               case PCI_DEVICE_ID_INTEL_ICH6_1:
+               case PCI_DEVICE_ID_INTEL_ICH7_0:
+               case PCI_DEVICE_ID_INTEL_ICH7_1:
+               case PCI_DEVICE_ID_INTEL_ICH7_30:
+               case PCI_DEVICE_ID_INTEL_ICH7_31:
+               case PCI_DEVICE_ID_INTEL_ESB2_0:
+                       r->name = "PIIX/ICH";
+                       r->get = pirq_piix_get;
+                       r->set = pirq_piix_set;
+                       return 1;
+       }
+       return 0;
+}
+
+static __init int via_router_probe(struct irq_router *r, struct pci_dev 
*router, u16 device)
+{
+       /* FIXME: We should move some of the quirk fixup stuff here */
+       switch(device)
+       {
+               case PCI_DEVICE_ID_VIA_82C586_0:
+               case PCI_DEVICE_ID_VIA_82C596:
+               case PCI_DEVICE_ID_VIA_82C686:
+               case PCI_DEVICE_ID_VIA_8231:
+               /* FIXME: add new ones for 8233/5 */
+                       r->name = "VIA";
+                       r->get = pirq_via_get;
+                       r->set = pirq_via_set;
+                       return 1;
+       }
+       return 0;
+}
+
+static __init int vlsi_router_probe(struct irq_router *r, struct pci_dev 
*router, u16 device)
+{
+       switch(device)
+       {
+               case PCI_DEVICE_ID_VLSI_82C534:
+                       r->name = "VLSI 82C534";
+                       r->get = pirq_vlsi_get;
+                       r->set = pirq_vlsi_set;
+                       return 1;
+       }
+       return 0;
+}
+
+
+static __init int serverworks_router_probe(struct irq_router *r, struct 
pci_dev *router, u16 device)
+{
+       switch(device)
+       {
+               case PCI_DEVICE_ID_SERVERWORKS_OSB4:
+               case PCI_DEVICE_ID_SERVERWORKS_CSB5:
+                       r->name = "ServerWorks";
+                       r->get = pirq_serverworks_get;
+                       r->set = pirq_serverworks_set;
+                       return 1;
+       }
+       return 0;
+}
+
+static __init int sis_router_probe(struct irq_router *r, struct pci_dev 
*router, u16 device)
+{
+       if (device != PCI_DEVICE_ID_SI_503)
+               return 0;
+               
+       r->name = "SIS";
+       r->get = pirq_sis_get;
+       r->set = pirq_sis_set;
+       return 1;
+}
+
+static __init int cyrix_router_probe(struct irq_router *r, struct pci_dev 
*router, u16 device)
+{
+       switch(device)
+       {
+               case PCI_DEVICE_ID_CYRIX_5520:
+                       r->name = "NatSemi";
+                       r->get = pirq_cyrix_get;
+                       r->set = pirq_cyrix_set;
+                       return 1;
+       }
+       return 0;
+}
+
+static __init int opti_router_probe(struct irq_router *r, struct pci_dev 
*router, u16 device)
+{
+       switch(device)
+       {
+               case PCI_DEVICE_ID_OPTI_82C700:
+                       r->name = "OPTI";
+                       r->get = pirq_opti_get;
+                       r->set = pirq_opti_set;
+                       return 1;
+       }
+       return 0;
+}
+
+static __init int ite_router_probe(struct irq_router *r, struct pci_dev 
*router, u16 device)
+{
+       switch(device)
+       {
+               case PCI_DEVICE_ID_ITE_IT8330G_0:
+                       r->name = "ITE";
+                       r->get = pirq_ite_get;
+                       r->set = pirq_ite_set;
+                       return 1;
+       }
+       return 0;
+}
+
+static __init int ali_router_probe(struct irq_router *r, struct pci_dev 
*router, u16 device)
+{
+       switch(device)
+       {
+       case PCI_DEVICE_ID_AL_M1533:
+       case PCI_DEVICE_ID_AL_M1563:
+               printk("PCI: Using ALI IRQ Router\n");
+                       r->name = "ALI";
+                       r->get = pirq_ali_get;
+                       r->set = pirq_ali_set;
+                       return 1;
+       }
+       return 0;
+}
+
+static __init int amd_router_probe(struct irq_router *r, struct pci_dev 
*router, u16 device)
+{
+       switch(device)
+       {
+               case PCI_DEVICE_ID_AMD_VIPER_740B:
+                       r->name = "AMD756";
+                       break;
+               case PCI_DEVICE_ID_AMD_VIPER_7413:
+                       r->name = "AMD766";
+                       break;
+               case PCI_DEVICE_ID_AMD_VIPER_7443:
+                       r->name = "AMD768";
+                       break;
+               default:
+                       return 0;
+       }
+       r->get = pirq_amd756_get;
+       r->set = pirq_amd756_set;
+       return 1;
+}
+               
+static __initdata struct irq_router_handler pirq_routers[] = {
+       { PCI_VENDOR_ID_INTEL, intel_router_probe },
+       { PCI_VENDOR_ID_AL, ali_router_probe },
+       { PCI_VENDOR_ID_ITE, ite_router_probe },
+       { PCI_VENDOR_ID_VIA, via_router_probe },
+       { PCI_VENDOR_ID_OPTI, opti_router_probe },
+       { PCI_VENDOR_ID_SI, sis_router_probe },
+       { PCI_VENDOR_ID_CYRIX, cyrix_router_probe },
+       { PCI_VENDOR_ID_VLSI, vlsi_router_probe },
+       { PCI_VENDOR_ID_SERVERWORKS, serverworks_router_probe },
+       { PCI_VENDOR_ID_AMD, amd_router_probe },
+       /* Someone with docs needs to add the ATI Radeon IGP */
+       { 0, NULL }
+};
+static struct irq_router pirq_router;
+static struct pci_dev *pirq_router_dev;
+
+
+/*
+ *     FIXME: should we have an option to say "generic for
+ *     chipset" ?
+ */
+ 
+static void __init pirq_find_router(struct irq_router *r)
+{
+       struct irq_routing_table *rt = pirq_table;
+       struct irq_router_handler *h;
+
+#ifdef CONFIG_PCI_BIOS
+       if (!rt->signature) {
+               printk(KERN_INFO "PCI: Using BIOS for IRQ routing\n");
+               r->set = pirq_bios_set;
+               r->name = "BIOS";
+               return;
+       }
+#endif
+
+       /* Default unless a driver reloads it */
+       r->name = "default";
+       r->get = NULL;
+       r->set = NULL;
+       
+       DBG("PCI: Attempting to find IRQ router for %04x:%04x\n",
+           rt->rtr_vendor, rt->rtr_device);
+
+       pirq_router_dev = pci_find_slot(rt->rtr_bus, rt->rtr_devfn);
+       if (!pirq_router_dev) {
+               DBG("PCI: Interrupt router not found at %02x:%02x\n", 
rt->rtr_bus, rt->rtr_devfn);
+               return;
+       }
+
+       for( h = pirq_routers; h->vendor; h++) {
+               /* First look for a router match */
+               if (rt->rtr_vendor == h->vendor && h->probe(r, pirq_router_dev, 
rt->rtr_device))
+                       break;
+               /* Fall back to a device match */
+               if (pirq_router_dev->vendor == h->vendor && h->probe(r, 
pirq_router_dev, pirq_router_dev->device))
+                       break;
+       }
+       printk(KERN_INFO "PCI: Using IRQ router %s [%04x/%04x] at %s\n",
+               pirq_router.name,
+               pirq_router_dev->vendor,
+               pirq_router_dev->device,
+               pci_name(pirq_router_dev));
+}
+
+static struct irq_info *pirq_get_info(struct pci_dev *dev)
+{
+       struct irq_routing_table *rt = pirq_table;
+       int entries = (rt->size - sizeof(struct irq_routing_table)) / 
sizeof(struct irq_info);
+       struct irq_info *info;
+
+       for (info = rt->slots; entries--; info++)
+               if (info->bus == dev->bus->number && PCI_SLOT(info->devfn) == 
PCI_SLOT(dev->devfn))
+                       return info;
+       return NULL;
+}
+
+static int pcibios_lookup_irq(struct pci_dev *dev, int assign)
+{
+       u8 pin;
+       struct irq_info *info;
+       int i, pirq, newirq;
+       int irq = 0;
+       u32 mask;
+       struct irq_router *r = &pirq_router;
+       struct pci_dev *dev2 = NULL;
+       char *msg = NULL;
+
+       /* Find IRQ pin */
+       pci_read_config_byte(dev, PCI_INTERRUPT_PIN, &pin);
+       if (!pin) {
+               DBG(" -> no interrupt pin\n");
+               return 0;
+       }
+       pin = pin - 1;
+
+       /* Find IRQ routing entry */
+
+       if (!pirq_table)
+               return 0;
+       
+       DBG("IRQ for %s[%c]", pci_name(dev), 'A' + pin);
+       info = pirq_get_info(dev);
+       if (!info) {
+               DBG(" -> not found in routing table\n");
+               return 0;
+       }
+       pirq = info->irq[pin].link;
+       mask = info->irq[pin].bitmap;
+       if (!pirq) {
+               DBG(" -> not routed\n");
+               return 0;
+       }
+       DBG(" -> PIRQ %02x, mask %04x, excl %04x", pirq, mask, 
pirq_table->exclusive_irqs);
+       mask &= pcibios_irq_mask;
+
+       /* Work around broken HP Pavilion Notebooks which assign USB to
+          IRQ 9 even though it is actually wired to IRQ 11 */
+
+       if (broken_hp_bios_irq9 && pirq == 0x59 && dev->irq == 9) {
+               dev->irq = 11;
+               pci_write_config_byte(dev, PCI_INTERRUPT_LINE, 11);
+               r->set(pirq_router_dev, dev, pirq, 11);
+       }
+
+       /* same for Acer Travelmate 360, but with CB and irq 11 -> 10 */
+       if (acer_tm360_irqrouting && dev->irq == 11 && dev->vendor == 
PCI_VENDOR_ID_O2) {
+               pirq = 0x68;
+               mask = 0x400;
+               dev->irq = r->get(pirq_router_dev, dev, pirq);
+               pci_write_config_byte(dev, PCI_INTERRUPT_LINE, dev->irq);
+       }
+
+       /*
+        * Find the best IRQ to assign: use the one
+        * reported by the device if possible.
+        */
+       newirq = dev->irq;
+       if (!((1 << newirq) & mask)) {
+               if ( pci_probe & PCI_USE_PIRQ_MASK) newirq = 0;
+               else printk(KERN_WARNING "PCI: IRQ %i for device %s doesn't 
match PIRQ mask - try pci=usepirqmask\n", newirq, pci_name(dev));
+       }
+       if (!newirq && assign) {
+               for (i = 0; i < 16; i++) {
+                       if (!(mask & (1 << i)))
+                               continue;
+                       if (pirq_penalty[i] < pirq_penalty[newirq] && 
can_request_irq(i, SA_SHIRQ))
+                               newirq = i;
+               }
+       }
+       DBG(" -> newirq=%d", newirq);
+
+       /* Check if it is hardcoded */
+       if ((pirq & 0xf0) == 0xf0) {
+               irq = pirq & 0xf;
+               DBG(" -> hardcoded IRQ %d\n", irq);
+               msg = "Hardcoded";
+       } else if ( r->get && (irq = r->get(pirq_router_dev, dev, pirq)) && \
+       ((!(pci_probe & PCI_USE_PIRQ_MASK)) || ((1 << irq) & mask)) ) {
+               DBG(" -> got IRQ %d\n", irq);
+               msg = "Found";
+       } else if (newirq && r->set && (dev->class >> 8) != 
PCI_CLASS_DISPLAY_VGA) {
+               DBG(" -> assigning IRQ %d", newirq);
+               if (r->set(pirq_router_dev, dev, pirq, newirq)) {
+                       eisa_set_level_irq(newirq);
+                       DBG(" ... OK\n");
+                       msg = "Assigned";
+                       irq = newirq;
+               }
+       }
+
+       if (!irq) {
+               DBG(" ... failed\n");
+               if (newirq && mask == (1 << newirq)) {
+                       msg = "Guessed";
+                       irq = newirq;
+               } else
+                       return 0;
+       }
+       printk(KERN_INFO "PCI: %s IRQ %d for device %s\n", msg, irq, 
pci_name(dev));
+
+       /* Update IRQ for all devices with the same pirq value */
+       while ((dev2 = pci_get_device(PCI_ANY_ID, PCI_ANY_ID, dev2)) != NULL) {
+               pci_read_config_byte(dev2, PCI_INTERRUPT_PIN, &pin);
+               if (!pin)
+                       continue;
+               pin--;
+               info = pirq_get_info(dev2);
+               if (!info)
+                       continue;
+               if (info->irq[pin].link == pirq) {
+                       /* We refuse to override the dev->irq information. Give 
a warning! */
+                       if ( dev2->irq && dev2->irq != irq && \
+                       (!(pci_probe & PCI_USE_PIRQ_MASK) || \
+                       ((1 << dev2->irq) & mask)) ) {
+#ifndef CONFIG_PCI_MSI
+                               printk(KERN_INFO "IRQ routing conflict for %s, 
have irq %d, want irq %d\n",
+                                      pci_name(dev2), dev2->irq, irq);
+#endif
+                               continue;
+                       }
+                       dev2->irq = irq;
+                       pirq_penalty[irq]++;
+                       if (dev != dev2)
+                               printk(KERN_INFO "PCI: Sharing IRQ %d with 
%s\n", irq, pci_name(dev2));
+               }
+       }
+       return 1;
+}
+
+static void __init pcibios_fixup_irqs(void)
+{
+       struct pci_dev *dev = NULL;
+       u8 pin;
+
+       DBG("PCI: IRQ fixup\n");
+       while ((dev = pci_get_device(PCI_ANY_ID, PCI_ANY_ID, dev)) != NULL) {
+               /*
+                * If the BIOS has set an out of range IRQ number, just ignore 
it.
+                * Also keep track of which IRQ's are already in use.
+                */
+               if (dev->irq >= 16) {
+                       DBG("%s: ignoring bogus IRQ %d\n", pci_name(dev), 
dev->irq);
+                       dev->irq = 0;
+               }
+               /* If the IRQ is already assigned to a PCI device, ignore its 
ISA use penalty */
+               if (pirq_penalty[dev->irq] >= 100 && pirq_penalty[dev->irq] < 
100000)
+                       pirq_penalty[dev->irq] = 0;
+               pirq_penalty[dev->irq]++;
+       }
+
+       dev = NULL;
+       while ((dev = pci_get_device(PCI_ANY_ID, PCI_ANY_ID, dev)) != NULL) {
+               pci_read_config_byte(dev, PCI_INTERRUPT_PIN, &pin);
+#ifdef CONFIG_X86_IO_APIC
+               /*
+                * Recalculate IRQ numbers if we use the I/O APIC.
+                */
+               if (io_apic_assign_pci_irqs)
+               {
+                       int irq;
+
+                       if (pin) {
+                               pin--;          /* interrupt pins are numbered 
starting from 1 */
+                               irq = 
IO_APIC_get_PCI_irq_vector(dev->bus->number, PCI_SLOT(dev->devfn), pin);
+       /*
+        * Busses behind bridges are typically not listed in the MP-table.
+        * In this case we have to look up the IRQ based on the parent bus,
+        * parent slot, and pin number. The SMP code detects such bridged
+        * busses itself so we should get into this branch reliably.
+        */
+                               if (irq < 0 && dev->bus->parent) { /* go back 
to the bridge */
+                                       struct pci_dev * bridge = 
dev->bus->self;
+
+                                       pin = (pin + PCI_SLOT(dev->devfn)) % 4;
+                                       irq = 
IO_APIC_get_PCI_irq_vector(bridge->bus->number, 
+                                                       
PCI_SLOT(bridge->devfn), pin);
+                                       if (irq >= 0)
+                                               printk(KERN_WARNING "PCI: using 
PPB %s[%c] to get irq %d\n",
+                                                       pci_name(bridge), 'A' + 
pin, irq);
+                               }
+                               if (irq >= 0) {
+                                       if (use_pci_vector() &&
+                                               !platform_legacy_irq(irq))
+                                               irq = IO_APIC_VECTOR(irq);
+
+                                       printk(KERN_INFO "PCI->APIC IRQ 
transform: %s[%c] -> IRQ %d\n",
+                                               pci_name(dev), 'A' + pin, irq);
+                                       dev->irq = irq;
+                               }
+                       }
+               }
+#endif
+               /*
+                * Still no IRQ? Try to lookup one...
+                */
+               if (pin && !dev->irq)
+                       pcibios_lookup_irq(dev, 0);
+       }
+}
+
+/*
+ * Work around broken HP Pavilion Notebooks which assign USB to
+ * IRQ 9 even though it is actually wired to IRQ 11
+ */
+static int __init fix_broken_hp_bios_irq9(struct dmi_system_id *d)
+{
+       if (!broken_hp_bios_irq9) {
+               broken_hp_bios_irq9 = 1;
+               printk(KERN_INFO "%s detected - fixing broken IRQ routing\n", 
d->ident);
+       }
+       return 0;
+}
+
+/*
+ * Work around broken Acer TravelMate 360 Notebooks which assign
+ * Cardbus to IRQ 11 even though it is actually wired to IRQ 10
+ */
+static int __init fix_acer_tm360_irqrouting(struct dmi_system_id *d)
+{
+       if (!acer_tm360_irqrouting) {
+               acer_tm360_irqrouting = 1;
+               printk(KERN_INFO "%s detected - fixing broken IRQ routing\n", 
d->ident);
+       }
+       return 0;
+}
+
+static struct dmi_system_id __initdata pciirq_dmi_table[] = {
+       {
+               .callback = fix_broken_hp_bios_irq9,
+               .ident = "HP Pavilion N5400 Series Laptop",
+               .matches = {
+                       DMI_MATCH(DMI_SYS_VENDOR, "Hewlett-Packard"),
+                       DMI_MATCH(DMI_BIOS_VERSION, "GE.M1.03"),
+                       DMI_MATCH(DMI_PRODUCT_VERSION, "HP Pavilion Notebook 
Model GE"),
+                       DMI_MATCH(DMI_BOARD_VERSION, "OmniBook N32N-736"),
+               },
+       },
+       {
+               .callback = fix_acer_tm360_irqrouting,
+               .ident = "Acer TravelMate 36x Laptop",
+               .matches = {
+                       DMI_MATCH(DMI_SYS_VENDOR, "Acer"),
+                       DMI_MATCH(DMI_PRODUCT_NAME, "TravelMate 360"),
+               },
+       },
+       { }
+};
+
+static int __init pcibios_irq_init(void)
+{
+       DBG("PCI: IRQ init\n");
+
+       if (pcibios_enable_irq || raw_pci_ops == NULL)
+               return 0;
+
+       dmi_check_system(pciirq_dmi_table);
+
+       pirq_table = pirq_find_routing_table();
+
+#ifdef CONFIG_PCI_BIOS
+       if (!pirq_table && (pci_probe & PCI_BIOS_IRQ_SCAN))
+               pirq_table = pcibios_get_irq_routing_table();
+#endif
+       if (pirq_table) {
+               pirq_peer_trick();
+               pirq_find_router(&pirq_router);
+               if (pirq_table->exclusive_irqs) {
+                       int i;
+                       for (i=0; i<16; i++)
+                               if (!(pirq_table->exclusive_irqs & (1 << i)))
+                                       pirq_penalty[i] += 100;
+               }
+               /* If we're using the I/O APIC, avoid using the PCI IRQ routing 
table */
+               if (io_apic_assign_pci_irqs)
+                       pirq_table = NULL;
+       }
+
+       pcibios_enable_irq = pirq_enable_irq;
+
+       pcibios_fixup_irqs();
+       return 0;
+}
+
+subsys_initcall(pcibios_irq_init);
+
+
+static void pirq_penalize_isa_irq(int irq)
+{
+       /*
+        *  If any ISAPnP device reports an IRQ in its list of possible
+        *  IRQ's, we try to avoid assigning it to PCI devices.
+        */
+       if (irq < 16)
+               pirq_penalty[irq] += 100;
+}
+
+void pcibios_penalize_isa_irq(int irq)
+{
+#ifdef CONFIG_ACPI_PCI
+       if (!acpi_noirq)
+               acpi_penalize_isa_irq(irq);
+       else
+#endif
+               pirq_penalize_isa_irq(irq);
+}
+
+static int pirq_enable_irq(struct pci_dev *dev)
+{
+       u8 pin;
+       struct pci_dev *temp_dev;
+
+       pci_read_config_byte(dev, PCI_INTERRUPT_PIN, &pin);
+       if (pin && !pcibios_lookup_irq(dev, 1) && !dev->irq) {
+               char *msg = "";
+
+               pin--;          /* interrupt pins are numbered starting from 1 
*/
+
+               if (io_apic_assign_pci_irqs) {
+                       int irq;
+
+                       irq = IO_APIC_get_PCI_irq_vector(dev->bus->number, 
PCI_SLOT(dev->devfn), pin);
+                       /*
+                        * Busses behind bridges are typically not listed in 
the MP-table.
+                        * In this case we have to look up the IRQ based on the 
parent bus,
+                        * parent slot, and pin number. The SMP code detects 
such bridged
+                        * busses itself so we should get into this branch 
reliably.
+                        */
+                       temp_dev = dev;
+                       while (irq < 0 && dev->bus->parent) { /* go back to the 
bridge */
+                               struct pci_dev * bridge = dev->bus->self;
+
+                               pin = (pin + PCI_SLOT(dev->devfn)) % 4;
+                               irq = 
IO_APIC_get_PCI_irq_vector(bridge->bus->number, 
+                                               PCI_SLOT(bridge->devfn), pin);
+                               if (irq >= 0)
+                                       printk(KERN_WARNING "PCI: using PPB 
%s[%c] to get irq %d\n",
+                                               pci_name(bridge), 'A' + pin, 
irq);
+                               dev = bridge;
+                       }
+                       dev = temp_dev;
+                       if (irq >= 0) {
+#ifdef CONFIG_PCI_MSI
+                               if (!platform_legacy_irq(irq))
+                                       irq = IO_APIC_VECTOR(irq);
+#endif
+                               printk(KERN_INFO "PCI->APIC IRQ transform: 
%s[%c] -> IRQ %d\n",
+                                       pci_name(dev), 'A' + pin, irq);
+                               dev->irq = irq;
+                               return 0;
+                       } else
+                               msg = " Probably buggy MP table.";
+               } else if (pci_probe & PCI_BIOS_IRQ_SCAN)
+                       msg = "";
+               else
+                       msg = " Please try using pci=biosirq.";
+
+               /* With IDE legacy devices the IRQ lookup failure is not a 
problem.. */
+               if (dev->class >> 8 == PCI_CLASS_STORAGE_IDE && !(dev->class & 
0x5))
+                       return 0;
+
+               printk(KERN_WARNING "PCI: No IRQ known for interrupt pin %c of 
device %s.%s\n",
+                      'A' + pin, pci_name(dev), msg);
+       }
+       return 0;
+}
+
+int pci_vector_resources(int last, int nr_released)
+{
+       int count = nr_released;
+
+       int next = last;
+       int offset = (last % 8);
+
+       while (next < FIRST_SYSTEM_VECTOR) {
+               next += 8;
+#ifdef CONFIG_X86_64
+               if (next == IA32_SYSCALL_VECTOR)
+                       continue;
+#else
+               if (next == SYSCALL_VECTOR)
+                       continue;
+#endif
+               count++;
+               if (next >= FIRST_SYSTEM_VECTOR) {
+                       if (offset%8) {
+                               next = FIRST_DEVICE_VECTOR + offset;
+                               offset++;
+                               continue;
+                       }
+                       count--;
+               }
+       }
+
+       return count;
+}
Index: linux-2.6.12-xen0-arch/arch/i386/mach-xen/pci/Makefile
===================================================================
--- /dev/null
+++ linux-2.6.12-xen0-arch/arch/i386/mach-xen/pci/Makefile
@@ -0,0 +1,30 @@
+CFLAGS += -Iarch/i386/pci
+
+c-obj-y                                := i386.o
+
+c-obj-$(CONFIG_PCI_BIOS)               += pcbios.o
+c-obj-$(CONFIG_PCI_MMCONFIG)   += mmconfig.o
+c-obj-$(CONFIG_PCI_DIRECT)     += direct.o
+
+c-pci-y                                := fixup.o
+c-pci-$(CONFIG_ACPI_PCI)       += acpi.o
+c-pci-y                                += legacy.o
+# Make sure irq.o gets linked in after legacy.o
+l-pci-y                                += irq.o
+
+c-pci-$(CONFIG_X86_VISWS)      := visws.o fixup.o
+pci-$(CONFIG_X86_VISWS)                :=
+c-pci-$(CONFIG_X86_NUMAQ)      := numa.o
+pci-$(CONFIG_X86_NUMAQ)                := irq.o
+
+obj-y                          += $(pci-y)
+c-obj-y                                += $(c-pci-y) common.o
+
+c-link :=
+
+$(patsubst %.o,$(obj)/%.c,$(c-obj-y) $(c-link)):
+       @ln -fsn $(srctree)/arch/i386/pci/$(notdir $@) $@
+
+obj-y  += $(c-obj-y) $(l-pci-y)
+
+clean-files += $(patsubst %.o,%.c,$(c-obj-y) $(c-obj-) $(c-link))
Index: linux-2.6.12-xen0-arch/arch/i386/Makefile
===================================================================
--- linux-2.6.12-xen0-arch.orig/arch/i386/Makefile
+++ linux-2.6.12-xen0-arch/arch/i386/Makefile
@@ -71,6 +71,10 @@ CFLAGS += $(cflags-y)
 # Default subarch .c files
 mcore-y  := mach-default
 
+head-y := arch/i386/kernel/head.o arch/i386/kernel/init_task.o
+boot   := arch/i386/boot
+pci    := arch/i386/pci/
+
 # Voyager subarch support
 mflags-$(CONFIG_X86_VOYAGER)   := -Iinclude/asm-i386/mach-voyager
 mcore-$(CONFIG_X86_VOYAGER)    := mach-voyager
@@ -91,6 +95,16 @@ mcore-$(CONFIG_X86_BIGSMP)   := mach-defau
 mflags-$(CONFIG_X86_SUMMIT) := -Iinclude/asm-i386/mach-summit
 mcore-$(CONFIG_X86_SUMMIT)  := mach-default
 
+#Xen subarch support
+mflags-$(CONFIG_X86_XEN)       := -Iinclude/asm-i386/mach-xen
+mcore-$(CONFIG_X86_XEN)                := mach-xen
+core-$(CONFIG_X86_XEN)         += arch/i386/mach-xen/
+head-$(CONFIG_X86_XEN)         := arch/i386/mach-xen/kernel/head.o 
arch/i386/mach-xen/kernel/init_task.o
+ifdef CONFIG_X86_XEN
+boot                           := arch/i386/mach-xen/boot
+pci                            := arch/i386/mach-xen/pci/
+endif
+
 # generic subarchitecture
 mflags-$(CONFIG_X86_GENERICARCH) := -Iinclude/asm-i386/mach-generic
 mcore-$(CONFIG_X86_GENERICARCH) := mach-default
@@ -104,15 +118,15 @@ core-$(CONFIG_X86_ES7000) := arch/i386/m
 # default subarch .h files
 mflags-y += -Iinclude/asm-i386/mach-default
 
-head-y := arch/i386/kernel/head.o arch/i386/kernel/init_task.o
-
 libs-y                                         += arch/i386/lib/
+ifndef CONFIG_X86_XEN
 core-y                                 += arch/i386/kernel/ \
                                           arch/i386/mm/ \
                                           arch/i386/$(mcore-y)/ \
                                           arch/i386/crypto/
 drivers-$(CONFIG_MATH_EMULATION)       += arch/i386/math-emu/
-drivers-$(CONFIG_PCI)                  += arch/i386/pci/
+endif
+drivers-$(CONFIG_PCI)                  += $(pci)
 # must be linked after kernel/
 drivers-$(CONFIG_OPROFILE)             += arch/i386/oprofile/
 drivers-$(CONFIG_PM)                   += arch/i386/power/
@@ -120,11 +134,16 @@ drivers-$(CONFIG_PM)                      += 
arch/i386/powe
 CFLAGS += $(mflags-y)
 AFLAGS += $(mflags-y)
 
-boot := arch/i386/boot
-
 .PHONY: zImage bzImage compressed zlilo bzlilo \
        zdisk bzdisk fdimage fdimage144 fdimage288 install kernel_install
 
+ifdef CONFIG_X86_XEN
+.PHONY: vmlinuz
+
+vmlinuz: vmlinux
+       $(Q)$(MAKE) $(build)=$(boot) vmlinuz
+endif
+
 all: bzImage
 
 # KBUILD_IMAGE specify target image being built
@@ -171,4 +190,7 @@ define archhelp
   echo  '  fdimage      - Create a boot floppy image'
 endef
 
+debug:
+       @echo $(core-y)
+
 CLEAN_FILES += arch/$(ARCH)/boot/fdimage arch/$(ARCH)/boot/mtools.conf
Index: linux-2.6.12-xen0-arch/drivers/xen/core/ctrl_if.c
===================================================================
--- /dev/null
+++ linux-2.6.12-xen0-arch/drivers/xen/core/ctrl_if.c
@@ -0,0 +1,569 @@
+/******************************************************************************
+ * ctrl_if.c
+ * 
+ * Management functions for special interface to the domain controller.
+ * 
+ * Copyright (c) 2004, K A Fraser
+ * 
+ * This file may be distributed separately from the Linux kernel, or
+ * incorporated into other software packages, subject to the following license:
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this source file (the "Software"), to deal in the Software without
+ * restriction, including without limitation the rights to use, copy, modify,
+ * merge, publish, distribute, sublicense, and/or sell copies of the Software,
+ * and to permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#include <linux/config.h>
+#include <linux/kernel.h>
+#include <linux/sched.h>
+#include <linux/slab.h>
+#include <linux/string.h>
+#include <linux/errno.h>
+#include <linux/irq.h>
+#include <linux/interrupt.h>
+#include <linux/module.h>
+#include <asm-xen/ctrl_if.h>
+#include <asm-xen/evtchn.h>
+
+#if 0
+#define DPRINTK(_f, _a...) printk(KERN_ALERT "(file=%s, line=%d) " _f, \
+                           __FILE__ , __LINE__ , ## _a )
+#else
+#define DPRINTK(_f, _a...) ((void)0)
+#endif
+
+/*
+ * Extra ring macros to sync a consumer index up to the public producer index. 
+ * Generally UNSAFE, but we use it for recovery and shutdown in some cases.
+ */
+#define RING_DROP_PENDING_REQUESTS(_r)                                  \
+    do {                                                                \
+        (_r)->req_cons = (_r)->sring->req_prod;                         \
+    } while (0)
+#define RING_DROP_PENDING_RESPONSES(_r)                                 \
+    do {                                                                \
+        (_r)->rsp_cons = (_r)->sring->rsp_prod;                         \
+    } while (0)
+
+/*
+ * Only used by initial domain which must create its own control-interface
+ * event channel. This value is picked up by the user-space domain controller
+ * via an ioctl.
+ */
+int initdom_ctrlif_domcontroller_port = -1;
+
+static int        ctrl_if_evtchn;
+static int        ctrl_if_irq;
+static spinlock_t ctrl_if_lock;
+
+static struct irqaction ctrl_if_irq_action;
+
+static ctrl_front_ring_t ctrl_if_tx_ring;
+static ctrl_back_ring_t  ctrl_if_rx_ring;
+
+/* Incoming message requests. */
+    /* Primary message type -> message handler. */
+static ctrl_msg_handler_t ctrl_if_rxmsg_handler[256];
+    /* Primary message type -> callback in process context? */
+static unsigned long ctrl_if_rxmsg_blocking_context[256/sizeof(unsigned long)];
+    /* Is it late enough during bootstrap to use schedule_task()? */
+static int safe_to_schedule_task;
+    /* Queue up messages to be handled in process context. */
+static ctrl_msg_t ctrl_if_rxmsg_deferred[CONTROL_RING_SIZE];
+static CONTROL_RING_IDX ctrl_if_rxmsg_deferred_prod;
+static CONTROL_RING_IDX ctrl_if_rxmsg_deferred_cons;
+
+/* Incoming message responses: message identifier -> message handler/id. */
+static struct {
+    ctrl_msg_handler_t fn;
+    unsigned long      id;
+} ctrl_if_txmsg_id_mapping[CONTROL_RING_SIZE];
+
+/* For received messages that must be deferred to process context. */
+static void __ctrl_if_rxmsg_deferred(void *unused);
+static DECLARE_WORK(ctrl_if_rxmsg_deferred_work,
+                    __ctrl_if_rxmsg_deferred,
+                    NULL);
+
+/* Deferred callbacks for people waiting for space in the transmit ring. */
+static DECLARE_TASK_QUEUE(ctrl_if_tx_tq);
+
+static DECLARE_WAIT_QUEUE_HEAD(ctrl_if_tx_wait);
+static void __ctrl_if_tx_tasklet(unsigned long data);
+static DECLARE_TASKLET(ctrl_if_tx_tasklet, __ctrl_if_tx_tasklet, 0);
+
+static void __ctrl_if_rx_tasklet(unsigned long data);
+static DECLARE_TASKLET(ctrl_if_rx_tasklet, __ctrl_if_rx_tasklet, 0);
+
+#define get_ctrl_if() ((control_if_t *)((char *)HYPERVISOR_shared_info + 2048))
+
+static void ctrl_if_notify_controller(void)
+{
+    notify_via_evtchn(ctrl_if_evtchn);
+}
+
+static void ctrl_if_rxmsg_default_handler(ctrl_msg_t *msg, unsigned long id)
+{
+    msg->length = 0;
+    ctrl_if_send_response(msg);
+}
+
+static void __ctrl_if_tx_tasklet(unsigned long data)
+{
+    ctrl_msg_t *msg;
+    int         was_full = RING_FULL(&ctrl_if_tx_ring);
+    RING_IDX    i, rp;
+
+    i  = ctrl_if_tx_ring.rsp_cons;
+    rp = ctrl_if_tx_ring.sring->rsp_prod;
+    rmb(); /* Ensure we see all requests up to 'rp'. */
+
+    for ( ; i != rp; i++ )
+    {
+        msg = RING_GET_RESPONSE(&ctrl_if_tx_ring, i);
+        
+        DPRINTK("Rx-Rsp %u/%u :: %d/%d\n", i-1,
+                ctrl_if_tx_ring.sring->rsp_prod,
+                msg->type, msg->subtype);
+
+        /* Execute the callback handler, if one was specified. */
+        if ( msg->id != 0xFF )
+        {
+            (*ctrl_if_txmsg_id_mapping[msg->id].fn)(
+                msg, ctrl_if_txmsg_id_mapping[msg->id].id);
+            smp_mb(); /* Execute, /then/ free. */
+            ctrl_if_txmsg_id_mapping[msg->id].fn = NULL;
+        }
+    }
+
+    /*
+     * Step over messages in the ring /after/ finishing reading them. As soon 
+     * as the index is updated then the message may get blown away.
+     */
+    smp_mb();
+    ctrl_if_tx_ring.rsp_cons = i;
+            
+    if ( was_full && !RING_FULL(&ctrl_if_tx_ring) )
+    {
+        wake_up(&ctrl_if_tx_wait);
+        run_task_queue(&ctrl_if_tx_tq);
+    }
+}
+
+static void __ctrl_if_rxmsg_deferred(void *unused)
+{
+    ctrl_msg_t *msg;
+    CONTROL_RING_IDX dp;
+
+    dp = ctrl_if_rxmsg_deferred_prod;
+    rmb(); /* Ensure we see all deferred requests up to 'dp'. */
+
+    while ( ctrl_if_rxmsg_deferred_cons != dp )
+    {
+        msg = &ctrl_if_rxmsg_deferred[MASK_CONTROL_IDX(
+            ctrl_if_rxmsg_deferred_cons++)];
+        (*ctrl_if_rxmsg_handler[msg->type])(msg, 0);
+    }
+}
+
+static void __ctrl_if_rx_tasklet(unsigned long data)
+{
+    ctrl_msg_t    msg, *pmsg;
+    CONTROL_RING_IDX dp;
+    RING_IDX rp, i;
+
+    i  = ctrl_if_rx_ring.req_cons;
+    rp = ctrl_if_rx_ring.sring->req_prod;
+    dp = ctrl_if_rxmsg_deferred_prod;
+    rmb(); /* Ensure we see all requests up to 'rp'. */
+ 
+    for ( ; i != rp; i++) 
+    {
+        pmsg = RING_GET_REQUEST(&ctrl_if_rx_ring, i);
+        memcpy(&msg, pmsg, offsetof(ctrl_msg_t, msg));
+
+        DPRINTK("Rx-Req %u/%u :: %d/%d\n", i-1,
+                ctrl_if_rx_ring.sring->req_prod,
+                msg.type, msg.subtype);
+
+        if ( msg.length > sizeof(msg.msg) )
+            msg.length = sizeof(msg.msg);
+        
+        if ( msg.length != 0 )
+            memcpy(msg.msg, pmsg->msg, msg.length);
+
+        if ( test_bit(msg.type, 
+                      (unsigned long *)&ctrl_if_rxmsg_blocking_context) )
+            memcpy(&ctrl_if_rxmsg_deferred[MASK_CONTROL_IDX(dp++)],
+                   &msg, offsetof(ctrl_msg_t, msg) + msg.length);
+        else
+            (*ctrl_if_rxmsg_handler[msg.type])(&msg, 0);
+    }
+
+    ctrl_if_rx_ring.req_cons = i;
+
+    if ( dp != ctrl_if_rxmsg_deferred_prod )
+    {
+        wmb();
+        ctrl_if_rxmsg_deferred_prod = dp;
+        schedule_work(&ctrl_if_rxmsg_deferred_work);
+    }
+}
+
+static irqreturn_t ctrl_if_interrupt(int irq, void *dev_id,
+                                     struct pt_regs *regs)
+{
+    if ( RING_HAS_UNCONSUMED_RESPONSES(&ctrl_if_tx_ring) )
+        tasklet_schedule(&ctrl_if_tx_tasklet);
+
+    if ( RING_HAS_UNCONSUMED_REQUESTS(&ctrl_if_rx_ring) )
+        tasklet_schedule(&ctrl_if_rx_tasklet);
+
+    return IRQ_HANDLED;
+}
+
+int
+ctrl_if_send_message_noblock(
+    ctrl_msg_t *msg, 
+    ctrl_msg_handler_t hnd,
+    unsigned long id)
+{
+    unsigned long flags;
+    ctrl_msg_t   *dmsg;
+    int           i;
+
+    spin_lock_irqsave(&ctrl_if_lock, flags);
+
+    if ( RING_FULL(&ctrl_if_tx_ring) )
+    {
+        spin_unlock_irqrestore(&ctrl_if_lock, flags);
+        return -EAGAIN;
+    }
+
+    msg->id = 0xFF;
+    if ( hnd != NULL )
+    {
+        for ( i = 0; ctrl_if_txmsg_id_mapping[i].fn != NULL; i++ )
+            continue;
+        ctrl_if_txmsg_id_mapping[i].fn = hnd;
+        ctrl_if_txmsg_id_mapping[i].id = id;
+        msg->id = i;
+    }
+
+    DPRINTK("Tx-Req %u/%u :: %d/%d\n", 
+            ctrl_if_tx_ring.req_prod_pvt, 
+            ctrl_if_tx_ring.rsp_cons,
+            msg->type, msg->subtype);
+
+    dmsg = RING_GET_REQUEST(&ctrl_if_tx_ring, 
+            ctrl_if_tx_ring.req_prod_pvt);
+    memcpy(dmsg, msg, sizeof(*msg));
+    ctrl_if_tx_ring.req_prod_pvt++;
+    RING_PUSH_REQUESTS(&ctrl_if_tx_ring);
+
+    spin_unlock_irqrestore(&ctrl_if_lock, flags);
+
+    ctrl_if_notify_controller();
+
+    return 0;
+}
+
+int
+ctrl_if_send_message_block(
+    ctrl_msg_t *msg, 
+    ctrl_msg_handler_t hnd, 
+    unsigned long id,
+    long wait_state)
+{
+    DECLARE_WAITQUEUE(wait, current);
+    int rc;
+
+    /* Fast path. */
+    if ( (rc = ctrl_if_send_message_noblock(msg, hnd, id)) != -EAGAIN )
+        return rc;
+
+    add_wait_queue(&ctrl_if_tx_wait, &wait);
+
+    for ( ; ; )
+    {
+        set_current_state(wait_state);
+
+        if ( (rc = ctrl_if_send_message_noblock(msg, hnd, id)) != -EAGAIN )
+            break;
+
+        rc = -ERESTARTSYS;
+        if ( signal_pending(current) && (wait_state == TASK_INTERRUPTIBLE) )
+            break;
+
+        schedule();
+    }
+
+    set_current_state(TASK_RUNNING);
+    remove_wait_queue(&ctrl_if_tx_wait, &wait);
+
+    return rc;
+}
+
+/* Allow a reponse-callback handler to find context of a blocked requester.  */
+struct rsp_wait {
+    ctrl_msg_t         *msg;  /* Buffer for the response message.            */
+    struct task_struct *task; /* The task that is blocked on the response.   */
+    int                 done; /* Indicate to 'task' that response is rcv'ed. */
+};
+
+static void __ctrl_if_get_response(ctrl_msg_t *msg, unsigned long id)
+{
+    struct rsp_wait    *wait = (struct rsp_wait *)id;
+    struct task_struct *task = wait->task;
+
+    memcpy(wait->msg, msg, sizeof(*msg));
+    wmb();
+    wait->done = 1;
+
+    wake_up_process(task);
+}
+
+int
+ctrl_if_send_message_and_get_response(
+    ctrl_msg_t *msg, 
+    ctrl_msg_t *rmsg,
+    long wait_state)
+{
+    struct rsp_wait wait;
+    int rc;
+
+    wait.msg  = rmsg;
+    wait.done = 0;
+    wait.task = current;
+
+    if ( (rc = ctrl_if_send_message_block(msg, __ctrl_if_get_response,
+                                          (unsigned long)&wait,
+                                          wait_state)) != 0 )
+        return rc;
+
+    for ( ; ; )
+    {
+        /* NB. Can't easily support TASK_INTERRUPTIBLE here. */
+        set_current_state(TASK_UNINTERRUPTIBLE);
+        if ( wait.done )
+            break;
+        schedule();
+    }
+
+    set_current_state(TASK_RUNNING);
+    return 0;
+}
+
+int
+ctrl_if_enqueue_space_callback(
+    struct tq_struct *task)
+{
+    /* Fast path. */
+    if ( !RING_FULL(&ctrl_if_tx_ring) )
+        return 0;
+
+    (void)queue_task(task, &ctrl_if_tx_tq);
+
+    /*
+     * We may race execution of the task queue, so return re-checked status. If
+     * the task is not executed despite the ring being non-full then we will
+     * certainly return 'not full'.
+     */
+    smp_mb();
+    return RING_FULL(&ctrl_if_tx_ring);
+}
+
+void
+ctrl_if_send_response(
+    ctrl_msg_t *msg)
+{
+    unsigned long flags;
+    ctrl_msg_t   *dmsg;
+
+    /*
+     * NB. The response may the original request message, modified in-place.
+     * In this situation we may have src==dst, so no copying is required.
+     */
+    spin_lock_irqsave(&ctrl_if_lock, flags);
+
+    DPRINTK("Tx-Rsp %u :: %d/%d\n", 
+            ctrl_if_rx_ring.rsp_prod_pvt, 
+            msg->type, msg->subtype);
+
+    dmsg = RING_GET_RESPONSE(&ctrl_if_rx_ring, 
+            ctrl_if_rx_ring.rsp_prod_pvt);
+    if ( dmsg != msg )
+        memcpy(dmsg, msg, sizeof(*msg));
+
+    ctrl_if_rx_ring.rsp_prod_pvt++;
+    RING_PUSH_RESPONSES(&ctrl_if_rx_ring);
+
+    spin_unlock_irqrestore(&ctrl_if_lock, flags);
+
+    ctrl_if_notify_controller();
+}
+
+int
+ctrl_if_register_receiver(
+    u8 type, 
+    ctrl_msg_handler_t hnd, 
+    unsigned int flags)
+{
+    unsigned long _flags;
+    int inuse;
+
+    spin_lock_irqsave(&ctrl_if_lock, _flags);
+
+    inuse = (ctrl_if_rxmsg_handler[type] != ctrl_if_rxmsg_default_handler);
+
+    if ( inuse )
+    {
+        printk(KERN_INFO "Receiver %p already established for control "
+               "messages of type %d.\n", ctrl_if_rxmsg_handler[type], type);
+    }
+    else
+    {
+        ctrl_if_rxmsg_handler[type] = hnd;
+        clear_bit(type, (unsigned long *)&ctrl_if_rxmsg_blocking_context);
+        if ( flags == CALLBACK_IN_BLOCKING_CONTEXT )
+        {
+            set_bit(type, (unsigned long *)&ctrl_if_rxmsg_blocking_context);
+            if ( !safe_to_schedule_task )
+                BUG();
+        }
+    }
+
+    spin_unlock_irqrestore(&ctrl_if_lock, _flags);
+
+    return !inuse;
+}
+
+void 
+ctrl_if_unregister_receiver(
+    u8 type,
+    ctrl_msg_handler_t hnd)
+{
+    unsigned long flags;
+
+    spin_lock_irqsave(&ctrl_if_lock, flags);
+
+    if ( ctrl_if_rxmsg_handler[type] != hnd )
+        printk(KERN_INFO "Receiver %p is not registered for control "
+               "messages of type %d.\n", hnd, type);
+    else
+        ctrl_if_rxmsg_handler[type] = ctrl_if_rxmsg_default_handler;
+
+    spin_unlock_irqrestore(&ctrl_if_lock, flags);
+
+    /* Ensure that @hnd will not be executed after this function returns. */
+    tasklet_unlock_wait(&ctrl_if_rx_tasklet);
+}
+
+void ctrl_if_suspend(void)
+{
+    teardown_irq(ctrl_if_irq, &ctrl_if_irq_action);
+    unbind_evtchn_from_irq(ctrl_if_evtchn);
+}
+
+void ctrl_if_resume(void)
+{
+    control_if_t *ctrl_if = get_ctrl_if();
+
+    if ( xen_start_info.flags & SIF_INITDOMAIN )
+    {
+        /*
+         * The initial domain must create its own domain-controller link.
+         * The controller is probably not running at this point, but will
+         * pick up its end of the event channel from 
+         */
+        evtchn_op_t op;
+       extern void bind_evtchn_to_cpu(unsigned port, unsigned cpu);
+
+        op.cmd = EVTCHNOP_bind_interdomain;
+        op.u.bind_interdomain.dom1 = DOMID_SELF;
+        op.u.bind_interdomain.dom2 = DOMID_SELF;
+        op.u.bind_interdomain.port1 = 0;
+        op.u.bind_interdomain.port2 = 0;
+        if ( HYPERVISOR_event_channel_op(&op) != 0 )
+            BUG();
+        xen_start_info.domain_controller_evtchn = op.u.bind_interdomain.port1;
+        initdom_ctrlif_domcontroller_port   = op.u.bind_interdomain.port2;
+       bind_evtchn_to_cpu(op.u.bind_interdomain.port1, 0);
+    }
+
+    /* Sync up with shared indexes. */
+    FRONT_RING_ATTACH(&ctrl_if_tx_ring, &ctrl_if->tx_ring, CONTROL_RING_MEM);
+    BACK_RING_ATTACH(&ctrl_if_rx_ring, &ctrl_if->rx_ring, CONTROL_RING_MEM);
+
+    ctrl_if_evtchn = xen_start_info.domain_controller_evtchn;
+    ctrl_if_irq    = bind_evtchn_to_irq(ctrl_if_evtchn);
+
+    memset(&ctrl_if_irq_action, 0, sizeof(ctrl_if_irq_action));
+    ctrl_if_irq_action.handler = ctrl_if_interrupt;
+    ctrl_if_irq_action.name    = "ctrl-if";
+    (void)setup_irq(ctrl_if_irq, &ctrl_if_irq_action);
+}
+
+void __init ctrl_if_init(void)
+{
+    control_if_t *ctrl_if = get_ctrl_if();
+    int i;
+
+    for ( i = 0; i < 256; i++ )
+        ctrl_if_rxmsg_handler[i] = ctrl_if_rxmsg_default_handler;
+
+    FRONT_RING_ATTACH(&ctrl_if_tx_ring, &ctrl_if->tx_ring, CONTROL_RING_MEM);
+    BACK_RING_ATTACH(&ctrl_if_rx_ring, &ctrl_if->rx_ring, CONTROL_RING_MEM);
+    
+    spin_lock_init(&ctrl_if_lock);
+
+    ctrl_if_resume();
+}
+
+
+/* This is called after it is safe to call schedule_task(). */
+static int __init ctrl_if_late_setup(void)
+{
+    safe_to_schedule_task = 1;
+    return 0;
+}
+__initcall(ctrl_if_late_setup);
+
+
+/*
+ * !! The following are DANGEROUS FUNCTIONS !!
+ * Use with care [for example, see xencons_force_flush()].
+ */
+
+int ctrl_if_transmitter_empty(void)
+{
+    return (ctrl_if_tx_ring.sring->req_prod == ctrl_if_tx_ring.rsp_cons);
+    
+}
+
+void ctrl_if_discard_responses(void)
+{
+    RING_DROP_PENDING_RESPONSES(&ctrl_if_tx_ring);
+}
+
+EXPORT_SYMBOL(ctrl_if_send_message_noblock);
+EXPORT_SYMBOL(ctrl_if_send_message_block);
+EXPORT_SYMBOL(ctrl_if_send_message_and_get_response);
+EXPORT_SYMBOL(ctrl_if_enqueue_space_callback);
+EXPORT_SYMBOL(ctrl_if_send_response);
+EXPORT_SYMBOL(ctrl_if_register_receiver);
+EXPORT_SYMBOL(ctrl_if_unregister_receiver);
Index: linux-2.6.12-xen0-arch/drivers/xen/core/devmem.c
===================================================================
--- /dev/null
+++ linux-2.6.12-xen0-arch/drivers/xen/core/devmem.c
@@ -0,0 +1,158 @@
+/*
+ *  Originally from linux/drivers/char/mem.c
+ *
+ *  Copyright (C) 1991, 1992  Linus Torvalds
+ *
+ *  Added devfs support. 
+ *    Jan-11-1998, C. Scott Ananian <cananian@xxxxxxxxxxxxxxxxxxxx>
+ *  Shared /dev/zero mmaping support, Feb 2000, Kanoj Sarcar <kanoj@xxxxxxx>
+ */
+
+#include <linux/config.h>
+#include <linux/mm.h>
+#include <linux/miscdevice.h>
+#include <linux/slab.h>
+#include <linux/vmalloc.h>
+#include <linux/mman.h>
+#include <linux/random.h>
+#include <linux/init.h>
+#include <linux/raw.h>
+#include <linux/tty.h>
+#include <linux/capability.h>
+#include <linux/smp_lock.h>
+#include <linux/devfs_fs_kernel.h>
+#include <linux/ptrace.h>
+#include <linux/device.h>
+#include <asm/pgalloc.h>
+#include <asm/uaccess.h>
+#include <asm/io.h>
+
+static inline int uncached_access(struct file *file, unsigned long addr)
+{
+        if (file->f_flags & O_SYNC)
+                return 1;
+        /* Xen sets correct MTRR type on non-RAM for us. */
+        return 0;
+}
+
+/*
+ * This funcion reads the *physical* memory. The f_pos points directly to the 
+ * memory location. 
+ */
+static ssize_t read_mem(struct file * file, char __user * buf,
+                       size_t count, loff_t *ppos)
+{
+       unsigned long i, p = *ppos;
+       ssize_t read = -EFAULT;
+       void *v;
+
+       if ((v = ioremap(p, count)) == NULL) {
+               /*
+                * Some programs (e.g., dmidecode) groove off into weird RAM
+                * areas where no table scan possibly exist (because Xen will
+                * have stomped on them!). These programs get rather upset if
+                 * we let them know that Xen failed their access, so we fake
+                 * out a read of all zeroes. :-)
+                */
+               for (i = 0; i < count; i++)
+                       if (put_user(0, buf+i))
+                               return -EFAULT;
+               return count;
+       }
+       if (copy_to_user(buf, v, count))
+               goto out;
+
+       read = count;
+       *ppos += read;
+out:
+       iounmap(v);
+       return read;
+}
+
+static ssize_t write_mem(struct file * file, const char __user * buf, 
+                        size_t count, loff_t *ppos)
+{
+       unsigned long p = *ppos;
+       ssize_t written = -EFAULT;
+       void *v;
+
+       if ((v = ioremap(p, count)) == NULL)
+               return -EFAULT;
+       if (copy_to_user(v, buf, count))
+               goto out;
+
+       written = count;
+       *ppos += written;
+out:
+       iounmap(v);
+       return written;
+}
+
+static int mmap_mem(struct file * file, struct vm_area_struct * vma)
+{
+       unsigned long offset = vma->vm_pgoff << PAGE_SHIFT;
+       int uncached;
+
+       uncached = uncached_access(file, offset);
+       if (uncached)
+               vma->vm_page_prot = pgprot_noncached(vma->vm_page_prot);
+
+       /* Don't try to swap out physical pages.. */
+       vma->vm_flags |= VM_RESERVED;
+
+       /*
+        * Don't dump addresses that are not real memory to a core file.
+        */
+       if (uncached)
+               vma->vm_flags |= VM_IO;
+
+       if (io_remap_page_range(vma, vma->vm_start, offset, 
+                               vma->vm_end-vma->vm_start, vma->vm_page_prot))
+               return -EAGAIN;
+
+       return 0;
+}
+
+/*
+ * The memory devices use the full 32/64 bits of the offset, and so we cannot
+ * check against negative addresses: they are ok. The return value is weird,
+ * though, in that case (0).
+ *
+ * also note that seeking relative to the "end of file" isn't supported:
+ * it has no meaning, so it returns -EINVAL.
+ */
+static loff_t memory_lseek(struct file * file, loff_t offset, int orig)
+{
+       loff_t ret;
+
+       down(&file->f_dentry->d_inode->i_sem);
+       switch (orig) {
+               case 0:
+                       file->f_pos = offset;
+                       ret = file->f_pos;
+                       force_successful_syscall_return();
+                       break;
+               case 1:
+                       file->f_pos += offset;
+                       ret = file->f_pos;
+                       force_successful_syscall_return();
+                       break;
+               default:
+                       ret = -EINVAL;
+       }
+       up(&file->f_dentry->d_inode->i_sem);
+       return ret;
+}
+
+static int open_mem(struct inode * inode, struct file * filp)
+{
+       return capable(CAP_SYS_RAWIO) ? 0 : -EPERM;
+}
+
+struct file_operations mem_fops = {
+       .llseek         = memory_lseek,
+       .read           = read_mem,
+       .write          = write_mem,
+       .mmap           = mmap_mem,
+       .open           = open_mem,
+};
Index: linux-2.6.12-xen0-arch/drivers/xen/core/evtchn.c
===================================================================
--- /dev/null
+++ linux-2.6.12-xen0-arch/drivers/xen/core/evtchn.c
@@ -0,0 +1,698 @@
+/******************************************************************************
+ * evtchn.c
+ * 
+ * Communication via Xen event channels.
+ * 
+ * Copyright (c) 2002-2004, K A Fraser
+ * 
+ * This file may be distributed separately from the Linux kernel, or
+ * incorporated into other software packages, subject to the following license:
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this source file (the "Software"), to deal in the Software without
+ * restriction, including without limitation the rights to use, copy, modify,
+ * merge, publish, distribute, sublicense, and/or sell copies of the Software,
+ * and to permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#include <linux/config.h>
+#include <linux/module.h>
+#include <linux/irq.h>
+#include <linux/interrupt.h>
+#include <linux/sched.h>
+#include <linux/kernel_stat.h>
+#include <linux/version.h>
+#include <asm/atomic.h>
+#include <asm/system.h>
+#include <asm/ptrace.h>
+#include <asm-xen/synch_bitops.h>
+#include <asm-xen/xen-public/event_channel.h>
+#include <asm-xen/xen-public/physdev.h>
+#include <asm-xen/ctrl_if.h>
+#include <asm-xen/hypervisor.h>
+#include <asm-xen/evtchn.h>
+
+#if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,0)
+EXPORT_SYMBOL(force_evtchn_callback);
+EXPORT_SYMBOL(evtchn_do_upcall);
+EXPORT_SYMBOL(bind_evtchn_to_irq);
+EXPORT_SYMBOL(unbind_evtchn_from_irq);
+#endif
+
+/*
+ * This lock protects updates to the following mapping and reference-count
+ * arrays. The lock does not need to be acquired to read the mapping tables.
+ */
+static spinlock_t irq_mapping_update_lock;
+
+/* IRQ <-> event-channel mappings. */
+static int evtchn_to_irq[NR_EVENT_CHANNELS];
+static int irq_to_evtchn[NR_IRQS];
+
+/* IRQ <-> VIRQ mapping. */
+DEFINE_PER_CPU(int, virq_to_irq[NR_VIRQS]);
+
+/* evtchn <-> IPI mapping. */
+#ifndef NR_IPIS
+#define NR_IPIS 1 
+#endif
+DEFINE_PER_CPU(int, ipi_to_evtchn[NR_IPIS]);
+
+/* Reference counts for bindings to IRQs. */
+static int irq_bindcount[NR_IRQS];
+
+/* Bitmap indicating which PIRQs require Xen to be notified on unmask. */
+static unsigned long pirq_needs_unmask_notify[NR_PIRQS/sizeof(unsigned long)];
+
+#ifdef CONFIG_SMP
+
+static u8  cpu_evtchn[NR_EVENT_CHANNELS];
+static u32 cpu_evtchn_mask[NR_CPUS][NR_EVENT_CHANNELS/32];
+
+#define active_evtchns(cpu,sh,idx)              \
+    ((sh)->evtchn_pending[idx] &                \
+     cpu_evtchn_mask[cpu][idx] &                \
+     ~(sh)->evtchn_mask[idx])
+
+void bind_evtchn_to_cpu(unsigned int chn, unsigned int cpu)
+{
+    clear_bit(chn, (unsigned long *)cpu_evtchn_mask[cpu_evtchn[chn]]);
+    set_bit(chn, (unsigned long *)cpu_evtchn_mask[cpu]);
+    cpu_evtchn[chn] = cpu;
+}
+
+#else
+
+#define active_evtchns(cpu,sh,idx)              \
+    ((sh)->evtchn_pending[idx] &                \
+     ~(sh)->evtchn_mask[idx])
+
+void bind_evtchn_to_cpu(unsigned int chn, unsigned int cpu)
+{
+}
+#endif
+
+/* Upcall to generic IRQ layer. */
+#ifdef CONFIG_X86
+#if LINUX_VERSION_CODE > KERNEL_VERSION(2,6,9)
+extern fastcall unsigned int do_IRQ(struct pt_regs *regs);
+#else
+extern asmlinkage unsigned int do_IRQ(struct pt_regs *regs);
+#endif
+#if defined (__i386__)
+#define IRQ_REG orig_eax
+#elif defined (__x86_64__)
+#define IRQ_REG orig_rax
+#endif
+#define do_IRQ(irq, regs) do {         \
+    (regs)->IRQ_REG = (irq);           \
+    do_IRQ((regs));                    \
+} while (0)
+#endif
+
+#define VALID_EVTCHN(_chn) ((_chn) >= 0)
+
+/*
+ * Force a proper event-channel callback from Xen after clearing the
+ * callback mask. We do this in a very simple manner, by making a call
+ * down into Xen. The pending flag will be checked by Xen on return.
+ */
+void force_evtchn_callback(void)
+{
+    (void)HYPERVISOR_xen_version(0);
+}
+
+/* NB. Interrupts are disabled on entry. */
+asmlinkage void evtchn_do_upcall(struct pt_regs *regs)
+{
+    u32           l1, l2;
+    unsigned int   l1i, l2i, port;
+    int            irq, cpu = smp_processor_id();
+    shared_info_t *s = HYPERVISOR_shared_info;
+    vcpu_info_t   *vcpu_info = &s->vcpu_data[cpu];
+
+    vcpu_info->evtchn_upcall_pending = 0;
+    
+    /* NB. No need for a barrier here -- XCHG is a barrier on x86. */
+    l1 = xchg(&vcpu_info->evtchn_pending_sel, 0);
+    while ( l1 != 0 )
+    {
+        l1i = __ffs(l1);
+        l1 &= ~(1 << l1i);
+        
+        while ( (l2 = active_evtchns(cpu, s, l1i)) != 0 )
+        {
+            l2i = __ffs(l2);
+            l2 &= ~(1 << l2i);
+            
+            port = (l1i << 5) + l2i;
+            if ( (irq = evtchn_to_irq[port]) != -1 )
+                do_IRQ(irq, regs);
+            else
+                evtchn_device_upcall(port);
+        }
+    }
+}
+
+static int find_unbound_irq(void)
+{
+    int irq;
+
+    for ( irq = 0; irq < NR_IRQS; irq++ )
+        if ( irq_bindcount[irq] == 0 )
+            break;
+
+    if ( irq == NR_IRQS )
+        panic("No available IRQ to bind to: increase NR_IRQS!\n");
+
+    return irq;
+}
+
+int bind_virq_to_irq(int virq)
+{
+    evtchn_op_t op;
+    int evtchn, irq;
+    int cpu = smp_processor_id();
+
+    spin_lock(&irq_mapping_update_lock);
+
+    if ( (irq = per_cpu(virq_to_irq, cpu)[virq]) == -1 )
+    {
+        op.cmd              = EVTCHNOP_bind_virq;
+        op.u.bind_virq.virq = virq;
+        if ( HYPERVISOR_event_channel_op(&op) != 0 )
+            panic("Failed to bind virtual IRQ %d\n", virq);
+        evtchn = op.u.bind_virq.port;
+
+        irq = find_unbound_irq();
+        evtchn_to_irq[evtchn] = irq;
+        irq_to_evtchn[irq]    = evtchn;
+
+        per_cpu(virq_to_irq, cpu)[virq] = irq;
+
+        bind_evtchn_to_cpu(evtchn, cpu);
+    }
+
+    irq_bindcount[irq]++;
+
+    spin_unlock(&irq_mapping_update_lock);
+    
+    return irq;
+}
+
+void unbind_virq_from_irq(int virq)
+{
+    evtchn_op_t op;
+    int cpu    = smp_processor_id();
+    int irq    = per_cpu(virq_to_irq, cpu)[virq];
+    int evtchn = irq_to_evtchn[irq];
+
+    spin_lock(&irq_mapping_update_lock);
+
+    if ( --irq_bindcount[irq] == 0 )
+    {
+        op.cmd          = EVTCHNOP_close;
+        op.u.close.dom  = DOMID_SELF;
+        op.u.close.port = evtchn;
+        if ( HYPERVISOR_event_channel_op(&op) != 0 )
+            panic("Failed to unbind virtual IRQ %d\n", virq);
+
+       /* This is a slight hack.  Interdomain ports can be allocated
+          directly by userspace, and at that point they get bound by
+          Xen to vcpu 0.  We therefore need to make sure that if we
+          get an event on an event channel we don't know about vcpu 0
+          handles it.  Binding channels to vcpu 0 when closing them
+          achieves this. */
+       bind_evtchn_to_cpu(evtchn, 0);
+        evtchn_to_irq[evtchn] = -1;
+        irq_to_evtchn[irq]    = -1;
+        per_cpu(virq_to_irq, cpu)[virq]     = -1;
+    }
+
+    spin_unlock(&irq_mapping_update_lock);
+}
+
+int bind_ipi_on_cpu_to_irq(int ipi)
+{
+    evtchn_op_t op;
+    int evtchn, irq;
+    int cpu = smp_processor_id();
+
+    spin_lock(&irq_mapping_update_lock);
+
+    if ( (evtchn = per_cpu(ipi_to_evtchn, cpu)[ipi]) == 0 )
+    {
+        op.cmd = EVTCHNOP_bind_ipi;
+        if ( HYPERVISOR_event_channel_op(&op) != 0 )
+            panic("Failed to bind virtual IPI %d on cpu %d\n", ipi, cpu);
+        evtchn = op.u.bind_ipi.port;
+
+        irq = find_unbound_irq();
+        evtchn_to_irq[evtchn] = irq;
+        irq_to_evtchn[irq]    = evtchn;
+
+        per_cpu(ipi_to_evtchn, cpu)[ipi] = evtchn;
+
+        bind_evtchn_to_cpu(evtchn, cpu);
+    } 
+    else
+    {
+       irq = evtchn_to_irq[evtchn];
+    }
+
+    irq_bindcount[irq]++;
+
+    spin_unlock(&irq_mapping_update_lock);
+
+    return irq;
+}
+
+void unbind_ipi_from_irq(int ipi)
+{
+    evtchn_op_t op;
+    int cpu    = smp_processor_id();
+    int evtchn = per_cpu(ipi_to_evtchn, cpu)[ipi];
+    int irq    = irq_to_evtchn[evtchn];
+
+    spin_lock(&irq_mapping_update_lock);
+
+    if ( --irq_bindcount[irq] == 0 )
+    {
+       op.cmd          = EVTCHNOP_close;
+       op.u.close.dom  = DOMID_SELF;
+       op.u.close.port = evtchn;
+       if ( HYPERVISOR_event_channel_op(&op) != 0 )
+           panic("Failed to unbind virtual IPI %d on cpu %d\n", ipi, cpu);
+
+       /* See comments in unbind_virq_from_irq */
+       bind_evtchn_to_cpu(evtchn, 0);
+        evtchn_to_irq[evtchn] = -1;
+        irq_to_evtchn[irq]    = -1;
+       per_cpu(ipi_to_evtchn, cpu)[ipi] = 0;
+    }
+
+    spin_unlock(&irq_mapping_update_lock);
+}
+
+int bind_evtchn_to_irq(int evtchn)
+{
+    int irq;
+
+    spin_lock(&irq_mapping_update_lock);
+
+    if ( (irq = evtchn_to_irq[evtchn]) == -1 )
+    {
+        irq = find_unbound_irq();
+        evtchn_to_irq[evtchn] = irq;
+        irq_to_evtchn[irq]    = evtchn;
+    }
+
+    irq_bindcount[irq]++;
+
+    spin_unlock(&irq_mapping_update_lock);
+    
+    return irq;
+}
+
+void unbind_evtchn_from_irq(int evtchn)
+{
+    int irq = evtchn_to_irq[evtchn];
+
+    spin_lock(&irq_mapping_update_lock);
+
+    if ( --irq_bindcount[irq] == 0 )
+    {
+        evtchn_to_irq[evtchn] = -1;
+        irq_to_evtchn[irq]    = -1;
+    }
+
+    spin_unlock(&irq_mapping_update_lock);
+}
+
+static void do_nothing_function(void *ign)
+{
+}
+
+/* Rebind an evtchn so that it gets delivered to a specific cpu */
+static void rebind_irq_to_cpu(unsigned irq, unsigned tcpu)
+{
+    evtchn_op_t op;
+    int evtchn;
+
+    spin_lock(&irq_mapping_update_lock);
+    evtchn = irq_to_evtchn[irq];
+    if (!VALID_EVTCHN(evtchn)) {
+       spin_unlock(&irq_mapping_update_lock);
+       return;
+    }
+
+    /* Tell Xen to send future instances of this interrupt to the
+       other vcpu */
+    op.cmd = EVTCHNOP_bind_vcpu;
+    op.u.bind_vcpu.port = evtchn;
+    op.u.bind_vcpu.vcpu = tcpu;
+
+    /* If this fails, it usually just indicates that we're dealing
+       with a virq or IPI channel, which don't actually need to be
+       rebound.  Ignore it, but don't do the xenlinux-level rebind
+       in that case. */
+    if (HYPERVISOR_event_channel_op(&op) >= 0)
+       bind_evtchn_to_cpu(evtchn, tcpu);
+
+    spin_unlock(&irq_mapping_update_lock);
+
+    /* Now send the new target processor a NOP IPI.  When this
+       returns, it will check for any pending interrupts, and so
+       service any that got delivered to the wrong processor by
+       mistake. */
+    /* XXX: The only time this is called with interrupts disabled is
+       from the hotplug/hotunplug path.  In that case, all cpus are
+       stopped with interrupts disabled, and the missed interrupts
+       will be picked up when they start again.  This is kind of a
+       hack.
+    */
+    if (!irqs_disabled()) {
+       smp_call_function(do_nothing_function, NULL, 0, 0);
+    }
+}
+
+
+static void set_affinity_irq(unsigned irq, cpumask_t dest)
+{
+    unsigned tcpu = first_cpu(dest);
+    rebind_irq_to_cpu(irq, tcpu);
+}
+
+/*
+ * Interface to generic handling in irq.c
+ */
+
+static unsigned int startup_dynirq(unsigned int irq)
+{
+    int evtchn = irq_to_evtchn[irq];
+
+    if ( !VALID_EVTCHN(evtchn) )
+        return 0;
+    unmask_evtchn(evtchn);
+    return 0;
+}
+
+static void shutdown_dynirq(unsigned int irq)
+{
+    int evtchn = irq_to_evtchn[irq];
+
+    if ( !VALID_EVTCHN(evtchn) )
+        return;
+    mask_evtchn(evtchn);
+}
+
+static void enable_dynirq(unsigned int irq)
+{
+    int evtchn = irq_to_evtchn[irq];
+
+    unmask_evtchn(evtchn);
+}
+
+static void disable_dynirq(unsigned int irq)
+{
+    int evtchn = irq_to_evtchn[irq];
+
+    mask_evtchn(evtchn);
+}
+
+static void ack_dynirq(unsigned int irq)
+{
+    int evtchn = irq_to_evtchn[irq];
+
+    mask_evtchn(evtchn);
+    clear_evtchn(evtchn);
+}
+
+static void end_dynirq(unsigned int irq)
+{
+    int evtchn = irq_to_evtchn[irq];
+
+    if ( !(irq_desc[irq].status & IRQ_DISABLED) )
+        unmask_evtchn(evtchn);
+}
+
+static struct hw_interrupt_type dynirq_type = {
+    "Dynamic-irq",
+    startup_dynirq,
+    shutdown_dynirq,
+    enable_dynirq,
+    disable_dynirq,
+    ack_dynirq,
+    end_dynirq,
+    set_affinity_irq
+};
+
+static inline void pirq_unmask_notify(int pirq)
+{
+    physdev_op_t op;
+    if ( unlikely(test_bit(pirq, &pirq_needs_unmask_notify[0])) )
+    {
+        op.cmd = PHYSDEVOP_IRQ_UNMASK_NOTIFY;
+        (void)HYPERVISOR_physdev_op(&op);
+    }
+}
+
+static inline void pirq_query_unmask(int pirq)
+{
+    physdev_op_t op;
+    op.cmd = PHYSDEVOP_IRQ_STATUS_QUERY;
+    op.u.irq_status_query.irq = pirq;
+    (void)HYPERVISOR_physdev_op(&op);
+    clear_bit(pirq, &pirq_needs_unmask_notify[0]);
+    if ( op.u.irq_status_query.flags & PHYSDEVOP_IRQ_NEEDS_UNMASK_NOTIFY )
+        set_bit(pirq, &pirq_needs_unmask_notify[0]);
+}
+
+/*
+ * On startup, if there is no action associated with the IRQ then we are
+ * probing. In this case we should not share with others as it will confuse us.
+ */
+#define probing_irq(_irq) (irq_desc[(_irq)].action == NULL)
+
+static unsigned int startup_pirq(unsigned int irq)
+{
+    evtchn_op_t op;
+    int evtchn;
+
+    op.cmd               = EVTCHNOP_bind_pirq;
+    op.u.bind_pirq.pirq  = irq;
+    /* NB. We are happy to share unless we are probing. */
+    op.u.bind_pirq.flags = probing_irq(irq) ? 0 : BIND_PIRQ__WILL_SHARE;
+    if ( HYPERVISOR_event_channel_op(&op) != 0 )
+    {
+        if ( !probing_irq(irq) ) /* Some failures are expected when probing. */
+            printk(KERN_INFO "Failed to obtain physical IRQ %d\n", irq);
+        return 0;
+    }
+    evtchn = op.u.bind_pirq.port;
+
+    pirq_query_unmask(irq_to_pirq(irq));
+
+    bind_evtchn_to_cpu(evtchn, 0);
+    evtchn_to_irq[evtchn] = irq;
+    irq_to_evtchn[irq]    = evtchn;
+
+    unmask_evtchn(evtchn);
+    pirq_unmask_notify(irq_to_pirq(irq));
+
+    return 0;
+}
+
+static void shutdown_pirq(unsigned int irq)
+{
+    evtchn_op_t op;
+    int evtchn = irq_to_evtchn[irq];
+
+    if ( !VALID_EVTCHN(evtchn) )
+        return;
+
+    mask_evtchn(evtchn);
+
+    op.cmd          = EVTCHNOP_close;
+    op.u.close.dom  = DOMID_SELF;
+    op.u.close.port = evtchn;
+    if ( HYPERVISOR_event_channel_op(&op) != 0 )
+        panic("Failed to unbind physical IRQ %d\n", irq);
+
+    bind_evtchn_to_cpu(evtchn, 0);
+    evtchn_to_irq[evtchn] = -1;
+    irq_to_evtchn[irq]    = -1;
+}
+
+static void enable_pirq(unsigned int irq)
+{
+    int evtchn = irq_to_evtchn[irq];
+    if ( !VALID_EVTCHN(evtchn) )
+        return;
+    unmask_evtchn(evtchn);
+    pirq_unmask_notify(irq_to_pirq(irq));
+}
+
+static void disable_pirq(unsigned int irq)
+{
+    int evtchn = irq_to_evtchn[irq];
+    if ( !VALID_EVTCHN(evtchn) )
+        return;
+    mask_evtchn(evtchn);
+}
+
+static void ack_pirq(unsigned int irq)
+{
+    int evtchn = irq_to_evtchn[irq];
+    if ( !VALID_EVTCHN(evtchn) )
+        return;
+    mask_evtchn(evtchn);
+    clear_evtchn(evtchn);
+}
+
+static void end_pirq(unsigned int irq)
+{
+    int evtchn = irq_to_evtchn[irq];
+    if ( !VALID_EVTCHN(evtchn) )
+        return;
+    if ( !(irq_desc[irq].status & IRQ_DISABLED) )
+    {
+        unmask_evtchn(evtchn);
+        pirq_unmask_notify(irq_to_pirq(irq));
+    }
+}
+
+static struct hw_interrupt_type pirq_type = {
+    "Phys-irq",
+    startup_pirq,
+    shutdown_pirq,
+    enable_pirq,
+    disable_pirq,
+    ack_pirq,
+    end_pirq,
+    set_affinity_irq
+};
+
+void irq_suspend(void)
+{
+    int pirq, virq, irq, evtchn;
+    int cpu = smp_processor_id(); /* XXX */
+
+    /* Unbind VIRQs from event channels. */
+    for ( virq = 0; virq < NR_VIRQS; virq++ )
+    {
+        if ( (irq = per_cpu(virq_to_irq, cpu)[virq]) == -1 )
+            continue;
+        evtchn = irq_to_evtchn[irq];
+
+        /* Mark the event channel as unused in our table. */
+        evtchn_to_irq[evtchn] = -1;
+        irq_to_evtchn[irq]    = -1;
+    }
+
+    /* Check that no PIRQs are still bound. */
+    for ( pirq = 0; pirq < NR_PIRQS; pirq++ )
+        if ( (evtchn = irq_to_evtchn[pirq_to_irq(pirq)]) != -1 )
+            panic("Suspend attempted while PIRQ %d bound to evtchn %d.\n",
+                  pirq, evtchn);
+}
+
+void irq_resume(void)
+{
+    evtchn_op_t op;
+    int         virq, irq, evtchn;
+    int cpu = smp_processor_id(); /* XXX */
+
+    for ( evtchn = 0; evtchn < NR_EVENT_CHANNELS; evtchn++ )
+        mask_evtchn(evtchn); /* New event-channel space is not 'live' yet. */
+
+    for ( virq = 0; virq < NR_VIRQS; virq++ )
+    {
+        if ( (irq = per_cpu(virq_to_irq, cpu)[virq]) == -1 )
+            continue;
+
+        /* Get a new binding from Xen. */
+        op.cmd              = EVTCHNOP_bind_virq;
+        op.u.bind_virq.virq = virq;
+        if ( HYPERVISOR_event_channel_op(&op) != 0 )
+            panic("Failed to bind virtual IRQ %d\n", virq);
+        evtchn = op.u.bind_virq.port;
+        
+        /* Record the new mapping. */
+       bind_evtchn_to_cpu(evtchn, 0);
+        evtchn_to_irq[evtchn] = irq;
+        irq_to_evtchn[irq]    = evtchn;
+
+        /* Ready for use. */
+        unmask_evtchn(evtchn);
+    }
+}
+
+void __init init_IRQ(void)
+{
+    int i;
+    int cpu;
+
+    irq_ctx_init(0);
+
+    spin_lock_init(&irq_mapping_update_lock);
+
+#ifdef CONFIG_SMP
+    /* By default all event channels notify CPU#0. */
+    memset(cpu_evtchn_mask[0], ~0, sizeof(cpu_evtchn_mask[0]));
+#endif
+
+    for ( cpu = 0; cpu < NR_CPUS; cpu++ ) {
+       /* No VIRQ -> IRQ mappings. */
+       for ( i = 0; i < NR_VIRQS; i++ )
+           per_cpu(virq_to_irq, cpu)[i] = -1;
+    }
+
+    /* No event-channel -> IRQ mappings. */
+    for ( i = 0; i < NR_EVENT_CHANNELS; i++ )
+    {
+        evtchn_to_irq[i] = -1;
+        mask_evtchn(i); /* No event channels are 'live' right now. */
+    }
+
+    /* No IRQ -> event-channel mappings. */
+    for ( i = 0; i < NR_IRQS; i++ )
+        irq_to_evtchn[i] = -1;
+
+    for ( i = 0; i < NR_DYNIRQS; i++ )
+    {
+        /* Dynamic IRQ space is currently unbound. Zero the refcnts. */
+        irq_bindcount[dynirq_to_irq(i)] = 0;
+
+        irq_desc[dynirq_to_irq(i)].status  = IRQ_DISABLED;
+        irq_desc[dynirq_to_irq(i)].action  = 0;
+        irq_desc[dynirq_to_irq(i)].depth   = 1;
+        irq_desc[dynirq_to_irq(i)].handler = &dynirq_type;
+    }
+
+    for ( i = 0; i < NR_PIRQS; i++ )
+    {
+        /* Phys IRQ space is statically bound (1:1 mapping). Nail refcnts. */
+        irq_bindcount[pirq_to_irq(i)] = 1;
+
+        irq_desc[pirq_to_irq(i)].status  = IRQ_DISABLED;
+        irq_desc[pirq_to_irq(i)].action  = 0;
+        irq_desc[pirq_to_irq(i)].depth   = 1;
+        irq_desc[pirq_to_irq(i)].handler = &pirq_type;
+    }
+
+    /* This needs to be done early, but after the IRQ subsystem is alive. */
+    ctrl_if_init();
+}
Index: linux-2.6.12-xen0-arch/drivers/xen/core/fixup.c
===================================================================
--- /dev/null
+++ linux-2.6.12-xen0-arch/drivers/xen/core/fixup.c
@@ -0,0 +1,87 @@
+/******************************************************************************
+ * fixup.c
+ * 
+ * Binary-rewriting of certain IA32 instructions, on notification by Xen.
+ * Used to avoid repeated slow emulation of common instructions used by the
+ * user-space TLS (Thread-Local Storage) libraries.
+ * 
+ * **** NOTE ****
+ *  Issues with the binary rewriting have caused it to be removed. Instead
+ *  we rely on Xen's emulator to boot the kernel, and then print a banner
+ *  message recommending that the user disables /lib/tls.
+ * 
+ * Copyright (c) 2004, K A Fraser
+ * 
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ * 
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ * 
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+ */
+
+#include <linux/config.h>
+#include <linux/init.h>
+#include <linux/sched.h>
+#include <linux/slab.h>
+#include <linux/kernel.h>
+#include <linux/delay.h>
+#include <linux/version.h>
+
+#define DP(_f, _args...) printk(KERN_ALERT "  " _f "\n" , ## _args )
+
+#if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,0)
+#define __LINKAGE fastcall
+#else
+#define __LINKAGE asmlinkage
+#endif
+
+__LINKAGE void do_fixup_4gb_segment(struct pt_regs *regs, long error_code)
+{
+    static unsigned long printed = 0;
+    char info[100];
+    int i;
+
+    if ( !test_and_set_bit(0, &printed) )
+    {
+        HYPERVISOR_vm_assist(VMASST_CMD_disable,
+                            VMASST_TYPE_4gb_segments_notify);
+
+        sprintf(info, "%s (pid=%d)", current->comm, current->tgid);
+
+        DP("");
+        DP("***************************************************************");
+        DP("***************************************************************");
+        DP("** WARNING: Currently emulating unsupported memory accesses  **");
+        DP("**          in /lib/tls libraries. The emulation is very     **");
+        DP("**          slow. To ensure full performance you should      **");
+        DP("**          execute the following as root:                   **");
+        DP("**          mv /lib/tls /lib/tls.disabled                    **");
+        DP("** Offending process: %-38.38s **", info);
+        DP("***************************************************************");
+        DP("***************************************************************");
+        DP("");
+
+        for ( i = 5; i > 0; i-- )
+        {
+            printk("Pausing... %d", i);
+            mdelay(1000);
+            printk("\b\b\b\b\b\b\b\b\b\b\b\b");
+        }
+        printk("Continuing...\n\n");
+    }
+}
+
+static int __init fixup_init(void)
+{
+    HYPERVISOR_vm_assist(VMASST_CMD_enable, VMASST_TYPE_4gb_segments_notify);
+    return 0;
+}
+__initcall(fixup_init);
Index: linux-2.6.12-xen0-arch/drivers/xen/core/gnttab.c
===================================================================
--- /dev/null
+++ linux-2.6.12-xen0-arch/drivers/xen/core/gnttab.c
@@ -0,0 +1,396 @@
+/******************************************************************************
+ * gnttab.c
+ * 
+ * Two sets of functionality:
+ * 1. Granting foreign access to our memory reservation.
+ * 2. Accessing others' memory reservations via grant references.
+ * (i.e., mechanisms for both sender and recipient of grant references)
+ * 
+ * Copyright (c) 2005, Christopher Clark
+ * Copyright (c) 2004, K A Fraser
+ */
+
+#include <linux/config.h>
+#include <linux/module.h>
+#include <linux/sched.h>
+#include <asm/pgtable.h>
+#include <asm/fixmap.h>
+#include <asm/uaccess.h>
+#include <asm-xen/xen_proc.h>
+#include <asm-xen/linux-public/privcmd.h>
+#include <asm-xen/gnttab.h>
+#include <asm-xen/synch_bitops.h>
+
+#if 1
+#define ASSERT(_p) \
+    if ( !(_p) ) { printk(KERN_ALERT"Assertion '%s': line %d, file %s\n", \
+    #_p , __LINE__, __FILE__); *(int*)0=0; }
+#else
+#define ASSERT(_p) ((void)0)
+#endif
+
+#define WPRINTK(fmt, args...) \
+    printk(KERN_WARNING "xen_grant: " fmt, ##args)
+
+
+EXPORT_SYMBOL(gnttab_grant_foreign_access);
+EXPORT_SYMBOL(gnttab_end_foreign_access);
+EXPORT_SYMBOL(gnttab_query_foreign_access);
+EXPORT_SYMBOL(gnttab_grant_foreign_transfer);
+EXPORT_SYMBOL(gnttab_end_foreign_transfer);
+EXPORT_SYMBOL(gnttab_alloc_grant_references);
+EXPORT_SYMBOL(gnttab_free_grant_references);
+EXPORT_SYMBOL(gnttab_claim_grant_reference);
+EXPORT_SYMBOL(gnttab_release_grant_reference);
+EXPORT_SYMBOL(gnttab_grant_foreign_access_ref);
+EXPORT_SYMBOL(gnttab_grant_foreign_transfer_ref);
+
+static grant_ref_t gnttab_free_list[NR_GRANT_ENTRIES];
+static grant_ref_t gnttab_free_head;
+
+static grant_entry_t *shared;
+
+/*
+ * Lock-free grant-entry allocator
+ */
+
+static inline int
+get_free_entry(
+    void)
+{
+    grant_ref_t fh, nfh = gnttab_free_head;
+    do { if ( unlikely((fh = nfh) == NR_GRANT_ENTRIES) ) return -1; }
+    while ( unlikely((nfh = cmpxchg(&gnttab_free_head, fh,
+                                    gnttab_free_list[fh])) != fh) );
+    return fh;
+}
+
+static inline void
+put_free_entry(
+    grant_ref_t ref)
+{
+    grant_ref_t fh, nfh = gnttab_free_head;
+    do { gnttab_free_list[ref] = fh = nfh; wmb(); }
+    while ( unlikely((nfh = cmpxchg(&gnttab_free_head, fh, ref)) != fh) );
+}
+
+/*
+ * Public grant-issuing interface functions
+ */
+
+int
+gnttab_grant_foreign_access(
+    domid_t domid, unsigned long frame, int readonly)
+{
+    int ref;
+    
+    if ( unlikely((ref = get_free_entry()) == -1) )
+        return -ENOSPC;
+
+    shared[ref].frame = frame;
+    shared[ref].domid = domid;
+    wmb();
+    shared[ref].flags = GTF_permit_access | (readonly ? GTF_readonly : 0);
+
+    return ref;
+}
+
+void
+gnttab_grant_foreign_access_ref(
+    grant_ref_t ref, domid_t domid, unsigned long frame, int readonly)
+{
+    shared[ref].frame = frame;
+    shared[ref].domid = domid;
+    wmb();
+    shared[ref].flags = GTF_permit_access | (readonly ? GTF_readonly : 0);
+}
+
+
+int
+gnttab_query_foreign_access( grant_ref_t ref )
+{
+    u16 nflags;
+
+    nflags = shared[ref].flags;
+
+    return ( nflags & (GTF_reading|GTF_writing) );
+}
+
+void
+gnttab_end_foreign_access( grant_ref_t ref, int readonly )
+{
+    u16 flags, nflags;
+
+    nflags = shared[ref].flags;
+    do {
+        if ( (flags = nflags) & (GTF_reading|GTF_writing) )
+            printk(KERN_ALERT "WARNING: g.e. still in use!\n");
+    }
+    while ( (nflags = synch_cmpxchg(&shared[ref].flags, flags, 0)) != flags );
+
+    put_free_entry(ref);
+}
+
+int
+gnttab_grant_foreign_transfer(
+    domid_t domid, unsigned long pfn )
+{
+    int ref;
+
+    if ( unlikely((ref = get_free_entry()) == -1) )
+        return -ENOSPC;
+
+    shared[ref].frame = pfn;
+    shared[ref].domid = domid;
+    wmb();
+    shared[ref].flags = GTF_accept_transfer;
+
+    return ref;
+}
+
+void
+gnttab_grant_foreign_transfer_ref(
+    grant_ref_t ref, domid_t domid, unsigned long pfn )
+{
+    shared[ref].frame = pfn;
+    shared[ref].domid = domid;
+    wmb();
+    shared[ref].flags = GTF_accept_transfer;
+}
+
+unsigned long
+gnttab_end_foreign_transfer(
+    grant_ref_t ref)
+{
+    unsigned long frame = 0;
+    u16           flags;
+
+    flags = shared[ref].flags;
+#ifdef CONFIG_XEN_NETDEV_GRANT_RX
+    /*
+     * But can't flags == (GTF_accept_transfer | GTF_transfer_completed)
+     * if gnttab_donate executes without interruption???
+     */
+#else
+    ASSERT(flags == (GTF_accept_transfer | GTF_transfer_committed));
+#endif
+    /*
+     * If a transfer is committed then wait for the frame address to appear.
+     * Otherwise invalidate the grant entry against future use.
+     */
+    if ( likely(flags != GTF_accept_transfer) ||
+         (synch_cmpxchg(&shared[ref].flags, flags, 0) != GTF_accept_transfer) )
+        while ( unlikely((frame = shared[ref].frame) == 0) )
+            cpu_relax();
+
+    put_free_entry(ref);
+
+    return frame;
+}
+
+void
+gnttab_free_grant_references( u16 count, grant_ref_t head )
+{
+    /* TODO: O(N)...? */
+    grant_ref_t to_die = 0, next = head;
+    int i;
+
+    for ( i = 0; i < count; i++ )
+    {
+        to_die = next;
+        next = gnttab_free_list[next];
+        put_free_entry( to_die );
+    }
+}
+
+int
+gnttab_alloc_grant_references( u16 count,
+                               grant_ref_t *head,
+                               grant_ref_t *terminal )
+{
+    int i;
+    grant_ref_t h = gnttab_free_head;
+
+    for ( i = 0; i < count; i++ )
+        if ( unlikely(get_free_entry() == -1) )
+            goto not_enough_refs;
+
+    *head = h;
+    *terminal = gnttab_free_head;
+
+    return 0;
+
+not_enough_refs:
+    gnttab_free_head = h;
+    return -ENOSPC;
+}
+
+int
+gnttab_claim_grant_reference( grant_ref_t *private_head,
+                              grant_ref_t  terminal )
+{
+    grant_ref_t g;
+    if ( unlikely((g = *private_head) == terminal) )
+        return -ENOSPC;
+    *private_head = gnttab_free_list[g];
+    return g;
+}
+
+void
+gnttab_release_grant_reference( grant_ref_t *private_head,
+                                grant_ref_t  release )
+{
+    gnttab_free_list[release] = *private_head;
+    *private_head = release;
+}
+
+/*
+ * ProcFS operations
+ */
+
+#ifdef CONFIG_PROC_FS
+
+static struct proc_dir_entry *grant_pde;
+
+static int grant_ioctl(struct inode *inode, struct file *file,
+                       unsigned int cmd, unsigned long data)
+{
+    int                     ret;
+    privcmd_hypercall_t     hypercall;
+
+    /* XXX Need safety checks here if using for anything other
+     *     than debugging */
+    return -ENOSYS;
+
+    if ( cmd != IOCTL_PRIVCMD_HYPERCALL )
+        return -ENOSYS;
+
+    if ( copy_from_user(&hypercall, (void *)data, sizeof(hypercall)) )
+        return -EFAULT;
+
+    if ( hypercall.op != __HYPERVISOR_grant_table_op )
+        return -ENOSYS;
+
+    /* hypercall-invoking asm taken from privcmd.c */
+    __asm__ __volatile__ (
+        "pushl %%ebx; pushl %%ecx; pushl %%edx; pushl %%esi; pushl %%edi; "
+        "movl  4(%%eax),%%ebx ;"
+        "movl  8(%%eax),%%ecx ;"
+        "movl 12(%%eax),%%edx ;"
+        "movl 16(%%eax),%%esi ;"
+        "movl 20(%%eax),%%edi ;"
+        "movl   (%%eax),%%eax ;"
+        TRAP_INSTR "; "
+        "popl %%edi; popl %%esi; popl %%edx; popl %%ecx; popl %%ebx"
+        : "=a" (ret) : "0" (&hypercall) : "memory" );
+
+    return ret;
+}
+
+static struct file_operations grant_file_ops = {
+    ioctl:  grant_ioctl,
+};
+
+static int grant_read(char *page, char **start, off_t off,
+                      int count, int *eof, void *data)
+{
+    int             len;
+    unsigned int    i;
+    grant_entry_t  *gt;
+
+    gt = (grant_entry_t *)shared;
+    len = 0;
+
+    for ( i = 0; i < NR_GRANT_ENTRIES; i++ )
+        /* TODO: safety catch here until this can handle >PAGE_SIZE output */
+        if (len > (PAGE_SIZE - 200))
+        {
+            len += sprintf( page + len, "Truncated.\n");
+            break;
+        }
+
+        if ( gt[i].flags )
+            len += sprintf( page + len,
+                    "Grant: ref (0x%x) flags (0x%hx) dom (0x%hx) frame 
(0x%x)\n", 
+                    i,
+                    gt[i].flags,
+                    gt[i].domid,
+                    gt[i].frame );
+
+    *eof = 1;
+    return len;
+}
+
+static int grant_write(struct file *file, const char __user *buffer,
+                       unsigned long count, void *data)
+{
+    /* TODO: implement this */
+    return -ENOSYS;
+}
+
+#endif /* CONFIG_PROC_FS */
+
+int gnttab_resume(void)
+{
+    gnttab_setup_table_t setup;
+    unsigned long        frames[NR_GRANT_FRAMES];
+    int                  i;
+
+    setup.dom        = DOMID_SELF;
+    setup.nr_frames  = NR_GRANT_FRAMES;
+    setup.frame_list = frames;
+
+    BUG_ON(HYPERVISOR_grant_table_op(GNTTABOP_setup_table, &setup, 1) != 0);
+    BUG_ON(setup.status != 0);
+
+    for ( i = 0; i < NR_GRANT_FRAMES; i++ )
+        set_fixmap(FIX_GNTTAB_END - i, frames[i] << PAGE_SHIFT);
+
+    return 0;
+}
+
+int gnttab_suspend(void)
+{
+    int i;
+
+    for ( i = 0; i < NR_GRANT_FRAMES; i++ )
+       clear_fixmap(FIX_GNTTAB_END - i);
+
+    return 0;
+}
+
+static int __init gnttab_init(void)
+{
+    int i;
+
+    BUG_ON(gnttab_resume());
+
+    shared = (grant_entry_t *)fix_to_virt(FIX_GNTTAB_END);
+
+    for ( i = 0; i < NR_GRANT_ENTRIES; i++ )
+        gnttab_free_list[i] = i + 1;
+    
+#ifdef CONFIG_PROC_FS
+    /*
+     *  /proc/xen/grant : used by libxc to access grant tables
+     */
+    if ( (grant_pde = create_xen_proc_entry("grant", 0600)) == NULL )
+    {
+        WPRINTK("Unable to create grant xen proc entry\n");
+        return -1;
+    }
+
+    grant_file_ops.read   = grant_pde->proc_fops->read;
+    grant_file_ops.write  = grant_pde->proc_fops->write;
+
+    grant_pde->proc_fops  = &grant_file_ops;
+
+    grant_pde->read_proc  = &grant_read;
+    grant_pde->write_proc = &grant_write;
+#endif
+
+    printk("Grant table initialized\n");
+    return 0;
+}
+
+__initcall(gnttab_init);
Index: linux-2.6.12-xen0-arch/drivers/xen/core/Makefile
===================================================================
--- /dev/null
+++ linux-2.6.12-xen0-arch/drivers/xen/core/Makefile
@@ -0,0 +1,5 @@
+obj-y   := ctrl_if.o evtchn.o fixup.o reboot.o gnttab.o devmem.o
+
+obj-$(CONFIG_PROC_FS) += xen_proc.o
+obj-$(CONFIG_NET)     += skbuff.o
+obj-$(CONFIG_SMP)     += smp.o
Index: linux-2.6.12-xen0-arch/drivers/xen/core/reboot.c
===================================================================
--- /dev/null
+++ linux-2.6.12-xen0-arch/drivers/xen/core/reboot.c
@@ -0,0 +1,269 @@
+
+#define __KERNEL_SYSCALLS__
+static int errno;
+#include <linux/errno.h>
+#include <linux/version.h>
+#include <linux/kernel.h>
+#include <linux/mm.h>
+#include <linux/unistd.h>
+#include <linux/module.h>
+#include <linux/reboot.h>
+#include <linux/sysrq.h>
+#include <asm/irq.h>
+#include <asm/mmu_context.h>
+#include <asm-xen/ctrl_if.h>
+#include <asm-xen/hypervisor.h>
+#include <asm-xen/xen-public/dom0_ops.h>
+#include <asm-xen/linux-public/suspend.h>
+#include <asm-xen/queues.h>
+
+void machine_restart(char * __unused)
+{
+       /* We really want to get pending console data out before we die. */
+       extern void xencons_force_flush(void);
+       xencons_force_flush();
+       HYPERVISOR_reboot();
+}
+
+void machine_halt(void)
+{
+       machine_power_off();
+}
+
+void machine_power_off(void)
+{
+       /* We really want to get pending console data out before we die. */
+       extern void xencons_force_flush(void);
+       xencons_force_flush();
+       HYPERVISOR_shutdown();
+}
+
+#if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,0)
+int reboot_thru_bios = 0;      /* for dmi_scan.c */
+EXPORT_SYMBOL(machine_restart);
+EXPORT_SYMBOL(machine_halt);
+EXPORT_SYMBOL(machine_power_off);
+#endif
+
+
+/******************************************************************************
+ * Stop/pickle callback handling.
+ */
+
+/* Ignore multiple shutdown requests. */
+static int shutting_down = -1;
+
+static void __do_suspend(void)
+{
+    int i, j;
+    suspend_record_t *suspend_record;
+
+    /* Hmmm... a cleaner interface to suspend/resume blkdevs would be nice. */
+       /* XXX SMH: yes it would :-( */ 
+#ifdef CONFIG_XEN_BLKDEV_FRONTEND
+    extern void blkdev_suspend(void);
+    extern void blkdev_resume(void);
+#else
+#define blkdev_suspend() do{}while(0)
+#define blkdev_resume()  do{}while(0)
+#endif
+
+#ifdef CONFIG_XEN_NETDEV_FRONTEND
+    extern void netif_suspend(void);
+    extern void netif_resume(void);  
+#else
+#define netif_suspend() do{}while(0)
+#define netif_resume()  do{}while(0)
+#endif
+
+#ifdef CONFIG_XEN_USB_FRONTEND
+    extern void usbif_resume();
+#else
+#define usbif_resume() do{}while(0)
+#endif
+
+#ifdef CONFIG_XEN_BLKDEV_GRANT
+    extern int gnttab_suspend(void);
+    extern int gnttab_resume(void);
+#else
+#define gnttab_suspend() do{}while(0)
+#define gnttab_resume()  do{}while(0)
+#endif
+
+    extern void time_suspend(void);
+    extern void time_resume(void);
+    extern unsigned long max_pfn;
+    extern unsigned int *pfn_to_mfn_frame_list;
+
+    suspend_record = (suspend_record_t *)__get_free_page(GFP_KERNEL);
+    if ( suspend_record == NULL )
+        goto out;
+
+    suspend_record->nr_pfns = max_pfn; /* final number of pfns */
+
+    __cli();
+
+#ifdef __i386__
+    mm_pin_all();
+    kmem_cache_shrink(pgd_cache);
+#endif
+
+    netif_suspend();
+
+    blkdev_suspend();
+
+    time_suspend();
+
+    ctrl_if_suspend();
+
+    irq_suspend();
+
+    gnttab_suspend();
+
+    HYPERVISOR_shared_info = (shared_info_t *)empty_zero_page;
+    clear_fixmap(FIX_SHARED_INFO);
+
+    memcpy(&suspend_record->resume_info, &xen_start_info,
+           sizeof(xen_start_info));
+
+    HYPERVISOR_suspend(virt_to_machine(suspend_record) >> PAGE_SHIFT);
+
+    shutting_down = -1; 
+
+    memcpy(&xen_start_info, &suspend_record->resume_info,
+           sizeof(xen_start_info));
+
+    set_fixmap(FIX_SHARED_INFO, xen_start_info.shared_info);
+
+    HYPERVISOR_shared_info = (shared_info_t *)fix_to_virt(FIX_SHARED_INFO);
+
+    memset(empty_zero_page, 0, PAGE_SIZE);
+
+    for ( i=0, j=0; i < max_pfn; i+=(PAGE_SIZE/sizeof(unsigned long)), j++ )
+    {
+        pfn_to_mfn_frame_list[j] = 
+            virt_to_machine(&phys_to_machine_mapping[i]) >> PAGE_SHIFT;
+    }
+    HYPERVISOR_shared_info->arch.pfn_to_mfn_frame_list =
+        virt_to_machine(pfn_to_mfn_frame_list) >> PAGE_SHIFT;
+
+    gnttab_resume();
+
+    irq_resume();
+
+    ctrl_if_resume();
+
+    time_resume();
+
+    blkdev_resume();
+
+    netif_resume();
+
+    usbif_resume();
+
+    __sti();
+
+ out:
+    if ( suspend_record != NULL )
+        free_page((unsigned long)suspend_record);
+}
+
+static int shutdown_process(void *__unused)
+{
+    static char *envp[] = { "HOME=/", "TERM=linux", 
+                            "PATH=/sbin:/usr/sbin:/bin:/usr/bin", NULL };
+    static char *restart_argv[]  = { "/sbin/reboot", NULL };
+    static char *poweroff_argv[] = { "/sbin/poweroff", NULL };
+
+    extern asmlinkage long sys_reboot(int magic1, int magic2,
+                                      unsigned int cmd, void *arg);
+
+    daemonize(
+#if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,0)
+        "shutdown"
+#endif
+        );
+
+    switch ( shutting_down )
+    {
+    case CMSG_SHUTDOWN_POWEROFF:
+        if ( execve("/sbin/poweroff", poweroff_argv, envp) < 0 )
+        {
+            sys_reboot(LINUX_REBOOT_MAGIC1,
+                       LINUX_REBOOT_MAGIC2,
+                       LINUX_REBOOT_CMD_POWER_OFF,
+                       NULL);
+        }
+        break;
+
+    case CMSG_SHUTDOWN_REBOOT:
+        if ( execve("/sbin/reboot", restart_argv, envp) < 0 )
+        {
+            sys_reboot(LINUX_REBOOT_MAGIC1,
+                       LINUX_REBOOT_MAGIC2,
+                       LINUX_REBOOT_CMD_RESTART,
+                       NULL);
+        }
+        break;
+    }
+
+    shutting_down = -1; /* could try again */
+
+    return 0;
+}
+
+static void __shutdown_handler(void *unused)
+{
+    int err;
+
+    if ( shutting_down != CMSG_SHUTDOWN_SUSPEND )
+    {
+        err = kernel_thread(shutdown_process, NULL, CLONE_FS | CLONE_FILES);
+        if ( err < 0 )
+            printk(KERN_ALERT "Error creating shutdown process!\n");
+    }
+    else
+    {
+        __do_suspend();
+    }
+}
+
+static void shutdown_handler(ctrl_msg_t *msg, unsigned long id)
+{
+    static DECLARE_WORK(shutdown_work, __shutdown_handler, NULL);
+
+    if ( msg->subtype == CMSG_SHUTDOWN_SYSRQ )
+    {
+       int sysrq = ((shutdown_sysrq_t *)&msg->msg[0])->key;
+       
+#ifdef CONFIG_MAGIC_SYSRQ
+#if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,0)
+       handle_sysrq(sysrq, NULL, NULL);
+#else
+       handle_sysrq(sysrq, NULL, NULL, NULL);
+#endif
+#endif
+    }
+    else if ( (shutting_down == -1) &&
+         ((msg->subtype == CMSG_SHUTDOWN_POWEROFF) ||
+          (msg->subtype == CMSG_SHUTDOWN_REBOOT) ||
+          (msg->subtype == CMSG_SHUTDOWN_SUSPEND)) )
+    {
+        shutting_down = msg->subtype;
+        schedule_work(&shutdown_work);
+    }
+    else
+    {
+        printk("Ignore spurious shutdown request\n");
+    }
+
+    ctrl_if_send_response(msg);
+}
+
+static int __init setup_shutdown_event(void)
+{
+    ctrl_if_register_receiver(CMSG_SHUTDOWN, shutdown_handler, 0);
+    return 0;
+}
+
+__initcall(setup_shutdown_event);
Index: linux-2.6.12-xen0-arch/drivers/xen/core/skbuff.c
===================================================================
--- /dev/null
+++ linux-2.6.12-xen0-arch/drivers/xen/core/skbuff.c
@@ -0,0 +1,47 @@
+
+#include <linux/config.h>
+#include <linux/module.h>
+#include <linux/version.h>
+#include <linux/kernel.h>
+#include <linux/sched.h>
+#include <linux/slab.h>
+#include <linux/string.h>
+#include <linux/errno.h>
+#include <linux/netdevice.h>
+#include <linux/inetdevice.h>
+#include <linux/etherdevice.h>
+#include <linux/skbuff.h>
+#include <linux/init.h>
+#include <asm/io.h>
+#include <asm/page.h>
+
+EXPORT_SYMBOL(__dev_alloc_skb);
+
+/* Referenced in netback.c. */
+/*static*/ kmem_cache_t *skbuff_cachep;
+
+/* Size must be cacheline-aligned (alloc_skb uses SKB_DATA_ALIGN). */
+#define XEN_SKB_SIZE \
+    ((PAGE_SIZE - sizeof(struct skb_shared_info)) & ~(SMP_CACHE_BYTES - 1))
+
+struct sk_buff *__dev_alloc_skb(unsigned int length, int gfp_mask)
+{
+    struct sk_buff *skb;
+    skb = alloc_skb_from_cache(skbuff_cachep, length + 16, gfp_mask);
+    if ( likely(skb != NULL) )
+        skb_reserve(skb, 16);
+    return skb;
+}
+
+static void skbuff_ctor(void *buf, kmem_cache_t *cachep, unsigned long unused)
+{
+    scrub_pages(buf, 1);
+}
+
+static int __init skbuff_init(void)
+{
+    skbuff_cachep = kmem_cache_create(
+        "xen-skb", PAGE_SIZE, PAGE_SIZE, 0, skbuff_ctor, NULL);
+    return 0;
+}
+__initcall(skbuff_init);
Index: linux-2.6.12-xen0-arch/drivers/xen/core/smp.c
===================================================================
--- /dev/null
+++ linux-2.6.12-xen0-arch/drivers/xen/core/smp.c
@@ -0,0 +1,16 @@
+/* Copyright (C) 2004, Christian Limpach */
+
+#include <linux/init.h>
+#include <linux/kernel.h>
+#include <linux/threads.h>
+
+/*
+ * the frequency of the profiling timer can be changed
+ * by writing a multiplier value into /proc/profile.
+ */
+int setup_profiling_timer(unsigned int multiplier)
+{
+       printk("setup_profiling_timer\n");
+
+       return 0;
+}
Index: linux-2.6.12-xen0-arch/drivers/xen/core/xen_proc.c
===================================================================
--- /dev/null
+++ linux-2.6.12-xen0-arch/drivers/xen/core/xen_proc.c
@@ -0,0 +1,18 @@
+
+#include <linux/config.h>
+#include <linux/proc_fs.h>
+
+static struct proc_dir_entry *xen_base;
+
+struct proc_dir_entry *create_xen_proc_entry(const char *name, mode_t mode)
+{
+    if ( xen_base == NULL )
+        if ( (xen_base = proc_mkdir("xen", &proc_root)) == NULL )
+            panic("Couldn't create /proc/xen");
+    return create_proc_entry(name, mode, xen_base);
+}
+
+void remove_xen_proc_entry(const char *name)
+{
+    remove_proc_entry(name, xen_base);
+}
Index: linux-2.6.12-xen0-arch/include/asm-i386/mach-xen/ctrl_if.h
===================================================================
--- /dev/null
+++ linux-2.6.12-xen0-arch/include/asm-i386/mach-xen/ctrl_if.h
@@ -0,0 +1,160 @@
+/******************************************************************************
+ * ctrl_if.h
+ * 
+ * Management functions for special interface to the domain controller.
+ * 
+ * Copyright (c) 2004, K A Fraser
+ * 
+ * This file may be distributed separately from the Linux kernel, or
+ * incorporated into other software packages, subject to the following license:
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this source file (the "Software"), to deal in the Software without
+ * restriction, including without limitation the rights to use, copy, modify,
+ * merge, publish, distribute, sublicense, and/or sell copies of the Software,
+ * and to permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#ifndef __ASM_XEN__CTRL_IF_H__
+#define __ASM_XEN__CTRL_IF_H__
+
+#include <hypervisor.h>
+#include <queues.h>
+
+typedef control_msg_t ctrl_msg_t;
+
+/*
+ * Callback function type. Called for asynchronous processing of received
+ * request messages, and responses to previously-transmitted request messages.
+ * The parameters are (@msg, @id).
+ *  @msg: Original request/response message (not a copy). The message can be
+ *        modified in-place by the handler (e.g., a response callback can
+ *        turn a request message into a response message in place). The message
+ *        is no longer accessible after the callback handler returns -- if the
+ *        message is required to persist for longer then it must be copied.
+ *  @id:  (Response callbacks only) The 'id' that was specified when the
+ *        original request message was queued for transmission.
+ */
+typedef void (*ctrl_msg_handler_t)(ctrl_msg_t *, unsigned long);
+
+/*
+ * Send @msg to the domain controller. Execute @hnd when a response is
+ * received, passing the response message and the specified @id. This
+ * operation will not block: it will return -EAGAIN if there is no space.
+ * Notes:
+ *  1. The @msg is copied if it is transmitted and so can be freed after this
+ *     function returns.
+ *  2. If @hnd is NULL then no callback is executed.
+ */
+int
+ctrl_if_send_message_noblock(
+    ctrl_msg_t *msg, 
+    ctrl_msg_handler_t hnd,
+    unsigned long id);
+
+/*
+ * Send @msg to the domain controller. Execute @hnd when a response is
+ * received, passing the response message and the specified @id. This
+ * operation will block until the message is sent, or a signal is received
+ * for the calling process (unless @wait_state is TASK_UNINTERRUPTIBLE).
+ * Notes:
+ *  1. The @msg is copied if it is transmitted and so can be freed after this
+ *     function returns.
+ *  2. If @hnd is NULL then no callback is executed.
+ */
+int
+ctrl_if_send_message_block(
+    ctrl_msg_t *msg, 
+    ctrl_msg_handler_t hnd, 
+    unsigned long id, 
+    long wait_state);
+
+/*
+ * Send @msg to the domain controller. Block until the response is received,
+ * and then copy it into the provided buffer, @rmsg.
+ */
+int
+ctrl_if_send_message_and_get_response(
+    ctrl_msg_t *msg,
+    ctrl_msg_t *rmsg,
+    long wait_state);
+
+/*
+ * Request a callback when there is /possibly/ space to immediately send a
+ * message to the domain controller. This function returns 0 if there is
+ * already space to trasnmit a message --- in this case the callback task /may/
+ * still be executed. If this function returns 1 then the callback /will/ be
+ * executed when space becomes available.
+ */
+int
+ctrl_if_enqueue_space_callback(
+    struct tq_struct *task);
+
+/*
+ * Send a response (@msg) to a message from the domain controller. This will 
+ * never block.
+ * Notes:
+ *  1. The @msg is copied and so can be freed after this function returns.
+ *  2. The @msg may be the original request message, modified in-place.
+ */
+void
+ctrl_if_send_response(
+    ctrl_msg_t *msg);
+
+/*
+ * Register a receiver for typed messages from the domain controller. The 
+ * handler (@hnd) is called for every received message of specified @type.
+ * Returns TRUE (non-zero) if the handler was successfully registered.
+ * If CALLBACK_IN_BLOCKING CONTEXT is specified in @flags then callbacks will
+ * occur in a context in which it is safe to yield (i.e., process context).
+ */
+#define CALLBACK_IN_BLOCKING_CONTEXT 1
+int ctrl_if_register_receiver(
+    u8 type, 
+    ctrl_msg_handler_t hnd,
+    unsigned int flags);
+
+/*
+ * Unregister a receiver for typed messages from the domain controller. The 
+ * handler (@hnd) will not be executed after this function returns.
+ */
+void
+ctrl_if_unregister_receiver(
+    u8 type, ctrl_msg_handler_t hnd);
+
+/* Suspend/resume notifications. */
+void ctrl_if_suspend(void);
+void ctrl_if_resume(void);
+
+/* Start-of-day setup. */
+void ctrl_if_init(void);
+
+/*
+ * Returns TRUE if there are no outstanding message requests at the domain
+ * controller. This can be used to ensure that messages have really flushed
+ * through when it is not possible to use the response-callback interface.
+ * WARNING: If other subsystems are using the control interface then this
+ * function might never return TRUE!
+ */
+int ctrl_if_transmitter_empty(void);  /* !! DANGEROUS FUNCTION !! */
+
+/*
+ * Manually discard response messages from the domain controller. 
+ * WARNING: This is usually done automatically -- this function should only
+ * be called when normal interrupt mechanisms are disabled!
+ */
+void ctrl_if_discard_responses(void); /* !! DANGEROUS FUNCTION !! */
+
+#endif /* __ASM_XEN__CONTROL_IF_H__ */
Index: linux-2.6.12-xen0-arch/include/asm-i386/mach-xen/hypercall.h
===================================================================
--- /dev/null
+++ linux-2.6.12-xen0-arch/include/asm-i386/mach-xen/hypercall.h
@@ -0,0 +1,564 @@
+/******************************************************************************
+ * hypercall.h
+ * 
+ * Linux-specific hypervisor handling.
+ * 
+ * Copyright (c) 2002-2004, K A Fraser
+ * 
+ * This file may be distributed separately from the Linux kernel, or
+ * incorporated into other software packages, subject to the following license:
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this source file (the "Software"), to deal in the Software without
+ * restriction, including without limitation the rights to use, copy, modify,
+ * merge, publish, distribute, sublicense, and/or sell copies of the Software,
+ * and to permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#ifndef __HYPERCALL_H__
+#define __HYPERCALL_H__
+#include <asm-xen/xen-public/xen.h>
+
+/*
+ * Assembler stubs for hyper-calls.
+ */
+
+static inline int
+HYPERVISOR_set_trap_table(
+    trap_info_t *table)
+{
+    int ret;
+    unsigned long ignore;
+
+    __asm__ __volatile__ (
+        TRAP_INSTR
+        : "=a" (ret), "=b" (ignore)
+       : "0" (__HYPERVISOR_set_trap_table), "1" (table)
+       : "memory" );
+
+    return ret;
+}
+
+static inline int
+HYPERVISOR_mmu_update(
+    mmu_update_t *req, int count, int *success_count, domid_t domid)
+{
+    int ret;
+    unsigned long ign1, ign2, ign3, ign4;
+
+    __asm__ __volatile__ (
+        TRAP_INSTR
+        : "=a" (ret), "=b" (ign1), "=c" (ign2), "=d" (ign3), "=S" (ign4)
+       : "0" (__HYPERVISOR_mmu_update), "1" (req), "2" (count),
+        "3" (success_count), "4" (domid)
+       : "memory" );
+
+    return ret;
+}
+
+static inline int
+HYPERVISOR_mmuext_op(
+    struct mmuext_op *op, int count, int *success_count, domid_t domid)
+{
+    int ret;
+    unsigned long ign1, ign2, ign3, ign4;
+
+    __asm__ __volatile__ (
+        TRAP_INSTR
+        : "=a" (ret), "=b" (ign1), "=c" (ign2), "=d" (ign3), "=S" (ign4)
+       : "0" (__HYPERVISOR_mmuext_op), "1" (op), "2" (count),
+        "3" (success_count), "4" (domid)
+       : "memory" );
+
+    return ret;
+}
+
+static inline int
+HYPERVISOR_set_gdt(
+    unsigned long *frame_list, int entries)
+{
+    int ret;
+    unsigned long ign1, ign2;
+
+    __asm__ __volatile__ (
+        TRAP_INSTR
+        : "=a" (ret), "=b" (ign1), "=c" (ign2)
+       : "0" (__HYPERVISOR_set_gdt), "1" (frame_list), "2" (entries)
+       : "memory" );
+
+
+    return ret;
+}
+
+static inline int
+HYPERVISOR_stack_switch(
+    unsigned long ss, unsigned long esp)
+{
+    int ret;
+    unsigned long ign1, ign2;
+
+    __asm__ __volatile__ (
+        TRAP_INSTR
+        : "=a" (ret), "=b" (ign1), "=c" (ign2)
+       : "0" (__HYPERVISOR_stack_switch), "1" (ss), "2" (esp)
+       : "memory" );
+
+    return ret;
+}
+
+static inline int
+HYPERVISOR_set_callbacks(
+    unsigned long event_selector, unsigned long event_address,
+    unsigned long failsafe_selector, unsigned long failsafe_address)
+{
+    int ret;
+    unsigned long ign1, ign2, ign3, ign4;
+
+    __asm__ __volatile__ (
+        TRAP_INSTR
+        : "=a" (ret), "=b" (ign1), "=c" (ign2), "=d" (ign3), "=S" (ign4)
+       : "0" (__HYPERVISOR_set_callbacks), "1" (event_selector),
+         "2" (event_address), "3" (failsafe_selector), "4" (failsafe_address)
+       : "memory" );
+
+    return ret;
+}
+
+static inline int
+HYPERVISOR_fpu_taskswitch(
+    int set)
+{
+    int ret;
+    unsigned long ign;
+
+    __asm__ __volatile__ (
+        TRAP_INSTR
+        : "=a" (ret), "=b" (ign)
+        : "0" (__HYPERVISOR_fpu_taskswitch), "1" (set)
+        : "memory" );
+
+    return ret;
+}
+
+static inline int
+HYPERVISOR_yield(
+    void)
+{
+    int ret;
+    unsigned long ign;
+
+    __asm__ __volatile__ (
+        TRAP_INSTR
+        : "=a" (ret), "=b" (ign)
+       : "0" (__HYPERVISOR_sched_op), "1" (SCHEDOP_yield)
+       : "memory" );
+
+    return ret;
+}
+
+static inline int
+HYPERVISOR_block(
+    void)
+{
+    int ret;
+    unsigned long ign1;
+    __asm__ __volatile__ (
+        TRAP_INSTR
+        : "=a" (ret), "=b" (ign1)
+       : "0" (__HYPERVISOR_sched_op), "1" (SCHEDOP_block)
+       : "memory" );
+
+    return ret;
+}
+
+static inline int
+HYPERVISOR_shutdown(
+    void)
+{
+    int ret;
+    unsigned long ign1;
+    __asm__ __volatile__ (
+        TRAP_INSTR
+        : "=a" (ret), "=b" (ign1)
+       : "0" (__HYPERVISOR_sched_op),
+         "1" (SCHEDOP_shutdown | (SHUTDOWN_poweroff << SCHEDOP_reasonshift))
+        : "memory" );
+
+    return ret;
+}
+
+static inline int
+HYPERVISOR_reboot(
+    void)
+{
+    int ret;
+    unsigned long ign1;
+    __asm__ __volatile__ (
+        TRAP_INSTR
+        : "=a" (ret), "=b" (ign1)
+       : "0" (__HYPERVISOR_sched_op),
+         "1" (SCHEDOP_shutdown | (SHUTDOWN_reboot << SCHEDOP_reasonshift))
+        : "memory" );
+
+    return ret;
+}
+
+static inline int
+HYPERVISOR_suspend(
+    unsigned long srec)
+{
+    int ret;
+    unsigned long ign1, ign2;
+
+    /* NB. On suspend, control software expects a suspend record in %esi. */
+    __asm__ __volatile__ (
+        TRAP_INSTR
+        : "=a" (ret), "=b" (ign1), "=S" (ign2)
+       : "0" (__HYPERVISOR_sched_op),
+        "b" (SCHEDOP_shutdown | (SHUTDOWN_suspend << SCHEDOP_reasonshift)), 
+        "S" (srec) : "memory");
+
+    return ret;
+}
+
+static inline int
+HYPERVISOR_crash(
+    void)
+{
+    int ret;
+    unsigned long ign1;
+    __asm__ __volatile__ (
+        TRAP_INSTR
+        : "=a" (ret), "=b" (ign1)
+       : "0" (__HYPERVISOR_sched_op),
+         "1" (SCHEDOP_shutdown | (SHUTDOWN_crash << SCHEDOP_reasonshift))
+        : "memory" );
+
+    return ret;
+}
+
+static inline long
+HYPERVISOR_set_timer_op(
+    u64 timeout)
+{
+    int ret;
+    unsigned long timeout_hi = (unsigned long)(timeout>>32);
+    unsigned long timeout_lo = (unsigned long)timeout;
+    unsigned long ign1, ign2;
+
+    __asm__ __volatile__ (
+        TRAP_INSTR
+        : "=a" (ret), "=b" (ign1), "=c" (ign2)
+       : "0" (__HYPERVISOR_set_timer_op), "b" (timeout_lo), "c" (timeout_hi)
+       : "memory");
+
+    return ret;
+}
+
+static inline int
+HYPERVISOR_dom0_op(
+    dom0_op_t *dom0_op)
+{
+    int ret;
+    unsigned long ign1;
+
+    dom0_op->interface_version = DOM0_INTERFACE_VERSION;
+    __asm__ __volatile__ (
+        TRAP_INSTR
+        : "=a" (ret), "=b" (ign1)
+       : "0" (__HYPERVISOR_dom0_op), "1" (dom0_op)
+       : "memory");
+
+    return ret;
+}
+
+static inline int
+HYPERVISOR_set_debugreg(
+    int reg, unsigned long value)
+{
+    int ret;
+    unsigned long ign1, ign2;
+    __asm__ __volatile__ (
+        TRAP_INSTR
+        : "=a" (ret), "=b" (ign1), "=c" (ign2)
+       : "0" (__HYPERVISOR_set_debugreg), "1" (reg), "2" (value)
+       : "memory" );
+
+    return ret;
+}
+
+static inline unsigned long
+HYPERVISOR_get_debugreg(
+    int reg)
+{
+    unsigned long ret;
+    unsigned long ign;
+    __asm__ __volatile__ (
+        TRAP_INSTR
+        : "=a" (ret), "=b" (ign)
+       : "0" (__HYPERVISOR_get_debugreg), "1" (reg)
+       : "memory" );
+
+    return ret;
+}
+
+static inline int
+HYPERVISOR_update_descriptor(
+    unsigned long ma, unsigned long word1, unsigned long word2)
+{
+    int ret;
+    unsigned long ign1, ign2, ign3;
+
+    __asm__ __volatile__ (
+        TRAP_INSTR
+        : "=a" (ret), "=b" (ign1), "=c" (ign2), "=d" (ign3)
+       : "0" (__HYPERVISOR_update_descriptor), "1" (ma), "2" (word1),
+         "3" (word2)
+       : "memory" );
+
+    return ret;
+}
+
+static inline int
+HYPERVISOR_dom_mem_op(
+    unsigned int op, unsigned long *extent_list,
+    unsigned long nr_extents, unsigned int extent_order)
+{
+    int ret;
+    unsigned long ign1, ign2, ign3, ign4, ign5;
+
+    __asm__ __volatile__ (
+        TRAP_INSTR
+        : "=a" (ret), "=b" (ign1), "=c" (ign2), "=d" (ign3), "=S" (ign4),
+         "=D" (ign5)
+       : "0" (__HYPERVISOR_dom_mem_op), "1" (op), "2" (extent_list),
+         "3" (nr_extents), "4" (extent_order), "5" (DOMID_SELF)
+        : "memory" );
+
+    return ret;
+}
+
+static inline int
+HYPERVISOR_multicall(
+    void *call_list, int nr_calls)
+{
+    int ret;
+    unsigned long ign1, ign2;
+
+    __asm__ __volatile__ (
+        TRAP_INSTR
+        : "=a" (ret), "=b" (ign1), "=c" (ign2)
+       : "0" (__HYPERVISOR_multicall), "1" (call_list), "2" (nr_calls)
+       : "memory" );
+
+    return ret;
+}
+
+static inline int
+HYPERVISOR_update_va_mapping(
+    unsigned long va, pte_t new_val, unsigned long flags)
+{
+    int ret;
+    unsigned long ign1, ign2, ign3, ign4;
+
+    __asm__ __volatile__ (
+        TRAP_INSTR
+        : "=a" (ret), "=b" (ign1), "=c" (ign2), "=d" (ign3), "=S" (ign4)
+       : "0" (__HYPERVISOR_update_va_mapping), 
+          "1" (va), "2" ((new_val).pte_low),
+#ifdef CONFIG_X86_PAE
+         "3" ((new_val).pte_high),
+#else
+         "3" (0),
+#endif
+         "4" (flags)
+       : "memory" );
+
+    if ( unlikely(ret < 0) )
+    {
+        printk(KERN_ALERT "Failed update VA mapping: %08lx, %08lx, %08lx\n",
+               va, (new_val).pte_low, flags);
+        BUG();
+    }
+
+    return ret;
+}
+
+static inline int
+HYPERVISOR_event_channel_op(
+    void *op)
+{
+    int ret;
+    unsigned long ignore;
+    __asm__ __volatile__ (
+        TRAP_INSTR
+        : "=a" (ret), "=b" (ignore)
+       : "0" (__HYPERVISOR_event_channel_op), "1" (op)
+       : "memory" );
+
+    return ret;
+}
+
+static inline int
+HYPERVISOR_xen_version(
+    int cmd)
+{
+    int ret;
+    unsigned long ignore;
+
+    __asm__ __volatile__ (
+        TRAP_INSTR
+        : "=a" (ret), "=b" (ignore)
+       : "0" (__HYPERVISOR_xen_version), "1" (cmd)
+       : "memory" );
+
+    return ret;
+}
+
+static inline int
+HYPERVISOR_console_io(
+    int cmd, int count, char *str)
+{
+    int ret;
+    unsigned long ign1, ign2, ign3;
+    __asm__ __volatile__ (
+        TRAP_INSTR
+        : "=a" (ret), "=b" (ign1), "=c" (ign2), "=d" (ign3)
+       : "0" (__HYPERVISOR_console_io), "1" (cmd), "2" (count), "3" (str)
+       : "memory" );
+
+    return ret;
+}
+
+static inline int
+HYPERVISOR_physdev_op(
+    void *physdev_op)
+{
+    int ret;
+    unsigned long ign;
+
+    __asm__ __volatile__ (
+        TRAP_INSTR
+        : "=a" (ret), "=b" (ign)
+       : "0" (__HYPERVISOR_physdev_op), "1" (physdev_op)
+       : "memory" );
+
+    return ret;
+}
+
+static inline int
+HYPERVISOR_grant_table_op(
+    unsigned int cmd, void *uop, unsigned int count)
+{
+    int ret;
+    unsigned long ign1, ign2, ign3;
+
+    __asm__ __volatile__ (
+        TRAP_INSTR
+        : "=a" (ret), "=b" (ign1), "=c" (ign2), "=d" (ign3)
+       : "0" (__HYPERVISOR_grant_table_op), "1" (cmd), "2" (uop), "3" (count)
+       : "memory" );
+
+    return ret;
+}
+
+static inline int
+HYPERVISOR_update_va_mapping_otherdomain(
+    unsigned long va, pte_t new_val, unsigned long flags, domid_t domid)
+{
+    int ret;
+    unsigned long ign1, ign2, ign3, ign4, ign5;
+
+    __asm__ __volatile__ (
+        TRAP_INSTR
+        : "=a" (ret), "=b" (ign1), "=c" (ign2), "=d" (ign3),
+         "=S" (ign4), "=D" (ign5)
+       : "0" (__HYPERVISOR_update_va_mapping_otherdomain),
+          "1" (va), "2" ((new_val).pte_low),
+#ifdef CONFIG_X86_PAE
+         "3" ((new_val).pte_high),
+#else
+         "3" (0),
+#endif
+         "4" (flags), "5" (domid) :
+        "memory" );
+    
+    return ret;
+}
+
+static inline int
+HYPERVISOR_vm_assist(
+    unsigned int cmd, unsigned int type)
+{
+    int ret;
+    unsigned long ign1, ign2;
+
+    __asm__ __volatile__ (
+        TRAP_INSTR
+        : "=a" (ret), "=b" (ign1), "=c" (ign2)
+       : "0" (__HYPERVISOR_vm_assist), "1" (cmd), "2" (type)
+       : "memory" );
+
+    return ret;
+}
+
+static inline int
+HYPERVISOR_boot_vcpu(
+    unsigned long vcpu, vcpu_guest_context_t *ctxt)
+{
+    int ret;
+    unsigned long ign1, ign2;
+
+    __asm__ __volatile__ (
+        TRAP_INSTR
+        : "=a" (ret), "=b" (ign1), "=c" (ign2)
+       : "0" (__HYPERVISOR_boot_vcpu), "1" (vcpu), "2" (ctxt)
+       : "memory");
+
+    return ret;
+}
+
+static inline int
+HYPERVISOR_vcpu_down(
+    int vcpu)
+{
+    int ret;
+    unsigned long ign1;
+    __asm__ __volatile__ (
+        TRAP_INSTR
+        : "=a" (ret), "=b" (ign1)
+       : "0" (__HYPERVISOR_sched_op),
+         "1" (SCHEDOP_vcpu_down | (vcpu << SCHEDOP_vcpushift))
+        : "memory" );
+
+    return ret;
+}
+
+static inline int
+HYPERVISOR_vcpu_up(
+    int vcpu)
+{
+    int ret;
+    unsigned long ign1;
+    __asm__ __volatile__ (
+        TRAP_INSTR
+        : "=a" (ret), "=b" (ign1)
+       : "0" (__HYPERVISOR_sched_op),
+         "1" (SCHEDOP_vcpu_up | (vcpu << SCHEDOP_vcpushift))
+        : "memory" );
+
+    return ret;
+}
+#endif /* __HYPERCALL_H__ */
Index: linux-2.6.12-xen0-arch/include/asm-i386/mach-xen/irq_vectors.h
===================================================================
--- /dev/null
+++ linux-2.6.12-xen0-arch/include/asm-i386/mach-xen/irq_vectors.h
@@ -0,0 +1,140 @@
+/*
+ * This file should contain #defines for all of the interrupt vector
+ * numbers used by this architecture.
+ *
+ * In addition, there are some standard defines:
+ *
+ *     FIRST_EXTERNAL_VECTOR:
+ *             The first free place for external interrupts
+ *
+ *     SYSCALL_VECTOR:
+ *             The IRQ vector a syscall makes the user to kernel transition
+ *             under.
+ *
+ *     TIMER_IRQ:
+ *             The IRQ number the timer interrupt comes in at.
+ *
+ *     NR_IRQS:
+ *             The total number of interrupt vectors (including all the
+ *             architecture specific interrupts) needed.
+ *
+ */                    
+#ifndef _ASM_IRQ_VECTORS_H
+#define _ASM_IRQ_VECTORS_H
+
+/*
+ * IDT vectors usable for external interrupt sources start
+ * at 0x20:
+ */
+#define FIRST_EXTERNAL_VECTOR  0x20
+
+#define SYSCALL_VECTOR         0x80
+
+/*
+ * Vectors 0x20-0x2f are used for ISA interrupts.
+ */
+
+#if 0
+/*
+ * Special IRQ vectors used by the SMP architecture, 0xf0-0xff
+ *
+ *  some of the following vectors are 'rare', they are merged
+ *  into a single vector (CALL_FUNCTION_VECTOR) to save vector space.
+ *  TLB, reschedule and local APIC vectors are performance-critical.
+ *
+ *  Vectors 0xf0-0xfa are free (reserved for future Linux use).
+ */
+#define SPURIOUS_APIC_VECTOR   0xff
+#define ERROR_APIC_VECTOR      0xfe
+#define INVALIDATE_TLB_VECTOR  0xfd
+#define RESCHEDULE_VECTOR      0xfc
+#define CALL_FUNCTION_VECTOR   0xfb
+
+#define THERMAL_APIC_VECTOR    0xf0
+/*
+ * Local APIC timer IRQ vector is on a different priority level,
+ * to work around the 'lost local interrupt if more than 2 IRQ
+ * sources per level' errata.
+ */
+#define LOCAL_TIMER_VECTOR     0xef
+#endif
+
+#define SPURIOUS_APIC_VECTOR   0xff
+#define ERROR_APIC_VECTOR      0xfe
+
+/*
+ * First APIC vector available to drivers: (vectors 0x30-0xee)
+ * we start at 0x31 to spread out vectors evenly between priority
+ * levels. (0x80 is the syscall vector)
+ */
+#define FIRST_DEVICE_VECTOR    0x31
+#define FIRST_SYSTEM_VECTOR    0xef
+
+/*
+ * 16 8259A IRQ's, 208 potential APIC interrupt sources.
+ * Right now the APIC is mostly only used for SMP.
+ * 256 vectors is an architectural limit. (we can have
+ * more than 256 devices theoretically, but they will
+ * have to use shared interrupts)
+ * Since vectors 0x00-0x1f are used/reserved for the CPU,
+ * the usable vector space is 0x20-0xff (224 vectors)
+ */
+
+#define NR_IPIS 8
+
+#define RESCHEDULE_VECTOR      1
+#define INVALIDATE_TLB_VECTOR  2
+#define CALL_FUNCTION_VECTOR   3
+
+/*
+ * The maximum number of vectors supported by i386 processors
+ * is limited to 256. For processors other than i386, NR_VECTORS
+ * should be changed accordingly.
+ */
+#define NR_VECTORS 256
+
+#define FPU_IRQ                        13
+
+#define        FIRST_VM86_IRQ          3
+#define LAST_VM86_IRQ          15
+#define invalid_vm86_irq(irq)  ((irq) < 3 || (irq) > 15)
+
+/*
+ * The flat IRQ space is divided into two regions:
+ *  1. A one-to-one mapping of real physical IRQs. This space is only used
+ *     if we have physical device-access privilege. This region is at the 
+ *     start of the IRQ space so that existing device drivers do not need
+ *     to be modified to translate physical IRQ numbers into our IRQ space.
+ *  3. A dynamic mapping of inter-domain and Xen-sourced virtual IRQs. These
+ *     are bound using the provided bind/unbind functions.
+ */
+
+#define PIRQ_BASE              0
+#define NR_PIRQS               256
+
+#define DYNIRQ_BASE            (PIRQ_BASE + NR_PIRQS)
+#define NR_DYNIRQS             256
+
+#define NR_IRQS                        (NR_PIRQS + NR_DYNIRQS)
+#define NR_IRQ_VECTORS         NR_IRQS
+
+#define pirq_to_irq(_x)                ((_x) + PIRQ_BASE)
+#define irq_to_pirq(_x)                ((_x) - PIRQ_BASE)
+
+#define dynirq_to_irq(_x)      ((_x) + DYNIRQ_BASE)
+#define irq_to_dynirq(_x)      ((_x) - DYNIRQ_BASE)
+
+#ifndef __ASSEMBLY__
+/* Dynamic binding of event channels and VIRQ sources to Linux IRQ space. */
+extern int  bind_virq_to_irq(int virq);
+extern void unbind_virq_from_irq(int virq);
+extern int  bind_ipi_to_irq(int ipi);
+extern void unbind_ipi_from_irq(int ipi);
+extern int  bind_evtchn_to_irq(int evtchn);
+extern void unbind_evtchn_from_irq(int evtchn);
+
+extern void irq_suspend(void);
+extern void irq_resume(void);
+#endif /* __ASSEMBLY__ */
+
+#endif /* _ASM_IRQ_VECTORS_H */
Index: linux-2.6.12-xen0-arch/include/asm-i386/mach-xen/queues.h
===================================================================
--- /dev/null
+++ linux-2.6.12-xen0-arch/include/asm-i386/mach-xen/queues.h
@@ -0,0 +1,81 @@
+
+/*
+ * Oh dear. Task queues were removed from Linux 2.6 and replaced by work 
+ * queues. Unfortunately the semantics is not the same. With task queues we 
+ * can defer work until a particular event occurs -- this is not
+ * straightforwardly done with work queues (queued work is performed asap, or
+ * after some fixed timeout). Conversely, work queues are a (slightly) neater
+ * way of deferring work to a process context than using task queues in 2.4.
+ * 
+ * This is a bit of a needless reimplementation -- should have just pulled
+ * the code from 2.4, but I tried leveraging work queues to simplify things.
+ * They didn't help. :-(
+ */
+
+#ifndef __QUEUES_H__
+#define __QUEUES_H__
+
+#include <linux/version.h>
+#include <linux/list.h>
+#include <linux/workqueue.h>
+
+struct tq_struct { 
+    void (*fn)(void *);
+    void *arg;
+    struct list_head list;
+    unsigned long pending;
+};
+#define INIT_TQUEUE(_name, _fn, _arg)               \
+    do {                                            \
+        INIT_LIST_HEAD(&(_name)->list);             \
+        (_name)->pending = 0;                       \
+        (_name)->fn = (_fn); (_name)->arg = (_arg); \
+    } while ( 0 )
+#define DECLARE_TQUEUE(_name, _fn, _arg)            \
+    struct tq_struct _name = { (_fn), (_arg), LIST_HEAD_INIT((_name).list), 0 }
+
+typedef struct {
+    struct list_head list;
+    spinlock_t       lock;
+} task_queue;
+#define DECLARE_TASK_QUEUE(_name) \
+    task_queue _name = { LIST_HEAD_INIT((_name).list), SPIN_LOCK_UNLOCKED }
+
+static inline int queue_task(struct tq_struct *tqe, task_queue *tql)
+{
+    unsigned long flags;
+    if ( test_and_set_bit(0, &tqe->pending) )
+        return 0;
+    spin_lock_irqsave(&tql->lock, flags);
+    list_add_tail(&tqe->list, &tql->list);
+    spin_unlock_irqrestore(&tql->lock, flags);
+    return 1;
+}
+
+static inline void run_task_queue(task_queue *tql)
+{
+    struct list_head head, *ent;
+    struct tq_struct *tqe;
+    unsigned long flags;
+    void (*fn)(void *);
+    void *arg;
+
+    spin_lock_irqsave(&tql->lock, flags);
+    list_add(&head, &tql->list);
+    list_del_init(&tql->list);
+    spin_unlock_irqrestore(&tql->lock, flags);
+
+    while ( !list_empty(&head) )
+    {
+        ent = head.next;
+        list_del_init(ent);
+        tqe = list_entry(ent, struct tq_struct, list);
+        fn  = tqe->fn;
+        arg = tqe->arg;
+        wmb();
+        tqe->pending = 0;
+        fn(arg);
+    }
+}
+
+#endif /* __QUEUES_H__ */
Index: linux-2.6.12-xen0-arch/include/asm-i386/mach-xen/xen_foreign_page.h
===================================================================
--- /dev/null
+++ linux-2.6.12-xen0-arch/include/asm-i386/mach-xen/xen_foreign_page.h
@@ -0,0 +1,30 @@
+/******************************************************************************
+ * foreign_page.h
+ * 
+ * Provide a "foreign" page type, that is owned by a foreign allocator and 
+ * not the normal buddy allocator in page_alloc.c
+ * 
+ * Copyright (c) 2004, K A Fraser
+ */
+
+#ifndef __ASM_XEN_FOREIGN_PAGE_H__
+#define __ASM_XEN_FOREIGN_PAGE_H__
+
+#define PG_foreign             PG_arch_1
+
+#define PageForeign(page)      test_bit(PG_foreign, &(page)->flags)
+
+#define SetPageForeign(page, dtor) do {                \
+       set_bit(PG_foreign, &(page)->flags);    \
+       (page)->mapping = (void *)dtor;         \
+} while (0)
+
+#define ClearPageForeign(page) do {            \
+       clear_bit(PG_foreign, &(page)->flags);  \
+       (page)->mapping = NULL;                 \
+} while (0)
+
+#define PageForeignDestructor(page)    \
+       ( (void (*) (struct page *)) (page)->mapping )
+
+#endif /* __ASM_XEN_FOREIGN_PAGE_H__ */
Index: linux-2.6.12-xen0-arch/include/asm-i386/mach-xen/xen_hypervisor.h
===================================================================
--- /dev/null
+++ linux-2.6.12-xen0-arch/include/asm-i386/mach-xen/xen_hypervisor.h
@@ -0,0 +1,197 @@
+/******************************************************************************
+ * hypervisor.h
+ * 
+ * Linux-specific hypervisor handling.
+ * 
+ * Copyright (c) 2002-2004, K A Fraser
+ * 
+ * This file may be distributed separately from the Linux kernel, or
+ * incorporated into other software packages, subject to the following license:
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this source file (the "Software"), to deal in the Software without
+ * restriction, including without limitation the rights to use, copy, modify,
+ * merge, publish, distribute, sublicense, and/or sell copies of the Software,
+ * and to permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#ifndef __HYPERVISOR_H__
+#define __HYPERVISOR_H__
+
+#include <linux/config.h>
+#include <linux/types.h>
+#include <linux/kernel.h>
+#include <linux/version.h>
+#include <xen-public/xen.h>
+#include <xen-public/dom0_ops.h>
+#include <xen-public/io/domain_controller.h>
+#include <asm/ptrace.h>
+#include <asm/page.h>
+#if defined(__i386__)
+# if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,0)
+#  ifdef CONFIG_X86_PAE
+#   include <asm-generic/pgtable-nopud.h>
+#  else
+#   include <asm-generic/pgtable-nopmd.h>
+#  endif
+# endif
+#endif
+
+/* arch/xen/i386/kernel/setup.c */
+union xen_start_info_union
+{
+    start_info_t xen_start_info;
+    char padding[2048];
+};
+extern union xen_start_info_union xen_start_info_union;
+#define xen_start_info (xen_start_info_union.xen_start_info)
+
+/* arch/xen/kernel/evtchn.c */
+/* Force a proper event-channel callback from Xen. */
+void force_evtchn_callback(void);
+
+/* arch/xen/kernel/process.c */
+void xen_cpu_idle (void);
+
+/* arch/xen/i386/kernel/hypervisor.c */
+void do_hypervisor_callback(struct pt_regs *regs);
+
+/* arch/xen/i386/kernel/head.S */
+void lgdt_finish(void);
+
+/* arch/xen/i386/mm/hypervisor.c */
+/*
+ * NB. ptr values should be PHYSICAL, not MACHINE. 'vals' should be already
+ * be MACHINE addresses.
+ */
+
+void xen_pt_switch(unsigned long ptr);
+void xen_new_user_pt(unsigned long ptr); /* x86_64 only */
+void xen_load_gs(unsigned int selector); /* x86_64 only */
+void xen_tlb_flush(void);
+void xen_invlpg(unsigned long ptr);
+
+#ifndef CONFIG_XEN_SHADOW_MODE
+void xen_l1_entry_update(pte_t *ptr, pte_t val);
+void xen_l2_entry_update(pmd_t *ptr, pmd_t val);
+void xen_l3_entry_update(pud_t *ptr, pud_t val); /* x86_64/PAE */
+void xen_l4_entry_update(pgd_t *ptr, pgd_t val); /* x86_64 only */
+void xen_pgd_pin(unsigned long ptr);
+void xen_pgd_unpin(unsigned long ptr);
+void xen_pud_pin(unsigned long ptr); /* x86_64 only */
+void xen_pud_unpin(unsigned long ptr); /* x86_64 only */
+void xen_pmd_pin(unsigned long ptr); /* x86_64 only */
+void xen_pmd_unpin(unsigned long ptr); /* x86_64 only */
+void xen_pte_pin(unsigned long ptr);
+void xen_pte_unpin(unsigned long ptr);
+#else
+#define xen_l1_entry_update(_p, _v) set_pte((_p), (pte_t){(_v)})
+#define xen_l2_entry_update(_p, _v) set_pgd((_p), (pgd_t){(_v)})
+#define xen_pgd_pin(_p)   ((void)0)
+#define xen_pgd_unpin(_p) ((void)0)
+#define xen_pte_pin(_p)   ((void)0)
+#define xen_pte_unpin(_p) ((void)0)
+#endif
+
+void xen_set_ldt(unsigned long ptr, unsigned long bytes);
+void xen_machphys_update(unsigned long mfn, unsigned long pfn);
+
+#ifdef CONFIG_SMP
+#include <linux/cpumask.h>
+void xen_tlb_flush_all(void);
+void xen_invlpg_all(unsigned long ptr);
+void xen_tlb_flush_mask(cpumask_t *mask);
+void xen_invlpg_mask(cpumask_t *mask, unsigned long ptr);
+#endif
+
+#if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,0)
+/* 
+** XXX SMH: 2.4 doesn't have percpu.h (or support SMP guests) so just 
+** include sufficient #defines to allow the below to build. 
+*/
+#define DEFINE_PER_CPU(type, name) \
+    __typeof__(type) per_cpu__##name
+
+#define per_cpu(var, cpu)           (*((void)cpu, &per_cpu__##var))
+#define __get_cpu_var(var)          per_cpu__##var
+#define DECLARE_PER_CPU(type, name) extern __typeof__(type) per_cpu__##name
+
+#define EXPORT_PER_CPU_SYMBOL(var) EXPORT_SYMBOL(per_cpu__##var)
+#define EXPORT_PER_CPU_SYMBOL_GPL(var) EXPORT_SYMBOL_GPL(per_cpu__##var)
+#endif /* linux < 2.6.0 */
+
+void xen_contig_memory(unsigned long vstart, unsigned int order);
+
+#ifdef CONFIG_XEN_PHYSDEV_ACCESS
+/* Allocate a contiguous empty region of low memory. Return virtual start. */
+unsigned long allocate_empty_lowmem_region(unsigned long pages);
+#endif
+
+#include <hypercall.h>
+
+#if defined(CONFIG_X86_64)
+#define MULTI_UVMFLAGS_INDEX 2
+#define MULTI_UVMDOMID_INDEX 3
+#else
+#define MULTI_UVMFLAGS_INDEX 3
+#define MULTI_UVMDOMID_INDEX 4
+#endif
+
+static inline void
+MULTI_update_va_mapping(
+    multicall_entry_t *mcl, unsigned long va,
+    pte_t new_val, unsigned long flags)
+{
+    mcl->op = __HYPERVISOR_update_va_mapping;
+    mcl->args[0] = va;
+#if defined(CONFIG_X86_64)
+    mcl->args[1] = new_val.pte;
+    mcl->args[2] = flags;
+#elif defined(CONFIG_X86_PAE)
+    mcl->args[1] = new_val.pte_low;
+    mcl->args[2] = new_val.pte_high;
+    mcl->args[3] = flags;
+#else
+    mcl->args[1] = new_val.pte_low;
+    mcl->args[2] = 0;
+    mcl->args[3] = flags;
+#endif
+}
+
+static inline void
+MULTI_update_va_mapping_otherdomain(
+    multicall_entry_t *mcl, unsigned long va,
+    pte_t new_val, unsigned long flags, domid_t domid)
+{
+    mcl->op = __HYPERVISOR_update_va_mapping_otherdomain;
+    mcl->args[0] = va;
+#if defined(CONFIG_X86_64)
+    mcl->args[1] = new_val.pte;
+    mcl->args[2] = flags;
+    mcl->args[3] = domid;
+#elif defined(CONFIG_X86_PAE)
+    mcl->args[1] = new_val.pte_low;
+    mcl->args[2] = new_val.pte_high;
+    mcl->args[3] = flags;
+    mcl->args[4] = domid;
+#else
+    mcl->args[1] = new_val.pte_low;
+    mcl->args[2] = 0;
+    mcl->args[3] = flags;
+    mcl->args[4] = domid;
+#endif
+}
+
+#endif /* __HYPERVISOR_H__ */
Index: linux-2.6.12-xen0-arch/include/asm-i386/mach-xen/xen-public/acm.h
===================================================================
--- /dev/null
+++ linux-2.6.12-xen0-arch/include/asm-i386/mach-xen/xen-public/acm.h
@@ -0,0 +1,157 @@
+/****************************************************************
+ * acm.h
+ * 
+ * Copyright (C) 2005 IBM Corporation
+ *
+ * Author:
+ * Reiner Sailer <sailer@xxxxxxxxxxxxxx>
+ *
+ * Contributors:
+ * Stefan Berger <stefanb@xxxxxxxxxxxxxx> 
+ *     added network byte order support for binary policies
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation, version 2 of the
+ * License.
+ *
+ * sHype general access control module header file.
+ *     here are all definitions that are shared between
+ *     xen-core, guest-kernels, and applications.
+ *
+ * todo: move from static policy choice to compile option.
+ */
+
+#ifndef _XEN_PUBLIC_ACM_H
+#define _XEN_PUBLIC_ACM_H
+
+#include "xen-public/xen.h"
+#include "xen-public/sched_ctl.h"
+
+/* if ACM_DEBUG defined, all hooks should
+ * print a short trace message (comment it out
+ * when not in testing mode )
+ */
+/* #define ACM_DEBUG */
+
+#ifdef ACM_DEBUG
+#  define printkd(fmt, args...) printk(fmt,## args)
+#else
+#  define printkd(fmt, args...)
+#endif
+
+/* default ssid reference value if not supplied */
+#define ACM_DEFAULT_SSID       0x0
+#define ACM_DEFAULT_LOCAL_SSID  0x0
+
+/* Internal ACM ERROR types */
+#define ACM_OK                          0
+#define ACM_UNDEF                      -1
+#define ACM_INIT_SSID_ERROR            -2
+#define ACM_INIT_SOID_ERROR            -3
+#define ACM_ERROR                      -4
+
+/* External ACCESS DECISIONS */
+#define ACM_ACCESS_PERMITTED           0
+#define ACM_ACCESS_DENIED              -111
+#define ACM_NULL_POINTER_ERROR         -200
+
+#define ACM_MAX_POLICY  3
+
+#define ACM_NULL_POLICY        0
+#define ACM_CHINESE_WALL_POLICY        1
+#define ACM_SIMPLE_TYPE_ENFORCEMENT_POLICY 2
+#define ACM_CHINESE_WALL_AND_SIMPLE_TYPE_ENFORCEMENT_POLICY 3
+
+/* policy: */
+#define ACM_POLICY_NAME(X) \
+       (X == ACM_NULL_POLICY) ? "NULL policy" : \
+       (X == ACM_CHINESE_WALL_POLICY) ? "CHINESE WALL policy" : \
+       (X == ACM_SIMPLE_TYPE_ENFORCEMENT_POLICY) ? "SIMPLE TYPE ENFORCEMENT 
policy" : \
+       (X == ACM_CHINESE_WALL_AND_SIMPLE_TYPE_ENFORCEMENT_POLICY) ? "CHINESE 
WALL AND SIMPLE TYPE ENFORCEMENT policy" : \
+       "UNDEFINED policy"
+
+/* defines a ssid reference used by xen */
+typedef u32 ssidref_t;
+
+/* -------security policy relevant type definitions-------- */
+
+/* type identifier; compares to "equal" or "not equal" */
+typedef u16 domaintype_t;
+
+/* CHINESE WALL POLICY DATA STRUCTURES
+ *
+ * current accumulated conflict type set:
+ * When a domain is started and has a type that is in
+ * a conflict set, the conflicting types are incremented in
+ * the aggregate set. When a domain is destroyed, the 
+ * conflicting types to its type are decremented.
+ * If a domain has multiple types, this procedure works over
+ * all those types.
+ *
+ * conflict_aggregate_set[i] holds the number of
+ *   running domains that have a conflict with type i.
+ *
+ * running_types[i] holds the number of running domains
+ *        that include type i in their ssidref-referenced type set
+ *
+ * conflict_sets[i][j] is "0" if type j has no conflict
+ *    with type i and is "1" otherwise.
+ */
+/* high-16 = version, low-16 = check magic */
+#define ACM_MAGIC              0x0001debc
+
+/* each offset in bytes from start of the struct they
+ *   the are part of */
+/* each buffer consists of all policy information for
+ * the respective policy given in the policy code
+ */
+struct acm_policy_buffer {
+        u32 magic;
+       u32 policyversion;
+       u32 len;
+       u16 primary_policy_code;
+       u16 primary_buffer_offset;
+       u16 secondary_policy_code;
+       u16 secondary_buffer_offset;
+};
+
+struct acm_chwall_policy_buffer {
+       u16 policy_code;
+       u16 chwall_max_types;
+       u16 chwall_max_ssidrefs;
+       u16 chwall_max_conflictsets;
+       u16 chwall_ssid_offset;
+       u16 chwall_conflict_sets_offset;
+       u16 chwall_running_types_offset;
+       u16 chwall_conflict_aggregate_offset;
+};
+
+struct acm_ste_policy_buffer {
+       u16 policy_code;
+       u16 ste_max_types;
+       u16 ste_max_ssidrefs;
+       u16 ste_ssid_offset;
+};
+
+struct acm_stats_buffer {
+        u32 magic;
+       u32 policyversion;
+       u32 len;
+       u16 primary_policy_code;
+       u16 primary_stats_offset;
+       u16 secondary_policy_code;
+       u16 secondary_stats_offset;
+};
+
+struct acm_ste_stats_buffer {
+       u32 ec_eval_count;
+       u32 gt_eval_count;
+       u32 ec_denied_count;
+       u32 gt_denied_count; 
+       u32 ec_cachehit_count;
+       u32 gt_cachehit_count;
+};
+
+
+#endif
Index: linux-2.6.12-xen0-arch/include/asm-i386/mach-xen/xen-public/arch-ia64.h
===================================================================
--- /dev/null
+++ linux-2.6.12-xen0-arch/include/asm-i386/mach-xen/xen-public/arch-ia64.h
@@ -0,0 +1,219 @@
+/******************************************************************************
+ * arch-ia64/hypervisor-if.h
+ * 
+ * Guest OS interface to IA64 Xen.
+ */
+
+#ifndef __HYPERVISOR_IF_IA64_H__
+#define __HYPERVISOR_IF_IA64_H__
+
+/* Maximum number of virtual CPUs in multi-processor guests. */
+/* WARNING: before changing this, check that shared_info fits on a page */
+#define MAX_VIRT_CPUS 1
+
+#ifndef __ASSEMBLY__
+
+/* NB. Both the following are 64 bits each. */
+typedef unsigned long memory_t;   /* Full-sized pointer/address/memory-size. */
+
+#define MAX_NR_SECTION  32  // at most 32 memory holes
+typedef struct {
+    unsigned long      start;  /* start of memory hole */
+    unsigned long      end;    /* end of memory hole */
+} mm_section_t;
+
+typedef struct {
+    unsigned long      mfn : 56;
+    unsigned long      type: 8;
+} pmt_entry_t;
+
+#define GPFN_MEM               (0UL << 56)     /* Guest pfn is normal mem */
+#define GPFN_FRAME_BUFFER      (1UL << 56)     /* VGA framebuffer */
+#define GPFN_LOW_MMIO          (2UL << 56)     /* Low MMIO range */
+#define GPFN_PIB               (3UL << 56)     /* PIB base */
+#define GPFN_IOSAPIC           (4UL << 56)     /* IOSAPIC base */
+#define GPFN_LEGACY_IO         (5UL << 56)     /* Legacy I/O base */
+#define GPFN_GFW               (6UL << 56)     /* Guest Firmware */
+#define GPFN_HIGH_MMIO         (7UL << 56)     /* High MMIO range */
+
+#define GPFN_IO_MASK           (7UL << 56)     /* Guest pfn is I/O type */
+#define GPFN_INV_MASK          (31UL << 59)    /* Guest pfn is invalid */
+
+#define INVALID_MFN              (~0UL)
+
+/*
+ * NB. This may become a 64-bit count with no shift. If this happens then the 
+ * structure size will still be 8 bytes, so no other alignments will change.
+ */
+typedef struct {
+    unsigned int  tsc_bits;      /* 0: 32 bits read from the CPU's TSC. */
+    unsigned int  tsc_bitshift;  /* 4: 'tsc_bits' uses N:N+31 of TSC.   */
+} tsc_timestamp_t; /* 8 bytes */
+
+struct pt_fpreg {
+        union {
+                unsigned long bits[2];
+                long double __dummy;    /* force 16-byte alignment */
+        } u;
+};
+
+struct pt_regs {
+       /* The following registers are saved by SAVE_MIN: */
+       unsigned long b6;               /* scratch */
+       unsigned long b7;               /* scratch */
+
+       unsigned long ar_csd;           /* used by cmp8xchg16 (scratch) */
+       unsigned long ar_ssd;           /* reserved for future use (scratch) */
+
+       unsigned long r8;               /* scratch (return value register 0) */
+       unsigned long r9;               /* scratch (return value register 1) */
+       unsigned long r10;              /* scratch (return value register 2) */
+       unsigned long r11;              /* scratch (return value register 3) */
+
+       unsigned long cr_ipsr;          /* interrupted task's psr */
+       unsigned long cr_iip;           /* interrupted task's instruction 
pointer */
+       unsigned long cr_ifs;           /* interrupted task's function state */
+
+       unsigned long ar_unat;          /* interrupted task's NaT register 
(preserved) */
+       unsigned long ar_pfs;           /* prev function state  */
+       unsigned long ar_rsc;           /* RSE configuration */
+       /* The following two are valid only if cr_ipsr.cpl > 0: */
+       unsigned long ar_rnat;          /* RSE NaT */
+       unsigned long ar_bspstore;      /* RSE bspstore */
+
+       unsigned long pr;               /* 64 predicate registers (1 bit each) 
*/
+       unsigned long b0;               /* return pointer (bp) */
+       unsigned long loadrs;           /* size of dirty partition << 16 */
+
+       unsigned long r1;               /* the gp pointer */
+       unsigned long r12;              /* interrupted task's memory stack 
pointer */
+       unsigned long r13;              /* thread pointer */
+
+       unsigned long ar_fpsr;          /* floating point status (preserved) */
+       unsigned long r15;              /* scratch */
+
+       /* The remaining registers are NOT saved for system calls.  */
+
+       unsigned long r14;              /* scratch */
+       unsigned long r2;               /* scratch */
+       unsigned long r3;               /* scratch */
+
+#ifdef CONFIG_VTI
+       unsigned long r4;               /* preserved */
+       unsigned long r5;               /* preserved */
+       unsigned long r6;               /* preserved */
+       unsigned long r7;               /* preserved */
+       unsigned long cr_iipa;   /* for emulation */
+       unsigned long cr_isr;    /* for emulation */
+       unsigned long eml_unat;    /* used for emulating instruction */
+       unsigned long rfi_pfs;     /* used for elulating rfi */
+#endif
+
+       /* The following registers are saved by SAVE_REST: */
+       unsigned long r16;              /* scratch */
+       unsigned long r17;              /* scratch */
+       unsigned long r18;              /* scratch */
+       unsigned long r19;              /* scratch */
+       unsigned long r20;              /* scratch */
+       unsigned long r21;              /* scratch */
+       unsigned long r22;              /* scratch */
+       unsigned long r23;              /* scratch */
+       unsigned long r24;              /* scratch */
+       unsigned long r25;              /* scratch */
+       unsigned long r26;              /* scratch */
+       unsigned long r27;              /* scratch */
+       unsigned long r28;              /* scratch */
+       unsigned long r29;              /* scratch */
+       unsigned long r30;              /* scratch */
+       unsigned long r31;              /* scratch */
+
+       unsigned long ar_ccv;           /* compare/exchange value (scratch) */
+
+       /*
+        * Floating point registers that the kernel considers scratch:
+        */
+       struct pt_fpreg f6;             /* scratch */
+       struct pt_fpreg f7;             /* scratch */
+       struct pt_fpreg f8;             /* scratch */
+       struct pt_fpreg f9;             /* scratch */
+       struct pt_fpreg f10;            /* scratch */
+       struct pt_fpreg f11;            /* scratch */
+};
+
+typedef struct {
+       unsigned long ipsr;
+       unsigned long iip;
+       unsigned long ifs;
+       unsigned long precover_ifs;
+       unsigned long isr;
+       unsigned long ifa;
+       unsigned long iipa;
+       unsigned long iim;
+       unsigned long unat;  // not sure if this is needed until NaT arch is 
done
+       unsigned long tpr;
+       unsigned long iha;
+       unsigned long itir;
+       unsigned long itv;
+       unsigned long pmv;
+       unsigned long cmcv;
+       unsigned long pta;
+       int interrupt_collection_enabled; // virtual psr.ic
+       int interrupt_delivery_enabled; // virtual psr.i
+       int pending_interruption;
+       int incomplete_regframe;        // see SDM vol2 6.8
+       unsigned long delivery_mask[4];
+       int metaphysical_mode;  // 1 = use metaphys mapping, 0 = use virtual
+       int banknum;    // 0 or 1, which virtual register bank is active
+       unsigned long bank0_regs[16]; // bank0 regs (r16-r31) when bank1 active
+       unsigned long bank1_regs[16]; // bank1 regs (r16-r31) when bank0 active
+       unsigned long rrs[8];   // region registers
+       unsigned long krs[8];   // kernel registers
+       unsigned long pkrs[8];  // protection key registers
+       unsigned long tmp[8];   // temp registers (e.g. for hyperprivops)
+       int evtchn_vector;
+} arch_vcpu_info_t;
+#define __ARCH_HAS_VCPU_INFO
+
+typedef struct {
+       int domain_controller_evtchn;
+       unsigned int flags;
+//} arch_shared_info_t;
+} arch_shared_info_t;          // DON'T PACK 
+
+typedef struct vcpu_guest_context {
+#define VGCF_FPU_VALID (1<<0)
+#define VGCF_VMX_GUEST (1<<1)
+#define VGCF_IN_KERNEL (1<<2)
+       unsigned long flags;       /* VGCF_* flags */
+       unsigned long pt_base;     /* PMT table base */
+       unsigned long pt_max_pfn;  /* Max pfn including holes */
+       unsigned long share_io_pg; /* Shared page for I/O emulation */
+       unsigned long vm_assist;   /* VMASST_TYPE_* bitmap, now none on IPF */
+       unsigned long guest_iip;   /* Guest entry point */
+
+       struct pt_regs regs;
+       arch_vcpu_info_t vcpu;
+       arch_shared_info_t shared;
+} vcpu_guest_context_t;
+
+#endif /* !__ASSEMBLY__ */
+
+#define        XEN_HYPER_RFI                   0x1
+#define        XEN_HYPER_RSM_DT                0x2
+#define        XEN_HYPER_SSM_DT                0x3
+#define        XEN_HYPER_COVER                 0x4
+#define        XEN_HYPER_ITC_D                 0x5
+#define        XEN_HYPER_ITC_I                 0x6
+#define        XEN_HYPER_SSM_I                 0x7
+#define        XEN_HYPER_GET_IVR               0x8
+#define        XEN_HYPER_GET_TPR               0x9
+#define        XEN_HYPER_SET_TPR               0xa
+#define        XEN_HYPER_EOI                   0xb
+#define        XEN_HYPER_SET_ITM               0xc
+#define        XEN_HYPER_THASH                 0xd
+#define        XEN_HYPER_PTC_GA                0xe
+#define        XEN_HYPER_ITR_D                 0xf
+#define        XEN_HYPER_GET_RR                0x10
+#define        XEN_HYPER_SET_RR                0x11
+
+#endif /* __HYPERVISOR_IF_IA64_H__ */
Index: linux-2.6.12-xen0-arch/include/asm-i386/mach-xen/xen-public/arch-x86_32.h
===================================================================
--- /dev/null
+++ linux-2.6.12-xen0-arch/include/asm-i386/mach-xen/xen-public/arch-x86_32.h
@@ -0,0 +1,142 @@
+/******************************************************************************
+ * arch-x86_32.h
+ * 
+ * Guest OS interface to x86 32-bit Xen.
+ * 
+ * Copyright (c) 2004, K A Fraser
+ */
+
+#ifndef __XEN_PUBLIC_ARCH_X86_32_H__
+#define __XEN_PUBLIC_ARCH_X86_32_H__
+
+#include <asm/types.h>
+
+/*
+ * SEGMENT DESCRIPTOR TABLES
+ */
+/*
+ * A number of GDT entries are reserved by Xen. These are not situated at the
+ * start of the GDT because some stupid OSes export hard-coded selector values
+ * in their ABI. These hard-coded values are always near the start of the GDT,
+ * so Xen places itself out of the way, at the far end of the GDT.
+ */
+#define FIRST_RESERVED_GDT_PAGE  14
+#define FIRST_RESERVED_GDT_BYTE  (FIRST_RESERVED_GDT_PAGE * 4096)
+#define FIRST_RESERVED_GDT_ENTRY (FIRST_RESERVED_GDT_BYTE / 8)
+
+/*
+ * These flat segments are in the Xen-private section of every GDT. Since these
+ * are also present in the initial GDT, many OSes will be able to avoid
+ * installing their own GDT.
+ */
+#define FLAT_RING1_CS 0xe019    /* GDT index 259 */
+#define FLAT_RING1_DS 0xe021    /* GDT index 260 */
+#define FLAT_RING1_SS 0xe021    /* GDT index 260 */
+#define FLAT_RING3_CS 0xe02b    /* GDT index 261 */
+#define FLAT_RING3_DS 0xe033    /* GDT index 262 */
+#define FLAT_RING3_SS 0xe033    /* GDT index 262 */
+
+#define FLAT_KERNEL_CS FLAT_RING1_CS
+#define FLAT_KERNEL_DS FLAT_RING1_DS
+#define FLAT_KERNEL_SS FLAT_RING1_SS
+#define FLAT_USER_CS    FLAT_RING3_CS
+#define FLAT_USER_DS    FLAT_RING3_DS
+#define FLAT_USER_SS    FLAT_RING3_SS
+
+/* And the trap vector is... */
+#define TRAP_INSTR "int $0x82"
+
+
+/*
+ * Virtual addresses beyond this are not modifiable by guest OSes. The 
+ * machine->physical mapping table starts at this address, read-only.
+ */
+#ifdef CONFIG_X86_PAE
+# define HYPERVISOR_VIRT_START (0xF5800000UL)
+#else
+# define HYPERVISOR_VIRT_START (0xFC000000UL)
+#endif
+#ifndef machine_to_phys_mapping
+#define machine_to_phys_mapping ((u32 *)HYPERVISOR_VIRT_START)
+#endif
+
+/* Maximum number of virtual CPUs in multi-processor guests. */
+#define MAX_VIRT_CPUS 32
+
+#ifndef __ASSEMBLY__
+
+/* NB. Both the following are 32 bits each. */
+typedef unsigned long memory_t;   /* Full-sized pointer/address/memory-size. */
+
+/*
+ * Send an array of these to HYPERVISOR_set_trap_table()
+ */
+#define TI_GET_DPL(_ti)      ((_ti)->flags & 3)
+#define TI_GET_IF(_ti)       ((_ti)->flags & 4)
+#define TI_SET_DPL(_ti,_dpl) ((_ti)->flags |= (_dpl))
+#define TI_SET_IF(_ti,_if)   ((_ti)->flags |= ((!!(_if))<<2))
+typedef struct trap_info {
+    u8       vector;  /* exception vector                              */
+    u8       flags;   /* 0-3: privilege level; 4: clear event enable?  */
+    u16      cs;      /* code selector                                 */
+    memory_t address; /* code address                                  */
+} trap_info_t;
+
+typedef struct cpu_user_regs {
+    u32 ebx;
+    u32 ecx;
+    u32 edx;
+    u32 esi;
+    u32 edi;
+    u32 ebp;
+    u32 eax;
+    u16 error_code;    /* private */
+    u16 entry_vector;  /* private */
+    u32 eip;
+    u16 cs;
+    u8  saved_upcall_mask;
+    u8  _pad0;
+    u32 eflags;
+    u32 esp;
+    u16 ss, _pad1;
+    u16 es, _pad2;
+    u16 ds, _pad3;
+    u16 fs, _pad4;
+    u16 gs, _pad5;
+} cpu_user_regs_t;
+
+typedef u64 tsc_timestamp_t; /* RDTSC timestamp */
+
+/*
+ * The following is all CPU context. Note that the fpu_ctxt block is filled 
+ * in by FXSAVE if the CPU has feature FXSR; otherwise FSAVE is used.
+ */
+typedef struct vcpu_guest_context {
+    /* FPU registers come first so they can be aligned for FXSAVE/FXRSTOR. */
+    struct { char x[512]; } fpu_ctxt;       /* User-level FPU registers     */
+#define VGCF_I387_VALID (1<<0)
+#define VGCF_VMX_GUEST  (1<<1)
+#define VGCF_IN_KERNEL  (1<<2)
+    unsigned long flags;                    /* VGCF_* flags                 */
+    cpu_user_regs_t user_regs;              /* User-level CPU registers     */
+    trap_info_t   trap_ctxt[256];           /* Virtual IDT                  */
+    unsigned long ldt_base, ldt_ents;       /* LDT (linear address, # ents) */
+    unsigned long gdt_frames[16], gdt_ents; /* GDT (machine frames, # ents) */
+    unsigned long kernel_ss, kernel_sp;     /* Virtual TSS (only SS1/SP1)   */
+    unsigned long ctrlreg[8];               /* CR0-CR7 (control registers)  */
+    unsigned long debugreg[8];              /* DB0-DB7 (debug registers)    */
+    unsigned long event_callback_cs;        /* CS:EIP of event callback     */
+    unsigned long event_callback_eip;
+    unsigned long failsafe_callback_cs;     /* CS:EIP of failsafe callback  */
+    unsigned long failsafe_callback_eip;
+    unsigned long vm_assist;                /* VMASST_TYPE_* bitmap */
+} vcpu_guest_context_t;
+
+typedef struct arch_shared_info {
+    /* MFN of a table of MFNs that make up p2m table */
+    u64 pfn_to_mfn_frame_list;
+} arch_shared_info_t;
+
+#endif
+
+#endif
Index: linux-2.6.12-xen0-arch/include/asm-i386/mach-xen/xen-public/arch-x86_64.h
===================================================================
--- /dev/null
+++ linux-2.6.12-xen0-arch/include/asm-i386/mach-xen/xen-public/arch-x86_64.h
@@ -0,0 +1,198 @@
+/******************************************************************************
+ * arch-x86_64.h
+ * 
+ * Guest OS interface to x86 64-bit Xen.
+ * 
+ * Copyright (c) 2004, K A Fraser
+ */
+
+#ifndef __XEN_PUBLIC_ARCH_X86_64_H__
+#define __XEN_PUBLIC_ARCH_X86_64_H__
+
+/*
+ * SEGMENT DESCRIPTOR TABLES
+ */
+/*
+ * A number of GDT entries are reserved by Xen. These are not situated at the
+ * start of the GDT because some stupid OSes export hard-coded selector values
+ * in their ABI. These hard-coded values are always near the start of the GDT,
+ * so Xen places itself out of the way, at the far end of the GDT.
+ */
+#define FIRST_RESERVED_GDT_PAGE  14
+#define FIRST_RESERVED_GDT_BYTE  (FIRST_RESERVED_GDT_PAGE * 4096)
+#define FIRST_RESERVED_GDT_ENTRY (FIRST_RESERVED_GDT_BYTE / 8)
+
+/*
+ * 64-bit segment selectors
+ * These flat segments are in the Xen-private section of every GDT. Since these
+ * are also present in the initial GDT, many OSes will be able to avoid
+ * installing their own GDT.
+ */
+
+#define FLAT_RING3_CS32 0xe023  /* GDT index 260 */
+#define FLAT_RING3_CS64 0xe033  /* GDT index 261 */
+#define FLAT_RING3_DS32 0xe02b  /* GDT index 262 */
+#define FLAT_RING3_DS64 0x0000  /* NULL selector */
+#define FLAT_RING3_SS32 0xe02b  /* GDT index 262 */
+#define FLAT_RING3_SS64 0xe02b  /* GDT index 262 */
+
+#define FLAT_KERNEL_DS64 FLAT_RING3_DS64
+#define FLAT_KERNEL_DS32 FLAT_RING3_DS32
+#define FLAT_KERNEL_DS   FLAT_KERNEL_DS64
+#define FLAT_KERNEL_CS64 FLAT_RING3_CS64
+#define FLAT_KERNEL_CS32 FLAT_RING3_CS32
+#define FLAT_KERNEL_CS   FLAT_KERNEL_CS64
+#define FLAT_KERNEL_SS64 FLAT_RING3_SS64
+#define FLAT_KERNEL_SS32 FLAT_RING3_SS32
+#define FLAT_KERNEL_SS   FLAT_KERNEL_SS64
+
+#define FLAT_USER_DS64 FLAT_RING3_DS64
+#define FLAT_USER_DS32 FLAT_RING3_DS32
+#define FLAT_USER_DS   FLAT_USER_DS64
+#define FLAT_USER_CS64 FLAT_RING3_CS64
+#define FLAT_USER_CS32 FLAT_RING3_CS32
+#define FLAT_USER_CS   FLAT_USER_CS64
+#define FLAT_USER_SS64 FLAT_RING3_SS64
+#define FLAT_USER_SS32 FLAT_RING3_SS32
+#define FLAT_USER_SS   FLAT_USER_SS64
+
+/* And the trap vector is... */
+#define TRAP_INSTR "syscall"
+
+#ifndef HYPERVISOR_VIRT_START
+#define HYPERVISOR_VIRT_START (0xFFFF800000000000UL)
+#define HYPERVISOR_VIRT_END   (0xFFFF880000000000UL)
+#endif
+
+/* Maximum number of virtual CPUs in multi-processor guests. */
+#define MAX_VIRT_CPUS 32
+
+#ifndef __ASSEMBLY__
+
+/* The machine->physical mapping table starts at this address, read-only. */
+#ifndef machine_to_phys_mapping
+#define machine_to_phys_mapping ((u32 *)HYPERVISOR_VIRT_START)
+#endif
+
+/*
+ * int HYPERVISOR_set_segment_base(unsigned int which, unsigned long base)
+ *  @which == SEGBASE_*  ;  @base == 64-bit base address
+ * Returns 0 on success.
+ */
+#define SEGBASE_FS          0
+#define SEGBASE_GS_USER     1
+#define SEGBASE_GS_KERNEL   2
+#define SEGBASE_GS_USER_SEL 3 /* Set user %gs specified in base[15:0] */
+
+/*
+ * int HYPERVISOR_switch_to_user(void)
+ * All arguments are on the kernel stack, in the following format.
+ * Never returns if successful. Current kernel context is lost.
+ * If flags contains VGCF_IN_SYSCALL:
+ *   Restore RAX, RIP, RFLAGS, RSP. 
+ *   Discard R11, RCX, CS, SS.
+ * Otherwise:
+ *   Restore RAX, R11, RCX, CS:RIP, RFLAGS, SS:RSP.
+ * All other registers are saved on hypercall entry and restored to user.
+ */
+/* Guest exited in SYSCALL context? Return to guest with SYSRET? */
+#define VGCF_IN_SYSCALL (1<<8)
+struct switch_to_user {
+    /* Top of stack (%rsp at point of hypercall). */
+    u64 rax, r11, rcx, flags, rip, cs, rflags, rsp, ss;
+    /* Bottom of switch_to_user stack frame. */
+};
+
+/* NB. Both the following are 64 bits each. */
+typedef unsigned long memory_t;   /* Full-sized pointer/address/memory-size. */
+
+/*
+ * Send an array of these to HYPERVISOR_set_trap_table().
+ * N.B. As in x86/32 mode, the privilege level specifies which modes may enter
+ * a trap via a software interrupt. Since rings 1 and 2 are unavailable, we
+ * allocate privilege levels as follows:
+ *  Level == 0: Noone may enter
+ *  Level == 1: Kernel may enter
+ *  Level == 2: Kernel may enter
+ *  Level == 3: Everyone may enter
+ */
+#define TI_GET_DPL(_ti)      ((_ti)->flags & 3)
+#define TI_GET_IF(_ti)       ((_ti)->flags & 4)
+#define TI_SET_DPL(_ti,_dpl) ((_ti)->flags |= (_dpl))
+#define TI_SET_IF(_ti,_if)   ((_ti)->flags |= ((!!(_if))<<2))
+typedef struct trap_info {
+    u8       vector;  /* exception vector                              */
+    u8       flags;   /* 0-3: privilege level; 4: clear event enable?  */
+    u16      cs;      /* code selector                                 */
+    memory_t address; /* code address                                  */
+} trap_info_t;
+
+typedef struct cpu_user_regs {
+    u64 r15;
+    u64 r14;
+    u64 r13;
+    u64 r12;
+    union { u64 rbp, ebp; };
+    union { u64 rbx, ebx; };
+    u64 r11;
+    u64 r10;
+    u64 r9;
+    u64 r8;
+    union { u64 rax, eax; };
+    union { u64 rcx, ecx; };
+    union { u64 rdx, edx; };
+    union { u64 rsi, esi; };
+    union { u64 rdi, edi; };
+    u32 error_code;    /* private */
+    u32 entry_vector;  /* private */
+    union { u64 rip, eip; };
+    u16 cs, _pad0[1];
+    u8  saved_upcall_mask;
+    u8  _pad1[3];
+    union { u64 rflags, eflags; };
+    union { u64 rsp, esp; };
+    u16 ss, _pad2[3];
+    u16 es, _pad3[3];
+    u16 ds, _pad4[3];
+    u16 fs, _pad5[3]; /* Non-zero => takes precedence over fs_base.      */
+    u16 gs, _pad6[3]; /* Non-zero => takes precedence over gs_base_user. */
+} cpu_user_regs_t;
+
+typedef u64 tsc_timestamp_t; /* RDTSC timestamp */
+
+/*
+ * The following is all CPU context. Note that the fpu_ctxt block is filled 
+ * in by FXSAVE if the CPU has feature FXSR; otherwise FSAVE is used.
+ */
+typedef struct vcpu_guest_context {
+    /* FPU registers come first so they can be aligned for FXSAVE/FXRSTOR. */
+    struct { char x[512]; } fpu_ctxt;       /* User-level FPU registers     */
+#define VGCF_I387_VALID (1<<0)
+#define VGCF_VMX_GUEST  (1<<1)
+#define VGCF_IN_KERNEL  (1<<2)
+    unsigned long flags;                    /* VGCF_* flags                 */
+    cpu_user_regs_t user_regs;              /* User-level CPU registers     */
+    trap_info_t   trap_ctxt[256];           /* Virtual IDT                  */
+    unsigned long ldt_base, ldt_ents;       /* LDT (linear address, # ents) */
+    unsigned long gdt_frames[16], gdt_ents; /* GDT (machine frames, # ents) */
+    unsigned long kernel_ss, kernel_sp;     /* Virtual TSS (only SS1/SP1)   */
+    unsigned long ctrlreg[8];               /* CR0-CR7 (control registers)  */
+    unsigned long debugreg[8];              /* DB0-DB7 (debug registers)    */
+    unsigned long event_callback_eip;
+    unsigned long failsafe_callback_eip;
+    unsigned long syscall_callback_eip;
+    unsigned long vm_assist;                /* VMASST_TYPE_* bitmap */
+    /* Segment base addresses. */
+    u64           fs_base;
+    u64           gs_base_kernel;
+    u64           gs_base_user;
+} vcpu_guest_context_t;
+
+typedef struct arch_shared_info {
+    /* MFN of a table of MFNs that make up p2m table */
+    u64 pfn_to_mfn_frame_list;
+} arch_shared_info_t;
+
+#endif /* !__ASSEMBLY__ */
+
+#endif
Index: linux-2.6.12-xen0-arch/include/asm-i386/mach-xen/xen-public/dom0_ops.h
===================================================================
--- /dev/null
+++ linux-2.6.12-xen0-arch/include/asm-i386/mach-xen/xen-public/dom0_ops.h
@@ -0,0 +1,414 @@
+/******************************************************************************
+ * dom0_ops.h
+ * 
+ * Process command requests from domain-0 guest OS.
+ * 
+ * Copyright (c) 2002-2003, B Dragovic
+ * Copyright (c) 2002-2004, K Fraser
+ */
+
+
+#ifndef __XEN_PUBLIC_DOM0_OPS_H__
+#define __XEN_PUBLIC_DOM0_OPS_H__
+
+#include "xen-public/xen.h"
+#include "xen-public/sched_ctl.h"
+
+/*
+ * Make sure you increment the interface version whenever you modify this file!
+ * This makes sure that old versions of dom0 tools will stop working in a
+ * well-defined way (rather than crashing the machine, for instance).
+ */
+#define DOM0_INTERFACE_VERSION   0xAAAA100E
+
+/************************************************************************/
+
+#define DOM0_GETMEMLIST        2
+typedef struct {
+    /* IN variables. */
+    domid_t       domain;
+    memory_t      max_pfns;
+    void         *buffer;
+    /* OUT variables. */
+    memory_t      num_pfns;
+} dom0_getmemlist_t;
+
+#define DOM0_SCHEDCTL          6
+ /* struct sched_ctl_cmd is from sched-ctl.h   */
+typedef struct sched_ctl_cmd dom0_schedctl_t;
+
+#define DOM0_ADJUSTDOM         7
+/* struct sched_adjdom_cmd is from sched-ctl.h */
+typedef struct sched_adjdom_cmd dom0_adjustdom_t;
+
+#define DOM0_CREATEDOMAIN      8
+typedef struct {
+    /* IN parameters */
+    u32 ssidref;
+    /* IN/OUT parameters. */
+    /* Identifier for new domain (auto-allocate if zero is specified). */
+    domid_t domain;
+} dom0_createdomain_t;
+
+#define DOM0_DESTROYDOMAIN     9
+typedef struct {
+    /* IN variables. */
+    domid_t      domain;
+} dom0_destroydomain_t;
+
+#define DOM0_PAUSEDOMAIN      10
+typedef struct {
+    /* IN parameters. */
+    domid_t domain;
+} dom0_pausedomain_t;
+
+#define DOM0_UNPAUSEDOMAIN    11
+typedef struct {
+    /* IN parameters. */
+    domid_t domain;
+} dom0_unpausedomain_t;
+
+#define DOM0_GETDOMAININFO    12
+typedef struct {
+    /* IN variables. */
+    domid_t  domain;                  /* NB. IN/OUT variable. */
+    /* OUT variables. */
+#define DOMFLAGS_DYING     (1<<0) /* Domain is scheduled to die.             */
+#define DOMFLAGS_SHUTDOWN  (1<<2) /* The guest OS has shut down.             */
+#define DOMFLAGS_PAUSED    (1<<3) /* Currently paused by control software.   */
+#define DOMFLAGS_BLOCKED   (1<<4) /* Currently blocked pending an event.     */
+#define DOMFLAGS_RUNNING   (1<<5) /* Domain is currently running.            */
+#define DOMFLAGS_CPUMASK      255 /* CPU to which this domain is bound.      */
+#define DOMFLAGS_CPUSHIFT       8
+#define DOMFLAGS_SHUTDOWNMASK 255 /* DOMFLAGS_SHUTDOWN guest-supplied code.  */
+#define DOMFLAGS_SHUTDOWNSHIFT 16
+    u32      flags;
+    memory_t tot_pages;
+    memory_t max_pages;
+    memory_t shared_info_frame;       /* MFN of shared_info struct */
+    u64      cpu_time;
+    u32      n_vcpu;
+    s32      vcpu_to_cpu[MAX_VIRT_CPUS];  /* current mapping   */
+    cpumap_t cpumap[MAX_VIRT_CPUS];       /* allowable mapping */
+    u32             ssidref;
+} dom0_getdomaininfo_t;
+
+#define DOM0_SETDOMAININFO      13
+typedef struct {
+    /* IN variables. */
+    domid_t                   domain;
+    u16                       vcpu;
+    /* IN/OUT parameters */
+    vcpu_guest_context_t *ctxt;
+} dom0_setdomaininfo_t;
+
+#define DOM0_MSR              15
+typedef struct {
+    /* IN variables. */
+    u32 write;
+    u32 cpu_mask;
+    u32 msr;
+    u32 in1;
+    u32 in2;
+    /* OUT variables. */
+    u32 out1;
+    u32 out2;
+} dom0_msr_t;
+
+#define DOM0_DEBUG            16
+typedef struct {
+    /* IN variables. */
+    domid_t domain;
+    u8  opcode;
+    u32 in1;
+    u32 in2;
+    u32 in3;
+    u32 in4;
+    /* OUT variables. */
+    u32 status;
+    u32 out1;
+    u32 out2;
+} dom0_debug_t;
+
+/*
+ * Set clock such that it would read <secs,usecs> after 00:00:00 UTC,
+ * 1 January, 1970 if the current system time was <system_time>.
+ */
+#define DOM0_SETTIME          17
+typedef struct {
+    /* IN variables. */
+    u32 secs;
+    u32 usecs;
+    u64 system_time;
+} dom0_settime_t;
+
+#define DOM0_GETPAGEFRAMEINFO 18
+#define NOTAB 0         /* normal page */
+#define L1TAB (1<<28)
+#define L2TAB (2<<28)
+#define L3TAB (3<<28)
+#define L4TAB (4<<28)
+#define LPINTAB  (1<<31)
+#define XTAB  (0xf<<28) /* invalid page */
+#define LTAB_MASK XTAB
+#define LTABTYPE_MASK (0x7<<28)
+
+typedef struct {
+    /* IN variables. */
+    memory_t pfn;          /* Machine page frame number to query.       */
+    domid_t domain;        /* To which domain does the frame belong?    */
+    /* OUT variables. */
+    /* Is the page PINNED to a type? */
+    u32 type;              /* see above type defs */
+} dom0_getpageframeinfo_t;
+
+/*
+ * Read console content from Xen buffer ring.
+ */
+#define DOM0_READCONSOLE      19
+typedef struct {
+    /* IN variables. */
+    u32      clear;        /* Non-zero -> clear after reading. */
+    /* IN/OUT variables. */
+    char    *buffer;       /* In: Buffer start; Out: Used buffer start */
+    u32      count;        /* In: Buffer size;  Out: Used buffer size  */
+} dom0_readconsole_t;
+
+/* 
+ * Set which physical cpus a vcpu can execute on.
+ */
+#define DOM0_PINCPUDOMAIN     20
+typedef struct {
+    /* IN variables. */
+    domid_t      domain;
+    u16          vcpu;
+    cpumap_t     *cpumap;
+} dom0_pincpudomain_t;
+
+/* Get trace buffers machine base address */
+#define DOM0_TBUFCONTROL       21
+typedef struct {
+    /* IN variables */
+#define DOM0_TBUF_GET_INFO     0
+#define DOM0_TBUF_SET_CPU_MASK 1
+#define DOM0_TBUF_SET_EVT_MASK 2
+    u8 op;
+    /* IN/OUT variables */
+    unsigned long cpu_mask;
+    u32           evt_mask;
+    /* OUT variables */
+    memory_t mach_addr;
+    u32      size;
+} dom0_tbufcontrol_t;
+
+/*
+ * Get physical information about the host machine
+ */
+#define DOM0_PHYSINFO         22
+typedef struct {
+    u32      threads_per_core;
+    u32      cores_per_socket;
+    u32      sockets_per_node;
+    u32      nr_nodes;
+    u32      cpu_khz;
+    memory_t total_pages;
+    memory_t free_pages;
+} dom0_physinfo_t;
+
+/*
+ * Get the ID of the current scheduler.
+ */
+#define DOM0_SCHED_ID        24
+typedef struct {
+    /* OUT variable */
+    u32 sched_id;
+} dom0_sched_id_t;
+
+/* 
+ * Control shadow pagetables operation
+ */
+#define DOM0_SHADOW_CONTROL  25
+
+#define DOM0_SHADOW_CONTROL_OP_OFF         0
+#define DOM0_SHADOW_CONTROL_OP_ENABLE_TEST 1
+#define DOM0_SHADOW_CONTROL_OP_ENABLE_LOGDIRTY 2
+#define DOM0_SHADOW_CONTROL_OP_ENABLE_TRANSLATE 3
+
+#define DOM0_SHADOW_CONTROL_OP_FLUSH       10     /* table ops */
+#define DOM0_SHADOW_CONTROL_OP_CLEAN       11
+#define DOM0_SHADOW_CONTROL_OP_PEEK        12
+
+typedef struct dom0_shadow_control
+{
+    u32 fault_count;
+    u32 dirty_count;
+    u32 dirty_net_count;     
+    u32 dirty_block_count;     
+} dom0_shadow_control_stats_t;
+
+typedef struct {
+    /* IN variables. */
+    domid_t        domain;
+    u32            op;
+    unsigned long *dirty_bitmap; /* pointer to locked buffer */
+    /* IN/OUT variables. */
+    memory_t       pages;        /* size of buffer, updated with actual size */
+    /* OUT variables. */
+    dom0_shadow_control_stats_t stats;
+} dom0_shadow_control_t;
+
+#define DOM0_SETDOMAINMAXMEM   28
+typedef struct {
+    /* IN variables. */
+    domid_t     domain;
+    memory_t    max_memkb;
+} dom0_setdomainmaxmem_t;
+
+#define DOM0_GETPAGEFRAMEINFO2 29   /* batched interface */
+typedef struct {
+    /* IN variables. */
+    domid_t  domain;
+    memory_t num;
+    /* IN/OUT variables. */
+    unsigned long *array;
+} dom0_getpageframeinfo2_t;
+
+/*
+ * Request memory range (@pfn, @pfn+@nr_pfns-1) to have type @type.
+ * On x86, @type is an architecture-defined MTRR memory type.
+ * On success, returns the MTRR that was used (@reg) and a handle that can
+ * be passed to DOM0_DEL_MEMTYPE to accurately tear down the new setting.
+ * (x86-specific).
+ */
+#define DOM0_ADD_MEMTYPE         31
+typedef struct {
+    /* IN variables. */
+    memory_t pfn;
+    memory_t nr_pfns;
+    u32      type;
+    /* OUT variables. */
+    u32      handle;
+    u32      reg;
+} dom0_add_memtype_t;
+
+/*
+ * Tear down an existing memory-range type. If @handle is remembered then it
+ * should be passed in to accurately tear down the correct setting (in case
+ * of overlapping memory regions with differing types). If it is not known
+ * then @handle should be set to zero. In all cases @reg must be set.
+ * (x86-specific).
+ */
+#define DOM0_DEL_MEMTYPE         32
+typedef struct {
+    /* IN variables. */
+    u32      handle;
+    u32      reg;
+} dom0_del_memtype_t;
+
+/* Read current type of an MTRR (x86-specific). */
+#define DOM0_READ_MEMTYPE        33
+typedef struct {
+    /* IN variables. */
+    u32      reg;
+    /* OUT variables. */
+    memory_t pfn;
+    memory_t nr_pfns;
+    u32      type;
+} dom0_read_memtype_t;
+
+/* Interface for controlling Xen software performance counters. */
+#define DOM0_PERFCCONTROL        34
+/* Sub-operations: */
+#define DOM0_PERFCCONTROL_OP_RESET 1   /* Reset all counters to zero. */
+#define DOM0_PERFCCONTROL_OP_QUERY 2   /* Get perfctr information. */
+typedef struct {
+    u8      name[80];               /*  name of perf counter */
+    u32     nr_vals;                /* number of values for this counter */
+    u32     vals[64];               /* array of values */
+} dom0_perfc_desc_t;
+typedef struct {
+    /* IN variables. */
+    u32            op;                /*  DOM0_PERFCCONTROL_OP_??? */
+    /* OUT variables. */
+    u32            nr_counters;       /*  number of counters */
+    dom0_perfc_desc_t *desc;          /*  counter information (or NULL) */
+} dom0_perfccontrol_t;
+
+#define DOM0_MICROCODE           35
+typedef struct {
+    /* IN variables. */
+    void   *data;                     /* Pointer to microcode data */
+    u32     length;                   /* Length of microcode data. */
+} dom0_microcode_t;
+
+#define DOM0_IOPORT_PERMISSION   36
+typedef struct {
+    domid_t domain;                   /* domain to be affected */
+    u16     first_port;               /* first port int range */
+    u16     nr_ports;                 /* size of port range */
+    u16     allow_access;             /* allow or deny access to range? */
+} dom0_ioport_permission_t;
+
+#define DOM0_GETVCPUCONTEXT      37
+typedef struct {
+    domid_t domain;                   /* domain to be affected */
+    u16     vcpu;                     /* vcpu # */
+    vcpu_guest_context_t *ctxt;       /* NB. IN/OUT variable. */
+    u64     cpu_time;                 
+} dom0_getvcpucontext_t;
+
+#define DOM0_GETDOMAININFOLIST   38
+typedef struct {
+    /* IN variables. */
+    domid_t               first_domain;
+    memory_t              max_domains;
+    dom0_getdomaininfo_t *buffer;
+    /* OUT variables. */
+    memory_t              num_domains;
+} dom0_getdomaininfolist_t;
+
+#define DOM0_PLATFORM_QUIRK      39  
+#define QUIRK_NOIRQBALANCING  1
+typedef struct {
+    /* IN variables. */
+    int quirk_id;
+} dom0_platform_quirk_t;
+
+typedef struct {
+    u32 cmd;
+    u32 interface_version; /* DOM0_INTERFACE_VERSION */
+    union {
+        dom0_createdomain_t      createdomain;
+        dom0_pausedomain_t       pausedomain;
+        dom0_unpausedomain_t     unpausedomain;
+        dom0_destroydomain_t     destroydomain;
+        dom0_getmemlist_t        getmemlist;
+        dom0_schedctl_t          schedctl;
+        dom0_adjustdom_t         adjustdom;
+        dom0_setdomaininfo_t     setdomaininfo;
+        dom0_getdomaininfo_t     getdomaininfo;
+        dom0_getpageframeinfo_t  getpageframeinfo;
+        dom0_msr_t               msr;
+        dom0_debug_t             debug;
+        dom0_settime_t           settime;
+        dom0_readconsole_t       readconsole;
+        dom0_pincpudomain_t      pincpudomain;
+        dom0_tbufcontrol_t       tbufcontrol;
+        dom0_physinfo_t          physinfo;
+        dom0_sched_id_t          sched_id;
+        dom0_shadow_control_t    shadow_control;
+        dom0_setdomainmaxmem_t   setdomainmaxmem;
+        dom0_getpageframeinfo2_t getpageframeinfo2;
+        dom0_add_memtype_t       add_memtype;
+        dom0_del_memtype_t       del_memtype;
+        dom0_read_memtype_t      read_memtype;
+        dom0_perfccontrol_t      perfccontrol;
+        dom0_microcode_t         microcode;
+        dom0_ioport_permission_t ioport_permission;
+        dom0_getvcpucontext_t    getvcpucontext;
+        dom0_getdomaininfolist_t getdomaininfolist;
+        dom0_platform_quirk_t    platform_quirk;
+    } u;
+} dom0_op_t;
+
+#endif /* __XEN_PUBLIC_DOM0_OPS_H__ */
Index: 
linux-2.6.12-xen0-arch/include/asm-i386/mach-xen/xen-public/event_channel.h
===================================================================
--- /dev/null
+++ linux-2.6.12-xen0-arch/include/asm-i386/mach-xen/xen-public/event_channel.h
@@ -0,0 +1,191 @@
+/******************************************************************************
+ * event_channel.h
+ * 
+ * Event channels between domains.
+ * 
+ * Copyright (c) 2003-2004, K A Fraser.
+ */
+
+#ifndef __XEN_PUBLIC_EVENT_CHANNEL_H__
+#define __XEN_PUBLIC_EVENT_CHANNEL_H__
+
+/*
+ * EVTCHNOP_alloc_unbound: Prepare a local port for binding to <dom>.
+ * <port> may be wildcarded by setting to zero, in which case a fresh port
+ * will be allocated, and the field filled in on return.
+ */
+#define EVTCHNOP_alloc_unbound    6
+typedef struct evtchn_alloc_unbound {
+    /* IN parameters */
+    domid_t dom;
+    /* IN/OUT parameters */
+    u32     port;
+} evtchn_alloc_unbound_t;
+
+/*
+ * EVTCHNOP_bind_interdomain: Construct an interdomain event channel between
+ * <dom1> and <dom2>. Either <port1> or <port2> may be wildcarded by setting to
+ * zero. On successful return both <port1> and <port2> are filled in and
+ * <dom1,port1> is fully bound to <dom2,port2>.
+ * 
+ * NOTES:
+ *  1. A wildcarded port is allocated from the relevant domain's free list
+ *     (i.e., some port that was previously EVTCHNSTAT_closed). However, if the
+ *     remote port pair is already fully bound then a port is not allocated,
+ *     and instead the existing local port is returned to the caller.
+ *  2. If the caller is unprivileged then <dom1> must be DOMID_SELF.
+ *  3. If the caller is unprivileged and <dom2,port2> is EVTCHNSTAT_closed
+ *     then <dom2> must be DOMID_SELF.
+ *  4. If either port is already bound then it must be bound to the other
+ *     specified domain and port (if not wildcarded).
+ *  5. If either port is awaiting binding (EVTCHNSTAT_unbound) then it must
+ *     be awaiting binding to the other domain, and the other port pair must
+ *     be closed or unbound.
+ */
+#define EVTCHNOP_bind_interdomain 0
+typedef struct evtchn_bind_interdomain {
+    /* IN parameters. */
+    domid_t dom1, dom2;
+    /* IN/OUT parameters. */
+    u32     port1, port2;
+} evtchn_bind_interdomain_t;
+
+/*
+ * EVTCHNOP_bind_virq: Bind a local event channel to IRQ <irq> on calling vcpu.
+ * NOTES:
+ *  1. A virtual IRQ may be bound to at most one event channel per vcpu.
+ *  2. The allocated event channel is bound to the calling vcpu. The binding
+ *     may not be changed.
+ */
+#define EVTCHNOP_bind_virq        1
+typedef struct evtchn_bind_virq {
+    /* IN parameters. */
+    u32 virq;
+    /* OUT parameters. */
+    u32 port;
+} evtchn_bind_virq_t;
+
+/*
+ * EVTCHNOP_bind_pirq: Bind a local event channel to IRQ <irq>.
+ * NOTES:
+ *  1. A physical IRQ may be bound to at most one event channel per domain.
+ *  2. Only a sufficiently-privileged domain may bind to a physical IRQ.
+ */
+#define EVTCHNOP_bind_pirq        2
+typedef struct evtchn_bind_pirq {
+    /* IN parameters. */
+    u32 pirq;
+#define BIND_PIRQ__WILL_SHARE 1
+    u32 flags; /* BIND_PIRQ__* */
+    /* OUT parameters. */
+    u32 port;
+} evtchn_bind_pirq_t;
+
+/*
+ * EVTCHNOP_bind_ipi: Bind a local event channel to receive events.
+ * NOTES:
+ *  1. The allocated event channel is bound to the calling vcpu. The binding
+ *     may not be changed.
+ */
+#define EVTCHNOP_bind_ipi         7
+typedef struct evtchn_bind_ipi {
+    /* OUT parameters. */
+    u32 port;
+} evtchn_bind_ipi_t;
+
+/*
+ * EVTCHNOP_close: Close the communication channel which has an endpoint at
+ * <dom, port>. If the channel is interdomain then the remote end is placed in
+ * the unbound state (EVTCHNSTAT_unbound), awaiting a new connection.
+ * NOTES:
+ *  1. <dom> may be specified as DOMID_SELF.
+ *  2. Only a sufficiently-privileged domain may close an event channel
+ *     for which <dom> is not DOMID_SELF.
+ */
+#define EVTCHNOP_close            3
+typedef struct evtchn_close {
+    /* IN parameters. */
+    domid_t dom;
+    u32     port;
+    /* No OUT parameters. */
+} evtchn_close_t;
+
+/*
+ * EVTCHNOP_send: Send an event to the remote end of the channel whose local
+ * endpoint is <DOMID_SELF, local_port>.
+ */
+#define EVTCHNOP_send             4
+typedef struct evtchn_send {
+    /* IN parameters. */
+    u32     local_port;
+    /* No OUT parameters. */
+} evtchn_send_t;
+
+/*
+ * EVTCHNOP_status: Get the current status of the communication channel which
+ * has an endpoint at <dom, port>.
+ * NOTES:
+ *  1. <dom> may be specified as DOMID_SELF.
+ *  2. Only a sufficiently-privileged domain may obtain the status of an event
+ *     channel for which <dom> is not DOMID_SELF.
+ */
+#define EVTCHNOP_status           5
+typedef struct evtchn_status {
+    /* IN parameters */
+    domid_t dom;
+    u32     port;
+    /* OUT parameters */
+#define EVTCHNSTAT_closed       0  /* Channel is not in use.                 */
+#define EVTCHNSTAT_unbound      1  /* Channel is waiting interdom connection.*/
+#define EVTCHNSTAT_interdomain  2  /* Channel is connected to remote domain. */
+#define EVTCHNSTAT_pirq         3  /* Channel is bound to a phys IRQ line.   */
+#define EVTCHNSTAT_virq         4  /* Channel is bound to a virtual IRQ line */
+#define EVTCHNSTAT_ipi          5  /* Channel is bound to a virtual IPI line */
+    u32     status;
+    u32     vcpu;                  /* VCPU to which this channel is bound.   */
+    union {
+        struct {
+            domid_t dom;
+        } unbound; /* EVTCHNSTAT_unbound */
+        struct {
+            domid_t dom;
+            u32     port;
+        } interdomain; /* EVTCHNSTAT_interdomain */
+        u32 pirq;      /* EVTCHNSTAT_pirq        */
+        u32 virq;      /* EVTCHNSTAT_virq        */
+    } u;
+} evtchn_status_t;
+
+/*
+ * EVTCHNOP_bind_vcpu: Specify which vcpu a channel should notify when an
+ * event is pending.
+ * NOTES:
+ *  1. IPI- and VIRQ-bound channels always notify the vcpu that initialised
+ *     the binding. This binding cannot be changed.
+ *  2. All other channels notify vcpu0 by default. This default is set when
+ *     the channel is allocated (a port that is freed and subsequently reused
+ *     has its binding reset to vcpu0).
+ */
+#define EVTCHNOP_bind_vcpu        8
+typedef struct evtchn_bind_vcpu {
+    /* IN parameters. */
+    u32 port;
+    u32 vcpu;
+} evtchn_bind_vcpu_t;
+
+typedef struct evtchn_op {
+    u32 cmd; /* EVTCHNOP_* */
+    union {
+        evtchn_alloc_unbound_t    alloc_unbound;
+        evtchn_bind_interdomain_t bind_interdomain;
+        evtchn_bind_virq_t        bind_virq;
+        evtchn_bind_pirq_t        bind_pirq;
+        evtchn_bind_ipi_t         bind_ipi;
+        evtchn_close_t            close;
+        evtchn_send_t             send;
+        evtchn_status_t           status;
+        evtchn_bind_vcpu_t        bind_vcpu;
+    } u;
+} evtchn_op_t;
+
+#endif /* __XEN_PUBLIC_EVENT_CHANNEL_H__ */
Index: linux-2.6.12-xen0-arch/include/asm-i386/mach-xen/xen-public/grant_table.h
===================================================================
--- /dev/null
+++ linux-2.6.12-xen0-arch/include/asm-i386/mach-xen/xen-public/grant_table.h
@@ -0,0 +1,275 @@
+/******************************************************************************
+ * grant_table.h
+ * 
+ * Interface for granting foreign access to page frames, and receiving
+ * page-ownership transfers.
+ * 
+ * Copyright (c) 2004, K A Fraser
+ */
+
+#ifndef __XEN_PUBLIC_GRANT_TABLE_H__
+#define __XEN_PUBLIC_GRANT_TABLE_H__
+
+
+/***********************************
+ * GRANT TABLE REPRESENTATION
+ */
+
+/* Some rough guidelines on accessing and updating grant-table entries
+ * in a concurrency-safe manner. For more information, Linux contains a
+ * reference implementation for guest OSes (arch/xen/kernel/grant_table.c).
+ * 
+ * NB. WMB is a no-op on current-generation x86 processors. However, a
+ *     compiler barrier will still be required.
+ * 
+ * Introducing a valid entry into the grant table:
+ *  1. Write ent->domid.
+ *  2. Write ent->frame:
+ *      GTF_permit_access:   Frame to which access is permitted.
+ *      GTF_accept_transfer: Pseudo-phys frame slot being filled by new
+ *                           frame, or zero if none.
+ *  3. Write memory barrier (WMB).
+ *  4. Write ent->flags, inc. valid type.
+ * 
+ * Invalidating an unused GTF_permit_access entry:
+ *  1. flags = ent->flags.
+ *  2. Observe that !(flags & (GTF_reading|GTF_writing)).
+ *  3. Check result of SMP-safe CMPXCHG(&ent->flags, flags, 0).
+ *  NB. No need for WMB as reuse of entry is control-dependent on success of
+ *      step 3, and all architectures guarantee ordering of ctrl-dep writes.
+ *
+ * Invalidating an in-use GTF_permit_access entry:
+ *  This cannot be done directly. Request assistance from the domain controller
+ *  which can set a timeout on the use of a grant entry and take necessary
+ *  action. (NB. This is not yet implemented!).
+ * 
+ * Invalidating an unused GTF_accept_transfer entry:
+ *  1. flags = ent->flags.
+ *  2. Observe that !(flags & GTF_transfer_committed). [*]
+ *  3. Check result of SMP-safe CMPXCHG(&ent->flags, flags, 0).
+ *  NB. No need for WMB as reuse of entry is control-dependent on success of
+ *      step 3, and all architectures guarantee ordering of ctrl-dep writes.
+ *  [*] If GTF_transfer_committed is set then the grant entry is 'committed'.
+ *      The guest must /not/ modify the grant entry until the address of the
+ *      transferred frame is written. It is safe for the guest to spin waiting
+ *      for this to occur (detect by observing GTF_transfer_completed in
+ *      ent->flags).
+ *
+ * Invalidating a committed GTF_accept_transfer entry:
+ *  1. Wait for (ent->flags & GTF_transfer_completed).
+ *
+ * Changing a GTF_permit_access from writable to read-only:
+ *  Use SMP-safe CMPXCHG to set GTF_readonly, while checking !GTF_writing.
+ * 
+ * Changing a GTF_permit_access from read-only to writable:
+ *  Use SMP-safe bit-setting instruction.
+ */
+
+/*
+ * A grant table comprises a packed array of grant entries in one or more
+ * page frames shared between Xen and a guest.
+ * [XEN]: This field is written by Xen and read by the sharing guest.
+ * [GST]: This field is written by the guest and read by Xen.
+ */
+typedef struct grant_entry {
+    /* GTF_xxx: various type and flag information.  [XEN,GST] */
+    u16     flags;
+    /* The domain being granted foreign privileges. [GST] */
+    domid_t domid;
+    /*
+     * GTF_permit_access: Frame that @domid is allowed to map and access. [GST]
+     * GTF_accept_transfer: Frame whose ownership transferred by @domid. [XEN]
+     */
+    u32     frame;
+} grant_entry_t;
+
+/*
+ * Type of grant entry.
+ *  GTF_invalid: This grant entry grants no privileges.
+ *  GTF_permit_access: Allow @domid to map/access @frame.
+ *  GTF_accept_transfer: Allow @domid to transfer ownership of one page frame
+ *                       to this guest. Xen writes the page number to @frame.
+ */
+#define GTF_invalid         (0U<<0)
+#define GTF_permit_access   (1U<<0)
+#define GTF_accept_transfer (2U<<0)
+#define GTF_type_mask       (3U<<0)
+
+/*
+ * Subflags for GTF_permit_access.
+ *  GTF_readonly: Restrict @domid to read-only mappings and accesses. [GST]
+ *  GTF_reading: Grant entry is currently mapped for reading by @domid. [XEN]
+ *  GTF_writing: Grant entry is currently mapped for writing by @domid. [XEN]
+ */
+#define _GTF_readonly       (2)
+#define GTF_readonly        (1U<<_GTF_readonly)
+#define _GTF_reading        (3)
+#define GTF_reading         (1U<<_GTF_reading)
+#define _GTF_writing        (4)
+#define GTF_writing         (1U<<_GTF_writing)
+
+/*
+ * Subflags for GTF_accept_transfer:
+ *  GTF_transfer_committed: Xen sets this flag to indicate that it is committed
+ *      to transferring ownership of a page frame. When a guest sees this flag
+ *      it must /not/ modify the grant entry until GTF_transfer_completed is
+ *      set by Xen.
+ *  GTF_transfer_completed: It is safe for the guest to spin-wait on this flag
+ *      after reading GTF_transfer_committed. Xen will always write the frame
+ *      address, followed by ORing this flag, in a timely manner.
+ */
+#define _GTF_transfer_committed (2)
+#define GTF_transfer_committed  (1U<<_GTF_transfer_committed)
+#define _GTF_transfer_completed (3)
+#define GTF_transfer_completed  (1U<<_GTF_transfer_completed)
+
+
+/***********************************
+ * GRANT TABLE QUERIES AND USES
+ */
+
+/*
+ * Reference to a grant entry in a specified domain's grant table.
+ */
+typedef u16 grant_ref_t;
+
+/*
+ * GNTTABOP_map_grant_ref: Map the grant entry (<dom>,<ref>) for access
+ * by devices and/or host CPUs. If successful, <handle> is a tracking number
+ * that must be presented later to destroy the mapping(s). On error, <handle>
+ * is a negative status code.
+ * NOTES:
+ *  1. If GNTPIN_map_for_dev is specified then <dev_bus_addr> is the address
+ *     via which I/O devices may access the granted frame.
+ *  2. If GNTPIN_map_for_host is specified then a mapping will be added at
+ *     virtual address <host_virt_addr> in the current address space.
+ *  3. Mappings should only be destroyed via GNTTABOP_unmap_grant_ref. If a
+ *     host mapping is destroyed by other means then it is *NOT* guaranteed
+ *     to be accounted to the correct grant reference!
+ */
+#define GNTTABOP_map_grant_ref        0
+typedef struct gnttab_map_grant_ref {
+    /* IN parameters. */
+    memory_t    host_virt_addr;
+    domid_t     dom;
+    grant_ref_t ref;
+    u16         flags;                /* GNTMAP_* */
+    /* OUT parameters. */
+    s16         handle;               /* +ve: handle; -ve: GNTST_* */
+    memory_t    dev_bus_addr;
+} gnttab_map_grant_ref_t;
+
+/*
+ * GNTTABOP_unmap_grant_ref: Destroy one or more grant-reference mappings
+ * tracked by <handle>. If <host_virt_addr> or <dev_bus_addr> is zero, that
+ * field is ignored. If non-zero, they must refer to a device/host mapping
+ * that is tracked by <handle>
+ * NOTES:
+ *  1. The call may fail in an undefined manner if either mapping is not
+ *     tracked by <handle>.
+ *  3. After executing a batch of unmaps, it is guaranteed that no stale
+ *     mappings will remain in the device or host TLBs.
+ */
+#define GNTTABOP_unmap_grant_ref      1
+typedef struct gnttab_unmap_grant_ref {
+    /* IN parameters. */
+    memory_t    host_virt_addr;
+    memory_t    dev_bus_addr;
+    u16         handle;
+    /* OUT parameters. */
+    s16         status;               /* GNTST_* */
+} gnttab_unmap_grant_ref_t;
+
+#define GNTUNMAP_DEV_FROM_VIRT (~0U)
+
+/*
+ * GNTTABOP_setup_table: Set up a grant table for <dom> comprising at least
+ * <nr_frames> pages. The frame addresses are written to the <frame_list>.
+ * Only <nr_frames> addresses are written, even if the table is larger.
+ * NOTES:
+ *  1. <dom> may be specified as DOMID_SELF.
+ *  2. Only a sufficiently-privileged domain may specify <dom> != DOMID_SELF.
+ *  3. Xen may not support more than a single grant-table page per domain.
+ */
+#define GNTTABOP_setup_table          2
+typedef struct gnttab_setup_table {
+    /* IN parameters. */
+    domid_t     dom;
+    u16         nr_frames;
+    /* OUT parameters. */
+    s16         status;               /* GNTST_* */
+    unsigned long *frame_list;
+} gnttab_setup_table_t;
+
+/*
+ * GNTTABOP_dump_table: Dump the contents of the grant table to the
+ * xen console. Debugging use only.
+ */
+#define GNTTABOP_dump_table           3
+typedef struct gnttab_dump_table {
+    /* IN parameters. */
+    domid_t     dom;
+    /* OUT parameters. */
+    s16         status;               /* GNTST_* */
+} gnttab_dump_table_t;
+
+/*
+ * GNTTABOP_donate_grant_ref: Donate <frame> to a foreign domain.  The
+ * foreign domain has previously registered the details of the transfer.
+ * These can be identified from <handle>, a grant reference.
+ */
+#define GNTTABOP_donate                4
+typedef struct {
+    memory_t    mfn;                 /*  0 */
+    domid_t     domid;               /*  4 */
+    u16         handle;               /*  8 */
+    s16         status;               /*  10: GNTST_* */
+    u32         __pad;
+} gnttab_donate_t;           /*  14 bytes */
+
+/*
+ * Bitfield values for update_pin_status.flags.
+ */
+ /* Map the grant entry for access by I/O devices. */
+#define _GNTMAP_device_map      (0)
+#define GNTMAP_device_map       (1<<_GNTMAP_device_map)
+ /* Map the grant entry for access by host CPUs. */
+#define _GNTMAP_host_map        (1)
+#define GNTMAP_host_map         (1<<_GNTMAP_host_map)
+ /* Accesses to the granted frame will be restricted to read-only access. */
+#define _GNTMAP_readonly        (2)
+#define GNTMAP_readonly         (1<<_GNTMAP_readonly)
+ /*
+  * GNTMAP_host_map subflag:
+  *  0 => The host mapping is usable only by the guest OS.
+  *  1 => The host mapping is usable by guest OS + current application.
+  */
+#define _GNTMAP_application_map (3)
+#define GNTMAP_application_map  (1<<_GNTMAP_application_map)
+
+/*
+ * Values for error status returns. All errors are -ve.
+ */
+#define GNTST_okay             (0)
+#define GNTST_general_error    (-1) /* General undefined error.              */
+#define GNTST_bad_domain       (-2) /* Unrecognsed domain id.                */
+#define GNTST_bad_gntref       (-3) /* Unrecognised or inappropriate gntref. */
+#define GNTST_bad_handle       (-4) /* Unrecognised or inappropriate handle. */
+#define GNTST_bad_virt_addr    (-5) /* Inappropriate virtual address to map. */
+#define GNTST_bad_dev_addr     (-6) /* Inappropriate device address to unmap.*/
+#define GNTST_no_device_space  (-7) /* Out of space in I/O MMU.              */
+#define GNTST_permission_denied (-8) /* Not enough privilege for operation.  */
+
+#define GNTTABOP_error_msgs {                   \
+    "okay",                                     \
+    "undefined error",                          \
+    "unrecognised domain id",                   \
+    "invalid grant reference",                  \
+    "invalid mapping handle",                   \
+    "invalid virtual address",                  \
+    "invalid device address",                   \
+    "no spare translation slot in the I/O MMU", \
+    "permission denied"                         \
+}
+
+#endif /* __XEN_PUBLIC_GRANT_TABLE_H__ */
Index: linux-2.6.12-xen0-arch/include/asm-i386/mach-xen/xen-public/io/blkif.h
===================================================================
--- /dev/null
+++ linux-2.6.12-xen0-arch/include/asm-i386/mach-xen/xen-public/io/blkif.h
@@ -0,0 +1,99 @@
+/******************************************************************************
+ * blkif.h
+ * 
+ * Unified block-device I/O interface for Xen guest OSes.
+ * 
+ * Copyright (c) 2003-2004, Keir Fraser
+ */
+
+#ifndef __XEN_PUBLIC_IO_BLKIF_H__
+#define __XEN_PUBLIC_IO_BLKIF_H__
+
+#include "ring.h"
+
+#ifndef blkif_vdev_t
+#define blkif_vdev_t   u16
+#endif
+#define blkif_sector_t u64
+
+#define BLKIF_OP_READ      0
+#define BLKIF_OP_WRITE     1
+#define BLKIF_OP_PROBE     2
+
+/* NB. Ring size must be small enough for sizeof(blkif_ring_t) <= PAGE_SIZE. */
+#define BLKIF_RING_SIZE        64
+
+/*
+ * Maximum scatter/gather segments per request.
+ * This is carefully chosen so that sizeof(blkif_ring_t) <= PAGE_SIZE.
+ * NB. This could be 12 if the ring indexes weren't stored in the same page.
+ */
+#define BLKIF_MAX_SEGMENTS_PER_REQUEST 11
+
+typedef struct blkif_request {
+    u8             operation;    /* BLKIF_OP_???                         */
+    u8             nr_segments;  /* number of segments                   */
+    blkif_vdev_t   device;       /* only for read/write requests         */
+    unsigned long  id;           /* private guest value, echoed in resp  */
+    blkif_sector_t sector_number;/* start sector idx on disk (r/w only)  */
+    /* @f_a_s[2:0]=last_sect ; @f_a_s[5:3]=first_sect                        */
+#ifdef CONFIG_XEN_BLKDEV_GRANT
+    /* @f_a_s[:16]= grant reference (16 bits)                                */
+#else
+    /* @f_a_s[:12]=@frame: machine page frame number.                        */
+#endif
+    /* @first_sect: first sector in frame to transfer (inclusive).           */
+    /* @last_sect: last sector in frame to transfer (inclusive).             */
+    unsigned long  frame_and_sects[BLKIF_MAX_SEGMENTS_PER_REQUEST];
+} blkif_request_t;
+
+#define blkif_first_sect(_fas) (((_fas)>>3)&7)
+#define blkif_last_sect(_fas)  ((_fas)&7)
+
+#ifdef CONFIG_XEN_BLKDEV_GRANT
+#define blkif_gref_from_fas(_fas) ((_fas)>>16)
+#endif
+
+typedef struct blkif_response {
+    unsigned long   id;              /* copied from request */
+    u8              operation;       /* copied from request */
+    s16             status;          /* BLKIF_RSP_???       */
+} blkif_response_t;
+
+#define BLKIF_RSP_ERROR  -1 /* non-specific 'error' */
+#define BLKIF_RSP_OKAY    0 /* non-specific 'okay'  */
+
+/*
+ * Generate blkif ring structures and types.
+ */
+
+DEFINE_RING_TYPES(blkif, blkif_request_t, blkif_response_t);
+
+/*
+ * BLKIF_OP_PROBE:
+ * The request format for a probe request is constrained as follows:
+ *  @operation   == BLKIF_OP_PROBE
+ *  @nr_segments == size of probe buffer in pages
+ *  @device      == unused (zero)
+ *  @id          == any value (echoed in response message)
+ *  @sector_num  == unused (zero)
+ *  @frame_and_sects == list of page-sized buffers.
+ *                       (i.e., @first_sect == 0, @last_sect == 7).
+ * 
+ * The response is a list of vdisk_t elements copied into the out-of-band
+ * probe buffer. On success the response status field contains the number
+ * of vdisk_t elements.
+ */
+
+#define VDISK_CDROM        0x1
+#define VDISK_REMOVABLE    0x2
+#define VDISK_READONLY     0x4
+
+typedef struct vdisk {
+    blkif_sector_t capacity;     /* Size in terms of 512-byte sectors.   */
+    blkif_vdev_t   device;       /* Device number (opaque 16 bit value). */
+    u16            info;         /* Device type and flags (VDISK_*).     */
+    u16            sector_size;  /* Minimum alignment for requests.      */
+} vdisk_t; /* 16 bytes */
+
+#endif /* __XEN_PUBLIC_IO_BLKIF_H__ */
Index: 
linux-2.6.12-xen0-arch/include/asm-i386/mach-xen/xen-public/io/domain_controller.h
===================================================================
--- /dev/null
+++ 
linux-2.6.12-xen0-arch/include/asm-i386/mach-xen/xen-public/io/domain_controller.h
@@ -0,0 +1,783 @@
+/******************************************************************************
+ * domain_controller.h
+ * 
+ * Interface to server controller (e.g., 'xend'). This header file defines the 
+ * interface that is shared with guest OSes.
+ * 
+ * Copyright (c) 2004, K A Fraser
+ */
+
+#ifndef __XEN_PUBLIC_IO_DOMAIN_CONTROLLER_H__
+#define __XEN_PUBLIC_IO_DOMAIN_CONTROLLER_H__
+
+#include "ring.h"
+
+/*
+ * CONTROLLER MESSAGING INTERFACE.
+ */
+
+typedef struct control_msg {
+    u8 type;     /*  0: echoed in response */
+    u8 subtype;  /*  1: echoed in response */
+    u8 id;       /*  2: echoed in response */
+    u8 length;   /*  3: number of bytes in 'msg' */
+    u8 msg[60];  /*  4: type-specific message data */
+} control_msg_t; /* 64 bytes */
+
+/* These are used by the control message deferred ring. */
+#define CONTROL_RING_SIZE 8
+typedef u32 CONTROL_RING_IDX;
+#define MASK_CONTROL_IDX(_i) ((_i)&(CONTROL_RING_SIZE-1))
+
+/*
+ * Generate control ring structures and types.
+ *
+ * CONTROL_RING_MEM is currently an 8-slot ring of ctrl_msg_t structs and
+ * two 32-bit counters:  (64 * 8) + (2 * 4) = 520
+ */
+#define CONTROL_RING_MEM 520
+DEFINE_RING_TYPES(ctrl, control_msg_t, control_msg_t);
+
+typedef struct control_if {
+    union {
+        ctrl_sring_t tx_ring; /* guest -> controller  */
+        char __x[CONTROL_RING_MEM];
+    };
+    union {
+        ctrl_sring_t rx_ring; /* controller -> guest  */
+        char __y[CONTROL_RING_MEM];
+    };
+} control_if_t;
+
+/*
+ * Top-level command types.
+ */
+#define CMSG_CONSOLE        0  /* Console                 */
+#define CMSG_BLKIF_BE       1  /* Block-device backend    */
+#define CMSG_BLKIF_FE       2  /* Block-device frontend   */
+#define CMSG_NETIF_BE       3  /* Network-device backend  */
+#define CMSG_NETIF_FE       4  /* Network-device frontend */
+#define CMSG_SHUTDOWN       6  /* Shutdown messages       */
+#define CMSG_MEM_REQUEST    7  /* Memory reservation reqs */
+#define CMSG_USBIF_BE       8  /* USB controller backend  */
+#define CMSG_USBIF_FE       9  /* USB controller frontend */
+#define CMSG_VCPU_HOTPLUG  10  /* Hotplug VCPU messages   */
+#define CMSG_DEBUG         11  /* PDB backend             */
+
+/******************************************************************************
+ * CONSOLE DEFINITIONS
+ */
+
+/*
+ * Subtypes for console messages.
+ */
+#define CMSG_CONSOLE_DATA       0
+
+
+/******************************************************************************
+ * BLOCK-INTERFACE FRONTEND DEFINITIONS
+ */
+
+/* Messages from domain controller to guest. */
+#define CMSG_BLKIF_FE_INTERFACE_STATUS           0
+
+/* Messages from guest to domain controller. */
+#define CMSG_BLKIF_FE_DRIVER_STATUS             32
+#define CMSG_BLKIF_FE_INTERFACE_CONNECT         33
+#define CMSG_BLKIF_FE_INTERFACE_DISCONNECT      34
+#define CMSG_BLKIF_FE_INTERFACE_QUERY           35
+
+#ifndef blkif_vdev_t
+#define blkif_vdev_t   u16
+#endif
+#define blkif_pdev_t   u32
+
+/*
+ * CMSG_BLKIF_FE_INTERFACE_STATUS:
+ *  Notify a guest about a status change on one of its block interfaces.
+ *  If the interface is DESTROYED or DOWN then the interface is disconnected:
+ *   1. The shared-memory frame is available for reuse.
+ *   2. Any unacknowledged messages pending on the interface were dropped.
+ */
+#define BLKIF_INTERFACE_STATUS_CLOSED       0 /* Interface doesn't exist.    */
+#define BLKIF_INTERFACE_STATUS_DISCONNECTED 1 /* Exists but is disconnected. */
+#define BLKIF_INTERFACE_STATUS_CONNECTED    2 /* Exists and is connected.    */
+#define BLKIF_INTERFACE_STATUS_CHANGED      3 /* A device has been added or 
removed. */
+typedef struct blkif_fe_interface_status {
+    u32 handle;
+    u32 status;
+    u16 evtchn;    /* (only if status == BLKIF_INTERFACE_STATUS_CONNECTED). */
+    domid_t domid; /* status != BLKIF_INTERFACE_STATUS_DESTROYED */
+} blkif_fe_interface_status_t;
+
+/*
+ * CMSG_BLKIF_FE_DRIVER_STATUS:
+ *  Notify the domain controller that the front-end driver is DOWN or UP.
+ *  When the driver goes DOWN then the controller will send no more
+ *  status-change notifications.
+ *  If the driver goes DOWN while interfaces are still UP, the domain
+ *  will automatically take the interfaces DOWN.
+ * 
+ *  NB. The controller should not send an INTERFACE_STATUS_CHANGED message
+ *  for interfaces that are active when it receives an UP notification. We
+ *  expect that the frontend driver will query those interfaces itself.
+ */
+#define BLKIF_DRIVER_STATUS_DOWN   0
+#define BLKIF_DRIVER_STATUS_UP     1
+typedef struct blkif_fe_driver_status {
+    /* IN */
+    u32 status;        /* BLKIF_DRIVER_STATUS_??? */
+    /* OUT */
+    /* Driver should query interfaces [0..max_handle]. */
+    u32 max_handle;
+} blkif_fe_driver_status_t;
+
+/*
+ * CMSG_BLKIF_FE_INTERFACE_CONNECT:
+ *  If successful, the domain controller will acknowledge with a
+ *  STATUS_CONNECTED message.
+ */
+typedef struct blkif_fe_interface_connect {
+    u32      handle;
+    memory_t shmem_frame;
+    int      shmem_ref;
+} blkif_fe_interface_connect_t;
+
+/*
+ * CMSG_BLKIF_FE_INTERFACE_DISCONNECT:
+ *  If successful, the domain controller will acknowledge with a
+ *  STATUS_DISCONNECTED message.
+ */
+typedef struct blkif_fe_interface_disconnect {
+    u32 handle;
+} blkif_fe_interface_disconnect_t;
+
+/*
+ * CMSG_BLKIF_FE_INTERFACE_QUERY:
+ */
+typedef struct blkif_fe_interface_query {
+    /* IN */
+    u32 handle;
+    /* OUT */
+    u32 status;
+    u16 evtchn;    /* (only if status == BLKIF_INTERFACE_STATUS_CONNECTED). */
+    domid_t domid; /* status != BLKIF_INTERFACE_STATUS_DESTROYED */
+} blkif_fe_interface_query_t;
+
+
+/******************************************************************************
+ * BLOCK-INTERFACE BACKEND DEFINITIONS
+ */
+
+/* Messages from domain controller. */
+#define CMSG_BLKIF_BE_CREATE      0  /* Create a new block-device interface. */
+#define CMSG_BLKIF_BE_DESTROY     1  /* Destroy a block-device interface.    */
+#define CMSG_BLKIF_BE_CONNECT     2  /* Connect i/f to remote driver.        */
+#define CMSG_BLKIF_BE_DISCONNECT  3  /* Disconnect i/f from remote driver.   */
+#define CMSG_BLKIF_BE_VBD_CREATE  4  /* Create a new VBD for an interface.   */
+#define CMSG_BLKIF_BE_VBD_DESTROY 5  /* Delete a VBD from an interface.      */
+
+/* Messages to domain controller. */
+#define CMSG_BLKIF_BE_DRIVER_STATUS 32
+
+/*
+ * Message request/response definitions for block-device messages.
+ */
+
+/* Non-specific 'okay' return. */
+#define BLKIF_BE_STATUS_OKAY                0
+/* Non-specific 'error' return. */
+#define BLKIF_BE_STATUS_ERROR               1
+/* The following are specific error returns. */
+#define BLKIF_BE_STATUS_INTERFACE_EXISTS    2
+#define BLKIF_BE_STATUS_INTERFACE_NOT_FOUND 3
+#define BLKIF_BE_STATUS_INTERFACE_CONNECTED 4
+#define BLKIF_BE_STATUS_VBD_EXISTS          5
+#define BLKIF_BE_STATUS_VBD_NOT_FOUND       6
+#define BLKIF_BE_STATUS_OUT_OF_MEMORY       7
+#define BLKIF_BE_STATUS_PHYSDEV_NOT_FOUND   8
+#define BLKIF_BE_STATUS_MAPPING_ERROR       9
+
+/* This macro can be used to create an array of descriptive error strings. */
+#define BLKIF_BE_STATUS_ERRORS {    \
+    "Okay",                         \
+    "Non-specific error",           \
+    "Interface already exists",     \
+    "Interface not found",          \
+    "Interface is still connected", \
+    "VBD already exists",           \
+    "VBD not found",                \
+    "Out of memory",                \
+    "Extent not found for VBD",     \
+    "Could not map domain memory" }
+
+/*
+ * CMSG_BLKIF_BE_CREATE:
+ *  When the driver sends a successful response then the interface is fully
+ *  created. The controller will send a DOWN notification to the front-end
+ *  driver.
+ */
+typedef struct blkif_be_create { 
+    /* IN */
+    domid_t    domid;         /* Domain attached to new interface.   */
+    u32        blkif_handle;  /* Domain-specific interface handle.   */
+    /* OUT */
+    u32        status;
+} blkif_be_create_t;
+
+/*
+ * CMSG_BLKIF_BE_DESTROY:
+ *  When the driver sends a successful response then the interface is fully
+ *  torn down. The controller will send a DESTROYED notification to the
+ *  front-end driver.
+ */
+typedef struct blkif_be_destroy { 
+    /* IN */
+    domid_t    domid;         /* Identify interface to be destroyed. */
+    u32        blkif_handle;  /* ...ditto...                         */
+    /* OUT */
+    u32        status;
+} blkif_be_destroy_t;
+
+/*
+ * CMSG_BLKIF_BE_CONNECT:
+ *  When the driver sends a successful response then the interface is fully
+ *  connected. The controller will send a CONNECTED notification to the
+ *  front-end driver.
+ */
+typedef struct blkif_be_connect {
+    /* IN */
+    domid_t    domid;         /* Domain attached to new interface.   */
+    u32        blkif_handle;  /* Domain-specific interface handle.   */
+    memory_t   shmem_frame;   /* Page cont. shared comms window.     */
+    int        shmem_ref;     /* Grant table reference.              */
+    u32        evtchn;        /* Event channel for notifications.    */
+    /* OUT */
+    u32        status;
+} blkif_be_connect_t;
+
+/*
+ * CMSG_BLKIF_BE_DISCONNECT:
+ *  When the driver sends a successful response then the interface is fully
+ *  disconnected. The controller will send a DOWN notification to the front-end
+ *  driver.
+ */
+typedef struct blkif_be_disconnect { 
+    /* IN */
+    domid_t    domid;         /* Domain attached to new interface.   */
+    u32        blkif_handle;  /* Domain-specific interface handle.   */
+    /* OUT */
+    u32        status;
+} blkif_be_disconnect_t;
+
+/* CMSG_BLKIF_BE_VBD_CREATE */
+typedef struct blkif_be_vbd_create {
+    /* IN */
+    domid_t    domid;         /* Identify blkdev interface.          */
+    u32        blkif_handle;  /* ...ditto...                         */
+    blkif_pdev_t pdevice;
+    u32        dev_handle;    /* Extended device id field.           */
+    blkif_vdev_t vdevice;     /* Interface-specific id for this VBD. */
+    u16        readonly;      /* Non-zero -> VBD isn't writable.     */
+    /* OUT */
+    u32        status;
+} blkif_be_vbd_create_t;
+
+/* CMSG_BLKIF_BE_VBD_DESTROY */
+typedef struct blkif_be_vbd_destroy {
+    /* IN */
+    domid_t    domid;         /* Identify blkdev interface.          */
+    u32        blkif_handle;  /* ...ditto...                         */
+    blkif_vdev_t vdevice;     /* Interface-specific id of the VBD.   */
+    /* OUT */
+    u32        status;
+} blkif_be_vbd_destroy_t;
+
+/*
+ * CMSG_BLKIF_BE_DRIVER_STATUS:
+ *  Notify the domain controller that the back-end driver is DOWN or UP.
+ *  If the driver goes DOWN while interfaces are still UP, the controller
+ *  will automatically send DOWN notifications.
+ */
+typedef struct blkif_be_driver_status {
+    u32        status;        /* BLKIF_DRIVER_STATUS_??? */
+} blkif_be_driver_status_t;
+
+
+/******************************************************************************
+ * NETWORK-INTERFACE FRONTEND DEFINITIONS
+ */
+
+/* Messages from domain controller to guest. */
+#define CMSG_NETIF_FE_INTERFACE_STATUS   0
+
+/* Messages from guest to domain controller. */
+#define CMSG_NETIF_FE_DRIVER_STATUS             32
+#define CMSG_NETIF_FE_INTERFACE_CONNECT         33
+#define CMSG_NETIF_FE_INTERFACE_DISCONNECT      34
+#define CMSG_NETIF_FE_INTERFACE_QUERY           35
+
+/*
+ * CMSG_NETIF_FE_INTERFACE_STATUS:
+ *  Notify a guest about a status change on one of its network interfaces.
+ *  If the interface is CLOSED or DOWN then the interface is disconnected:
+ *   1. The shared-memory frame is available for reuse.
+ *   2. Any unacknowledged messgaes pending on the interface were dropped.
+ */
+#define NETIF_INTERFACE_STATUS_CLOSED       0 /* Interface doesn't exist.    */
+#define NETIF_INTERFACE_STATUS_DISCONNECTED 1 /* Exists but is disconnected. */
+#define NETIF_INTERFACE_STATUS_CONNECTED    2 /* Exists and is connected.    */
+#define NETIF_INTERFACE_STATUS_CHANGED      3 /* A device has been added or 
removed. */
+typedef struct netif_fe_interface_status {
+    u32        handle;
+    u32        status;
+    u16        evtchn; /* status == NETIF_INTERFACE_STATUS_CONNECTED */
+    u8         mac[6]; /* status == NETIF_INTERFACE_STATUS_CONNECTED */
+    domid_t    domid;  /* status != NETIF_INTERFACE_STATUS_DESTROYED */
+} netif_fe_interface_status_t;
+
+/*
+ * CMSG_NETIF_FE_DRIVER_STATUS:
+ *  Notify the domain controller that the front-end driver is DOWN or UP.
+ *  When the driver goes DOWN then the controller will send no more
+ *  status-change notifications.
+ *  If the driver goes DOWN while interfaces are still UP, the domain
+ *  will automatically take the interfaces DOWN.
+ * 
+ *  NB. The controller should not send an INTERFACE_STATUS message
+ *  for interfaces that are active when it receives an UP notification. We
+ *  expect that the frontend driver will query those interfaces itself.
+ */
+#define NETIF_DRIVER_STATUS_DOWN   0
+#define NETIF_DRIVER_STATUS_UP     1
+typedef struct netif_fe_driver_status {
+    /* IN */
+    u32        status;        /* NETIF_DRIVER_STATUS_??? */
+    /* OUT */
+    /* Driver should query interfaces [0..max_handle]. */
+    u32        max_handle;
+} netif_fe_driver_status_t;
+
+/*
+ * CMSG_NETIF_FE_INTERFACE_CONNECT:
+ *  If successful, the domain controller will acknowledge with a
+ *  STATUS_CONNECTED message.
+ */
+typedef struct netif_fe_interface_connect {
+    u32        handle;
+    memory_t   tx_shmem_frame;
+    memory_t   rx_shmem_frame;
+} netif_fe_interface_connect_t;
+
+/*
+ * CMSG_NETIF_FE_INTERFACE_DISCONNECT:
+ *  If successful, the domain controller will acknowledge with a
+ *  STATUS_DISCONNECTED message.
+ */
+typedef struct netif_fe_interface_disconnect {
+    u32        handle;
+} netif_fe_interface_disconnect_t;
+
+/*
+ * CMSG_NETIF_FE_INTERFACE_QUERY:
+ */
+typedef struct netif_fe_interface_query {
+    /* IN */
+    u32        handle;
+    /* OUT */
+    u32        status;
+    u16        evtchn; /* status == NETIF_INTERFACE_STATUS_CONNECTED */
+    u8         mac[6]; /* status == NETIF_INTERFACE_STATUS_CONNECTED */
+    domid_t    domid;  /* status != NETIF_INTERFACE_STATUS_DESTROYED */
+} netif_fe_interface_query_t;
+
+
+/******************************************************************************
+ * NETWORK-INTERFACE BACKEND DEFINITIONS
+ */
+
+/* Messages from domain controller. */
+#define CMSG_NETIF_BE_CREATE      0  /* Create a new net-device interface. */
+#define CMSG_NETIF_BE_DESTROY     1  /* Destroy a net-device interface.    */
+#define CMSG_NETIF_BE_CONNECT     2  /* Connect i/f to remote driver.        */
+#define CMSG_NETIF_BE_DISCONNECT  3  /* Disconnect i/f from remote driver.   */
+#define CMSG_NETIF_BE_CREDITLIMIT 4  /* Limit i/f to a given credit limit. */
+
+/* Messages to domain controller. */
+#define CMSG_NETIF_BE_DRIVER_STATUS 32
+
+/*
+ * Message request/response definitions for net-device messages.
+ */
+
+/* Non-specific 'okay' return. */
+#define NETIF_BE_STATUS_OKAY                0
+/* Non-specific 'error' return. */
+#define NETIF_BE_STATUS_ERROR               1
+/* The following are specific error returns. */
+#define NETIF_BE_STATUS_INTERFACE_EXISTS    2
+#define NETIF_BE_STATUS_INTERFACE_NOT_FOUND 3
+#define NETIF_BE_STATUS_INTERFACE_CONNECTED 4
+#define NETIF_BE_STATUS_OUT_OF_MEMORY       5
+#define NETIF_BE_STATUS_MAPPING_ERROR       6
+
+/* This macro can be used to create an array of descriptive error strings. */
+#define NETIF_BE_STATUS_ERRORS {    \
+    "Okay",                         \
+    "Non-specific error",           \
+    "Interface already exists",     \
+    "Interface not found",          \
+    "Interface is still connected", \
+    "Out of memory",                \
+    "Could not map domain memory" }
+
+/*
+ * CMSG_NETIF_BE_CREATE:
+ *  When the driver sends a successful response then the interface is fully
+ *  created. The controller will send a DOWN notification to the front-end
+ *  driver.
+ */
+typedef struct netif_be_create { 
+    /* IN */
+    domid_t    domid;         /* Domain attached to new interface.   */
+    u32        netif_handle;  /* Domain-specific interface handle.   */
+    u8         mac[6];
+    u8         be_mac[6];
+    /* OUT */
+    u32        status;
+} netif_be_create_t;
+
+/*
+ * CMSG_NETIF_BE_DESTROY:
+ *  When the driver sends a successful response then the interface is fully
+ *  torn down. The controller will send a DESTROYED notification to the
+ *  front-end driver.
+ */
+typedef struct netif_be_destroy { 
+    /* IN */
+    domid_t    domid;         /* Identify interface to be destroyed. */
+    u32        netif_handle;  /* ...ditto...                         */
+    /* OUT */
+    u32   status;
+} netif_be_destroy_t;
+
+/*
+ * CMSG_NETIF_BE_CREDITLIMIT:
+ *  Limit a virtual interface to "credit_bytes" bytes per "period_usec" 
+ *  microseconds.  
+ */
+typedef struct netif_be_creditlimit { 
+    /* IN */
+    domid_t    domid;          /* Domain attached to new interface.   */
+    u32        netif_handle;   /* Domain-specific interface handle.   */
+    u32        credit_bytes;   /* Vifs credit of bytes per period.    */
+    u32        period_usec;    /* Credit replenishment period.        */
+    /* OUT */
+    u32        status;
+} netif_be_creditlimit_t;
+
+/*
+ * CMSG_NETIF_BE_CONNECT:
+ *  When the driver sends a successful response then the interface is fully
+ *  connected. The controller will send a CONNECTED notification to the
+ *  front-end driver.
+ */
+typedef struct netif_be_connect { 
+    /* IN */
+    domid_t    domid;          /* Domain attached to new interface.   */
+    u32        netif_handle;   /* Domain-specific interface handle.   */
+    memory_t   tx_shmem_frame; /* Page cont. tx shared comms window.  */
+    memory_t   rx_shmem_frame; /* Page cont. rx shared comms window.  */
+    u16        evtchn;         /* Event channel for notifications.    */
+    /* OUT */
+    u32        status;
+} netif_be_connect_t;
+
+/*
+ * CMSG_NETIF_BE_DISCONNECT:
+ *  When the driver sends a successful response then the interface is fully
+ *  disconnected. The controller will send a DOWN notification to the front-end
+ *  driver.
+ */
+typedef struct netif_be_disconnect { 
+    /* IN */
+    domid_t    domid;         /* Domain attached to new interface.   */
+    u32        netif_handle;  /* Domain-specific interface handle.   */
+    /* OUT */
+    u32        status;
+} netif_be_disconnect_t;
+
+/*
+ * CMSG_NETIF_BE_DRIVER_STATUS:
+ *  Notify the domain controller that the back-end driver is DOWN or UP.
+ *  If the driver goes DOWN while interfaces are still UP, the domain
+ *  will automatically send DOWN notifications.
+ */
+typedef struct netif_be_driver_status {
+    u32        status;        /* NETIF_DRIVER_STATUS_??? */
+} netif_be_driver_status_t;
+
+
+
+/******************************************************************************
+ * USB-INTERFACE FRONTEND DEFINITIONS
+ */
+
+/* Messages from domain controller to guest. */
+#define CMSG_USBIF_FE_INTERFACE_STATUS_CHANGED   0
+
+/* Messages from guest to domain controller. */
+#define CMSG_USBIF_FE_DRIVER_STATUS_CHANGED     32
+#define CMSG_USBIF_FE_INTERFACE_CONNECT         33
+#define CMSG_USBIF_FE_INTERFACE_DISCONNECT      34
+/*
+ * CMSG_USBIF_FE_INTERFACE_STATUS_CHANGED:
+ *  Notify a guest about a status change on one of its block interfaces.
+ *  If the interface is DESTROYED or DOWN then the interface is disconnected:
+ *   1. The shared-memory frame is available for reuse.
+ *   2. Any unacknowledged messages pending on the interface were dropped.
+ */
+#define USBIF_INTERFACE_STATUS_DESTROYED    0 /* Interface doesn't exist.    */
+#define USBIF_INTERFACE_STATUS_DISCONNECTED 1 /* Exists but is disconnected. */
+#define USBIF_INTERFACE_STATUS_CONNECTED    2 /* Exists and is connected.    */
+typedef struct usbif_fe_interface_status_changed {
+    u32 status;
+    u16 evtchn;    /* (only if status == BLKIF_INTERFACE_STATUS_CONNECTED). */
+    domid_t domid; /* status != BLKIF_INTERFACE_STATUS_DESTROYED */
+    u32 bandwidth;
+    u32 num_ports;
+} usbif_fe_interface_status_changed_t;
+
+/*
+ * CMSG_USBIF_FE_DRIVER_STATUS_CHANGED:
+ *  Notify the domain controller that the front-end driver is DOWN or UP.
+ *  When the driver goes DOWN then the controller will send no more
+ *  status-change notifications.
+ *  If the driver goes DOWN while interfaces are still UP, the domain
+ *  will automatically take the interfaces DOWN.
+ * 
+ *  NB. The controller should not send an INTERFACE_STATUS_CHANGED message
+ *  for interfaces that are active when it receives an UP notification. We
+ *  expect that the frontend driver will query those interfaces itself.
+ */
+#define USBIF_DRIVER_STATUS_DOWN   0
+#define USBIF_DRIVER_STATUS_UP     1
+typedef struct usbif_fe_driver_status_changed {
+    /* IN */
+    u32 status;        /* USBIF_DRIVER_STATUS_??? */
+} usbif_fe_driver_status_changed_t;
+
+/*
+ * CMSG_USBIF_FE_INTERFACE_CONNECT:
+ *  If successful, the domain controller will acknowledge with a
+ *  STATUS_CONNECTED message.
+ */
+typedef struct usbif_fe_interface_connect {
+    memory_t shmem_frame;
+} usbif_fe_interface_connect_t;
+
+/*
+ * CMSG_USBIF_FE_INTERFACE_DISCONNECT:
+ *  If successful, the domain controller will acknowledge with a
+ *  STATUS_DISCONNECTED message.
+ */
+typedef struct usbif_fe_interface_disconnect {
+    int dummy; /* make struct non-empty */
+} usbif_fe_interface_disconnect_t;
+
+
+/******************************************************************************
+ * USB-INTERFACE BACKEND DEFINITIONS
+ */
+
+/* Messages from domain controller. */
+#define CMSG_USBIF_BE_CREATE       0  /* Create a new block-device interface. 
*/
+#define CMSG_USBIF_BE_DESTROY      1  /* Destroy a block-device interface.    
*/
+#define CMSG_USBIF_BE_CONNECT      2  /* Connect i/f to remote driver.        
*/
+#define CMSG_USBIF_BE_DISCONNECT   3  /* Disconnect i/f from remote driver.   
*/
+#define CMSG_USBIF_BE_CLAIM_PORT   4  /* Claim host port for a domain.        
*/
+#define CMSG_USBIF_BE_RELEASE_PORT 5  /* Release host port.                   
*/
+/* Messages to domain controller. */
+#define CMSG_USBIF_BE_DRIVER_STATUS_CHANGED 32
+
+/* Non-specific 'okay' return. */
+#define USBIF_BE_STATUS_OKAY                0
+/* Non-specific 'error' return. */
+#define USBIF_BE_STATUS_ERROR               1
+/* The following are specific error returns. */
+#define USBIF_BE_STATUS_INTERFACE_EXISTS    2
+#define USBIF_BE_STATUS_INTERFACE_NOT_FOUND 3
+#define USBIF_BE_STATUS_INTERFACE_CONNECTED 4
+#define USBIF_BE_STATUS_OUT_OF_MEMORY       7
+#define USBIF_BE_STATUS_MAPPING_ERROR       9
+
+/* This macro can be used to create an array of descriptive error strings. */
+#define USBIF_BE_STATUS_ERRORS {    \
+    "Okay",                         \
+    "Non-specific error",           \
+    "Interface already exists",     \
+    "Interface not found",          \
+    "Interface is still connected", \
+    "Out of memory",                \
+    "Could not map domain memory" }
+
+/*
+ * CMSG_USBIF_BE_CREATE:
+ *  When the driver sends a successful response then the interface is fully
+ *  created. The controller will send a DOWN notification to the front-end
+ *  driver.
+ */
+typedef struct usbif_be_create { 
+    /* IN */
+    domid_t    domid;         /* Domain attached to new interface.   */
+    /* OUT */
+    u32        status;
+} usbif_be_create_t;
+
+/*
+ * CMSG_USBIF_BE_DESTROY:
+ *  When the driver sends a successful response then the interface is fully
+ *  torn down. The controller will send a DESTROYED notification to the
+ *  front-end driver.
+ */
+typedef struct usbif_be_destroy { 
+    /* IN */
+    domid_t    domid;         /* Identify interface to be destroyed. */
+    /* OUT */
+    u32        status;
+} usbif_be_destroy_t;
+
+/*
+ * CMSG_USBIF_BE_CONNECT:
+ *  When the driver sends a successful response then the interface is fully
+ *  connected. The controller will send a CONNECTED notification to the
+ *  front-end driver.
+ */
+typedef struct usbif_be_connect { 
+    /* IN */
+    domid_t    domid;         /* Domain attached to new interface.   */
+    memory_t   shmem_frame;   /* Page cont. shared comms window.     */
+    u32        evtchn;        /* Event channel for notifications.    */
+    u32        bandwidth;     /* Bandwidth allocated for isoch / int - us
+                               * per 1ms frame (ie between 0 and 900 or 800
+                               * depending on USB version). */
+    /* OUT */
+    u32        status;
+} usbif_be_connect_t;
+
+/*
+ * CMSG_USBIF_BE_DISCONNECT:
+ *  When the driver sends a successful response then the interface is fully
+ *  disconnected. The controller will send a DOWN notification to the front-end
+ *  driver.
+ */
+typedef struct usbif_be_disconnect { 
+    /* IN */
+    domid_t    domid;         /* Domain attached to new interface.   */
+    /* OUT */
+    u32        status;
+} usbif_be_disconnect_t;
+
+/*
+ * CMSG_USBIF_BE_DRIVER_STATUS_CHANGED:
+ *  Notify the domain controller that the back-end driver is DOWN or UP.
+ *  If the driver goes DOWN while interfaces are still UP, the controller
+ *  will automatically send DOWN notifications.
+ */
+typedef struct usbif_be_driver_status_changed {
+    u32        status;        /* USBIF_DRIVER_STATUS_??? */
+} usbif_be_driver_status_changed_t;
+
+#define USB_PATH_LEN 16
+
+/*
+ * CMSG_USBIF_BE_CLAIM_PORT:
+ * Instruct the backend driver to claim any device plugged into the specified
+ * host port and to allow the specified domain to control that port.
+ */
+typedef struct usbif_be_claim_port {
+    /* IN */
+    domid_t  domid;        /* which domain                 */
+    u32      usbif_port;   /* port on the virtual root hub */
+    u32      status;       /* status of operation          */
+    char path[USB_PATH_LEN]; /* Currently specified in the Linux style - may 
need to be
+                    * converted to some OS-independent format at some stage. */
+} usbif_be_claim_port_t;
+
+/*
+ * CMSG_USBIF_BE_RELEASE_PORT: 
+ * Instruct the backend driver to release any device plugged into the specified
+ * host port.
+ */
+typedef struct usbif_be_release_port {
+    char     path[USB_PATH_LEN];
+} usbif_be_release_port_t;
+
+/******************************************************************************
+ * SHUTDOWN DEFINITIONS
+ */
+
+/*
+ * Subtypes for shutdown messages.
+ */
+#define CMSG_SHUTDOWN_POWEROFF  0   /* Clean shutdown (SHUTDOWN_poweroff).   */
+#define CMSG_SHUTDOWN_REBOOT    1   /* Clean shutdown (SHUTDOWN_reboot).     */
+#define CMSG_SHUTDOWN_SUSPEND   2   /* Create suspend info, then             */
+                                    /* SHUTDOWN_suspend.                     */
+#define CMSG_SHUTDOWN_SYSRQ     3
+
+typedef struct shutdown_sysrq {
+    char key;      /* sysrq key */
+} shutdown_sysrq_t;
+
+/******************************************************************************
+ * VCPU HOTPLUG CONTROLS
+ */
+
+/*
+ * Subtypes for shutdown messages.
+ */
+#define CMSG_VCPU_HOTPLUG_OFF   0   /* turn vcpu off */
+#define CMSG_VCPU_HOTPLUG_ON    1   /* turn vcpu on  */
+
+/*
+ * CMSG_VCPU_HOTPLUG:
+ *  Indicate which vcpu's state should change
+ */
+typedef struct vcpu_hotplug {
+    u32 vcpu;         /* VCPU's whose state will change */
+    u32 status;       /* Return code indicates success or failure. */
+} vcpu_hotplug_t;
+
+/******************************************************************************
+ * MEMORY CONTROLS
+ */
+
+#define CMSG_MEM_REQUEST_SET 0 /* Request a domain to set its mem footprint. */
+
+/*
+ * CMSG_MEM_REQUEST:
+ *  Request that the domain change its memory reservation.
+ */
+typedef struct mem_request {
+    /* OUT */
+    u32 target;       /* Target memory reservation in pages.       */
+    /* IN  */
+    u32 status;       /* Return code indicates success or failure. */
+} mem_request_t;
+
+
+/******************************************************************************
+ * PDB INTERFACE DEFINITIONS
+ */
+
+#define CMSG_DEBUG_CONNECTION_STATUS 0
+typedef struct pdb_Connection {
+#define PDB_CONNECTION_STATUS_UP   1
+#define PDB_CONNECTION_STATUS_DOWN 2
+    u32      status;
+    memory_t ring;       /* status: UP */
+    u32      evtchn;     /* status: UP */
+} pdb_connection_t, *pdb_connection_p;
+
+#endif /* __XEN_PUBLIC_IO_DOMAIN_CONTROLLER_H__ */
Index: linux-2.6.12-xen0-arch/include/asm-i386/mach-xen/xen-public/io/ioreq.h
===================================================================
--- /dev/null
+++ linux-2.6.12-xen0-arch/include/asm-i386/mach-xen/xen-public/io/ioreq.h
@@ -0,0 +1,70 @@
+/*
+ * ioreq.h: I/O request definitions for device models
+ * Copyright (c) 2004, Intel Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License along with
+ * this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+ * Place - Suite 330, Boston, MA 02111-1307 USA.
+ *
+ */
+
+#ifndef _IOREQ_H_
+#define _IOREQ_H_
+
+#define IOREQ_READ      1
+#define IOREQ_WRITE     0
+
+#define STATE_INVALID           0
+#define STATE_IOREQ_READY       1
+#define STATE_IOREQ_INPROCESS   2
+#define STATE_IORESP_READY      3
+#define STATE_IORESP_HOOK       4
+
+/* VMExit dispatcher should cooperate with instruction decoder to
+   prepare this structure and notify service OS and DM by sending
+   virq */
+typedef struct {
+    u64     addr;               /*  physical address            */
+    u64     size;               /*  size in bytes               */
+    u64     count;             /*  for rep prefixes            */
+    union {
+        u64     data;           /*  data                        */
+        void    *pdata;         /*  pointer to data             */
+    } u;
+    u8      state:4;
+    u8      pdata_valid:1;     /* if 1, use pdata above        */
+    u8      dir:1;             /*  1=read, 0=write             */
+    u8      port_mm:1;         /*  0=portio, 1=mmio            */
+    u8      df:1;
+} ioreq_t;
+
+#define MAX_VECTOR    256
+#define BITS_PER_BYTE   8
+#define INTR_LEN        (MAX_VECTOR/(BITS_PER_BYTE * sizeof(u64)))
+
+typedef struct {
+    u64   pic_intr[INTR_LEN];
+    u64   pic_mask[INTR_LEN];
+    int     eport; /* Event channel port */
+} global_iodata_t;
+
+typedef struct {
+    ioreq_t         vp_ioreq;
+    unsigned long   vp_intr[INTR_LEN];
+} vcpu_iodata_t;
+
+typedef struct {
+    global_iodata_t     sp_global;
+    vcpu_iodata_t       vcpu_iodata[1];
+} shared_iopage_t;
+
+#endif /* _IOREQ_H_ */
Index: linux-2.6.12-xen0-arch/include/asm-i386/mach-xen/xen-public/io/netif.h
===================================================================
--- /dev/null
+++ linux-2.6.12-xen0-arch/include/asm-i386/mach-xen/xen-public/io/netif.h
@@ -0,0 +1,108 @@
+/******************************************************************************
+ * netif.h
+ * 
+ * Unified network-device I/O interface for Xen guest OSes.
+ * 
+ * Copyright (c) 2003-2004, Keir Fraser
+ */
+
+#ifndef __XEN_PUBLIC_IO_NETIF_H__
+#define __XEN_PUBLIC_IO_NETIF_H__
+
+typedef struct netif_tx_request {
+    memory_t addr;   /* Machine address of packet.  */
+    u16      csum_blank:1; /* Proto csum field blank?   */
+    u16      id:15;  /* Echoed in response message. */
+    u16      size;   /* Packet size in bytes.       */
+} netif_tx_request_t;
+
+typedef struct netif_tx_response {
+    u16      id;
+    s8       status;
+} netif_tx_response_t;
+
+typedef struct {
+    u16       id;    /* Echoed in response message.        */
+#ifdef CONFIG_XEN_NETDEV_GRANT_RX
+    grant_ref_t gref;  /* 2: Reference to incoming granted frame */
+#endif
+} netif_rx_request_t;
+
+typedef struct {
+#ifdef CONFIG_XEN_NETDEV_GRANT_TX
+    u32      addr;   /*  0: Offset in page of start of received packet  */
+#else
+    memory_t addr;   /* Machine address of packet.              */
+#endif
+    u16      csum_valid:1; /* Protocol checksum is validated?       */
+    u16      id:15;
+    s16      status; /* -ve: BLKIF_RSP_* ; +ve: Rx'ed pkt size. */
+} netif_rx_response_t;
+
+/*
+ * We use a special capitalised type name because it is _essential_ that all 
+ * arithmetic on indexes is done on an integer type of the correct size.
+ */
+typedef u32 NETIF_RING_IDX;
+
+/*
+ * Ring indexes are 'free running'. That is, they are not stored modulo the
+ * size of the ring buffer. The following macros convert a free-running counter
+ * into a value that can directly index a ring-buffer array.
+ */
+#define MASK_NETIF_RX_IDX(_i) ((_i)&(NETIF_RX_RING_SIZE-1))
+#define MASK_NETIF_TX_IDX(_i) ((_i)&(NETIF_TX_RING_SIZE-1))
+
+#ifdef __x86_64__
+/*
+ * This restriction can be lifted when we move netfront/netback to use
+ * grant tables. This will remove memory_t fields from the above structures
+ * and thus relax natural alignment restrictions.
+ */
+#define NETIF_TX_RING_SIZE 128
+#define NETIF_RX_RING_SIZE 128
+#else
+#define NETIF_TX_RING_SIZE 256
+#define NETIF_RX_RING_SIZE 256
+#endif
+
+/* This structure must fit in a memory page. */
+typedef struct netif_tx_interface {
+    /*
+     * Frontend places packets into ring at tx_req_prod.
+     * Frontend receives event when tx_resp_prod passes tx_event.
+     * 'req_cons' is a shadow of the backend's request consumer -- the frontend
+     * may use it to determine if all queued packets have been seen by the
+     * backend.
+     */
+    NETIF_RING_IDX req_prod;
+    NETIF_RING_IDX req_cons;
+    NETIF_RING_IDX resp_prod;
+    NETIF_RING_IDX event;
+    union {
+        netif_tx_request_t  req;
+        netif_tx_response_t resp;
+    } ring[NETIF_TX_RING_SIZE];
+} netif_tx_interface_t;
+
+/* This structure must fit in a memory page. */
+typedef struct netif_rx_interface {
+    /*
+     * Frontend places empty buffers into ring at rx_req_prod.
+     * Frontend receives event when rx_resp_prod passes rx_event.
+     */
+    NETIF_RING_IDX req_prod;
+    NETIF_RING_IDX resp_prod;
+    NETIF_RING_IDX event;
+    union {
+        netif_rx_request_t  req;
+        netif_rx_response_t resp;
+    } ring[NETIF_RX_RING_SIZE];
+} netif_rx_interface_t;
+
+/* Descriptor status values */
+#define NETIF_RSP_DROPPED         -2
+#define NETIF_RSP_ERROR           -1
+#define NETIF_RSP_OKAY             0
+
+#endif
Index: linux-2.6.12-xen0-arch/include/asm-i386/mach-xen/xen-public/io/ring.h
===================================================================
--- /dev/null
+++ linux-2.6.12-xen0-arch/include/asm-i386/mach-xen/xen-public/io/ring.h
@@ -0,0 +1,199 @@
+/*
+ * Shared producer-consumer ring macros.
+ * Tim Deegan and Andrew Warfield November 2004.
+ */ 
+
+#ifndef __XEN_PUBLIC_IO_RING_H__
+#define __XEN_PUBLIC_IO_RING_H__
+
+typedef unsigned int RING_IDX;
+
+/* Round a 32-bit unsigned constant down to the nearest power of two. */
+#define __RD2(_x)  (((_x) & 0x00000002) ? 0x2                  : ((_x) & 0x1))
+#define __RD4(_x)  (((_x) & 0x0000000c) ? __RD2((_x)>>2)<<2    : __RD2(_x))
+#define __RD8(_x)  (((_x) & 0x000000f0) ? __RD4((_x)>>4)<<4    : __RD4(_x))
+#define __RD16(_x) (((_x) & 0x0000ff00) ? __RD8((_x)>>8)<<8    : __RD8(_x))
+#define __RD32(_x) (((_x) & 0xffff0000) ? __RD16((_x)>>16)<<16 : __RD16(_x))
+
+/*
+ * Calculate size of a shared ring, given the total available space for the
+ * ring and indexes (_sz), and the name tag of the request/response structure.
+ * A ring contains as many entries as will fit, rounded down to the nearest 
+ * power of two (so we can mask with (size-1) to loop around).
+ */
+#define __RING_SIZE(_s, _sz) \
+    (__RD32(((_sz) - 2*sizeof(RING_IDX)) / sizeof((_s)->ring[0])))
+
+/*
+ *  Macros to make the correct C datatypes for a new kind of ring.
+ * 
+ *  To make a new ring datatype, you need to have two message structures,
+ *  let's say request_t, and response_t already defined.
+ *
+ *  In a header where you want the ring datatype declared, you then do:
+ *
+ *     DEFINE_RING_TYPES(mytag, request_t, response_t);
+ *
+ *  These expand out to give you a set of types, as you can see below.
+ *  The most important of these are:
+ *  
+ *     mytag_sring_t      - The shared ring.
+ *     mytag_front_ring_t - The 'front' half of the ring.
+ *     mytag_back_ring_t  - The 'back' half of the ring.
+ *
+ *  To initialize a ring in your code you need to know the location and size
+ *  of the shared memory area (PAGE_SIZE, for instance). To initialise
+ *  the front half:
+ *
+ *      mytag_front_ring_t front_ring;
+ *
+ *      SHARED_RING_INIT((mytag_sring_t *)shared_page);
+ *      FRONT_RING_INIT(&front_ring, (mytag_sring_t *)shared_page, PAGE_SIZE);
+ *
+ *  Initializing the back follows similarly...
+ */
+         
+#define DEFINE_RING_TYPES(__name, __req_t, __rsp_t)                     \
+                                                                        \
+/* Shared ring entry */                                                 \
+union __name##_sring_entry {                                            \
+    __req_t req;                                                        \
+    __rsp_t rsp;                                                        \
+};                                                                      \
+                                                                        \
+/* Shared ring page */                                                  \
+struct __name##_sring {                                                 \
+    RING_IDX req_prod;                                                  \
+    RING_IDX rsp_prod;                                                  \
+    union __name##_sring_entry ring[1]; /* variable-length */           \
+};                                                                      \
+                                                                        \
+/* "Front" end's private variables */                                   \
+struct __name##_front_ring {                                            \
+    RING_IDX req_prod_pvt;                                              \
+    RING_IDX rsp_cons;                                                  \
+    unsigned int nr_ents;                                               \
+    struct __name##_sring *sring;                                       \
+};                                                                      \
+                                                                        \
+/* "Back" end's private variables */                                    \
+struct __name##_back_ring {                                             \
+    RING_IDX rsp_prod_pvt;                                              \
+    RING_IDX req_cons;                                                  \
+    unsigned int nr_ents;                                               \
+    struct __name##_sring *sring;                                       \
+};                                                                      \
+                                                                        \
+/* Syntactic sugar */                                                   \
+typedef struct __name##_sring __name##_sring_t;                         \
+typedef struct __name##_front_ring __name##_front_ring_t;               \
+typedef struct __name##_back_ring __name##_back_ring_t;
+
+/*
+ *   Macros for manipulating rings.  
+ * 
+ *   FRONT_RING_whatever works on the "front end" of a ring: here 
+ *   requests are pushed on to the ring and responses taken off it.
+ * 
+ *   BACK_RING_whatever works on the "back end" of a ring: here 
+ *   requests are taken off the ring and responses put on.
+ * 
+ *   N.B. these macros do NO INTERLOCKS OR FLOW CONTROL.  
+ *   This is OK in 1-for-1 request-response situations where the 
+ *   requestor (front end) never has more than RING_SIZE()-1
+ *   outstanding requests.
+ */
+
+/* Initialising empty rings */
+#define SHARED_RING_INIT(_s) do {                                       \
+    (_s)->req_prod = 0;                                                 \
+    (_s)->rsp_prod = 0;                                                 \
+} while(0)
+
+#define FRONT_RING_INIT(_r, _s, __size) do {                            \
+    (_r)->req_prod_pvt = 0;                                             \
+    (_r)->rsp_cons = 0;                                                 \
+    (_r)->nr_ents = __RING_SIZE(_s, __size);                            \
+    (_r)->sring = (_s);                                                 \
+} while (0)
+
+#define BACK_RING_INIT(_r, _s, __size) do {                             \
+    (_r)->rsp_prod_pvt = 0;                                             \
+    (_r)->req_cons = 0;                                                 \
+    (_r)->nr_ents = __RING_SIZE(_s, __size);                            \
+    (_r)->sring = (_s);                                                 \
+} while (0)
+
+/* Initialize to existing shared indexes -- for recovery */
+#define FRONT_RING_ATTACH(_r, _s, __size) do {                          \
+    (_r)->sring = (_s);                                                 \
+    (_r)->req_prod_pvt = (_s)->req_prod;                                \
+    (_r)->rsp_cons = (_s)->rsp_prod;                                    \
+    (_r)->nr_ents = __RING_SIZE(_s, __size);                            \
+} while (0)
+
+#define BACK_RING_ATTACH(_r, _s, __size) do {                           \
+    (_r)->sring = (_s);                                                 \
+    (_r)->rsp_prod_pvt = (_s)->rsp_prod;                                \
+    (_r)->req_cons = (_s)->req_prod;                                    \
+    (_r)->nr_ents = __RING_SIZE(_s, __size);                            \
+} while (0)
+
+/* How big is this ring? */
+#define RING_SIZE(_r)                                                   \
+    ((_r)->nr_ents)
+
+/* How many empty slots are on a ring? */
+#define RING_PENDING_REQUESTS(_r)                                       \
+   ( ((_r)->req_prod_pvt - (_r)->rsp_cons) )
+   
+/* Test if there is an empty slot available on the front ring. 
+ * (This is only meaningful from the front. )
+ */
+#define RING_FULL(_r)                                                   \
+    (((_r)->req_prod_pvt - (_r)->rsp_cons) == RING_SIZE(_r))
+
+/* Test if there are outstanding messages to be processed on a ring. */
+#define RING_HAS_UNCONSUMED_RESPONSES(_r)                               \
+   ( (_r)->rsp_cons != (_r)->sring->rsp_prod )
+   
+#define RING_HAS_UNCONSUMED_REQUESTS(_r)                                \
+   ( ((_r)->req_cons != (_r)->sring->req_prod ) &&                      \
+     (((_r)->req_cons - (_r)->rsp_prod_pvt) !=                          \
+      RING_SIZE(_r)) )
+      
+/* Test if there are messages waiting to be pushed. */
+#define RING_HAS_UNPUSHED_REQUESTS(_r)                                  \
+   ( (_r)->req_prod_pvt != (_r)->sring->req_prod )
+   
+#define RING_HAS_UNPUSHED_RESPONSES(_r)                                 \
+   ( (_r)->rsp_prod_pvt != (_r)->sring->rsp_prod )
+
+/* Copy the private producer pointer into the shared ring so the other end 
+ * can see the updates we've made. */
+#define RING_PUSH_REQUESTS(_r) do {                                     \
+    wmb();                                                              \
+    (_r)->sring->req_prod = (_r)->req_prod_pvt;                         \
+} while (0)
+
+#define RING_PUSH_RESPONSES(_r) do {                                    \
+    wmb();                                                              \
+    (_r)->sring->rsp_prod = (_r)->rsp_prod_pvt;                         \
+} while (0)
+
+/* Direct access to individual ring elements, by index. */
+#define RING_GET_REQUEST(_r, _idx)                                      \
+ (&((_r)->sring->ring[                                                  \
+     ((_idx) & (RING_SIZE(_r) - 1))                                     \
+     ].req))
+
+#define RING_GET_RESPONSE(_r, _idx)                                     \
+ (&((_r)->sring->ring[                                                  \
+     ((_idx) & (RING_SIZE(_r) - 1))                                     \
+     ].rsp))   
+    
+/* Loop termination condition: Would the specified index overflow the ring? */
+#define RING_REQUEST_CONS_OVERFLOW(_r, _cons)                           \
+    (((_cons) - (_r)->rsp_prod_pvt) >= RING_SIZE(_r))
+
+#endif /* __XEN_PUBLIC_IO_RING_H__ */
Index: linux-2.6.12-xen0-arch/include/asm-i386/mach-xen/xen-public/io/usbif.h
===================================================================
--- /dev/null
+++ linux-2.6.12-xen0-arch/include/asm-i386/mach-xen/xen-public/io/usbif.h
@@ -0,0 +1,66 @@
+/******************************************************************************
+ * usbif.h
+ * 
+ * Unified block-device I/O interface for Xen guest OSes.
+ * 
+ * Copyright (c) 2003-2004, Keir Fraser
+ */
+
+#ifndef __SHARED_USBIF_H__
+#define __SHARED_USBIF_H__
+
+#define usbif_vdev_t   u16
+#define usbif_sector_t u64
+
+#define USBIF_OP_IO      0 /* Request IO to a device */
+#define USBIF_OP_PROBE   1 /* Is there a device on this port? */
+#define USBIF_OP_RESET   2 /* Reset a virtual USB port.       */
+
+typedef struct {
+    unsigned long  id;           /* private guest value, echoed in resp  */
+    u8             operation;    /* USBIF_OP_???                         */
+    u8  __pad1;
+    usbif_vdev_t   port;         /* guest virtual USB port               */
+    unsigned long  devnum :7;    /* Device address, as seen by the guest.*/
+    unsigned long  endpoint :4;  /* Device endpoint.                         */
+    unsigned long  direction :1; /* Pipe direction.                          */
+    unsigned long  speed :1;     /* Pipe speed.                              */
+    unsigned long  pipe_type :2; /* Pipe type (iso, bulk, int, ctrl)         */
+    unsigned long  __pad2 :18;
+    unsigned long  transfer_buffer; /* Machine address */
+    unsigned long  length;          /* Buffer length */
+    unsigned long  transfer_flags;  /* For now just pass Linux transfer
+                                     * flags - this may change. */
+    unsigned char setup[8];         /* Embed setup packets directly. */
+    unsigned long  iso_schedule;    /* Machine address of transfer sched (iso
+                                     * only) */
+    unsigned long num_iso;        /* length of iso schedule */
+    unsigned long timeout;        /* timeout in ms */
+} usbif_request_t;
+
+/* Data we need to pass:
+ * - Transparently handle short packets or complain at us?
+ */
+
+typedef struct {
+    unsigned long   id;              /* copied from request         */
+    u8              operation;       /* copied from request         */
+    u8              data;            /* Small chunk of in-band data */
+    s16             status;          /* USBIF_RSP_???               */
+    unsigned long   transfer_mutex;  /* Used for cancelling requests 
atomically. */
+    unsigned long    length;         /* How much data we really got */
+} usbif_response_t;
+
+#define USBIF_RSP_ERROR  -1 /* non-specific 'error' */
+#define USBIF_RSP_OKAY    0 /* non-specific 'okay'  */
+
+DEFINE_RING_TYPES(usbif, usbif_request_t, usbif_response_t);
+
+typedef struct {
+    unsigned long length; /* IN = expected, OUT = actual */
+    unsigned long buffer_offset;  /* IN offset in buffer specified in main
+                                     packet */
+    unsigned long status; /* OUT Status for this packet. */
+} usbif_iso_t;
+
+#endif /* __SHARED_USBIF_H__ */
Index: 
linux-2.6.12-xen0-arch/include/asm-i386/mach-xen/xen-public/io/vmx_vlapic.h
===================================================================
--- /dev/null
+++ linux-2.6.12-xen0-arch/include/asm-i386/mach-xen/xen-public/io/vmx_vlapic.h
@@ -0,0 +1,57 @@
+#ifndef _VMX_VLAPIC_H
+#define _VMX_VLAPIC_H
+
+/*
+   We extended one bit for PIC type
+ */
+#define VLAPIC_DELIV_MODE_FIXED          0x0
+#define VLAPIC_DELIV_MODE_LPRI           0x1
+#define VLAPIC_DELIV_MODE_SMI            0x2
+#define VLAPIC_DELIV_MODE_NMI            0x4
+#define VLAPIC_DELIV_MODE_INIT           0x5
+#define VLAPIC_DELIV_MODE_STARTUP        0x6
+#define VLAPIC_DELIV_MODE_EXT            0x7
+#define VLAPIC_DELIV_MODE_MASK            0x8
+
+#define VLAPIC_MSG_LEVEL                4
+
+#define INTR_EXT   0
+#define INTR_APIC   1
+#define INTR_LAPIC  2
+
+#define VL_STATE_EOI    1
+#define VL_STATE_EXT_LOCK   2
+#define VL_STATE_MSG_LOCK   3
+#define VL_STATE_EOI_LOCK   3
+
+#define VLOCAL_APIC_MAX_INTS             256
+#define VLAPIC_INT_COUNT                (VLOCAL_APIC_MAX_INTS/(BITS_PER_BYTE * 
sizeof(u64)))
+#define VLAPIC_INT_COUNT_32             (VLOCAL_APIC_MAX_INTS/(BITS_PER_BYTE * 
sizeof(u32)))
+
+struct vapic_bus_message{
+   u8   deliv_mode:4;   /* deliver mode, including fixed, LPRI, etc */
+   u8   level:1;        /* level or edge */
+   u8   trig_mod:1;    /* assert or disassert */
+   u8   reserved:2;
+   u8   vector;
+};
+
+typedef struct {
+    /* interrupt for PIC and ext type IOAPIC interrupt */
+    u64   vl_ext_intr[VLAPIC_INT_COUNT];
+    u64   vl_ext_intr_mask[VLAPIC_INT_COUNT];
+    u64   vl_apic_intr[VLAPIC_INT_COUNT];
+    u64   vl_apic_tmr[VLAPIC_INT_COUNT];
+    u64   vl_eoi[VLAPIC_INT_COUNT];
+    u32   vl_lapic_id;
+    u32   direct_intr;
+    u32   vl_apr;
+    u32   vl_logical_dest;
+    u32   vl_dest_format;
+    u32   vl_arb_id;
+    u32   vl_state;
+    u32   apic_msg_count;
+    struct vapic_bus_message  vl_apic_msg[24];
+} vlapic_info;
+
+#endif /* _VMX_VLAPIC_H_ */
Index: linux-2.6.12-xen0-arch/include/asm-i386/mach-xen/xen-public/physdev.h
===================================================================
--- /dev/null
+++ linux-2.6.12-xen0-arch/include/asm-i386/mach-xen/xen-public/physdev.h
@@ -0,0 +1,70 @@
+
+#ifndef __XEN_PUBLIC_PHYSDEV_H__
+#define __XEN_PUBLIC_PHYSDEV_H__
+
+/* Commands to HYPERVISOR_physdev_op() */
+#define PHYSDEVOP_IRQ_UNMASK_NOTIFY     4
+#define PHYSDEVOP_IRQ_STATUS_QUERY      5
+#define PHYSDEVOP_SET_IOPL              6
+#define PHYSDEVOP_SET_IOBITMAP          7
+#define PHYSDEVOP_APIC_READ             8
+#define PHYSDEVOP_APIC_WRITE            9
+#define PHYSDEVOP_ASSIGN_VECTOR         10
+
+typedef struct physdevop_irq_status_query {
+    /* IN */
+    u32 irq;
+    /* OUT */
+/* Need to call PHYSDEVOP_IRQ_UNMASK_NOTIFY when the IRQ has been serviced? */
+#define PHYSDEVOP_IRQ_NEEDS_UNMASK_NOTIFY (1<<0)
+    u32 flags;
+} physdevop_irq_status_query_t;
+
+typedef struct physdevop_set_iopl {
+    /* IN */
+    u32 iopl;
+} physdevop_set_iopl_t;
+
+typedef struct physdevop_set_iobitmap {
+    /* IN */
+    memory_t bitmap;
+    u32      nr_ports;
+} physdevop_set_iobitmap_t;
+
+typedef struct physdevop_apic {
+    /* IN */
+    u32 apic;
+    u32 offset;
+    /* IN or OUT */
+    u32 value;
+} physdevop_apic_t; 
+
+typedef struct physdevop_irq {
+    /* IN */
+    u32 irq;
+    /* OUT */
+    u32 vector;
+} physdevop_irq_t; 
+
+typedef struct physdev_op {
+    u32 cmd;
+    union {
+        physdevop_irq_status_query_t      irq_status_query;
+        physdevop_set_iopl_t              set_iopl;
+        physdevop_set_iobitmap_t          set_iobitmap;
+        physdevop_apic_t                  apic_op;
+        physdevop_irq_t                   irq_op;
+    } u;
+} physdev_op_t;
+
+#endif /* __XEN_PUBLIC_PHYSDEV_H__ */
+
+/*
+ * Local variables:
+ * mode: C
+ * c-set-style: "BSD"
+ * c-basic-offset: 4
+ * tab-width: 4
+ * indent-tabs-mode: nil
+ * End:
+ */
Index: linux-2.6.12-xen0-arch/include/asm-i386/mach-xen/xen-public/policy_ops.h
===================================================================
--- /dev/null
+++ linux-2.6.12-xen0-arch/include/asm-i386/mach-xen/xen-public/policy_ops.h
@@ -0,0 +1,70 @@
+/******************************************************************************
+ * policy_ops.h
+ * 
+ * Copyright (C) 2005 IBM Corporation
+ *
+ * Author:
+ * Reiner Sailer <sailer@xxxxxxxxxxxxxx>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation, version 2 of the
+ * License. 
+ *
+ * Process policy command requests from guest OS.
+ * access checked by policy; not restricted to DOM0
+ * 
+ */
+
+#ifndef __XEN_PUBLIC_POLICY_OPS_H__
+#define __XEN_PUBLIC_POLICY_OPS_H__
+
+#include "xen-public/xen.h"
+#include "xen-public/sched_ctl.h"
+
+/*
+ * Make sure you increment the interface version whenever you modify this file!
+ * This makes sure that old versions of policy tools will stop working in a
+ * well-defined way (rather than crashing the machine, for instance).
+ */
+#define POLICY_INTERFACE_VERSION   0xAAAA0003
+
+/************************************************************************/
+
+#define POLICY_SETPOLICY               4
+typedef struct policy_setpolicy {
+    /* IN variables. */
+    u16           policy_type;
+    /* OUT variables */
+    void         *pushcache;
+    u16           pushcache_size;
+} policy_setpolicy_t;          
+
+
+#define POLICY_GETPOLICY               5
+typedef struct policy_getpolicy {
+    /* IN variables. */
+    u16           policy_type;
+    /* OUT variables */
+    void         *pullcache;
+    u16           pullcache_size;
+} policy_getpolicy_t;       
+
+#define POLICY_DUMPSTATS               6
+typedef struct policy_dumpstats {
+    void         *pullcache;
+    u16           pullcache_size;
+} policy_dumpstats_t;            
+ 
+
+typedef struct policy_op {
+    u32 cmd;
+    u32 interface_version;       /* POLICY_INTERFACE_VERSION */
+    union {
+       policy_setpolicy_t       setpolicy;
+        policy_getpolicy_t       getpolicy;
+       policy_dumpstats_t       dumpstats;
+    } u;
+} policy_op_t;
+
+#endif /* __XEN_PUBLIC_POLICY_OPS_H__ */
Index: linux-2.6.12-xen0-arch/include/asm-i386/mach-xen/xen-public/sched_ctl.h
===================================================================
--- /dev/null
+++ linux-2.6.12-xen0-arch/include/asm-i386/mach-xen/xen-public/sched_ctl.h
@@ -0,0 +1,58 @@
+/******************************************************************************
+ * Generic scheduler control interface.
+ *
+ * Mark Williamson, (C) 2004 Intel Research Cambridge
+ */
+
+#ifndef __XEN_PUBLIC_SCHED_CTL_H__
+#define __XEN_PUBLIC_SCHED_CTL_H__
+
+/* Scheduler types. */
+#define SCHED_BVT      0
+#define SCHED_SEDF     4
+
+/* Set or get info? */
+#define SCHED_INFO_PUT 0
+#define SCHED_INFO_GET 1
+
+/*
+ * Generic scheduler control command - used to adjust system-wide scheduler
+ * parameters
+ */
+struct sched_ctl_cmd {
+    u32 sched_id;
+    u32 direction;
+    union {
+        struct bvt_ctl {
+            u32 ctx_allow;
+        } bvt;
+    } u;
+};
+
+struct sched_adjdom_cmd {
+    u32     sched_id;
+    u32     direction;
+    domid_t domain;
+    union {
+        struct bvt_adjdom
+        {
+            u32 mcu_adv;            /* mcu advance: inverse of weight */
+            u32 warpback;           /* warp? */
+            s32 warpvalue;          /* warp value */
+            long long warpl;        /* warp limit */
+            long long warpu;        /* unwarp time requirement */
+        } bvt;
+        
+       struct sedf_adjdom
+        {
+            u64 period;
+            u64 slice;
+            u64 latency;
+            u16 extratime;
+           u16 weight;
+        } sedf;
+
+    } u;
+};
+
+#endif /* __XEN_PUBLIC_SCHED_CTL_H__ */
Index: linux-2.6.12-xen0-arch/include/asm-i386/mach-xen/xen-public/trace.h
===================================================================
--- /dev/null
+++ linux-2.6.12-xen0-arch/include/asm-i386/mach-xen/xen-public/trace.h
@@ -0,0 +1,59 @@
+/******************************************************************************
+ * include/public/trace.h
+ * 
+ * Mark Williamson, (C) 2004 Intel Research Cambridge
+ * Copyright (C) 2005 Bin Ren
+ */
+
+#ifndef __XEN_PUBLIC_TRACE_H__
+#define __XEN_PUBLIC_TRACE_H__
+
+/* Trace classes */
+#define TRC_GEN     0x00010000    /* General trace            */
+#define TRC_SCHED   0x00020000    /* Xen Scheduler trace      */
+#define TRC_DOM0OP  0x00040000    /* Xen DOM0 operation trace */
+#define TRC_VMX     0x00080000    /* Xen VMX trace            */
+#define TRC_ALL     0xffff0000
+
+/* Trace events per class */
+
+#define TRC_SCHED_DOM_ADD       (TRC_SCHED +  1)
+#define TRC_SCHED_DOM_REM       (TRC_SCHED +  2)
+#define TRC_SCHED_SLEEP         (TRC_SCHED +  3)
+#define TRC_SCHED_WAKE          (TRC_SCHED +  4)
+#define TRC_SCHED_YIELD         (TRC_SCHED +  5)
+#define TRC_SCHED_BLOCK         (TRC_SCHED +  6)
+#define TRC_SCHED_SHUTDOWN      (TRC_SCHED +  7)
+#define TRC_SCHED_CTL           (TRC_SCHED +  8)
+#define TRC_SCHED_ADJDOM        (TRC_SCHED +  9)
+#define TRC_SCHED_SWITCH        (TRC_SCHED + 10)
+#define TRC_SCHED_S_TIMER_FN    (TRC_SCHED + 11)
+#define TRC_SCHED_T_TIMER_FN    (TRC_SCHED + 12)
+#define TRC_SCHED_DOM_TIMER_FN  (TRC_SCHED + 13)
+
+#define TRC_VMX_VMEXIT          (TRC_VMX + 1)
+#define TRC_VMX_VECTOR          (TRC_VMX + 2)
+#define TRC_VMX_INT             (TRC_VMX + 3)
+
+/* This structure represents a single trace buffer record. */
+struct t_rec {
+    u64 cycles;               /* cycle counter timestamp */
+    u32 event;                /* event ID                */
+    unsigned long data[5];    /* event data items        */
+};
+
+/*
+ * This structure contains the metadata for a single trace buffer.  The head
+ * field, indexes into an array of struct t_rec's.
+ */
+struct t_buf {
+    /* Used by both Xen and user space. */
+    atomic_t      rec_idx;   /* the next record to save to */
+    unsigned int  rec_num;   /* number of records in this trace buffer  */
+    /* Used by Xen only. */
+    struct t_rec  *rec;      /* start of records */
+    /* Used by user space only. */
+    unsigned long rec_addr;  /* machine address of the start of records */
+};
+
+#endif /* __XEN_PUBLIC_TRACE_H__ */
Index: linux-2.6.12-xen0-arch/include/asm-i386/mach-xen/xen-public/vmx_assist.h
===================================================================
--- /dev/null
+++ linux-2.6.12-xen0-arch/include/asm-i386/mach-xen/xen-public/vmx_assist.h
@@ -0,0 +1,101 @@
+/*
+ * vmx_assist.h: Context definitions for the VMXASSIST world switch.
+ *
+ * Leendert van Doorn, leendert@xxxxxxxxxxxxxx
+ * Copyright (c) 2005, International Business Machines Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License along with
+ * this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+ * Place - Suite 330, Boston, MA 02111-1307 USA.
+ *
+ */
+#ifndef _VMX_ASSIST_H_
+#define _VMX_ASSIST_H_
+
+#define        VMXASSIST_BASE          0xD0000
+#define        VMXASSIST_MAGIC         0x17101966
+#define        VMXASSIST_MAGIC_OFFSET  (VMXASSIST_BASE+8)
+
+#define        VMXASSIST_NEW_CONTEXT   (VMXASSIST_BASE + 12)
+#define        VMXASSIST_OLD_CONTEXT   (VMXASSIST_NEW_CONTEXT + 4)
+
+#ifndef __ASSEMBLY__
+
+union vmcs_arbytes {
+       struct arbyte_fields {
+               unsigned int    seg_type        : 4,
+                               s               : 1,
+                               dpl             : 2,
+                               p               : 1, 
+                               reserved0       : 4,
+                               avl             : 1,
+                               reserved1       : 1,     
+                               default_ops_size: 1,
+                               g               : 1,
+                               null_bit        : 1, 
+                               reserved2       : 15;
+       } fields;
+       unsigned int bytes;
+};
+
+/*
+ * World switch state
+ */
+typedef struct vmx_assist_context {
+       u32             eip;            /* execution pointer */
+       u32             esp;            /* stack point */
+       u32             eflags;         /* flags register */
+       u32             cr0;
+       u32             cr3;            /* page table directory */
+       u32             cr4;
+       u32             idtr_limit;     /* idt */
+       u32             idtr_base;
+       u32             gdtr_limit;     /* gdt */
+       u32             gdtr_base;
+       u32             cs_sel;         /* cs selector */
+       u32             cs_limit;
+       u32             cs_base;
+       union vmcs_arbytes      cs_arbytes;
+       u32             ds_sel;         /* ds selector */
+       u32             ds_limit;
+       u32             ds_base;
+       union vmcs_arbytes      ds_arbytes;
+       u32             es_sel;         /* es selector */
+       u32             es_limit;
+       u32             es_base;
+       union vmcs_arbytes      es_arbytes;
+       u32             ss_sel;         /* ss selector */
+       u32             ss_limit;
+       u32             ss_base;
+       union vmcs_arbytes      ss_arbytes;
+       u32             fs_sel;         /* fs selector */
+       u32             fs_limit;
+       u32             fs_base;
+       union vmcs_arbytes      fs_arbytes;
+       u32             gs_sel;         /* gs selector */
+       u32             gs_limit;
+       u32             gs_base;
+       union vmcs_arbytes      gs_arbytes;
+       u32             tr_sel;         /* task selector */
+       u32             tr_limit;
+       u32             tr_base;
+       union vmcs_arbytes      tr_arbytes;
+       u32             ldtr_sel;       /* ldtr selector */
+       u32             ldtr_limit;
+       u32             ldtr_base;
+       union vmcs_arbytes      ldtr_arbytes;
+} vmx_assist_context_t;
+
+#endif /* __ASSEMBLY__ */
+
+#endif /* _VMX_ASSIST_H_ */
+
Index: linux-2.6.12-xen0-arch/include/asm-i386/mach-xen/xen-public/xen.h
===================================================================
--- /dev/null
+++ linux-2.6.12-xen0-arch/include/asm-i386/mach-xen/xen-public/xen.h
@@ -0,0 +1,466 @@
+/******************************************************************************
+ * xen.h
+ * 
+ * Guest OS interface to Xen.
+ * 
+ * Copyright (c) 2004, K A Fraser
+ */
+
+#ifndef __XEN_PUBLIC_XEN_H__
+#define __XEN_PUBLIC_XEN_H__
+
+#if defined(__i386__)
+#include "xen-public/arch-x86_32.h"
+#elif defined(__x86_64__)
+#include "xen-public/arch-x86_64.h"
+#elif defined(__ia64__)
+#include "xen-public/arch-ia64.h"
+#else
+#error "Unsupported architecture"
+#endif
+
+/*
+ * XEN "SYSTEM CALLS" (a.k.a. HYPERCALLS).
+ */
+
+/*
+ * x86_32: EAX = vector; EBX, ECX, EDX, ESI, EDI = args 1, 2, 3, 4, 5.
+ *         EAX = return value
+ *         (argument registers may be clobbered on return)
+ * x86_64: RAX = vector; RDI, RSI, RDX, R10, R8, R9 = args 1, 2, 3, 4, 5, 6. 
+ *         RAX = return value
+ *         (argument registers not clobbered on return; RCX, R11 are)
+ */
+#define __HYPERVISOR_set_trap_table        0
+#define __HYPERVISOR_mmu_update            1
+#define __HYPERVISOR_set_gdt               2
+#define __HYPERVISOR_stack_switch          3
+#define __HYPERVISOR_set_callbacks         4
+#define __HYPERVISOR_fpu_taskswitch        5
+#define __HYPERVISOR_sched_op              6
+#define __HYPERVISOR_dom0_op               7
+#define __HYPERVISOR_set_debugreg          8
+#define __HYPERVISOR_get_debugreg          9
+#define __HYPERVISOR_update_descriptor    10
+#define __HYPERVISOR_dom_mem_op           12
+#define __HYPERVISOR_multicall            13
+#define __HYPERVISOR_update_va_mapping    14
+#define __HYPERVISOR_set_timer_op         15
+#define __HYPERVISOR_event_channel_op     16
+#define __HYPERVISOR_xen_version          17
+#define __HYPERVISOR_console_io           18
+#define __HYPERVISOR_physdev_op           19
+#define __HYPERVISOR_grant_table_op       20
+#define __HYPERVISOR_vm_assist            21
+#define __HYPERVISOR_update_va_mapping_otherdomain 22
+#define __HYPERVISOR_switch_vm86          23 /* x86/32 only */
+#define __HYPERVISOR_switch_to_user       23 /* x86/64 only */
+#define __HYPERVISOR_boot_vcpu            24
+#define __HYPERVISOR_set_segment_base     25 /* x86/64 only */
+#define __HYPERVISOR_mmuext_op            26
+#define __HYPERVISOR_policy_op            27
+
+/* 
+ * VIRTUAL INTERRUPTS
+ * 
+ * Virtual interrupts that a guest OS may receive from Xen.
+ */
+#define VIRQ_TIMER      0  /* Timebase update, and/or requested timeout.  */
+#define VIRQ_DEBUG      1  /* Request guest to dump debug info.           */
+#define VIRQ_CONSOLE    2  /* (DOM0) bytes received on emergency console. */
+#define VIRQ_DOM_EXC    3  /* (DOM0) Exceptional event for some domain.   */
+#define VIRQ_PARITY_ERR 4  /* (DOM0) NMI parity error.                    */
+#define VIRQ_IO_ERR     5  /* (DOM0) NMI I/O error.                       */
+#define VIRQ_DEBUGGER   6  /* (DOM0) A domain has paused for debugging.   */
+#define NR_VIRQS        7
+
+/*
+ * MMU-UPDATE REQUESTS
+ * 
+ * HYPERVISOR_mmu_update() accepts a list of (ptr, val) pairs.
+ * A foreigndom (FD) can be specified (or DOMID_SELF for none).
+ * Where the FD has some effect, it is described below.
+ * ptr[1:0] specifies the appropriate MMU_* command.
+ * 
+ * ptr[1:0] == MMU_NORMAL_PT_UPDATE:
+ * Updates an entry in a page table. If updating an L1 table, and the new
+ * table entry is valid/present, the mapped frame must belong to the FD, if
+ * an FD has been specified. If attempting to map an I/O page then the
+ * caller assumes the privilege of the FD.
+ * FD == DOMID_IO: Permit /only/ I/O mappings, at the priv level of the caller.
+ * FD == DOMID_XEN: Map restricted areas of Xen's heap space.
+ * ptr[:2]  -- Machine address of the page-table entry to modify.
+ * val      -- Value to write.
+ * 
+ * ptr[1:0] == MMU_MACHPHYS_UPDATE:
+ * Updates an entry in the machine->pseudo-physical mapping table.
+ * ptr[:2]  -- Machine address within the frame whose mapping to modify.
+ *             The frame must belong to the FD, if one is specified.
+ * val      -- Value to write into the mapping entry.
+ */
+#define MMU_NORMAL_PT_UPDATE     0 /* checked '*ptr = val'. ptr is MA.       */
+#define MMU_MACHPHYS_UPDATE      1 /* ptr = MA of frame to modify entry for  */
+
+/*
+ * MMU EXTENDED OPERATIONS
+ * 
+ * HYPERVISOR_mmuext_op() accepts a list of mmuext_op structures.
+ * A foreigndom (FD) can be specified (or DOMID_SELF for none).
+ * Where the FD has some effect, it is described below.
+ * 
+ * cmd: MMUEXT_(UN)PIN_*_TABLE
+ * mfn: Machine frame number to be (un)pinned as a p.t. page.
+ *      The frame must belong to the FD, if one is specified.
+ * 
+ * cmd: MMUEXT_NEW_BASEPTR
+ * mfn: Machine frame number of new page-table base to install in MMU.
+ * 
+ * cmd: MMUEXT_NEW_USER_BASEPTR [x86/64 only]
+ * mfn: Machine frame number of new page-table base to install in MMU
+ *      when in user space.
+ * 
+ * cmd: MMUEXT_TLB_FLUSH_LOCAL
+ * No additional arguments. Flushes local TLB.
+ * 
+ * cmd: MMUEXT_INVLPG_LOCAL
+ * linear_addr: Linear address to be flushed from the local TLB.
+ * 
+ * cmd: MMUEXT_TLB_FLUSH_MULTI
+ * vcpumask: Pointer to bitmap of VCPUs to be flushed.
+ * 
+ * cmd: MMUEXT_INVLPG_MULTI
+ * linear_addr: Linear address to be flushed.
+ * vcpumask: Pointer to bitmap of VCPUs to be flushed.
+ * 
+ * cmd: MMUEXT_TLB_FLUSH_ALL
+ * No additional arguments. Flushes all VCPUs' TLBs.
+ * 
+ * cmd: MMUEXT_INVLPG_ALL
+ * linear_addr: Linear address to be flushed from all VCPUs' TLBs.
+ * 
+ * cmd: MMUEXT_FLUSH_CACHE
+ * No additional arguments. Writes back and flushes cache contents.
+ * 
+ * cmd: MMUEXT_SET_LDT
+ * linear_addr: Linear address of LDT base (NB. must be page-aligned).
+ * nr_ents: Number of entries in LDT.
+ * 
+ * cmd: MMUEXT_REASSIGN_PAGE
+ * mfn: Machine frame number to be reassigned to the FD.
+ *      (NB. page must currently belong to the calling domain).
+ */
+#define MMUEXT_PIN_L1_TABLE      0
+#define MMUEXT_PIN_L2_TABLE      1
+#define MMUEXT_PIN_L3_TABLE      2
+#define MMUEXT_PIN_L4_TABLE      3
+#define MMUEXT_UNPIN_TABLE       4
+#define MMUEXT_NEW_BASEPTR       5
+#define MMUEXT_TLB_FLUSH_LOCAL   6
+#define MMUEXT_INVLPG_LOCAL      7
+#define MMUEXT_TLB_FLUSH_MULTI   8
+#define MMUEXT_INVLPG_MULTI      9
+#define MMUEXT_TLB_FLUSH_ALL    10
+#define MMUEXT_INVLPG_ALL       11
+#define MMUEXT_FLUSH_CACHE      12
+#define MMUEXT_SET_LDT          13
+#define MMUEXT_REASSIGN_PAGE    14
+#define MMUEXT_NEW_USER_BASEPTR 15
+
+#ifndef __ASSEMBLY__
+struct mmuext_op {
+    unsigned int cmd;
+    union {
+        /* [UN]PIN_TABLE, NEW_BASEPTR, NEW_USER_BASEPTR, REASSIGN_PAGE */
+        memory_t mfn;
+        /* INVLPG_LOCAL, INVLPG_ALL, SET_LDT */
+        memory_t linear_addr;
+    };
+    union {
+        /* SET_LDT */
+        unsigned int nr_ents;
+        /* TLB_FLUSH_MULTI, INVLPG_MULTI */
+        void *vcpumask;
+    };
+};
+#endif
+
+/* These are passed as 'flags' to update_va_mapping. They can be ORed. */
+/* When specifying UVMF_MULTI, also OR in a pointer to a CPU bitmap.   */
+/* UVMF_LOCAL is merely UVMF_MULTI with a NULL bitmap pointer.         */
+#define UVMF_NONE               (0UL<<0) /* No flushing at all.   */
+#define UVMF_TLB_FLUSH          (1UL<<0) /* Flush entire TLB(s).  */
+#define UVMF_INVLPG             (2UL<<0) /* Flush only one entry. */
+#define UVMF_FLUSHTYPE_MASK     (3UL<<0)
+#define UVMF_MULTI              (0UL<<2) /* Flush subset of TLBs. */
+#define UVMF_LOCAL              (0UL<<2) /* Flush local TLB.      */
+#define UVMF_ALL                (1UL<<2) /* Flush all TLBs.       */
+
+/*
+ * Commands to HYPERVISOR_sched_op().
+ */
+#define SCHEDOP_yield           0   /* Give up the CPU voluntarily.       */
+#define SCHEDOP_block           1   /* Block until an event is received.  */
+#define SCHEDOP_shutdown        2   /* Stop executing this domain.        */
+#define SCHEDOP_vcpu_down       3   /* make target VCPU not-runnable.     */
+#define SCHEDOP_vcpu_up         4   /* make target VCPU runnable.         */
+#define SCHEDOP_cmdmask       255   /* 8-bit command. */
+#define SCHEDOP_reasonshift     8   /* 8-bit reason code. (SCHEDOP_shutdown) */
+#define SCHEDOP_vcpushift       8   /* 8-bit VCPU target. (SCHEDOP_up|down) */
+
+/*
+ * Reason codes for SCHEDOP_shutdown. These may be interpreted by control 
+ * software to determine the appropriate action. For the most part, Xen does
+ * not care about the shutdown code (SHUTDOWN_crash excepted).
+ */
+#define SHUTDOWN_poweroff   0  /* Domain exited normally. Clean up and kill. */
+#define SHUTDOWN_reboot     1  /* Clean up, kill, and then restart.          */
+#define SHUTDOWN_suspend    2  /* Clean up, save suspend info, kill.         */
+#define SHUTDOWN_crash      3  /* Tell controller we've crashed.             */
+
+/*
+ * Commands to HYPERVISOR_console_io().
+ */
+#define CONSOLEIO_write         0
+#define CONSOLEIO_read          1
+
+/*
+ * Commands to HYPERVISOR_dom_mem_op().
+ */
+#define MEMOP_increase_reservation 0
+#define MEMOP_decrease_reservation 1
+
+/*
+ * Commands to HYPERVISOR_vm_assist().
+ */
+#define VMASST_CMD_enable                0
+#define VMASST_CMD_disable               1
+#define VMASST_TYPE_4gb_segments         0
+#define VMASST_TYPE_4gb_segments_notify  1
+#define VMASST_TYPE_writable_pagetables  2
+#define MAX_VMASST_TYPE 2
+
+#ifndef __ASSEMBLY__
+
+typedef u16 domid_t;
+
+/* Domain ids >= DOMID_FIRST_RESERVED cannot be used for ordinary domains. */
+#define DOMID_FIRST_RESERVED (0x7FF0U)
+
+/* DOMID_SELF is used in certain contexts to refer to oneself. */
+#define DOMID_SELF (0x7FF0U)
+
+/*
+ * DOMID_IO is used to restrict page-table updates to mapping I/O memory.
+ * Although no Foreign Domain need be specified to map I/O pages, DOMID_IO
+ * is useful to ensure that no mappings to the OS's own heap are accidentally
+ * installed. (e.g., in Linux this could cause havoc as reference counts
+ * aren't adjusted on the I/O-mapping code path).
+ * This only makes sense in MMUEXT_SET_FOREIGNDOM, but in that context can
+ * be specified by any calling domain.
+ */
+#define DOMID_IO   (0x7FF1U)
+
+/*
+ * DOMID_XEN is used to allow privileged domains to map restricted parts of
+ * Xen's heap space (e.g., the machine_to_phys table).
+ * This only makes sense in MMUEXT_SET_FOREIGNDOM, and is only permitted if
+ * the caller is privileged.
+ */
+#define DOMID_XEN  (0x7FF2U)
+
+/*
+ * Send an array of these to HYPERVISOR_mmu_update().
+ * NB. The fields are natural pointer/address size for this architecture.
+ */
+typedef struct
+{
+    u64 ptr;       /* Machine address of PTE. */
+    u64 val;       /* New contents of PTE.    */
+} mmu_update_t;
+
+/*
+ * Send an array of these to HYPERVISOR_multicall().
+ * NB. The fields are natural register size for this architecture.
+ */
+typedef struct
+{
+    unsigned long op, result;
+    unsigned long args[6];
+} multicall_entry_t;
+
+/* Event channel endpoints per domain. */
+#define NR_EVENT_CHANNELS 1024
+
+/*
+ * Per-VCPU information goes here. This will be cleaned up more when Xen 
+ * actually supports multi-VCPU guests.
+ */
+typedef struct vcpu_info {
+    /*
+     * 'evtchn_upcall_pending' is written non-zero by Xen to indicate
+     * a pending notification for a particular VCPU. It is then cleared 
+     * by the guest OS /before/ checking for pending work, thus avoiding
+     * a set-and-check race. Note that the mask is only accessed by Xen
+     * on the CPU that is currently hosting the VCPU. This means that the
+     * pending and mask flags can be updated by the guest without special
+     * synchronisation (i.e., no need for the x86 LOCK prefix).
+     * This may seem suboptimal because if the pending flag is set by
+     * a different CPU then an IPI may be scheduled even when the mask
+     * is set. However, note:
+     *  1. The task of 'interrupt holdoff' is covered by the per-event-
+     *     channel mask bits. A 'noisy' event that is continually being
+     *     triggered can be masked at source at this very precise
+     *     granularity.
+     *  2. The main purpose of the per-VCPU mask is therefore to restrict
+     *     reentrant execution: whether for concurrency control, or to
+     *     prevent unbounded stack usage. Whatever the purpose, we expect
+     *     that the mask will be asserted only for short periods at a time,
+     *     and so the likelihood of a 'spurious' IPI is suitably small.
+     * The mask is read before making an event upcall to the guest: a
+     * non-zero mask therefore guarantees that the VCPU will not receive
+     * an upcall activation. The mask is cleared when the VCPU requests
+     * to block: this avoids wakeup-waiting races.
+     */
+    u8 evtchn_upcall_pending;
+    u8 evtchn_upcall_mask;
+    u32 evtchn_pending_sel;
+#ifdef __ARCH_HAS_VCPU_INFO
+    arch_vcpu_info_t arch;
+#endif
+} vcpu_info_t;
+
+typedef struct vcpu_time_info {
+    /*
+     * The following values are updated periodically (and not necessarily
+     * atomically!). The guest OS detects this because 'time_version1' is
+     * incremented just before updating these values, and 'time_version2' is
+     * incremented immediately after. See the Xen-specific Linux code for an
+     * example of how to read these values safely (arch/xen/kernel/time.c).
+     */
+    u32 time_version1;
+    u32 time_version2;
+    u64 tsc_timestamp;   /* TSC at last update of time vals.  */
+    u64 system_time;     /* Time, in nanosecs, since boot.    */
+    /*
+     * Current system time:
+     *   system_time + ((tsc - tsc_timestamp) << tsc_shift) * tsc_to_system_mul
+     * CPU frequency (Hz):
+     *   ((10^9 << 32) / tsc_to_system_mul) >> tsc_shift
+     */
+    u32 tsc_to_system_mul;
+    s8  tsc_shift;
+} vcpu_time_info_t;
+
+/*
+ * Xen/kernel shared data -- pointer provided in start_info.
+ * NB. We expect that this struct is smaller than a page.
+ */
+typedef struct shared_info {
+    vcpu_info_t vcpu_data[MAX_VIRT_CPUS];
+
+    vcpu_time_info_t vcpu_time[MAX_VIRT_CPUS];
+
+    u32 n_vcpu;
+
+    /*
+     * A domain can have up to 1024 "event channels" on which it can send
+     * and receive asynchronous event notifications. There are three classes
+     * of event that are delivered by this mechanism:
+     *  1. Bi-directional inter- and intra-domain connections. Domains must
+     *     arrange out-of-band to set up a connection (usually the setup
+     *     is initiated and organised by a privileged third party such as
+     *     software running in domain 0).
+     *  2. Physical interrupts. A domain with suitable hardware-access
+     *     privileges can bind an event-channel port to a physical interrupt
+     *     source.
+     *  3. Virtual interrupts ('events'). A domain can bind an event-channel
+     *     port to a virtual interrupt source, such as the virtual-timer
+     *     device or the emergency console.
+     * 
+     * Event channels are addressed by a "port index" between 0 and 1023.
+     * Each channel is associated with two bits of information:
+     *  1. PENDING -- notifies the domain that there is a pending notification
+     *     to be processed. This bit is cleared by the guest.
+     *  2. MASK -- if this bit is clear then a 0->1 transition of PENDING
+     *     will cause an asynchronous upcall to be scheduled. This bit is only
+     *     updated by the guest. It is read-only within Xen. If a channel
+     *     becomes pending while the channel is masked then the 'edge' is lost
+     *     (i.e., when the channel is unmasked, the guest must manually handle
+     *     pending notifications as no upcall will be scheduled by Xen).
+     * 
+     * To expedite scanning of pending notifications, any 0->1 pending
+     * transition on an unmasked channel causes a corresponding bit in a
+     * 32-bit selector to be set. Each bit in the selector covers a 32-bit
+     * word in the PENDING bitfield array.
+     */
+    u32 evtchn_pending[32];
+    u32 evtchn_mask[32];
+
+    /*
+     * Wallclock time: updated only by control software. Guests should base
+     * their gettimeofday() syscall on this wallclock-base value.
+     */
+    u32                wc_sec;          /* Secs  00:00:00 UTC, Jan 1, 1970.  */
+    u32                wc_usec;         /* Usecs 00:00:00 UTC, Jan 1, 1970.  */
+
+    arch_shared_info_t arch;
+
+} shared_info_t;
+
+/*
+ * Start-of-day memory layout for the initial domain (DOM0):
+ *  1. The domain is started within contiguous virtual-memory region.
+ *  2. The contiguous region begins and ends on an aligned 4MB boundary.
+ *  3. The region start corresponds to the load address of the OS image.
+ *     If the load address is not 4MB aligned then the address is rounded down.
+ *  4. This the order of bootstrap elements in the initial virtual region:
+ *      a. relocated kernel image
+ *      b. initial ram disk              [mod_start, mod_len]
+ *      c. list of allocated page frames [mfn_list, nr_pages]
+ *      d. bootstrap page tables         [pt_base, CR3 (x86)]
+ *      e. start_info_t structure        [register ESI (x86)]
+ *      f. bootstrap stack               [register ESP (x86)]
+ *  5. Bootstrap elements are packed together, but each is 4kB-aligned.
+ *  6. The initial ram disk may be omitted.
+ *  7. The list of page frames forms a contiguous 'pseudo-physical' memory
+ *     layout for the domain. In particular, the bootstrap virtual-memory
+ *     region is a 1:1 mapping to the first section of the pseudo-physical map.
+ *  8. All bootstrap elements are mapped read-writable for the guest OS. The
+ *     only exception is the bootstrap page table, which is mapped read-only.
+ *  9. There is guaranteed to be at least 512kB padding after the final
+ *     bootstrap element. If necessary, the bootstrap virtual region is
+ *     extended by an extra 4MB to ensure this.
+ */
+
+#define MAX_GUEST_CMDLINE 1024
+typedef struct start_info {
+    /* THE FOLLOWING ARE FILLED IN BOTH ON INITIAL BOOT AND ON RESUME.    */
+    memory_t nr_pages;        /* Total pages allocated to this domain.    */
+    memory_t shared_info;     /* MACHINE address of shared info struct.   */
+    u32      flags;           /* SIF_xxx flags.                           */
+    u16      domain_controller_evtchn;
+    /* THE FOLLOWING ARE ONLY FILLED IN ON INITIAL BOOT (NOT RESUME).     */
+    memory_t pt_base;         /* VIRTUAL address of page directory.       */
+    memory_t nr_pt_frames;    /* Number of bootstrap p.t. frames.         */
+    memory_t mfn_list;        /* VIRTUAL address of page-frame list.      */
+    memory_t mod_start;       /* VIRTUAL address of pre-loaded module.    */
+    memory_t mod_len;         /* Size (bytes) of pre-loaded module.       */
+    s8 cmd_line[MAX_GUEST_CMDLINE];
+    memory_t store_mfn;       /* MACHINE page number of shared page.      */
+    u16      store_evtchn;    /* Event channel for store communication.   */
+} start_info_t;
+
+/* These flags are passed in the 'flags' field of start_info_t. */
+#define SIF_PRIVILEGED    (1<<0)  /* Is the domain privileged? */
+#define SIF_INITDOMAIN    (1<<1)  /* Is this the initial control domain? */
+#define SIF_BLK_BE_DOMAIN (1<<4)  /* Is this a block backend domain? */
+#define SIF_NET_BE_DOMAIN (1<<5)  /* Is this a net backend domain? */
+#define SIF_USB_BE_DOMAIN (1<<6)  /* Is this a usb backend domain? */
+/* For use in guest OSes. */
+extern shared_info_t *HYPERVISOR_shared_info;
+
+typedef u64 cpumap_t;
+
+#endif /* !__ASSEMBLY__ */
+
+#endif /* __XEN_PUBLIC_XEN_H__ */
Index: linux-2.6.12-xen0-arch/include/asm-i386/mach-xen/xen_synch_bitops.h
===================================================================
--- /dev/null
+++ linux-2.6.12-xen0-arch/include/asm-i386/mach-xen/xen_synch_bitops.h
@@ -0,0 +1,140 @@
+#ifndef __XEN_SYNCH_BITOPS_H__
+#define __XEN_SYNCH_BITOPS_H__
+
+/*
+ * Copyright 1992, Linus Torvalds.
+ * Heavily modified to provide guaranteed strong synchronisation
+ * when communicating with Xen or other guest OSes running on other CPUs.
+ */
+
+#include <linux/config.h>
+
+#define ADDR (*(volatile long *) addr)
+
+static __inline__ void synch_set_bit(int nr, volatile void * addr)
+{
+    __asm__ __volatile__ ( 
+        "lock btsl %1,%0"
+        : "=m" (ADDR) : "Ir" (nr) : "memory" );
+}
+
+static __inline__ void synch_clear_bit(int nr, volatile void * addr)
+{
+    __asm__ __volatile__ (
+        "lock btrl %1,%0"
+        : "=m" (ADDR) : "Ir" (nr) : "memory" );
+}
+
+static __inline__ void synch_change_bit(int nr, volatile void * addr)
+{
+    __asm__ __volatile__ (
+        "lock btcl %1,%0"
+        : "=m" (ADDR) : "Ir" (nr) : "memory" );
+}
+
+static __inline__ int synch_test_and_set_bit(int nr, volatile void * addr)
+{
+    int oldbit;
+    __asm__ __volatile__ (
+        "lock btsl %2,%1\n\tsbbl %0,%0"
+        : "=r" (oldbit), "=m" (ADDR) : "Ir" (nr) : "memory");
+    return oldbit;
+}
+
+static __inline__ int synch_test_and_clear_bit(int nr, volatile void * addr)
+{
+    int oldbit;
+    __asm__ __volatile__ (
+        "lock btrl %2,%1\n\tsbbl %0,%0"
+        : "=r" (oldbit), "=m" (ADDR) : "Ir" (nr) : "memory");
+    return oldbit;
+}
+
+static __inline__ int synch_test_and_change_bit(int nr, volatile void * addr)
+{
+    int oldbit;
+
+    __asm__ __volatile__ (
+        "lock btcl %2,%1\n\tsbbl %0,%0"
+        : "=r" (oldbit), "=m" (ADDR) : "Ir" (nr) : "memory");
+    return oldbit;
+}
+
+struct __synch_xchg_dummy { unsigned long a[100]; };
+#define __synch_xg(x) ((struct __synch_xchg_dummy *)(x))
+
+#define synch_cmpxchg(ptr, old, new) \
+((__typeof__(*(ptr)))__synch_cmpxchg((ptr),\
+                                     (unsigned long)(old), \
+                                     (unsigned long)(new), \
+                                     sizeof(*(ptr))))
+
+static inline unsigned long __synch_cmpxchg(volatile void *ptr,
+                                           unsigned long old,
+                                           unsigned long new, int size)
+{
+       unsigned long prev;
+       switch (size) {
+       case 1:
+               __asm__ __volatile__("lock; cmpxchgb %b1,%2"
+                                    : "=a"(prev)
+                                    : "q"(new), "m"(*__synch_xg(ptr)),
+                                      "0"(old)
+                                    : "memory");
+               return prev;
+       case 2:
+               __asm__ __volatile__("lock; cmpxchgw %w1,%2"
+                                    : "=a"(prev)
+                                    : "q"(new), "m"(*__synch_xg(ptr)),
+                                      "0"(old)
+                                    : "memory");
+               return prev;
+#ifdef CONFIG_X86_64
+       case 4:
+               __asm__ __volatile__("lock; cmpxchgl %k1,%2"
+                                    : "=a"(prev)
+                                    : "q"(new), "m"(*__synch_xg(ptr)),
+                                      "0"(old)
+                                    : "memory");
+               return prev;
+       case 8:
+               __asm__ __volatile__("lock; cmpxchgq %1,%2"
+                                    : "=a"(prev)
+                                    : "q"(new), "m"(*__synch_xg(ptr)),
+                                      "0"(old)
+                                    : "memory");
+               return prev;
+#else
+       case 4:
+               __asm__ __volatile__("lock; cmpxchgl %1,%2"
+                                    : "=a"(prev)
+                                    : "q"(new), "m"(*__synch_xg(ptr)),
+                                      "0"(old)
+                                    : "memory");
+               return prev;
+#endif
+       }
+       return old;
+}
+
+static __inline__ int synch_const_test_bit(int nr, const volatile void * addr)
+{
+    return ((1UL << (nr & 31)) & 
+            (((const volatile unsigned int *) addr)[nr >> 5])) != 0;
+}
+
+static __inline__ int synch_var_test_bit(int nr, volatile void * addr)
+{
+    int oldbit;
+    __asm__ __volatile__ (
+        "btl %2,%1\n\tsbbl %0,%0"
+        : "=r" (oldbit) : "m" (ADDR), "Ir" (nr) );
+    return oldbit;
+}
+
+#define synch_test_bit(nr,addr) \
+(__builtin_constant_p(nr) ? \
+ synch_const_test_bit((nr),(addr)) : \
+ synch_var_test_bit((nr),(addr)))
+
+#endif /* __XEN_SYNCH_BITOPS_H__ */
Index: linux-2.6.12-xen0-arch/include/linux/xen/privcmd.h
===================================================================
--- /dev/null
+++ linux-2.6.12-xen0-arch/include/linux/xen/privcmd.h
@@ -0,0 +1,90 @@
+/******************************************************************************
+ * privcmd.h
+ * 
+ * Interface to /proc/xen/privcmd.
+ * 
+ * Copyright (c) 2003-2004, K A Fraser
+ * 
+ * This file may be distributed separately from the Linux kernel, or
+ * incorporated into other software packages, subject to the following license:
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this source file (the "Software"), to deal in the Software without
+ * restriction, including without limitation the rights to use, copy, modify,
+ * merge, publish, distribute, sublicense, and/or sell copies of the Software,
+ * and to permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#ifndef __PRIVCMD_H__
+#define __PRIVCMD_H__
+
+typedef struct privcmd_hypercall
+{
+    unsigned long op;
+    unsigned long arg[5];
+} privcmd_hypercall_t;
+
+typedef struct privcmd_mmap_entry {
+    unsigned long va;
+    unsigned long mfn;
+    unsigned long npages;
+} privcmd_mmap_entry_t; 
+
+typedef struct privcmd_mmap {
+    int num;
+    domid_t dom; /* target domain */
+    privcmd_mmap_entry_t *entry;
+} privcmd_mmap_t; 
+
+typedef struct privcmd_mmapbatch {
+    int num;     /* number of pages to populate */
+    domid_t dom; /* target domain */
+    unsigned long addr;  /* virtual address */
+    unsigned long *arr; /* array of mfns - top nibble set on err */
+} privcmd_mmapbatch_t; 
+
+typedef struct privcmd_blkmsg
+{
+    unsigned long op;
+    void         *buf;
+    int           buf_size;
+} privcmd_blkmsg_t;
+
+/*
+ * @cmd: IOCTL_PRIVCMD_HYPERCALL
+ * @arg: &privcmd_hypercall_t
+ * Return: Value returned from execution of the specified hypercall.
+ */
+#define IOCTL_PRIVCMD_HYPERCALL         \
+    _IOC(_IOC_NONE, 'P', 0, sizeof(privcmd_hypercall_t))
+
+/*
+ * @cmd: IOCTL_PRIVCMD_INITDOMAIN_EVTCHN
+ * @arg: n/a
+ * Return: Port associated with domain-controller end of control event channel
+ *         for the initial domain.
+ */
+#define IOCTL_PRIVCMD_INITDOMAIN_EVTCHN \
+    _IOC(_IOC_NONE, 'P', 1, 0)
+#define IOCTL_PRIVCMD_MMAP             \
+    _IOC(_IOC_NONE, 'P', 2, sizeof(privcmd_mmap_t))
+#define IOCTL_PRIVCMD_MMAPBATCH             \
+    _IOC(_IOC_NONE, 'P', 3, sizeof(privcmd_mmapbatch_t))
+#define IOCTL_PRIVCMD_GET_MACH2PHYS_START_MFN \
+    _IOC(_IOC_READ, 'P', 4, sizeof(unsigned long))
+#define IOCTL_PRIVCMD_INITDOMAIN_STORE \
+    _IOC(_IOC_READ, 'P', 5, 0)
+
+#endif /* __PRIVCMD_H__ */
Index: linux-2.6.12-xen0-arch/include/linux/xen/suspend.h
===================================================================
--- /dev/null
+++ linux-2.6.12-xen0-arch/include/linux/xen/suspend.h
@@ -0,0 +1,43 @@
+/******************************************************************************
+ * suspend.h
+ * 
+ * Copyright (c) 2003-2004, K A Fraser
+ * 
+ * This file may be distributed separately from the Linux kernel, or
+ * incorporated into other software packages, subject to the following license:
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this source file (the "Software"), to deal in the Software without
+ * restriction, including without limitation the rights to use, copy, modify,
+ * merge, publish, distribute, sublicense, and/or sell copies of the Software,
+ * and to permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#ifndef __ASM_XEN_SUSPEND_H__
+#define __ASM_XEN_SUSPEND_H__
+
+typedef struct suspend_record_st {
+    /* To be filled in before resume. */
+    start_info_t resume_info;
+    /*
+     * The number of a machine frame containing, in sequence, the number of
+     * each machine frame that contains PFN -> MFN translation table data.
+     */
+    unsigned long pfn_to_mfn_frame_list;
+    /* Number of entries in the PFN -> MFN translation table. */
+    unsigned long nr_pfns;
+} suspend_record_t;
+
+#endif /* __ASM_XEN_SUSPEND_H__ */
Index: linux-2.6.12-xen0-arch/include/linux/xen/xen_proc.h
===================================================================
--- /dev/null
+++ linux-2.6.12-xen0-arch/include/linux/xen/xen_proc.h
@@ -0,0 +1,12 @@
+#ifndef __LINUX_XEN_PROC_H__
+#define __LINUX_XEN_PROC_H__
+
+#include <linux/config.h>
+#include <linux/proc_fs.h>
+
+extern struct proc_dir_entry *create_xen_proc_entry(
+    const char *name, mode_t mode);
+extern void remove_xen_proc_entry(
+    const char *name);
+
+#endif /* __LINUX_XEN_PROC_H__ */

--


_______________________________________________
Xen-merge mailing list
Xen-merge@xxxxxxxxxxxxxxxxxxx
http://lists.xensource.com/xen-merge