Re: [Xen-merge] i386 subarch

Here's that output from the script.  The diffstat can be a guiding list
of files to be converted. (Kconfig and Makefiles can be ignored)


 arch/i386/Kconfig                   |  718 +++++----------------
 arch/i386/Makefile                  |  128 ---
 arch/i386/kernel/Makefile           |   79 +-
 arch/i386/kernel/acpi/Makefile      |   15 
 arch/i386/kernel/acpi/boot.c        |   26 
 arch/i386/kernel/apic.c             | 1201 ------------------------------------
 arch/i386/kernel/cpu/Makefile       |   34 -
 arch/i386/kernel/cpu/common.c       |   58 -
 arch/i386/kernel/cpu/mtrr/Makefile  |   19 
 arch/i386/kernel/cpu/mtrr/main.c    |  629 +-----------------
 arch/i386/kernel/entry.S            |  358 ++++++----
 arch/i386/kernel/head.S             |  457 +------------
 arch/i386/kernel/i386_ksyms.c       |    2 
 arch/i386/kernel/io_apic.c          |  104 ++-
 arch/i386/kernel/ioport.c           |   74 --
 arch/i386/kernel/irq.c              |   66 +
 arch/i386/kernel/ldt.c              |   32 
 arch/i386/kernel/microcode.c        |  375 -----------
 arch/i386/kernel/mpparse.c          |   27 
 arch/i386/kernel/pci-dma.c          |  141 ++++
 arch/i386/kernel/process.c          |  291 +++-----
 arch/i386/kernel/quirks.c           |   11 
 arch/i386/kernel/setup.c            |  241 ++++++-
 arch/i386/kernel/signal.c           |    2 
 arch/i386/kernel/smp.c              |  208 +++---
 arch/i386/kernel/smpboot.c          |  476 ++++++++++++--
 arch/i386/kernel/time.c             |  553 +++++++++++++++-
 arch/i386/kernel/timers/Makefile    |   16 
 arch/i386/kernel/timers/timer_tsc.c |  277 +-------
 arch/i386/kernel/traps.c            |  210 ++----
 arch/i386/kernel/vsyscall.S         |    4 
 arch/i386/mach-default/Makefile     |    9 
 arch/i386/mm/Makefile               |   22 
 arch/i386/mm/fault.c                |   35 -
 arch/i386/mm/highmem.c              |   15 
 arch/i386/mm/hypervisor.c           |  363 ++++++++++
 arch/i386/mm/init.c                 |  131 +++
 arch/i386/mm/ioremap.c              |  312 ++++++---
 arch/i386/mm/pgtable.c              |  309 ++++++++-
 arch/i386/pci/Makefile              |   38 -
 arch/i386/pci/irq.c                 |    5 
 41 files changed, 3673 insertions(+), 4398 deletions(-)


diff -x mkbuildtree -x include -x xen -x SCCS -urPp 
pristine-linux-2.6.12/arch/i386/Kconfig linux-2.6-xen-sparse/arch/i386/Kconfig
--- pristine-linux-2.6.12/arch/i386/Kconfig     2005-06-17 12:48:29.000000000 
-0700
+++ linux-2.6-xen-sparse/arch/i386/Kconfig      2005-07-28 13:17:07.000000000 
-0700
@@ -3,7 +3,11 @@
 # see Documentation/kbuild/kconfig-language.txt.
 #
 
-mainmenu "Linux Kernel Configuration"
+menu "X86 Processor Configuration"
+
+config XENARCH
+       string
+       default i386
 
 config X86
        bool
@@ -33,119 +37,6 @@ config GENERIC_IOMAP
        bool
        default y
 
-source "init/Kconfig"
-
-menu "Processor type and features"
-
-choice
-       prompt "Subarchitecture Type"
-       default X86_PC
-
-config X86_PC
-       bool "PC-compatible"
-       help
-         Choose this option if your computer is a standard PC or compatible.
-
-config X86_ELAN
-       bool "AMD Elan"
-       help
-         Select this for an AMD Elan processor.
-
-         Do not use this option for K6/Athlon/Opteron processors!
-
-         If unsure, choose "PC-compatible" instead.
-
-config X86_VOYAGER
-       bool "Voyager (NCR)"
-       help
-         Voyager is an MCA-based 32-way capable SMP architecture proprietary
-         to NCR Corp.  Machine classes 345x/35xx/4100/51xx are Voyager-based.
-
-         *** WARNING ***
-
-         If you do not specifically know you have a Voyager based machine,
-         say N here, otherwise the kernel you build will not be bootable.
-
-config X86_NUMAQ
-       bool "NUMAQ (IBM/Sequent)"
-       select DISCONTIGMEM
-       select NUMA
-       help
-         This option is used for getting Linux to run on a (IBM/Sequent) NUMA
-         multiquad box. This changes the way that processors are bootstrapped,
-         and uses Clustered Logical APIC addressing mode instead of Flat 
Logical.
-         You will need a new lynxer.elf file to flash your firmware with - send
-         email to <Martin.Bligh@xxxxxxxxxx>.
-
-config X86_SUMMIT
-       bool "Summit/EXA (IBM x440)"
-       depends on SMP
-       help
-         This option is needed for IBM systems that use the Summit/EXA chipset.
-         In particular, it is needed for the x440.
-
-         If you don't have one of these computers, you should say N here.
-
-config X86_BIGSMP
-       bool "Support for other sub-arch SMP systems with more than 8 CPUs"
-       depends on SMP
-       help
-         This option is needed for the systems that have more than 8 CPUs
-         and if the system is not of any sub-arch type above.
-
-         If you don't have such a system, you should say N here.
-
-config X86_VISWS
-       bool "SGI 320/540 (Visual Workstation)"
-       help
-         The SGI Visual Workstation series is an IA32-based workstation
-         based on SGI systems chips with some legacy PC hardware attached.
-
-         Say Y here to create a kernel to run on the SGI 320 or 540.
-
-         A kernel compiled for the Visual Workstation will not run on PCs
-         and vice versa. See <file:Documentation/sgi-visws.txt> for details.
-
-config X86_GENERICARCH
-       bool "Generic architecture (Summit, bigsmp, ES7000, default)"
-       depends on SMP
-       help
-          This option compiles in the Summit, bigsmp, ES7000, default 
subarchitectures.
-         It is intended for a generic binary kernel.
-
-config X86_ES7000
-       bool "Support for Unisys ES7000 IA32 series"
-       depends on SMP
-       help
-         Support for Unisys ES7000 systems.  Say 'Y' here if this kernel is
-         supposed to run on an IA32-based Unisys ES7000 system.
-         Only choose this option if you have such a system, otherwise you
-         should say N here.
-
-endchoice
-
-config ACPI_SRAT
-       bool
-       default y
-       depends on NUMA && (X86_SUMMIT || X86_GENERICARCH)
-
-config X86_SUMMIT_NUMA
-       bool
-       default y
-       depends on NUMA && (X86_SUMMIT || X86_GENERICARCH)
-
-config X86_CYCLONE_TIMER
-       bool
-       default y
-       depends on X86_SUMMIT || X86_GENERICARCH
-
-config ES7000_CLUSTERED_APIC
-       bool
-       default y
-       depends on SMP && X86_ES7000 && MPENTIUMIII
-
-if !X86_ELAN
-
 choice
        prompt "Processor family"
        default M686
@@ -347,8 +238,6 @@ config X86_GENERIC
          This is really intended for distributors who need more
          generic optimizations.
 
-endif
-
 #
 # Define implied options from the CPU selection here
 #
@@ -444,19 +333,21 @@ config X86_OOSTORE
        default y
 
 config HPET_TIMER
-       bool "HPET Timer Support"
-       help
-         This enables the use of the HPET for the kernel's internal timer.
-         HPET is the next generation timer replacing legacy 8254s.
-         You can safely choose Y here.  However, HPET will only be
-         activated if the platform and the BIOS support this feature.
-         Otherwise the 8254 will be used for timing services.
-
-         Choose N to continue using the legacy 8254 timer.
+       bool
+       default n
+#config HPET_TIMER
+#      bool "HPET Timer Support"
+#      help
+#        This enables the use of the HPET for the kernel's internal timer.
+#        HPET is the next generation timer replacing legacy 8254s.
+#        You can safely choose Y here.  However, HPET will only be
+#        activated if the platform and the BIOS support this feature.
+#        Otherwise the 8254 will be used for timing services.
+#
+#        Choose N to continue using the legacy 8254 timer.
 
 config HPET_EMULATE_RTC
-       bool "Provide RTC interrupt"
-       depends on HPET_TIMER && RTC=y
+       def_bool HPET_TIMER && RTC=y
 
 config SMP
        bool "Symmetric multi-processing support"
@@ -487,6 +378,19 @@ config SMP
 
          If you don't know what to do here, say N.
 
+config SMP_ALTERNATIVES
+        bool "SMP alternatives support (EXPERIMENTAL)"
+        depends on SMP && EXPERIMENTAL
+        help
+          Try to reduce the overhead of running an SMP kernel on a uniprocessor
+          host slightly by replacing certain key instruction sequences
+          according to whether we currently have more than one CPU available.
+          This should provide a noticeable boost to performance when
+          running SMP kernels on UP machines, and have negligible impact
+          when running on an true SMP host.
+
+          If unsure, say N.
+
 config NR_CPUS
        int "Maximum number of CPUs (2-255)"
        range 2 255
@@ -534,122 +438,47 @@ config PREEMPT_BKL
          Say Y here if you are building a kernel for a desktop system.
          Say N if you are unsure.
 
-config X86_UP_APIC
-       bool "Local APIC support on uniprocessors"
-       depends on !SMP && !(X86_VISWS || X86_VOYAGER)
-       help
-         A local APIC (Advanced Programmable Interrupt Controller) is an
-         integrated interrupt controller in the CPU. If you have a single-CPU
-         system which has a processor with a local APIC, you can say Y here to
-         enable and use it. If you say Y here even though your machine doesn't
-         have a local APIC, then the kernel will still run with no slowdown at
-         all. The local APIC supports CPU-generated self-interrupts (timer,
-         performance counters), and the NMI watchdog which detects hard
-         lockups.
-
-config X86_UP_IOAPIC
-       bool "IO-APIC support on uniprocessors"
-       depends on X86_UP_APIC
-       help
-         An IO-APIC (I/O Advanced Programmable Interrupt Controller) is an
-         SMP-capable replacement for PC-style interrupt controllers. Most
-         SMP systems and many recent uniprocessor systems have one.
-
-         If you have a single-CPU system with an IO-APIC, you can say Y here
-         to use it. If you say Y here even though your machine doesn't have
-         an IO-APIC, then the kernel will still run with no slowdown at all.
-
-config X86_LOCAL_APIC
-       bool
-       depends on X86_UP_APIC || ((X86_VISWS || SMP) && !X86_VOYAGER)
-       default y
-
-config X86_IO_APIC
-       bool
-       depends on X86_UP_IOAPIC || (SMP && !(X86_VISWS || X86_VOYAGER))
-       default y
-
-config X86_VISWS_APIC
-       bool
-       depends on X86_VISWS
-       default y
-
-config X86_TSC
-       bool
-       depends on (MWINCHIP3D || MWINCHIP2 || MCRUSOE || MEFFICEON || 
MCYRIXIII || MK7 || MK6 || MPENTIUM4 || MPENTIUMM || MPENTIUMIII || MPENTIUMII 
|| M686 || M586MMX || M586TSC || MK8 || MVIAC3_2 || MGEODEGX1) && !X86_NUMAQ
-       default y
-
-config X86_MCE
-       bool "Machine Check Exception"
-       depends on !X86_VOYAGER
-       ---help---
-         Machine Check Exception support allows the processor to notify the
-         kernel if it detects a problem (e.g. overheating, component failure).
-         The action the kernel takes depends on the severity of the problem,
-         ranging from a warning message on the console, to halting the machine.
-         Your processor must be a Pentium or newer to support this - check the
-         flags in /proc/cpuinfo for mce.  Note that some older Pentium systems
-         have a design flaw which leads to false MCE events - hence MCE is
-         disabled on all P5 processors, unless explicitly enabled with "mce"
-         as a boot argument.  Similarly, if MCE is built in and creates a
-         problem on some new non-standard machine, you can boot with "nomce"
-         to disable it.  MCE support simply ignores non-MCE processors like
-         the 386 and 486, so nearly everyone can say Y here.
-
-config X86_MCE_NONFATAL
-       tristate "Check for non-fatal errors on AMD Athlon/Duron / Intel 
Pentium 4"
-       depends on X86_MCE
-       help
-         Enabling this feature starts a timer that triggers every 5 seconds 
which
-         will look at the machine check registers to see if anything happened.
-         Non-fatal problems automatically get corrected (but still logged).
-         Disable this if you don't want to see these messages.
-         Seeing the messages this option prints out may be indicative of dying 
hardware,
-         or out-of-spec (ie, overclocked) hardware.
-         This option only does something on certain CPUs.
-         (AMD Athlon/Duron and Intel Pentium 4)
-
-config X86_MCE_P4THERMAL
-       bool "check for P4 thermal throttling interrupt."
-       depends on X86_MCE && (X86_UP_APIC || SMP) && !X86_VISWS
-       help
-         Enabling this feature will cause a message to be printed when the P4
-         enters thermal throttling.
-
-config TOSHIBA
-       tristate "Toshiba Laptop support"
-       ---help---
-         This adds a driver to safely access the System Management Mode of
-         the CPU on Toshiba portables with a genuine Toshiba BIOS. It does
-         not work on models with a Phoenix BIOS. The System Management Mode
-         is used to set the BIOS and power saving options on Toshiba portables.
-
-         For information on utilities to make use of this driver see the
-         Toshiba Linux utilities web site at:
-         <http://www.buzzard.org.uk/toshiba/>.
-
-         Say Y if you intend to run this kernel on a Toshiba portable.
-         Say N otherwise.
-
-config I8K
-       tristate "Dell laptop support"
-       ---help---
-         This adds a driver to safely access the System Management Mode
-         of the CPU on the Dell Inspiron 8000. The System Management Mode
-         is used to read cpu temperature and cooling fan status and to
-         control the fans on the I8K portables.
-
-         This driver has been tested only on the Inspiron 8000 but it may
-         also work with other Dell laptops. You can force loading on other
-         models by passing the parameter `force=1' to the module. Use at
-         your own risk.
-
-         For information on utilities to make use of this driver see the
-         I8K Linux utilities web site at:
-         <http://people.debian.org/~dz/i8k/>
-
-         Say Y if you intend to run this kernel on a Dell Inspiron 8000.
-         Say N otherwise.
+#config X86_TSC
+#       bool
+#      depends on (MWINCHIP3D || MWINCHIP2 || MCRUSOE || MEFFICEON || 
MCYRIXIII || MK7 || MK6 || MPENTIUM4 || MPENTIUMM || MPENTIUMIII || MPENTIUMII 
|| M686 || M586MMX || M586TSC || MK8 || MVIAC3_2 || MGEODEGX1) && !X86_NUMAQ
+#       default y
+
+#config X86_MCE
+#       bool "Machine Check Exception"
+#      depends on !X86_VOYAGER
+#       ---help---
+#         Machine Check Exception support allows the processor to notify the
+#         kernel if it detects a problem (e.g. overheating, component failure).
+#         The action the kernel takes depends on the severity of the problem,
+#         ranging from a warning message on the console, to halting the 
machine.
+#         Your processor must be a Pentium or newer to support this - check the
+#         flags in /proc/cpuinfo for mce.  Note that some older Pentium systems
+#         have a design flaw which leads to false MCE events - hence MCE is
+#         disabled on all P5 processors, unless explicitly enabled with "mce"
+#         as a boot argument.  Similarly, if MCE is built in and creates a
+#         problem on some new non-standard machine, you can boot with "nomce"
+#         to disable it.  MCE support simply ignores non-MCE processors like
+#         the 386 and 486, so nearly everyone can say Y here.
+
+#config X86_MCE_NONFATAL
+#      tristate "Check for non-fatal errors on AMD Athlon/Duron / Intel 
Pentium 4"
+#       depends on X86_MCE
+#       help
+#         Enabling this feature starts a timer that triggers every 5 seconds 
which
+#         will look at the machine check registers to see if anything happened.
+#         Non-fatal problems automatically get corrected (but still logged).
+#         Disable this if you don't want to see these messages.
+#         Seeing the messages this option prints out may be indicative of 
dying hardware,
+#         or out-of-spec (ie, overclocked) hardware.
+#         This option only does something on certain CPUs.
+#         (AMD Athlon/Duron and Intel Pentium 4)
+
+#config X86_MCE_P4THERMAL
+#       bool "check for P4 thermal throttling interrupt."
+#       depends on X86_MCE && (X86_UP_APIC || SMP)
+#       help
+#         Enabling this feature will cause a message to be printed when the P4
+#         enters thermal throttling.
 
 config X86_REBOOTFIXUPS
        bool "Enable X86 board specific fixups for reboot"
@@ -671,6 +500,7 @@ config X86_REBOOTFIXUPS
 
 config MICROCODE
        tristate "/dev/cpu/microcode - Intel IA32 CPU microcode support"
+        depends on XEN_PRIVILEGED_GUEST
        ---help---
          If you say Y here and also to "/dev file system support" in the
          'File systems' section, you will be able to update the microcode on
@@ -686,14 +516,14 @@ config MICROCODE
          To compile this driver as a module, choose M here: the
          module will be called microcode.
 
-config X86_MSR
-       tristate "/dev/cpu/*/msr - Model-specific register support"
-       help
-         This device gives privileged processes access to the x86
-         Model-Specific Registers (MSRs).  It is a character device with
-         major 202 and minors 0 to 31 for /dev/cpu/0/msr to /dev/cpu/31/msr.
-         MSR accesses are directed to a specific CPU on multi-processor
-         systems.
+#config X86_MSR
+#       tristate "/dev/cpu/*/msr - Model-specific register support"
+#       help
+#         This device gives privileged processes access to the x86
+#         Model-Specific Registers (MSRs).  It is a character device with
+#         major 202 and minors 0 to 31 for /dev/cpu/0/msr to /dev/cpu/31/msr.
+#         MSR accesses are directed to a specific CPU on multi-processor
+#         systems.
 
 config X86_CPUID
        tristate "/dev/cpu/*/cpuid - CPU information support"
@@ -803,95 +633,57 @@ config NEED_NODE_MEMMAP_SIZE
        depends on DISCONTIGMEM
        default y
 
-config HIGHPTE
-       bool "Allocate 3rd-level pagetables from highmem"
-       depends on HIGHMEM4G || HIGHMEM64G
-       help
-         The VM uses one page table entry for each page of physical memory.
-         For systems with a lot of RAM, this can be wasteful of precious
-         low memory.  Setting this option will put user-space page table
-         entries in high memory.
-
-config MATH_EMULATION
-       bool "Math emulation"
-       ---help---
-         Linux can emulate a math coprocessor (used for floating point
-         operations) if you don't have one. 486DX and Pentium processors have
-         a math coprocessor built in, 486SX and 386 do not, unless you added
-         a 487DX or 387, respectively. (The messages during boot time can
-         give you some hints here ["man dmesg"].) Everyone needs either a
-         coprocessor or this emulation.
-
-         If you don't have a math coprocessor, you need to say Y here; if you
-         say Y here even though you have a coprocessor, the coprocessor will
-         be used nevertheless. (This behavior can be changed with the kernel
-         command line option "no387", which comes handy if your coprocessor
-         is broken. Try "man bootparam" or see the documentation of your boot
-         loader (lilo or loadlin) about how to pass options to the kernel at
-         boot time.) This means that it is a good idea to say Y here if you
-         intend to use this kernel on different machines.
-
-         More information about the internals of the Linux math coprocessor
-         emulation can be found in <file:arch/i386/math-emu/README>.
-
-         If you are not sure, say Y; apart from resulting in a 66 KB bigger
-         kernel, it won't hurt.
+#config HIGHPTE
+#      bool "Allocate 3rd-level pagetables from highmem"
+#      depends on HIGHMEM4G || HIGHMEM64G
+#      help
+#        The VM uses one page table entry for each page of physical memory.
+#        For systems with a lot of RAM, this can be wasteful of precious
+#        low memory.  Setting this option will put user-space page table
+#        entries in high memory.
 
 config MTRR
-       bool "MTRR (Memory Type Range Register) support"
-       ---help---
-         On Intel P6 family processors (Pentium Pro, Pentium II and later)
-         the Memory Type Range Registers (MTRRs) may be used to control
-         processor access to memory ranges. This is most useful if you have
-         a video (VGA) card on a PCI or AGP bus. Enabling write-combining
-         allows bus write transfers to be combined into a larger transfer
-         before bursting over the PCI/AGP bus. This can increase performance
-         of image write operations 2.5 times or more. Saying Y here creates a
-         /proc/mtrr file which may be used to manipulate your processor's
-         MTRRs. Typically the X server should use this.
-
-         This code has a reasonably generic interface so that similar
-         control registers on other processors can be easily supported
-         as well:
-
-         The Cyrix 6x86, 6x86MX and M II processors have Address Range
-         Registers (ARRs) which provide a similar functionality to MTRRs. For
-         these, the ARRs are used to emulate the MTRRs.
-         The AMD K6-2 (stepping 8 and above) and K6-3 processors have two
-         MTRRs. The Centaur C6 (WinChip) has 8 MCRs, allowing
-         write-combining. All of these processors are supported by this code
-         and it makes sense to say Y here if you have one of them.
-
-         Saying Y here also fixes a problem with buggy SMP BIOSes which only
-         set the MTRRs for the boot CPU and not for the secondary CPUs. This
-         can lead to all sorts of problems, so it's good to say Y here.
-
-         You can safely say Y even if your machine doesn't have MTRRs, you'll
-         just add about 9 KB to your kernel.
-
-         See <file:Documentation/mtrr.txt> for more information.
-
-config EFI
-       bool "Boot from EFI support (EXPERIMENTAL)"
-       depends on ACPI
-       default n
-       ---help---
-       This enables the the kernel to boot on EFI platforms using
-       system configuration information passed to it from the firmware.
-       This also enables the kernel to use any EFI runtime services that are
-       available (such as the EFI variable services).
-
-       This option is only useful on systems that have EFI firmware
-       and will result in a kernel image that is ~8k larger.  In addition,
-       you must use the latest ELILO loader available at
-       <http://elilo.sourceforge.net> in order to take advantage of
-       kernel initialization using EFI information (neither GRUB nor LILO know
-       anything about EFI).  However, even with this option, the resultant
-       kernel should continue to boot on existing non-EFI platforms.
+       bool
+       depends on XEN_PRIVILEGED_GUEST
+       default y
+
+#config MTRR
+#       bool "MTRR (Memory Type Range Register) support"
+#       ---help---
+#         On Intel P6 family processors (Pentium Pro, Pentium II and later)
+#         the Memory Type Range Registers (MTRRs) may be used to control
+#         processor access to memory ranges. This is most useful if you have
+#         a video (VGA) card on a PCI or AGP bus. Enabling write-combining
+#         allows bus write transfers to be combined into a larger transfer
+#         before bursting over the PCI/AGP bus. This can increase performance
+#         of image write operations 2.5 times or more. Saying Y here creates a
+#         /proc/mtrr file which may be used to manipulate your processor's
+#         MTRRs. Typically the X server should use this.
+#
+#         This code has a reasonably generic interface so that similar
+#         control registers on other processors can be easily supported
+#         as well:
+#
+#         The Cyrix 6x86, 6x86MX and M II processors have Address Range
+#         Registers (ARRs) which provide a similar functionality to MTRRs. For
+#         these, the ARRs are used to emulate the MTRRs.
+#         The AMD K6-2 (stepping 8 and above) and K6-3 processors have two
+#         MTRRs. The Centaur C6 (WinChip) has 8 MCRs, allowing
+#         write-combining. All of these processors are supported by this code
+#         and it makes sense to say Y here if you have one of them.
+#
+#         Saying Y here also fixes a problem with buggy SMP BIOSes which only
+#         set the MTRRs for the boot CPU and not for the secondary CPUs. This
+#         can lead to all sorts of problems, so it's good to say Y here.
+#
+#         You can safely say Y even if your machine doesn't have MTRRs, you'll
+#         just add about 9 KB to your kernel.
+#
+#         See <file:Documentation/mtrr.txt> for more information.
 
 config IRQBALANCE
        bool "Enable kernel irq balancing"
-       depends on SMP && X86_IO_APIC
+       depends on SMP && X86_IO_APIC && !XEN
        default y
        help
          The default yes will allow the kernel to do irq load balancing.
@@ -922,186 +714,59 @@ config REGPARM
        generate incorrect output with certain kernel constructs when
        -mregparm=3 is used.
 
-config SECCOMP
-       bool "Enable seccomp to safely compute untrusted bytecode"
-       depends on PROC_FS
+config X86_LOCAL_APIC
+       bool
+       depends on XEN_PRIVILEGED_GUEST && (X86_UP_APIC || ((X86_VISWS || SMP) 
&& !X86_VOYAGER))
        default y
-       help
-         This kernel feature is useful for number crunching applications
-         that may need to compute untrusted bytecode during their
-         execution. By using pipes or other transports made available to
-         the process as file descriptors supporting the read/write
-         syscalls, it's possible to isolate those applications in
-         their own address space using seccomp. Once seccomp is
-         enabled via /proc/<pid>/seccomp, it cannot be disabled
-         and the task is only allowed to execute a few safe syscalls
-         defined by each seccomp mode.
-
-         If unsure, say Y. Only embedded should say N here.
-
-endmenu
-
-
-menu "Power management options (ACPI, APM)"
-       depends on !X86_VOYAGER
-
-source kernel/power/Kconfig
 
-source "drivers/acpi/Kconfig"
+config X86_IO_APIC
+       bool
+       depends on XEN_PRIVILEGED_GUEST && (X86_UP_IOAPIC || (SMP && 
!(X86_VISWS || X86_VOYAGER)))
+       default y
 
-menu "APM (Advanced Power Management) BIOS Support"
-depends on PM && !X86_VISWS
+config X86_VISWS_APIC
+       bool
+       depends on X86_VISWS
+       default y
 
-config APM
-       tristate "APM (Advanced Power Management) BIOS support"
-       depends on PM
+config HOTPLUG_CPU
+       bool "Support for hot-pluggable CPUs (EXPERIMENTAL)"
+       depends on SMP && HOTPLUG && EXPERIMENTAL
        ---help---
-         APM is a BIOS specification for saving power using several different
-         techniques. This is mostly useful for battery powered laptops with
-         APM compliant BIOSes. If you say Y here, the system time will be
-         reset after a RESUME operation, the /proc/apm device will provide
-         battery status information, and user-space programs will receive
-         notification of APM "events" (e.g. battery status change).
-
-         If you select "Y" here, you can disable actual use of the APM
-         BIOS by passing the "apm=off" option to the kernel at boot time.
-
-         Note that the APM support is almost completely disabled for
-         machines with more than one CPU.
-
-         In order to use APM, you will need supporting software. For location
-         and more information, read <file:Documentation/pm.txt> and the
-         Battery Powered Linux mini-HOWTO, available from
-         <http://www.tldp.org/docs.html#howto>.
+         Say Y here to experiment with turning CPUs off and on.  CPUs
+         can be controlled through /sys/devices/system/cpu.
 
-         This driver does not spin down disk drives (see the hdparm(8)
-         manpage ("man 8 hdparm") for that), and it doesn't turn off
-         VESA-compliant "green" monitors.
-
-         This driver does not support the TI 4000M TravelMate and the ACER
-         486/DX4/75 because they don't have compliant BIOSes. Many "green"
-         desktop machines also don't have compliant BIOSes, and this driver
-         may cause those machines to panic during the boot phase.
-
-         Generally, if you don't have a battery in your machine, there isn't
-         much point in using this driver and you should say N. If you get
-         random kernel OOPSes or reboots that don't seem to be related to
-         anything, try disabling/enabling this option (or disabling/enabling
-         APM in your BIOS).
-
-         Some other things you should try when experiencing seemingly random,
-         "weird" problems:
-
-         1) make sure that you have enough swap space and that it is
-         enabled.
-         2) pass the "no-hlt" option to the kernel
-         3) switch on floating point emulation in the kernel and pass
-         the "no387" option to the kernel
-         4) pass the "floppy=nodma" option to the kernel
-         5) pass the "mem=4M" option to the kernel (thereby disabling
-         all but the first 4 MB of RAM)
-         6) make sure that the CPU is not over clocked.
-         7) read the sig11 FAQ at <http://www.bitwizard.nl/sig11/>
-         8) disable the cache from your BIOS settings
-         9) install a fan for the video card or exchange video RAM
-         10) install a better fan for the CPU
-         11) exchange RAM chips
-         12) exchange the motherboard.
+         Say N.
 
-         To compile this driver as a module, choose M here: the
-         module will be called apm.
 
-config APM_IGNORE_USER_SUSPEND
-       bool "Ignore USER SUSPEND"
-       depends on APM
-       help
-         This option will ignore USER SUSPEND requests. On machines with a
-         compliant APM BIOS, you want to say N. However, on the NEC Versa M
-         series notebooks, it is necessary to say Y because of a BIOS bug.
-
-config APM_DO_ENABLE
-       bool "Enable PM at boot time"
-       depends on APM
-       ---help---
-         Enable APM features at boot time. From page 36 of the APM BIOS
-         specification: "When disabled, the APM BIOS does not automatically
-         power manage devices, enter the Standby State, enter the Suspend
-         State, or take power saving steps in response to CPU Idle calls."
-         This driver will make CPU Idle calls when Linux is idle (unless this
-         feature is turned off -- see "Do CPU IDLE calls", below). This
-         should always save battery power, but more complicated APM features
-         will be dependent on your BIOS implementation. You may need to turn
-         this option off if your computer hangs at boot time when using APM
-         support, or if it beeps continuously instead of suspending. Turn
-         this off if you have a NEC UltraLite Versa 33/C or a Toshiba
-         T400CDT. This is off by default since most machines do fine without
-         this feature.
-
-config APM_CPU_IDLE
-       bool "Make CPU Idle calls when idle"
-       depends on APM
-       help
-         Enable calls to APM CPU Idle/CPU Busy inside the kernel's idle loop.
-         On some machines, this can activate improved power savings, such as
-         a slowed CPU clock rate, when the machine is idle. These idle calls
-         are made after the idle loop has run for some length of time (e.g.,
-         333 mS). On some machines, this will cause a hang at boot time or
-         whenever the CPU becomes idle. (On machines with more than one CPU,
-         this option does nothing.)
-
-config APM_DISPLAY_BLANK
-       bool "Enable console blanking using APM"
-       depends on APM
-       help
-         Enable console blanking using the APM. Some laptops can use this to
-         turn off the LCD backlight when the screen blanker of the Linux
-         virtual console blanks the screen. Note that this is only used by
-         the virtual console screen blanker, and won't turn off the backlight
-         when using the X Window system. This also doesn't have anything to
-         do with your VESA-compliant power-saving monitor. Further, this
-         option doesn't work for all laptops -- it might not turn off your
-         backlight at all, or it might print a lot of errors to the console,
-         especially if you are using gpm.
-
-config APM_RTC_IS_GMT
-       bool "RTC stores time in GMT"
-       depends on APM
-       help
-         Say Y here if your RTC (Real Time Clock a.k.a. hardware clock)
-         stores the time in GMT (Greenwich Mean Time). Say N if your RTC
-         stores localtime.
-
-         It is in fact recommended to store GMT in your RTC, because then you
-         don't have to worry about daylight savings time changes. The only
-         reason not to use GMT in your RTC is if you also run a broken OS
-         that doesn't understand GMT.
-
-config APM_ALLOW_INTS
-       bool "Allow interrupts during APM BIOS calls"
-       depends on APM
-       help
-         Normally we disable external interrupts while we are making calls to
-         the APM BIOS as a measure to lessen the effects of a badly behaving
-         BIOS implementation.  The BIOS should reenable interrupts if it
-         needs to.  Unfortunately, some BIOSes do not -- especially those in
-         many of the newer IBM Thinkpads.  If you experience hangs when you
-         suspend, try setting this to Y.  Otherwise, say N.
-
-config APM_REAL_MODE_POWER_OFF
-       bool "Use real mode APM BIOS call to power off"
-       depends on APM
-       help
-         Use real mode APM BIOS calls to switch off the computer. This is
-         a work-around for a number of buggy BIOSes. Switch this option on if
-         your computer crashes instead of powering off properly.
+if XEN_PHYSDEV_ACCESS
 
-endmenu
+menu "Bus options (PCI, PCMCIA, EISA, MCA, ISA)"
 
-source "arch/i386/kernel/cpu/cpufreq/Kconfig"
+config X86_UP_APIC
+       bool "Local APIC support on uniprocessors"
+       depends on !SMP && !(X86_VISWS || X86_VOYAGER)
+       help
+         A local APIC (Advanced Programmable Interrupt Controller) is an
+         integrated interrupt controller in the CPU. If you have a single-CPU
+         system which has a processor with a local APIC, you can say Y here to
+         enable and use it. If you say Y here even though your machine doesn't
+         have a local APIC, then the kernel will still run with no slowdown at
+         all. The local APIC supports CPU-generated self-interrupts (timer,
+         performance counters), and the NMI watchdog which detects hard
+         lockups.
 
-endmenu
+config X86_UP_IOAPIC
+       bool "IO-APIC support on uniprocessors"
+       depends on X86_UP_APIC
+       help
+         An IO-APIC (I/O Advanced Programmable Interrupt Controller) is an
+         SMP-capable replacement for PC-style interrupt controllers. Most
+         SMP systems and many recent uniprocessor systems have one.
 
-menu "Bus options (PCI, PCMCIA, EISA, MCA, ISA)"
+         If you have a single-CPU system with an IO-APIC, you can say Y here
+         to use it. If you say Y here even though your machine doesn't have
+         an IO-APIC, then the kernel will still run with no slowdown at all.
 
 config PCI
        bool "PCI support" if !X86_VISWS
@@ -1232,25 +897,7 @@ source "drivers/pci/hotplug/Kconfig"
 
 endmenu
 
-menu "Executable file formats"
-
-source "fs/Kconfig.binfmt"
-
-endmenu
-
-source "drivers/Kconfig"
-
-source "fs/Kconfig"
-
-source "arch/i386/oprofile/Kconfig"
-
-source "arch/i386/Kconfig.debug"
-
-source "security/Kconfig"
-
-source "crypto/Kconfig"
-
-source "lib/Kconfig"
+endif
 
 #
 # Use the generic interrupt handling code in kernel/irq/:
@@ -1268,10 +915,10 @@ config X86_SMP
        depends on SMP && !X86_VOYAGER
        default y
 
-config X86_HT
-       bool
-       depends on SMP && !(X86_VISWS || X86_VOYAGER)
-       default y
+#config X86_HT
+#      bool
+#      depends on SMP && !(X86_VISWS || X86_VOYAGER)
+#      default y
 
 config X86_BIOS_REBOOT
        bool
@@ -1287,3 +934,22 @@ config PC
        bool
        depends on X86 && !EMBEDDED
        default y
+
+config SECCOMP
+       bool "Enable seccomp to safely compute untrusted bytecode"
+       depends on PROC_FS
+       default y
+       help
+         This kernel feature is useful for number crunching applications
+         that may need to compute untrusted bytecode during their
+         execution. By using pipes or other transports made available to
+         the process as file descriptors supporting the read/write
+         syscalls, it's possible to isolate those applications in
+         their own address space using seccomp. Once seccomp is
+         enabled via /proc/<pid>/seccomp, it cannot be disabled
+         and the task is only allowed to execute a few safe syscalls
+         defined by each seccomp mode.
+
+         If unsure, say Y. Only embedded should say N here.
+
+endmenu
diff -x mkbuildtree -x include -x xen -x SCCS -urPp 
pristine-linux-2.6.12/arch/i386/kernel/acpi/boot.c 
linux-2.6-xen-sparse/arch/i386/kernel/acpi/boot.c
--- pristine-linux-2.6.12/arch/i386/kernel/acpi/boot.c  2005-06-17 
12:48:29.000000000 -0700
+++ linux-2.6-xen-sparse/arch/i386/kernel/acpi/boot.c   2005-07-28 
13:17:07.000000000 -0700
@@ -36,6 +36,11 @@
 #include <asm/io.h>
 #include <asm/irq.h>
 #include <asm/mpspec.h>
+#ifdef CONFIG_XEN
+#include <asm/fixmap.h>
+#endif
+
+void (*pm_power_off)(void) = NULL;
 
 #ifdef CONFIG_X86_64
 
@@ -100,7 +105,7 @@ EXPORT_SYMBOL(x86_acpiid_to_apicid);
  */
 enum acpi_irq_model_id         acpi_irq_model = ACPI_IRQ_MODEL_PIC;
 
-#ifdef CONFIG_X86_64
+#if defined(CONFIG_X86_64) && !defined(CONFIG_XEN)
 
 /* rely on all ACPI tables being in the direct mapping */
 char *__acpi_map_table(unsigned long phys_addr, unsigned long size)
@@ -133,8 +138,10 @@ char *__acpi_map_table(unsigned long phy
        unsigned long base, offset, mapped_size;
        int idx;
 
+#ifndef CONFIG_XEN
        if (phys + size < 8*1024*1024) 
                return __va(phys); 
+#endif
 
        offset = phys & (PAGE_SIZE - 1);
        mapped_size = PAGE_SIZE - offset;
@@ -462,18 +469,6 @@ unsigned int acpi_register_gsi(u32 gsi, 
        unsigned int irq;
        unsigned int plat_gsi = gsi;
 
-#ifdef CONFIG_PCI
-       /*
-        * Make sure all (legacy) PCI IRQs are set as level-triggered.
-        */
-       if (acpi_irq_model == ACPI_IRQ_MODEL_PIC) {
-               extern void eisa_set_level_irq(unsigned int irq);
-
-               if (edge_level == ACPI_LEVEL_SENSITIVE)
-                               eisa_set_level_irq(gsi);
-       }
-#endif
-
 #ifdef CONFIG_X86_IO_APIC
        if (acpi_irq_model == ACPI_IRQ_MODEL_IOAPIC) {
                plat_gsi = mp_register_gsi(gsi, edge_level, active_high_low);
@@ -513,13 +508,14 @@ acpi_scan_rsdp (
 {
        unsigned long           offset = 0;
        unsigned long           sig_len = sizeof("RSD PTR ") - 1;
+       unsigned long           vstart = (unsigned long)isa_bus_to_virt(start);
 
        /*
         * Scan all 16-byte boundaries of the physical memory region for the
         * RSDP signature.
         */
        for (offset = 0; offset < length; offset += 16) {
-               if (strncmp((char *) (start + offset), "RSD PTR ", sig_len))
+               if (strncmp((char *) (vstart + offset), "RSD PTR ", sig_len))
                        continue;
                return (start + offset);
        }
@@ -652,6 +648,8 @@ acpi_find_rsdp (void)
        if (!rsdp_phys)
                rsdp_phys = acpi_scan_rsdp (0xE0000, 0x20000);
 
+       set_fixmap(FIX_ACPI_RSDP_PAGE, rsdp_phys);
+
        return rsdp_phys;
 }
 
diff -x mkbuildtree -x include -x xen -x SCCS -urPp 
pristine-linux-2.6.12/arch/i386/kernel/acpi/Makefile 
linux-2.6-xen-sparse/arch/i386/kernel/acpi/Makefile
--- pristine-linux-2.6.12/arch/i386/kernel/acpi/Makefile        2005-06-17 
12:48:29.000000000 -0700
+++ linux-2.6-xen-sparse/arch/i386/kernel/acpi/Makefile 2005-07-28 
13:17:07.000000000 -0700
@@ -1,4 +1,13 @@
-obj-$(CONFIG_ACPI_BOOT)                := boot.o
-obj-$(CONFIG_X86_IO_APIC)      += earlyquirk.o
-obj-$(CONFIG_ACPI_SLEEP)       += sleep.o wakeup.o
+obj-$(CONFIG_ACPI_BOOT)                        := boot.o
+c-obj-$(CONFIG_X86_IO_APIC)            += earlyquirk.o
+c-obj-$(CONFIG_ACPI_SLEEP)             += sleep.o wakeup.o
 
+c-link                                  :=
+
+$(patsubst %.o,$(obj)/%.c,$(c-obj-y) $(c-link)):
+       @ln -fsn $(srctree)/arch/i386/kernel/acpi/$(notdir $@) $@
+
+obj-y  += $(c-obj-y) $(s-obj-y)
+
+clean-files += $(patsubst %.o,%.c,$(c-obj-y) $(c-obj-) $(c-link))
+clean-files += $(patsubst %.o,%.S,$(s-obj-y) $(s-obj-) $(s-link))
diff -x mkbuildtree -x include -x xen -x SCCS -urPp 
pristine-linux-2.6.12/arch/i386/kernel/apic.c 
linux-2.6-xen-sparse/arch/i386/kernel/apic.c
--- pristine-linux-2.6.12/arch/i386/kernel/apic.c       2005-06-17 
12:48:29.000000000 -0700
+++ linux-2.6-xen-sparse/arch/i386/kernel/apic.c        2005-07-28 
13:17:07.000000000 -0700
@@ -44,8 +44,10 @@
  */
 int apic_verbosity;
 
-
-static void apic_pm_activate(void);
+int get_physical_broadcast(void)
+{
+        return 0xff;
+}
 
 /*
  * 'what should we do if we get a hw irq event on an illegal vector'.
@@ -65,1212 +67,17 @@ void ack_bad_irq(unsigned int irq)
        ack_APIC_irq();
 }
 
-void __init apic_intr_init(void)
-{
-#ifdef CONFIG_SMP
-       smp_intr_init();
-#endif
-       /* self generated IPI for local APIC timer */
-       set_intr_gate(LOCAL_TIMER_VECTOR, apic_timer_interrupt);
-
-       /* IPI vectors for APIC spurious and error interrupts */
-       set_intr_gate(SPURIOUS_APIC_VECTOR, spurious_interrupt);
-       set_intr_gate(ERROR_APIC_VECTOR, error_interrupt);
-
-       /* thermal monitor LVT interrupt */
-#ifdef CONFIG_X86_MCE_P4THERMAL
-       set_intr_gate(THERMAL_APIC_VECTOR, thermal_interrupt);
-#endif
-}
-
-/* Using APIC to generate smp_local_timer_interrupt? */
-int using_apic_timer = 0;
-
-static DEFINE_PER_CPU(int, prof_multiplier) = 1;
-static DEFINE_PER_CPU(int, prof_old_multiplier) = 1;
-static DEFINE_PER_CPU(int, prof_counter) = 1;
-
-static int enabled_via_apicbase;
-
-void enable_NMI_through_LVT0 (void * dummy)
-{
-       unsigned int v, ver;
-
-       ver = apic_read(APIC_LVR);
-       ver = GET_APIC_VERSION(ver);
-       v = APIC_DM_NMI;                        /* unmask and set to NMI */
-       if (!APIC_INTEGRATED(ver))              /* 82489DX */
-               v |= APIC_LVT_LEVEL_TRIGGER;
-       apic_write_around(APIC_LVT0, v);
-}
-
-int get_physical_broadcast(void)
-{
-       unsigned int lvr, version;
-       lvr = apic_read(APIC_LVR);
-       version = GET_APIC_VERSION(lvr);
-       if (!APIC_INTEGRATED(version) || version >= 0x14)
-               return 0xff;
-       else
-               return 0xf;
-}
-
-int get_maxlvt(void)
-{
-       unsigned int v, ver, maxlvt;
-
-       v = apic_read(APIC_LVR);
-       ver = GET_APIC_VERSION(v);
-       /* 82489DXs do not report # of LVT entries. */
-       maxlvt = APIC_INTEGRATED(ver) ? GET_APIC_MAXLVT(v) : 2;
-       return maxlvt;
-}
-
-void clear_local_APIC(void)
-{
-       int maxlvt;
-       unsigned long v;
-
-       maxlvt = get_maxlvt();
-
-       /*
-        * Masking an LVT entry on a P6 can trigger a local APIC error
-        * if the vector is zero. Mask LVTERR first to prevent this.
-        */
-       if (maxlvt >= 3) {
-               v = ERROR_APIC_VECTOR; /* any non-zero vector will do */
-               apic_write_around(APIC_LVTERR, v | APIC_LVT_MASKED);
-       }
-       /*
-        * Careful: we have to set masks only first to deassert
-        * any level-triggered sources.
-        */
-       v = apic_read(APIC_LVTT);
-       apic_write_around(APIC_LVTT, v | APIC_LVT_MASKED);
-       v = apic_read(APIC_LVT0);
-       apic_write_around(APIC_LVT0, v | APIC_LVT_MASKED);
-       v = apic_read(APIC_LVT1);
-       apic_write_around(APIC_LVT1, v | APIC_LVT_MASKED);
-       if (maxlvt >= 4) {
-               v = apic_read(APIC_LVTPC);
-               apic_write_around(APIC_LVTPC, v | APIC_LVT_MASKED);
-       }
-
-/* lets not touch this if we didn't frob it */
-#ifdef CONFIG_X86_MCE_P4THERMAL
-       if (maxlvt >= 5) {
-               v = apic_read(APIC_LVTTHMR);
-               apic_write_around(APIC_LVTTHMR, v | APIC_LVT_MASKED);
-       }
-#endif
-       /*
-        * Clean APIC state for other OSs:
-        */
-       apic_write_around(APIC_LVTT, APIC_LVT_MASKED);
-       apic_write_around(APIC_LVT0, APIC_LVT_MASKED);
-       apic_write_around(APIC_LVT1, APIC_LVT_MASKED);
-       if (maxlvt >= 3)
-               apic_write_around(APIC_LVTERR, APIC_LVT_MASKED);
-       if (maxlvt >= 4)
-               apic_write_around(APIC_LVTPC, APIC_LVT_MASKED);
-
-#ifdef CONFIG_X86_MCE_P4THERMAL
-       if (maxlvt >= 5)
-               apic_write_around(APIC_LVTTHMR, APIC_LVT_MASKED);
-#endif
-       v = GET_APIC_VERSION(apic_read(APIC_LVR));
-       if (APIC_INTEGRATED(v)) {       /* !82489DX */
-               if (maxlvt > 3)         /* Due to Pentium errata 3AP and 11AP. 
*/
-                       apic_write(APIC_ESR, 0);
-               apic_read(APIC_ESR);
-       }
-}
-
-void __init connect_bsp_APIC(void)
-{
-       if (pic_mode) {
-               /*
-                * Do not trust the local APIC being empty at bootup.
-                */
-               clear_local_APIC();
-               /*
-                * PIC mode, enable APIC mode in the IMCR, i.e.
-                * connect BSP's local APIC to INT and NMI lines.
-                */
-               apic_printk(APIC_VERBOSE, "leaving PIC mode, "
-                               "enabling APIC mode.\n");
-               outb(0x70, 0x22);
-               outb(0x01, 0x23);
-       }
-       enable_apic_mode();
-}
-
-void disconnect_bsp_APIC(void)
-{
-       if (pic_mode) {
-               /*
-                * Put the board back into PIC mode (has an effect
-                * only on certain older boards).  Note that APIC
-                * interrupts, including IPIs, won't work beyond
-                * this point!  The only exception are INIT IPIs.
-                */
-               apic_printk(APIC_VERBOSE, "disabling APIC mode, "
-                               "entering PIC mode.\n");
-               outb(0x70, 0x22);
-               outb(0x00, 0x23);
-       }
-}
-
-void disable_local_APIC(void)
-{
-       unsigned long value;
-
-       clear_local_APIC();
-
-       /*
-        * Disable APIC (implies clearing of registers
-        * for 82489DX!).
-        */
-       value = apic_read(APIC_SPIV);
-       value &= ~APIC_SPIV_APIC_ENABLED;
-       apic_write_around(APIC_SPIV, value);
-
-       if (enabled_via_apicbase) {
-               unsigned int l, h;
-               rdmsr(MSR_IA32_APICBASE, l, h);
-               l &= ~MSR_IA32_APICBASE_ENABLE;
-               wrmsr(MSR_IA32_APICBASE, l, h);
-       }
-}
-
-/*
- * This is to verify that we're looking at a real local APIC.
- * Check these against your board if the CPUs aren't getting
- * started for no apparent reason.
- */
-int __init verify_local_APIC(void)
-{
-       unsigned int reg0, reg1;
-
-       /*
-        * The version register is read-only in a real APIC.
-        */
-       reg0 = apic_read(APIC_LVR);
-       apic_printk(APIC_DEBUG, "Getting VERSION: %x\n", reg0);
-       apic_write(APIC_LVR, reg0 ^ APIC_LVR_MASK);
-       reg1 = apic_read(APIC_LVR);
-       apic_printk(APIC_DEBUG, "Getting VERSION: %x\n", reg1);
-
-       /*
-        * The two version reads above should print the same
-        * numbers.  If the second one is different, then we
-        * poke at a non-APIC.
-        */
-       if (reg1 != reg0)
-               return 0;
-
-       /*
-        * Check if the version looks reasonably.
-        */
-       reg1 = GET_APIC_VERSION(reg0);
-       if (reg1 == 0x00 || reg1 == 0xff)
-               return 0;
-       reg1 = get_maxlvt();
-       if (reg1 < 0x02 || reg1 == 0xff)
-               return 0;
-
-       /*
-        * The ID register is read/write in a real APIC.
-        */
-       reg0 = apic_read(APIC_ID);
-       apic_printk(APIC_DEBUG, "Getting ID: %x\n", reg0);
-
-       /*
-        * The next two are just to see if we have sane values.
-        * They're only really relevant if we're in Virtual Wire
-        * compatibility mode, but most boxes are anymore.
-        */
-       reg0 = apic_read(APIC_LVT0);
-       apic_printk(APIC_DEBUG, "Getting LVT0: %x\n", reg0);
-       reg1 = apic_read(APIC_LVT1);
-       apic_printk(APIC_DEBUG, "Getting LVT1: %x\n", reg1);
-
-       return 1;
-}
-
-void __init sync_Arb_IDs(void)
-{
-       /* Unsupported on P4 - see Intel Dev. Manual Vol. 3, Ch. 8.6.1 */
-       unsigned int ver = GET_APIC_VERSION(apic_read(APIC_LVR));
-       if (ver >= 0x14)        /* P4 or higher */
-               return;
-       /*
-        * Wait for idle.
-        */
-       apic_wait_icr_idle();
-
-       apic_printk(APIC_DEBUG, "Synchronizing Arb IDs.\n");
-       apic_write_around(APIC_ICR, APIC_DEST_ALLINC | APIC_INT_LEVELTRIG
-                               | APIC_DM_INIT);
-}
-
-extern void __error_in_apic_c (void);
-
-/*
- * An initial setup of the virtual wire mode.
- */
-void __init init_bsp_APIC(void)
-{
-       unsigned long value, ver;
-
-       /*
-        * Don't do the setup now if we have a SMP BIOS as the
-        * through-I/O-APIC virtual wire mode might be active.
-        */
-       if (smp_found_config || !cpu_has_apic)
-               return;
-
-       value = apic_read(APIC_LVR);
-       ver = GET_APIC_VERSION(value);
-
-       /*
-        * Do not trust the local APIC being empty at bootup.
-        */
-       clear_local_APIC();
-
-       /*
-        * Enable APIC.
-        */
-       value = apic_read(APIC_SPIV);
-       value &= ~APIC_VECTOR_MASK;
-       value |= APIC_SPIV_APIC_ENABLED;
-       
-       /* This bit is reserved on P4/Xeon and should be cleared */
-       if ((boot_cpu_data.x86_vendor == X86_VENDOR_INTEL) && 
(boot_cpu_data.x86 == 15))
-               value &= ~APIC_SPIV_FOCUS_DISABLED;
-       else
-               value |= APIC_SPIV_FOCUS_DISABLED;
-       value |= SPURIOUS_APIC_VECTOR;
-       apic_write_around(APIC_SPIV, value);
-
-       /*
-        * Set up the virtual wire mode.
-        */
-       apic_write_around(APIC_LVT0, APIC_DM_EXTINT);
-       value = APIC_DM_NMI;
-       if (!APIC_INTEGRATED(ver))              /* 82489DX */
-               value |= APIC_LVT_LEVEL_TRIGGER;
-       apic_write_around(APIC_LVT1, value);
-}
-
-void __init setup_local_APIC (void)
-{
-       unsigned long oldvalue, value, ver, maxlvt;
-
-       /* Pound the ESR really hard over the head with a big hammer - mbligh */
-       if (esr_disable) {
-               apic_write(APIC_ESR, 0);
-               apic_write(APIC_ESR, 0);
-               apic_write(APIC_ESR, 0);
-               apic_write(APIC_ESR, 0);
-       }
-
-       value = apic_read(APIC_LVR);
-       ver = GET_APIC_VERSION(value);
-
-       if ((SPURIOUS_APIC_VECTOR & 0x0f) != 0x0f)
-               __error_in_apic_c();
-
-       /*
-        * Double-check whether this APIC is really registered.
-        */
-       if (!apic_id_registered())
-               BUG();
-
-       /*
-        * Intel recommends to set DFR, LDR and TPR before enabling
-        * an APIC.  See e.g. "AP-388 82489DX User's Manual" (Intel
-        * document number 292116).  So here it goes...
-        */
-       init_apic_ldr();
-
-       /*
-        * Set Task Priority to 'accept all'. We never change this
-        * later on.
-        */
-       value = apic_read(APIC_TASKPRI);
-       value &= ~APIC_TPRI_MASK;
-       apic_write_around(APIC_TASKPRI, value);
-
-       /*
-        * Now that we are all set up, enable the APIC
-        */
-       value = apic_read(APIC_SPIV);
-       value &= ~APIC_VECTOR_MASK;
-       /*
-        * Enable APIC
-        */
-       value |= APIC_SPIV_APIC_ENABLED;
-
-       /*
-        * Some unknown Intel IO/APIC (or APIC) errata is biting us with
-        * certain networking cards. If high frequency interrupts are
-        * happening on a particular IOAPIC pin, plus the IOAPIC routing
-        * entry is masked/unmasked at a high rate as well then sooner or
-        * later IOAPIC line gets 'stuck', no more interrupts are received
-        * from the device. If focus CPU is disabled then the hang goes
-        * away, oh well :-(
-        *
-        * [ This bug can be reproduced easily with a level-triggered
-        *   PCI Ne2000 networking cards and PII/PIII processors, dual
-        *   BX chipset. ]
-        */
-       /*
-        * Actually disabling the focus CPU check just makes the hang less
-        * frequent as it makes the interrupt distributon model be more
-        * like LRU than MRU (the short-term load is more even across CPUs).
-        * See also the comment in end_level_ioapic_irq().  --macro
-        */
-#if 1
-       /* Enable focus processor (bit==0) */
-       value &= ~APIC_SPIV_FOCUS_DISABLED;
-#else
-       /* Disable focus processor (bit==1) */
-       value |= APIC_SPIV_FOCUS_DISABLED;
-#endif
-       /*
-        * Set spurious IRQ vector
-        */
-       value |= SPURIOUS_APIC_VECTOR;
-       apic_write_around(APIC_SPIV, value);
-
-       /*
-        * Set up LVT0, LVT1:
-        *
-        * set up through-local-APIC on the BP's LINT0. This is not
-        * strictly necessery in pure symmetric-IO mode, but sometimes
-        * we delegate interrupts to the 8259A.
-        */
-       /*
-        * TODO: set up through-local-APIC from through-I/O-APIC? --macro
-        */
-       value = apic_read(APIC_LVT0) & APIC_LVT_MASKED;
-       if (!smp_processor_id() && (pic_mode || !value)) {
-               value = APIC_DM_EXTINT;
-               apic_printk(APIC_VERBOSE, "enabled ExtINT on CPU#%d\n",
-                               smp_processor_id());
-       } else {
-               value = APIC_DM_EXTINT | APIC_LVT_MASKED;
-               apic_printk(APIC_VERBOSE, "masked ExtINT on CPU#%d\n",
-                               smp_processor_id());
-       }
-       apic_write_around(APIC_LVT0, value);
-
-       /*
-        * only the BP should see the LINT1 NMI signal, obviously.
-        */
-       if (!smp_processor_id())
-               value = APIC_DM_NMI;
-       else
-               value = APIC_DM_NMI | APIC_LVT_MASKED;
-       if (!APIC_INTEGRATED(ver))              /* 82489DX */
-               value |= APIC_LVT_LEVEL_TRIGGER;
-       apic_write_around(APIC_LVT1, value);
-
-       if (APIC_INTEGRATED(ver) && !esr_disable) {             /* !82489DX */
-               maxlvt = get_maxlvt();
-               if (maxlvt > 3)         /* Due to the Pentium erratum 3AP. */
-                       apic_write(APIC_ESR, 0);
-               oldvalue = apic_read(APIC_ESR);
-
-               value = ERROR_APIC_VECTOR;      // enables sending errors
-               apic_write_around(APIC_LVTERR, value);
-               /*
-                * spec says clear errors after enabling vector.
-                */
-               if (maxlvt > 3)
-                       apic_write(APIC_ESR, 0);
-               value = apic_read(APIC_ESR);
-               if (value != oldvalue)
-                       apic_printk(APIC_VERBOSE, "ESR value before enabling "
-                               "vector: 0x%08lx  after: 0x%08lx\n",
-                               oldvalue, value);
-       } else {
-               if (esr_disable)        
-                       /* 
-                        * Something untraceble is creating bad interrupts on 
-                        * secondary quads ... for the moment, just leave the
-                        * ESR disabled - we can't do anything useful with the
-                        * errors anyway - mbligh
-                        */
-                       printk("Leaving ESR disabled.\n");
-               else 
-                       printk("No ESR for 82489DX.\n");
-       }
-
-       if (nmi_watchdog == NMI_LOCAL_APIC)
-               setup_apic_nmi_watchdog();
-       apic_pm_activate();
-}
-
-/*
- * If Linux enabled the LAPIC against the BIOS default
- * disable it down before re-entering the BIOS on shutdown.
- * Otherwise the BIOS may get confused and not power-off.
- */
-void lapic_shutdown(void)
-{
-       if (!cpu_has_apic || !enabled_via_apicbase)
-               return;
-
-       local_irq_disable();
-       disable_local_APIC();
-       local_irq_enable();
-}
-
-#ifdef CONFIG_PM
-
-static struct {
-       int active;
-       /* r/w apic fields */
-       unsigned int apic_id;
-       unsigned int apic_taskpri;
-       unsigned int apic_ldr;
-       unsigned int apic_dfr;
-       unsigned int apic_spiv;
-       unsigned int apic_lvtt;
-       unsigned int apic_lvtpc;
-       unsigned int apic_lvt0;
-       unsigned int apic_lvt1;
-       unsigned int apic_lvterr;
-       unsigned int apic_tmict;
-       unsigned int apic_tdcr;
-       unsigned int apic_thmr;
-} apic_pm_state;
-
-static int lapic_suspend(struct sys_device *dev, pm_message_t state)
-{
-       unsigned long flags;
-
-       if (!apic_pm_state.active)
-               return 0;
-
-       apic_pm_state.apic_id = apic_read(APIC_ID);
-       apic_pm_state.apic_taskpri = apic_read(APIC_TASKPRI);
-       apic_pm_state.apic_ldr = apic_read(APIC_LDR);
-       apic_pm_state.apic_dfr = apic_read(APIC_DFR);
-       apic_pm_state.apic_spiv = apic_read(APIC_SPIV);
-       apic_pm_state.apic_lvtt = apic_read(APIC_LVTT);
-       apic_pm_state.apic_lvtpc = apic_read(APIC_LVTPC);
-       apic_pm_state.apic_lvt0 = apic_read(APIC_LVT0);
-       apic_pm_state.apic_lvt1 = apic_read(APIC_LVT1);
-       apic_pm_state.apic_lvterr = apic_read(APIC_LVTERR);
-       apic_pm_state.apic_tmict = apic_read(APIC_TMICT);
-       apic_pm_state.apic_tdcr = apic_read(APIC_TDCR);
-       apic_pm_state.apic_thmr = apic_read(APIC_LVTTHMR);
-       
-       local_irq_save(flags);
-       disable_local_APIC();
-       local_irq_restore(flags);
-       return 0;
-}
-
-static int lapic_resume(struct sys_device *dev)
-{
-       unsigned int l, h;
-       unsigned long flags;
-
-       if (!apic_pm_state.active)
-               return 0;
-
-       local_irq_save(flags);
-
-       /*
-        * Make sure the APICBASE points to the right address
-        *
-        * FIXME! This will be wrong if we ever support suspend on
-        * SMP! We'll need to do this as part of the CPU restore!
-        */
-       rdmsr(MSR_IA32_APICBASE, l, h);
-       l &= ~MSR_IA32_APICBASE_BASE;
-       l |= MSR_IA32_APICBASE_ENABLE | mp_lapic_addr;
-       wrmsr(MSR_IA32_APICBASE, l, h);
-
-       apic_write(APIC_LVTERR, ERROR_APIC_VECTOR | APIC_LVT_MASKED);
-       apic_write(APIC_ID, apic_pm_state.apic_id);
-       apic_write(APIC_DFR, apic_pm_state.apic_dfr);
-       apic_write(APIC_LDR, apic_pm_state.apic_ldr);
-       apic_write(APIC_TASKPRI, apic_pm_state.apic_taskpri);
-       apic_write(APIC_SPIV, apic_pm_state.apic_spiv);
-       apic_write(APIC_LVT0, apic_pm_state.apic_lvt0);
-       apic_write(APIC_LVT1, apic_pm_state.apic_lvt1);
-       apic_write(APIC_LVTTHMR, apic_pm_state.apic_thmr);
-       apic_write(APIC_LVTPC, apic_pm_state.apic_lvtpc);
-       apic_write(APIC_LVTT, apic_pm_state.apic_lvtt);
-       apic_write(APIC_TDCR, apic_pm_state.apic_tdcr);
-       apic_write(APIC_TMICT, apic_pm_state.apic_tmict);
-       apic_write(APIC_ESR, 0);
-       apic_read(APIC_ESR);
-       apic_write(APIC_LVTERR, apic_pm_state.apic_lvterr);
-       apic_write(APIC_ESR, 0);
-       apic_read(APIC_ESR);
-       local_irq_restore(flags);
-       return 0;
-}
-
-/*
- * This device has no shutdown method - fully functioning local APICs
- * are needed on every CPU up until machine_halt/restart/poweroff.
- */
-
-static struct sysdev_class lapic_sysclass = {
-       set_kset_name("lapic"),
-       .resume         = lapic_resume,
-       .suspend        = lapic_suspend,
-};
-
-static struct sys_device device_lapic = {
-       .id     = 0,
-       .cls    = &lapic_sysclass,
-};
-
-static void __init apic_pm_activate(void)
-{
-       apic_pm_state.active = 1;
-}
-
-static int __init init_lapic_sysfs(void)
-{
-       int error;
-
-       if (!cpu_has_apic)
-               return 0;
-       /* XXX: remove suspend/resume procs if !apic_pm_state.active? */
-
-       error = sysdev_class_register(&lapic_sysclass);
-       if (!error)
-               error = sysdev_register(&device_lapic);
-       return error;
-}
-device_initcall(init_lapic_sysfs);
-
-#else  /* CONFIG_PM */
-
-static void apic_pm_activate(void) { }
-
-#endif /* CONFIG_PM */
-
-/*
- * Detect and enable local APICs on non-SMP boards.
- * Original code written by Keir Fraser.
- */
-
-/*
- * Knob to control our willingness to enable the local APIC.
- */
-int enable_local_apic __initdata = 0; /* -1=force-disable, +1=force-enable */
-
-static int __init lapic_disable(char *str)
-{
-       enable_local_apic = -1;
-       clear_bit(X86_FEATURE_APIC, boot_cpu_data.x86_capability);
-       return 0;
-}
-__setup("nolapic", lapic_disable);
-
-static int __init lapic_enable(char *str)
-{
-       enable_local_apic = 1;
-       return 0;
-}
-__setup("lapic", lapic_enable);
-
-static int __init apic_set_verbosity(char *str)
-{
-       if (strcmp("debug", str) == 0)
-               apic_verbosity = APIC_DEBUG;
-       else if (strcmp("verbose", str) == 0)
-               apic_verbosity = APIC_VERBOSE;
-       else
-               printk(KERN_WARNING "APIC Verbosity level %s not recognised"
-                               " use apic=verbose or apic=debug", str);
-
-       return 0;
-}
-
-__setup("apic=", apic_set_verbosity);
-
-static int __init detect_init_APIC (void)
-{
-       u32 h, l, features;
-       extern void get_cpu_vendor(struct cpuinfo_x86*);
-
-       /* Disabled by kernel option? */
-       if (enable_local_apic < 0)
-               return -1;
-
-       /* Workaround for us being called before identify_cpu(). */
-       get_cpu_vendor(&boot_cpu_data);
-
-       switch (boot_cpu_data.x86_vendor) {
-       case X86_VENDOR_AMD:
-               if ((boot_cpu_data.x86 == 6 && boot_cpu_data.x86_model > 1) ||
-                   (boot_cpu_data.x86 == 15))      
-                       break;
-               goto no_apic;
-       case X86_VENDOR_INTEL:
-               if (boot_cpu_data.x86 == 6 || boot_cpu_data.x86 == 15 ||
-                   (boot_cpu_data.x86 == 5 && cpu_has_apic))
-                       break;
-               goto no_apic;
-       default:
-               goto no_apic;
-       }
-
-       if (!cpu_has_apic) {
-               /*
-                * Over-ride BIOS and try to enable the local
-                * APIC only if "lapic" specified.
-                */
-               if (enable_local_apic <= 0) {
-                       printk("Local APIC disabled by BIOS -- "
-                              "you can enable it with \"lapic\"\n");
-                       return -1;
-               }
-               /*
-                * Some BIOSes disable the local APIC in the
-                * APIC_BASE MSR. This can only be done in
-                * software for Intel P6 or later and AMD K7
-                * (Model > 1) or later.
-                */
-               rdmsr(MSR_IA32_APICBASE, l, h);
-               if (!(l & MSR_IA32_APICBASE_ENABLE)) {
-                       printk("Local APIC disabled by BIOS -- reenabling.\n");
-                       l &= ~MSR_IA32_APICBASE_BASE;
-                       l |= MSR_IA32_APICBASE_ENABLE | APIC_DEFAULT_PHYS_BASE;
-                       wrmsr(MSR_IA32_APICBASE, l, h);
-                       enabled_via_apicbase = 1;
-               }
-       }
-       /*
-        * The APIC feature bit should now be enabled
-        * in `cpuid'
-        */
-       features = cpuid_edx(1);
-       if (!(features & (1 << X86_FEATURE_APIC))) {
-               printk("Could not enable APIC!\n");
-               return -1;
-       }
-       set_bit(X86_FEATURE_APIC, boot_cpu_data.x86_capability);
-       mp_lapic_addr = APIC_DEFAULT_PHYS_BASE;
-
-       /* The BIOS may have set up the APIC at some other address */
-       rdmsr(MSR_IA32_APICBASE, l, h);
-       if (l & MSR_IA32_APICBASE_ENABLE)
-               mp_lapic_addr = l & MSR_IA32_APICBASE_BASE;
-
-       if (nmi_watchdog != NMI_NONE)
-               nmi_watchdog = NMI_LOCAL_APIC;
-
-       printk("Found and enabled local APIC!\n");
-
-       apic_pm_activate();
-
-       return 0;
-
-no_apic:
-       printk("No local APIC present or hardware disabled\n");
-       return -1;
-}
-
-void __init init_apic_mappings(void)
-{
-       unsigned long apic_phys;
-
-       /*
-        * If no local APIC can be found then set up a fake all
-        * zeroes page to simulate the local APIC and another
-        * one for the IO-APIC.
-        */
-       if (!smp_found_config && detect_init_APIC()) {
-               apic_phys = (unsigned long) alloc_bootmem_pages(PAGE_SIZE);
-               apic_phys = __pa(apic_phys);
-       } else
-               apic_phys = mp_lapic_addr;
-
-       set_fixmap_nocache(FIX_APIC_BASE, apic_phys);
-       printk(KERN_DEBUG "mapped APIC to %08lx (%08lx)\n", APIC_BASE,
-              apic_phys);
-
-       /*
-        * Fetch the APIC ID of the BSP in case we have a
-        * default configuration (or the MP table is broken).
-        */
-       if (boot_cpu_physical_apicid == -1U)
-               boot_cpu_physical_apicid = GET_APIC_ID(apic_read(APIC_ID));
-
-#ifdef CONFIG_X86_IO_APIC
-       {
-               unsigned long ioapic_phys, idx = FIX_IO_APIC_BASE_0;
-               int i;
-
-               for (i = 0; i < nr_ioapics; i++) {
-                       if (smp_found_config) {
-                               ioapic_phys = mp_ioapics[i].mpc_apicaddr;
-                               if (!ioapic_phys) {
-                                       printk(KERN_ERR
-                                              "WARNING: bogus zero IO-APIC "
-                                              "address found in MPTABLE, "
-                                              "disabling IO/APIC support!\n");
-                                       smp_found_config = 0;
-                                       skip_ioapic_setup = 1;
-                                       goto fake_ioapic_page;
-                               }
-                       } else {
-fake_ioapic_page:
-                               ioapic_phys = (unsigned long)
-                                             alloc_bootmem_pages(PAGE_SIZE);
-                               ioapic_phys = __pa(ioapic_phys);
-                       }
-                       set_fixmap_nocache(idx, ioapic_phys);
-                       printk(KERN_DEBUG "mapped IOAPIC to %08lx (%08lx)\n",
-                              __fix_to_virt(idx), ioapic_phys);
-                       idx++;
-               }
-       }
-#endif
-}
-
-/*
- * This part sets up the APIC 32 bit clock in LVTT1, with HZ interrupts
- * per second. We assume that the caller has already set up the local
- * APIC.
- *
- * The APIC timer is not exactly sync with the external timer chip, it
- * closely follows bus clocks.
- */
-
-/*
- * The timer chip is already set up at HZ interrupts per second here,
- * but we do not accept timer interrupts yet. We only allow the BP
- * to calibrate.
- */
-static unsigned int __init get_8254_timer_count(void)
-{
-       extern spinlock_t i8253_lock;
-       unsigned long flags;
-
-       unsigned int count;
-
-       spin_lock_irqsave(&i8253_lock, flags);
-
-       outb_p(0x00, PIT_MODE);
-       count = inb_p(PIT_CH0);
-       count |= inb_p(PIT_CH0) << 8;
-
-       spin_unlock_irqrestore(&i8253_lock, flags);
-
-       return count;
-}
-
-/* next tick in 8254 can be caught by catching timer wraparound */
-static void __init wait_8254_wraparound(void)
-{
-       unsigned int curr_count, prev_count;
-
-       curr_count = get_8254_timer_count();
-       do {
-               prev_count = curr_count;
-               curr_count = get_8254_timer_count();
-
-               /* workaround for broken Mercury/Neptune */
-               if (prev_count >= curr_count + 0x100)
-                       curr_count = get_8254_timer_count();
-
-       } while (prev_count >= curr_count);
-}
-
-/*
- * Default initialization for 8254 timers. If we use other timers like HPET,
- * we override this later
- */
-void (*wait_timer_tick)(void) __initdata = wait_8254_wraparound;
-
-/*
- * This function sets up the local APIC timer, with a timeout of
- * 'clocks' APIC bus clock. During calibration we actually call
- * this function twice on the boot CPU, once with a bogus timeout
- * value, second time for real. The other (noncalibrating) CPUs
- * call this function only once, with the real, calibrated value.
- *
- * We do reads before writes even if unnecessary, to get around the
- * P5 APIC double write bug.
- */
-
-#define APIC_DIVISOR 16
-
-static void __setup_APIC_LVTT(unsigned int clocks)
-{
-       unsigned int lvtt_value, tmp_value, ver;
-
-       ver = GET_APIC_VERSION(apic_read(APIC_LVR));
-       lvtt_value = APIC_LVT_TIMER_PERIODIC | LOCAL_TIMER_VECTOR;
-       if (!APIC_INTEGRATED(ver))
-               lvtt_value |= SET_APIC_TIMER_BASE(APIC_TIMER_BASE_DIV);
-       apic_write_around(APIC_LVTT, lvtt_value);
-
-       /*
-        * Divide PICLK by 16
-        */
-       tmp_value = apic_read(APIC_TDCR);
-       apic_write_around(APIC_TDCR, (tmp_value
-                               & ~(APIC_TDR_DIV_1 | APIC_TDR_DIV_TMBASE))
-                               | APIC_TDR_DIV_16);
-
-       apic_write_around(APIC_TMICT, clocks/APIC_DIVISOR);
-}
-
-static void __init setup_APIC_timer(unsigned int clocks)
-{
-       unsigned long flags;
-
-       local_irq_save(flags);
-
-       /*
-        * Wait for IRQ0's slice:
-        */
-       wait_timer_tick();
-
-       __setup_APIC_LVTT(clocks);
-
-       local_irq_restore(flags);
-}
-
-/*
- * In this function we calibrate APIC bus clocks to the external
- * timer. Unfortunately we cannot use jiffies and the timer irq
- * to calibrate, since some later bootup code depends on getting
- * the first irq? Ugh.
- *
- * We want to do the calibration only once since we
- * want to have local timer irqs syncron. CPUs connected
- * by the same APIC bus have the very same bus frequency.
- * And we want to have irqs off anyways, no accidental
- * APIC irq that way.
- */
-
-static int __init calibrate_APIC_clock(void)
-{
-       unsigned long long t1 = 0, t2 = 0;
-       long tt1, tt2;
-       long result;
-       int i;
-       const int LOOPS = HZ/10;
-
-       apic_printk(APIC_VERBOSE, "calibrating APIC timer ...\n");
-
-       /*
-        * Put whatever arbitrary (but long enough) timeout
-        * value into the APIC clock, we just want to get the
-        * counter running for calibration.
-        */
-       __setup_APIC_LVTT(1000000000);
-
-       /*
-        * The timer chip counts down to zero. Let's wait
-        * for a wraparound to start exact measurement:
-        * (the current tick might have been already half done)
-        */
-
-       wait_timer_tick();
-
-       /*
-        * We wrapped around just now. Let's start:
-        */
-       if (cpu_has_tsc)
-               rdtscll(t1);
-       tt1 = apic_read(APIC_TMCCT);
-
-       /*
-        * Let's wait LOOPS wraprounds:
-        */
-       for (i = 0; i < LOOPS; i++)
-               wait_timer_tick();
-
-       tt2 = apic_read(APIC_TMCCT);
-       if (cpu_has_tsc)
-               rdtscll(t2);
-
-       /*
-        * The APIC bus clock counter is 32 bits only, it
-        * might have overflown, but note that we use signed
-        * longs, thus no extra care needed.
-        *
-        * underflown to be exact, as the timer counts down ;)
-        */
-
-       result = (tt1-tt2)*APIC_DIVISOR/LOOPS;
-
-       if (cpu_has_tsc)
-               apic_printk(APIC_VERBOSE, "..... CPU clock speed is "
-                       "%ld.%04ld MHz.\n",
-                       ((long)(t2-t1)/LOOPS)/(1000000/HZ),
-                       ((long)(t2-t1)/LOOPS)%(1000000/HZ));
-
-       apic_printk(APIC_VERBOSE, "..... host bus clock speed is "
-               "%ld.%04ld MHz.\n",
-               result/(1000000/HZ),
-               result%(1000000/HZ));
-
-       return result;
-}
-
-static unsigned int calibration_result;
-
-void __init setup_boot_APIC_clock(void)
-{
-       apic_printk(APIC_VERBOSE, "Using local APIC timer interrupts.\n");
-       using_apic_timer = 1;
-
-       local_irq_disable();
-
-       calibration_result = calibrate_APIC_clock();
-       /*
-        * Now set up the timer for real.
-        */
-       setup_APIC_timer(calibration_result);
-
-       local_irq_enable();
-}
-
-void __init setup_secondary_APIC_clock(void)
-{
-       setup_APIC_timer(calibration_result);
-}
-
-void __init disable_APIC_timer(void)
-{
-       if (using_apic_timer) {
-               unsigned long v;
-
-               v = apic_read(APIC_LVTT);
-               apic_write_around(APIC_LVTT, v | APIC_LVT_MASKED);
-       }
-}
-
-void enable_APIC_timer(void)
-{
-       if (using_apic_timer) {
-               unsigned long v;
-
-               v = apic_read(APIC_LVTT);
-               apic_write_around(APIC_LVTT, v & ~APIC_LVT_MASKED);
-       }
-}
-
-/*
- * the frequency of the profiling timer can be changed
- * by writing a multiplier value into /proc/profile.
- */
-int setup_profiling_timer(unsigned int multiplier)
-{
-       int i;
-
-       /*
-        * Sanity check. [at least 500 APIC cycles should be
-        * between APIC interrupts as a rule of thumb, to avoid
-        * irqs flooding us]
-        */
-       if ( (!multiplier) || (calibration_result/multiplier < 500))
-               return -EINVAL;
-
-       /* 
-        * Set the new multiplier for each CPU. CPUs don't start using the
-        * new values until the next timer interrupt in which they do process
-        * accounting. At that time they also adjust their APIC timers
-        * accordingly.
-        */
-       for (i = 0; i < NR_CPUS; ++i)
-               per_cpu(prof_multiplier, i) = multiplier;
-
-       return 0;
-}
-
-#undef APIC_DIVISOR
-
-/*
- * Local timer interrupt handler. It does both profiling and
- * process statistics/rescheduling.
- *
- * We do profiling in every local tick, statistics/rescheduling
- * happen only every 'profiling multiplier' ticks. The default
- * multiplier is 1 and it can be changed by writing the new multiplier
- * value into /proc/profile.
- */
-
-inline void smp_local_timer_interrupt(struct pt_regs * regs)
-{
-       int cpu = smp_processor_id();
-
-       profile_tick(CPU_PROFILING, regs);
-       if (--per_cpu(prof_counter, cpu) <= 0) {
-               /*
-                * The multiplier may have changed since the last time we got
-                * to this point as a result of the user writing to
-                * /proc/profile. In this case we need to adjust the APIC
-                * timer accordingly.
-                *
-                * Interrupts are already masked off at this point.
-                */
-               per_cpu(prof_counter, cpu) = per_cpu(prof_multiplier, cpu);
-               if (per_cpu(prof_counter, cpu) !=
-                                       per_cpu(prof_old_multiplier, cpu)) {
-                       __setup_APIC_LVTT(
-                                       calibration_result/
-                                       per_cpu(prof_counter, cpu));
-                       per_cpu(prof_old_multiplier, cpu) =
-                                               per_cpu(prof_counter, cpu);
-               }
-
-#ifdef CONFIG_SMP
-               update_process_times(user_mode(regs));
-#endif
-       }
-
-       /*
-        * We take the 'long' return path, and there every subsystem
-        * grabs the apropriate locks (kernel lock/ irq lock).
-        *
-        * we might want to decouple profiling from the 'long path',
-        * and do the profiling totally in assembly.
-        *
-        * Currently this isn't too much of an issue (performance wise),
-        * we can take more than 100K local irqs per second on a 100 MHz P5.
-        */
-}
-
-/*
- * Local APIC timer interrupt. This is the most natural way for doing
- * local interrupts, but local timer interrupts can be emulated by
- * broadcast interrupts too. [in case the hw doesn't support APIC timers]
- *
- * [ if a single-CPU system runs an SMP kernel then we call the local
- *   interrupt as well. Thus we cannot inline the local irq ... ]
- */
-
-fastcall void smp_apic_timer_interrupt(struct pt_regs *regs)
-{
-       int cpu = smp_processor_id();
-
-       /*
-        * the NMI deadlock-detector uses this.
-        */
-       per_cpu(irq_stat, cpu).apic_timer_irqs++;
-
-       /*
-        * NOTE! We'd better ACK the irq immediately,
-        * because timer handling can be slow.
-        */
-       ack_APIC_irq();
-       /*
-        * update_process_times() expects us to have done irq_enter().
-        * Besides, if we don't timer interrupts ignore the global
-        * interrupt lock, which is the WrongThing (tm) to do.
-        */
-       irq_enter();
-       smp_local_timer_interrupt(regs);
-       irq_exit();
-}
-
-/*
- * This interrupt should _never_ happen with our APIC/SMP architecture
- */
-fastcall void smp_spurious_interrupt(struct pt_regs *regs)
-{
-       unsigned long v;
-
-       irq_enter();
-       /*
-        * Check if this really is a spurious interrupt and ACK it
-        * if it is a vectored one.  Just in case...
-        * Spurious interrupts should not be ACKed.
-        */
-       v = apic_read(APIC_ISR + ((SPURIOUS_APIC_VECTOR & ~0x1f) >> 1));
-       if (v & (1 << (SPURIOUS_APIC_VECTOR & 0x1f)))
-               ack_APIC_irq();
-
-       /* see sw-dev-man vol 3, chapter 7.4.13.5 */
-       printk(KERN_INFO "spurious APIC interrupt on CPU#%d, should never 
happen.\n",
-                       smp_processor_id());
-       irq_exit();
-}
-
-/*
- * This interrupt should never happen with our APIC/SMP architecture
- */
-
-fastcall void smp_error_interrupt(struct pt_regs *regs)
-{
-       unsigned long v, v1;
-
-       irq_enter();
-       /* First tickle the hardware, only then report what went on. -- REW */
-       v = apic_read(APIC_ESR);
-       apic_write(APIC_ESR, 0);
-       v1 = apic_read(APIC_ESR);
-       ack_APIC_irq();
-       atomic_inc(&irq_err_count);
-
-       /* Here is what the APIC error bits mean:
-          0: Send CS error
-          1: Receive CS error
-          2: Send accept error
-          3: Receive accept error
-          4: Reserved
-          5: Send illegal vector
-          6: Received illegal vector
-          7: Illegal register address
-       */
-       printk (KERN_DEBUG "APIC error on CPU%d: %02lx(%02lx)\n",
-               smp_processor_id(), v , v1);
-       irq_exit();
-}
-
 /*
  * This initializes the IO-APIC and APIC hardware if this is
  * a UP kernel.
  */
 int __init APIC_init_uniprocessor (void)
 {
-       if (enable_local_apic < 0)
-               clear_bit(X86_FEATURE_APIC, boot_cpu_data.x86_capability);
-
-       if (!smp_found_config && !cpu_has_apic)
-               return -1;
-
-       /*
-        * Complain if the BIOS pretends there is one.
-        */
-       if (!cpu_has_apic && 
APIC_INTEGRATED(apic_version[boot_cpu_physical_apicid])) {
-               printk(KERN_ERR "BIOS bug, local APIC #%d not detected!...\n",
-                       boot_cpu_physical_apicid);
-               return -1;
-       }
-
-       verify_local_APIC();
-
-       connect_bsp_APIC();
-
-       phys_cpu_present_map = physid_mask_of_physid(boot_cpu_physical_apicid);
-
-       setup_local_APIC();
-
 #ifdef CONFIG_X86_IO_APIC
        if (smp_found_config)
                if (!skip_ioapic_setup && nr_ioapics)
                        setup_IO_APIC();
 #endif
-       setup_boot_APIC_clock();
 
        return 0;
 }
diff -x mkbuildtree -x include -x xen -x SCCS -urPp 
pristine-linux-2.6.12/arch/i386/kernel/cpu/common.c 
linux-2.6-xen-sparse/arch/i386/kernel/cpu/common.c
--- pristine-linux-2.6.12/arch/i386/kernel/cpu/common.c 2005-06-17 
12:48:29.000000000 -0700
+++ linux-2.6-xen-sparse/arch/i386/kernel/cpu/common.c  2005-07-28 
13:17:07.000000000 -0700
@@ -15,6 +15,7 @@
 #include <asm/apic.h>
 #include <mach_apic.h>
 #endif
+#include <asm-xen/hypervisor.h>
 
 #include "cpu.h"
 
@@ -32,6 +33,8 @@ struct cpu_dev * cpu_devs[X86_VENDOR_NUM
 
 extern void mcheck_init(struct cpuinfo_x86 *c);
 
+extern void machine_specific_modify_cpu_capabilities(struct cpuinfo_x86 *c);
+
 extern int disable_pse;
 
 static void default_init(struct cpuinfo_x86 * c)
@@ -409,6 +412,8 @@ void __init identify_cpu(struct cpuinfo_
                                c->x86_vendor, c->x86_model);
        }
 
+       machine_specific_modify_cpu_capabilities(c);
+
        /* Now the feature flags better reflect actual CPU features! */
 
        printk(KERN_DEBUG "CPU: After all inits, caps:");
@@ -554,6 +559,24 @@ void __init early_cpu_init(void)
        disable_pse = 1;
 #endif
 }
+
+void __init cpu_gdt_init(struct Xgt_desc_struct *gdt_descr)
+{
+       unsigned long frames[16];
+       unsigned long va;
+       int f;
+
+       for (va = gdt_descr->address, f = 0;
+            va < gdt_descr->address + gdt_descr->size;
+            va += PAGE_SIZE, f++) {
+               frames[f] = virt_to_machine(va) >> PAGE_SHIFT;
+               make_page_readonly((void *)va);
+       }
+       if (HYPERVISOR_set_gdt(frames, gdt_descr->size / 8))
+               BUG();
+       lgdt_finish();
+}
+
 /*
  * cpu_init() initializes state that is per-CPU. Some data is already
  * initialized (naturally) in the bootstrap process, such as the GDT
@@ -565,7 +588,6 @@ void __init cpu_init (void)
        int cpu = smp_processor_id();
        struct tss_struct * t = &per_cpu(init_tss, cpu);
        struct thread_struct *thread = &current->thread;
-       __u32 stk16_off = (__u32)&per_cpu(cpu_16bit_stack, cpu);
 
        if (cpu_test_and_set(cpu, cpu_initialized)) {
                printk(KERN_WARNING "CPU#%d already initialized!\n", cpu);
@@ -573,7 +595,7 @@ void __init cpu_init (void)
        }
        printk(KERN_INFO "Initializing CPU#%d\n", cpu);
 
-       if (cpu_has_vme || cpu_has_tsc || cpu_has_de)
+       if (cpu_has_vme || cpu_has_de)
                clear_in_cr4(X86_CR4_VME|X86_CR4_PVI|X86_CR4_TSD|X86_CR4_DE);
        if (tsc_disable && cpu_has_tsc) {
                printk(KERN_NOTICE "Disabling TSC...\n");
@@ -583,30 +605,12 @@ void __init cpu_init (void)
        }
 
        /*
-        * Initialize the per-CPU GDT with the boot GDT,
-        * and set up the GDT descriptor:
-        */
-       memcpy(&per_cpu(cpu_gdt_table, cpu), cpu_gdt_table,
-              GDT_SIZE);
-
-       /* Set up GDT entry for 16bit stack */
-       *(__u64 *)&(per_cpu(cpu_gdt_table, cpu)[GDT_ENTRY_ESPFIX_SS]) |=
-               ((((__u64)stk16_off) << 16) & 0x000000ffffff0000ULL) |
-               ((((__u64)stk16_off) << 32) & 0xff00000000000000ULL) |
-               (CPU_16BIT_STACK_SIZE - 1);
-
-       cpu_gdt_descr[cpu].size = GDT_SIZE - 1;
-       cpu_gdt_descr[cpu].address =
-           (unsigned long)&per_cpu(cpu_gdt_table, cpu);
-
-       /*
         * Set up the per-thread TLS descriptor cache:
         */
-       memcpy(thread->tls_array, &per_cpu(cpu_gdt_table, cpu),
-               GDT_ENTRY_TLS_ENTRIES * 8);
+       memcpy(thread->tls_array, &get_cpu_gdt_table(cpu)[GDT_ENTRY_TLS_MIN],
+              GDT_ENTRY_TLS_ENTRIES * 8);
 
-       __asm__ __volatile__("lgdt %0" : : "m" (cpu_gdt_descr[cpu]));
-       __asm__ __volatile__("lidt %0" : : "m" (idt_descr));
+       cpu_gdt_init(&cpu_gdt_descr[cpu]);
 
        /*
         * Delete NT
@@ -623,19 +627,15 @@ void __init cpu_init (void)
        enter_lazy_tlb(&init_mm, current);
 
        load_esp0(t, thread);
-       set_tss_desc(cpu,t);
-       load_TR_desc();
-       load_LDT(&init_mm.context);
 
-       /* Set up doublefault TSS pointer in the GDT */
-       __set_tss_desc(cpu, GDT_ENTRY_DOUBLEFAULT_TSS, &doublefault_tss);
+       load_LDT(&init_mm.context);
 
        /* Clear %fs and %gs. */
        asm volatile ("xorl %eax, %eax; movl %eax, %fs; movl %eax, %gs");
 
        /* Clear all 6 debug registers: */
 
-#define CD(register) __asm__("movl %0,%%db" #register ::"r"(0) );
+#define CD(register) HYPERVISOR_set_debugreg(register, 0)
 
        CD(0); CD(1); CD(2); CD(3); /* no db4 and db5 */; CD(6); CD(7);
 
diff -x mkbuildtree -x include -x xen -x SCCS -urPp 
pristine-linux-2.6.12/arch/i386/kernel/cpu/Makefile 
linux-2.6-xen-sparse/arch/i386/kernel/cpu/Makefile
--- pristine-linux-2.6.12/arch/i386/kernel/cpu/Makefile 2005-06-17 
12:48:29.000000000 -0700
+++ linux-2.6-xen-sparse/arch/i386/kernel/cpu/Makefile  2005-07-28 
13:17:07.000000000 -0700
@@ -2,18 +2,30 @@
 # Makefile for x86-compatible CPU details and quirks
 #
 
-obj-y  :=      common.o proc.o
+CFLAGS += -Iarch/i386/kernel/cpu
 
-obj-y  +=      amd.o
-obj-y  +=      cyrix.o
-obj-y  +=      centaur.o
-obj-y  +=      transmeta.o
-obj-y  +=      intel.o intel_cacheinfo.o
-obj-y  +=      rise.o
-obj-y  +=      nexgen.o
-obj-y  +=      umc.o
+obj-y  :=      common.o
+c-obj-y        +=      proc.o
 
-obj-$(CONFIG_X86_MCE)  +=      mcheck/
+c-obj-y        +=      amd.o
+c-obj-y        +=      cyrix.o
+c-obj-y        +=      centaur.o
+c-obj-y        +=      transmeta.o
+c-obj-y        +=      intel.o intel_cacheinfo.o
+c-obj-y        +=      rise.o
+c-obj-y        +=      nexgen.o
+c-obj-y        +=      umc.o
+
+#obj-$(CONFIG_X86_MCE) +=      ../../../../i386/kernel/cpu/mcheck/
 
 obj-$(CONFIG_MTRR)     +=      mtrr/
-obj-$(CONFIG_CPU_FREQ) +=      cpufreq/
+#obj-$(CONFIG_CPU_FREQ)        +=      ../../../../i386/kernel/cpu/cpufreq/
+
+c-link :=
+
+$(patsubst %.o,$(obj)/%.c,$(c-obj-y) $(c-link)):
+       @ln -fsn $(srctree)/arch/i386/kernel/cpu/$(notdir $@) $@
+
+obj-y  += $(c-obj-y)
+
+clean-files += $(patsubst %.o,%.c,$(c-obj-y) $(c-obj-) $(c-link))
diff -x mkbuildtree -x include -x xen -x SCCS -urPp 
pristine-linux-2.6.12/arch/i386/kernel/cpu/mtrr/main.c 
linux-2.6-xen-sparse/arch/i386/kernel/cpu/mtrr/main.c
--- pristine-linux-2.6.12/arch/i386/kernel/cpu/mtrr/main.c      2005-06-17 
12:48:29.000000000 -0700
+++ linux-2.6-xen-sparse/arch/i386/kernel/cpu/mtrr/main.c       2005-07-28 
13:17:07.000000000 -0700
@@ -1,116 +1,46 @@
-/*  Generic MTRR (Memory Type Range Register) driver.
-
-    Copyright (C) 1997-2000  Richard Gooch
-    Copyright (c) 2002      Patrick Mochel
-
-    This library is free software; you can redistribute it and/or
-    modify it under the terms of the GNU Library General Public
-    License as published by the Free Software Foundation; either
-    version 2 of the License, or (at your option) any later version.
-
-    This library is distributed in the hope that it will be useful,
-    but WITHOUT ANY WARRANTY; without even the implied warranty of
-    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
-    Library General Public License for more details.
-
-    You should have received a copy of the GNU Library General Public
-    License along with this library; if not, write to the Free
-    Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
-
-    Richard Gooch may be reached by email at  rgooch@xxxxxxxxxxxxx
-    The postal address is:
-      Richard Gooch, c/o ATNF, P. O. Box 76, Epping, N.S.W., 2121, Australia.
-
-    Source: "Pentium Pro Family Developer's Manual, Volume 3:
-    Operating System Writer's Guide" (Intel document number 242692),
-    section 11.11.7
-
-    This was cleaned and made readable by Patrick Mochel <mochel@xxxxxxxx> 
-    on 6-7 March 2002. 
-    Source: Intel Architecture Software Developers Manual, Volume 3: 
-    System Programming Guide; Section 9.11. (1997 edition - PPro).
-*/
-
-#include <linux/module.h>
 #include <linux/init.h>
-#include <linux/pci.h>
-#include <linux/smp.h>
-#include <linux/cpu.h>
+#include <linux/proc_fs.h>
+#include <linux/ctype.h>
+#include <linux/module.h>
+#include <linux/seq_file.h>
+#include <asm/uaccess.h>
 
 #include <asm/mtrr.h>
-
-#include <asm/uaccess.h>
-#include <asm/processor.h>
-#include <asm/msr.h>
 #include "mtrr.h"
 
-#define MTRR_VERSION            "2.0 (20020519)"
-
-u32 num_var_ranges = 0;
-
-unsigned int *usage_table;
-static DECLARE_MUTEX(main_lock);
-
-u32 size_or_mask, size_and_mask;
-
-static struct mtrr_ops * mtrr_ops[X86_VENDOR_NUM] = {};
-
-struct mtrr_ops * mtrr_if = NULL;
-
-static void set_mtrr(unsigned int reg, unsigned long base,
-                    unsigned long size, mtrr_type type);
+void generic_get_mtrr(unsigned int reg, unsigned long *base,
+                     unsigned int *size, mtrr_type * type)
+{
+       dom0_op_t op;
 
-extern int arr3_protected;
+       op.cmd = DOM0_READ_MEMTYPE;
+       op.u.read_memtype.reg = reg;
+       (void)HYPERVISOR_dom0_op(&op);
 
-void set_mtrr_ops(struct mtrr_ops * ops)
-{
-       if (ops->vendor && ops->vendor < X86_VENDOR_NUM)
-               mtrr_ops[ops->vendor] = ops;
+       *size = op.u.read_memtype.nr_pfns;
+       *base = op.u.read_memtype.pfn;
+       *type = op.u.read_memtype.type;
 }
 
-/*  Returns non-zero if we have the write-combining memory type  */
-static int have_wrcomb(void)
-{
-       struct pci_dev *dev;
-       u8 rev;
-       
-       if ((dev = pci_get_class(PCI_CLASS_BRIDGE_HOST << 8, NULL)) != NULL) {
-               /* ServerWorks LE chipsets < rev 6 have problems with 
write-combining
-                  Don't allow it and leave room for other chipsets to be 
tagged */
-               if (dev->vendor == PCI_VENDOR_ID_SERVERWORKS &&
-                   dev->device == PCI_DEVICE_ID_SERVERWORKS_LE) {
-                       pci_read_config_byte(dev, PCI_CLASS_REVISION, &rev);
-                       if (rev <= 5) {
-                               printk(KERN_INFO "mtrr: Serverworks LE rev < 6 
detected. Write-combining disabled.\n");
-                               pci_dev_put(dev);
-                               return 0;
-                       }
-               }
-               /* Intel 450NX errata # 23. Non ascending cacheline evictions to
-                  write combining memory may resulting in data corruption */
-               if (dev->vendor == PCI_VENDOR_ID_INTEL &&
-                   dev->device == PCI_DEVICE_ID_INTEL_82451NX) {
-                       printk(KERN_INFO "mtrr: Intel 450NX MMC detected. 
Write-combining disabled.\n");
-                       pci_dev_put(dev);
-                       return 0;
-               }
-               pci_dev_put(dev);
-       }               
-       return (mtrr_if->have_wrcomb ? mtrr_if->have_wrcomb() : 0);
-}
+struct mtrr_ops generic_mtrr_ops = {
+       .use_intel_if      = 1,
+       .get               = generic_get_mtrr,
+};
+
+struct mtrr_ops *mtrr_if = &generic_mtrr_ops;
+unsigned int num_var_ranges;
+unsigned int *usage_table;
 
-/*  This function returns the number of variable MTRRs  */
 static void __init set_num_var_ranges(void)
 {
-       unsigned long config = 0, dummy;
+       dom0_op_t op;
 
-       if (use_intel()) {
-               rdmsr(MTRRcap_MSR, config, dummy);
-       } else if (is_cpu(AMD))
-               config = 2;
-       else if (is_cpu(CYRIX) || is_cpu(CENTAUR))
-               config = 8;
-       num_var_ranges = config & 0xff;
+       for (num_var_ranges = 0; ; num_var_ranges++) {
+               op.cmd = DOM0_READ_MEMTYPE;
+               op.u.read_memtype.reg = num_var_ranges;
+               if (HYPERVISOR_dom0_op(&op) != 0)
+                       break;
+       }
 }
 
 static void __init init_table(void)
@@ -124,293 +54,28 @@ static void __init init_table(void)
                return;
        }
        for (i = 0; i < max; i++)
-               usage_table[i] = 1;
-}
-
-struct set_mtrr_data {
-       atomic_t        count;
-       atomic_t        gate;
-       unsigned long   smp_base;
-       unsigned long   smp_size;
-       unsigned int    smp_reg;
-       mtrr_type       smp_type;
-};
-
-#ifdef CONFIG_SMP
-
-static void ipi_handler(void *info)
-/*  [SUMMARY] Synchronisation handler. Executed by "other" CPUs.
-    [RETURNS] Nothing.
-*/
-{
-       struct set_mtrr_data *data = info;
-       unsigned long flags;
-
-       local_irq_save(flags);
-
-       atomic_dec(&data->count);
-       while(!atomic_read(&data->gate))
-               cpu_relax();
-
-       /*  The master has cleared me to execute  */
-       if (data->smp_reg != ~0U) 
-               mtrr_if->set(data->smp_reg, data->smp_base, 
-                            data->smp_size, data->smp_type);
-       else
-               mtrr_if->set_all();
-
-       atomic_dec(&data->count);
-       while(atomic_read(&data->gate))
-               cpu_relax();
-
-       atomic_dec(&data->count);
-       local_irq_restore(flags);
+               usage_table[i] = 0;
 }
 
-#endif
-
-/**
- * set_mtrr - update mtrrs on all processors
- * @reg:       mtrr in question
- * @base:      mtrr base
- * @size:      mtrr size
- * @type:      mtrr type
- *
- * This is kinda tricky, but fortunately, Intel spelled it out for us cleanly:
- * 
- * 1. Send IPI to do the following:
- * 2. Disable Interrupts
- * 3. Wait for all procs to do so 
- * 4. Enter no-fill cache mode
- * 5. Flush caches
- * 6. Clear PGE bit
- * 7. Flush all TLBs
- * 8. Disable all range registers
- * 9. Update the MTRRs
- * 10. Enable all range registers
- * 11. Flush all TLBs and caches again
- * 12. Enter normal cache mode and reenable caching
- * 13. Set PGE 
- * 14. Wait for buddies to catch up
- * 15. Enable interrupts.
- * 
- * What does that mean for us? Well, first we set data.count to the number
- * of CPUs. As each CPU disables interrupts, it'll decrement it once. We wait
- * until it hits 0 and proceed. We set the data.gate flag and reset data.count.
- * Meanwhile, they are waiting for that flag to be set. Once it's set, each 
- * CPU goes through the transition of updating MTRRs. The CPU vendors may each 
do it 
- * differently, so we call mtrr_if->set() callback and let them take care of 
it.
- * When they're done, they again decrement data->count and wait for data.gate 
to 
- * be reset. 
- * When we finish, we wait for data.count to hit 0 and toggle the data.gate 
flag.
- * Everyone then enables interrupts and we all continue on.
- *
- * Note that the mechanism is the same for UP systems, too; all the SMP stuff
- * becomes nops.
- */
-static void set_mtrr(unsigned int reg, unsigned long base,
-                    unsigned long size, mtrr_type type)
-{
-       struct set_mtrr_data data;
-       unsigned long flags;
-
-       data.smp_reg = reg;
-       data.smp_base = base;
-       data.smp_size = size;
-       data.smp_type = type;
-       atomic_set(&data.count, num_booting_cpus() - 1);
-       atomic_set(&data.gate,0);
-
-       /*  Start the ball rolling on other CPUs  */
-       if (smp_call_function(ipi_handler, &data, 1, 0) != 0)
-               panic("mtrr: timed out waiting for other CPUs\n");
-
-       local_irq_save(flags);
-
-       while(atomic_read(&data.count))
-               cpu_relax();
-
-       /* ok, reset count and toggle gate */
-       atomic_set(&data.count, num_booting_cpus() - 1);
-       atomic_set(&data.gate,1);
-
-       /* do our MTRR business */
-
-       /* HACK!
-        * We use this same function to initialize the mtrrs on boot.
-        * The state of the boot cpu's mtrrs has been saved, and we want
-        * to replicate across all the APs. 
-        * If we're doing that @reg is set to something special...
-        */
-       if (reg != ~0U) 
-               mtrr_if->set(reg,base,size,type);
-
-       /* wait for the others */
-       while(atomic_read(&data.count))
-               cpu_relax();
-
-       atomic_set(&data.count, num_booting_cpus() - 1);
-       atomic_set(&data.gate,0);
-
-       /*
-        * Wait here for everyone to have seen the gate change
-        * So we're the last ones to touch 'data'
-        */
-       while(atomic_read(&data.count))
-               cpu_relax();
-
-       local_irq_restore(flags);
-}
-
-/**
- *     mtrr_add_page - Add a memory type region
- *     @base: Physical base address of region in pages (4 KB)
- *     @size: Physical size of region in pages (4 KB)
- *     @type: Type of MTRR desired
- *     @increment: If this is true do usage counting on the region
- *
- *     Memory type region registers control the caching on newer Intel and
- *     non Intel processors. This function allows drivers to request an
- *     MTRR is added. The details and hardware specifics of each processor's
- *     implementation are hidden from the caller, but nevertheless the 
- *     caller should expect to need to provide a power of two size on an
- *     equivalent power of two boundary.
- *
- *     If the region cannot be added either because all regions are in use
- *     or the CPU cannot support it a negative value is returned. On success
- *     the register number for this entry is returned, but should be treated
- *     as a cookie only.
- *
- *     On a multiprocessor machine the changes are made to all processors.
- *     This is required on x86 by the Intel processors.
- *
- *     The available types are
- *
- *     %MTRR_TYPE_UNCACHABLE   -       No caching
- *
- *     %MTRR_TYPE_WRBACK       -       Write data back in bursts whenever
- *
- *     %MTRR_TYPE_WRCOMB       -       Write data back soon but allow bursts
- *
- *     %MTRR_TYPE_WRTHROUGH    -       Cache reads but not writes
- *
- *     BUGS: Needs a quiet flag for the cases where drivers do not mind
- *     failures and do not wish system log messages to be sent.
- */
-
 int mtrr_add_page(unsigned long base, unsigned long size, 
                  unsigned int type, char increment)
 {
-       int i;
-       mtrr_type ltype;
-       unsigned long lbase;
-       unsigned int lsize;
        int error;
+       dom0_op_t op;
 
-       if (!mtrr_if)
-               return -ENXIO;
-               
-       if ((error = mtrr_if->validate_add_page(base,size,type)))
+       op.cmd = DOM0_ADD_MEMTYPE;
+       op.u.add_memtype.pfn     = base;
+       op.u.add_memtype.nr_pfns = size;
+       op.u.add_memtype.type    = type;
+       if ((error = HYPERVISOR_dom0_op(&op)))
                return error;
 
-       if (type >= MTRR_NUM_TYPES) {
-               printk(KERN_WARNING "mtrr: type: %u invalid\n", type);
-               return -EINVAL;
-       }
-
-       /*  If the type is WC, check that this processor supports it  */
-       if ((type == MTRR_TYPE_WRCOMB) && !have_wrcomb()) {
-               printk(KERN_WARNING
-                      "mtrr: your processor doesn't support 
write-combining\n");
-               return -ENOSYS;
-       }
-
-       if (base & size_or_mask || size & size_or_mask) {
-               printk(KERN_WARNING "mtrr: base or size exceeds the MTRR 
width\n");
-               return -EINVAL;
-       }
+       if (increment)
+               ++usage_table[op.u.add_memtype.reg];
 
-       error = -EINVAL;
-
-       /*  Search for existing MTRR  */
-       down(&main_lock);
-       for (i = 0; i < num_var_ranges; ++i) {
-               mtrr_if->get(i, &lbase, &lsize, &ltype);
-               if (base >= lbase + lsize)
-                       continue;
-               if ((base < lbase) && (base + size <= lbase))
-                       continue;
-               /*  At this point we know there is some kind of 
overlap/enclosure  */
-               if ((base < lbase) || (base + size > lbase + lsize)) {
-                       printk(KERN_WARNING
-                              "mtrr: 0x%lx000,0x%lx000 overlaps existing"
-                              " 0x%lx000,0x%x000\n", base, size, lbase,
-                              lsize);
-                       goto out;
-               }
-               /*  New region is enclosed by an existing region  */
-               if (ltype != type) {
-                       if (type == MTRR_TYPE_UNCACHABLE)
-                               continue;
-                       printk (KERN_WARNING "mtrr: type mismatch for 
%lx000,%lx000 old: %s new: %s\n",
-                            base, size, mtrr_attrib_to_str(ltype),
-                            mtrr_attrib_to_str(type));
-                       goto out;
-               }
-               if (increment)
-                       ++usage_table[i];
-               error = i;
-               goto out;
-       }
-       /*  Search for an empty MTRR  */
-       i = mtrr_if->get_free_region(base, size);
-       if (i >= 0) {
-               set_mtrr(i, base, size, type);
-               usage_table[i] = 1;
-       } else
-               printk(KERN_INFO "mtrr: no more MTRRs available\n");
-       error = i;
- out:
-       up(&main_lock);
-       return error;
+       return op.u.add_memtype.reg;
 }
 
-/**
- *     mtrr_add - Add a memory type region
- *     @base: Physical base address of region
- *     @size: Physical size of region
- *     @type: Type of MTRR desired
- *     @increment: If this is true do usage counting on the region
- *
- *     Memory type region registers control the caching on newer Intel and
- *     non Intel processors. This function allows drivers to request an
- *     MTRR is added. The details and hardware specifics of each processor's
- *     implementation are hidden from the caller, but nevertheless the 
- *     caller should expect to need to provide a power of two size on an
- *     equivalent power of two boundary.
- *
- *     If the region cannot be added either because all regions are in use
- *     or the CPU cannot support it a negative value is returned. On success
- *     the register number for this entry is returned, but should be treated
- *     as a cookie only.
- *
- *     On a multiprocessor machine the changes are made to all processors.
- *     This is required on x86 by the Intel processors.
- *
- *     The available types are
- *
- *     %MTRR_TYPE_UNCACHABLE   -       No caching
- *
- *     %MTRR_TYPE_WRBACK       -       Write data back in bursts whenever
- *
- *     %MTRR_TYPE_WRCOMB       -       Write data back soon but allow bursts
- *
- *     %MTRR_TYPE_WRTHROUGH    -       Cache reads but not writes
- *
- *     BUGS: Needs a quiet flag for the cases where drivers do not mind
- *     failures and do not wish system log messages to be sent.
- */
-
 int
 mtrr_add(unsigned long base, unsigned long size, unsigned int type,
         char increment)
@@ -424,21 +89,6 @@ mtrr_add(unsigned long base, unsigned lo
                             increment);
 }
 
-/**
- *     mtrr_del_page - delete a memory type region
- *     @reg: Register returned by mtrr_add
- *     @base: Physical base address
- *     @size: Size of region
- *
- *     If register is supplied then base and size are ignored. This is
- *     how drivers should call it.
- *
- *     Releases an MTRR region. If the usage count drops to zero the 
- *     register is freed and the region returns to default state.
- *     On success the register is returned, on failure a negative error
- *     code.
- */
-
 int mtrr_del_page(int reg, unsigned long base, unsigned long size)
 {
        int i, max;
@@ -446,12 +96,9 @@ int mtrr_del_page(int reg, unsigned long
        unsigned long lbase;
        unsigned int lsize;
        int error = -EINVAL;
-
-       if (!mtrr_if)
-               return -ENXIO;
+       dom0_op_t op;
 
        max = num_var_ranges;
-       down(&main_lock);
        if (reg < 0) {
                /*  Search for existing MTRR  */
                for (i = 0; i < max; ++i) {
@@ -467,46 +114,20 @@ int mtrr_del_page(int reg, unsigned long
                        goto out;
                }
        }
-       if (reg >= max) {
-               printk(KERN_WARNING "mtrr: register: %d too big\n", reg);
-               goto out;
-       }
-       if (is_cpu(CYRIX) && !use_intel()) {
-               if ((reg == 3) && arr3_protected) {
-                       printk(KERN_WARNING "mtrr: ARR3 cannot be changed\n");
-                       goto out;
-               }
-       }
-       mtrr_if->get(reg, &lbase, &lsize, &ltype);
-       if (lsize < 1) {
-               printk(KERN_WARNING "mtrr: MTRR %d not used\n", reg);
-               goto out;
-       }
        if (usage_table[reg] < 1) {
                printk(KERN_WARNING "mtrr: reg: %d has count=0\n", reg);
                goto out;
        }
-       if (--usage_table[reg] < 1)
-               set_mtrr(reg, 0, 0, 0);
+       if (--usage_table[reg] < 1) {
+               op.cmd = DOM0_DEL_MEMTYPE;
+               op.u.del_memtype.handle = 0;
+               op.u.add_memtype.reg    = reg;
+               (void)HYPERVISOR_dom0_op(&op);
+       }
        error = reg;
  out:
-       up(&main_lock);
        return error;
 }
-/**
- *     mtrr_del - delete a memory type region
- *     @reg: Register returned by mtrr_add
- *     @base: Physical base address
- *     @size: Size of region
- *
- *     If register is supplied then base and size are ignored. This is
- *     how drivers should call it.
- *
- *     Releases an MTRR region. If the usage count drops to zero the 
- *     register is freed and the region returns to default state.
- *     On success the register is returned, on failure a negative error
- *     code.
- */
 
 int
 mtrr_del(int reg, unsigned long base, unsigned long size)
@@ -522,157 +143,23 @@ mtrr_del(int reg, unsigned long base, un
 EXPORT_SYMBOL(mtrr_add);
 EXPORT_SYMBOL(mtrr_del);
 
-/* HACK ALERT!
- * These should be called implicitly, but we can't yet until all the initcall
- * stuff is done...
- */
-extern void amd_init_mtrr(void);
-extern void cyrix_init_mtrr(void);
-extern void centaur_init_mtrr(void);
-
-static void __init init_ifs(void)
-{
-       amd_init_mtrr();
-       cyrix_init_mtrr();
-       centaur_init_mtrr();
-}
-
-static void __init init_other_cpus(void)
+static int __init mtrr_init(void)
 {
-       if (use_intel())
-               get_mtrr_state();
-
-       /* bring up the other processors */
-       set_mtrr(~0U,0,0,0);
-
-       if (use_intel()) {
-               finalize_mtrr_state();
-               mtrr_state_warn();
-       }
-}
+       struct cpuinfo_x86 *c = &boot_cpu_data;
 
+       if (!(xen_start_info.flags & SIF_PRIVILEGED))
+               return -ENODEV;
 
-struct mtrr_value {
-       mtrr_type       ltype;
-       unsigned long   lbase;
-       unsigned int    lsize;
-};
+       if ((!cpu_has(c, X86_FEATURE_MTRR)) &&
+           (!cpu_has(c, X86_FEATURE_K6_MTRR)) &&
+           (!cpu_has(c, X86_FEATURE_CYRIX_ARR)) &&
+           (!cpu_has(c, X86_FEATURE_CENTAUR_MCR)))
+               return -ENODEV;
 
-static struct mtrr_value * mtrr_state;
+       set_num_var_ranges();
+       init_table();
 
-static int mtrr_save(struct sys_device * sysdev, u32 state)
-{
-       int i;
-       int size = num_var_ranges * sizeof(struct mtrr_value);
-
-       mtrr_state = kmalloc(size,GFP_ATOMIC);
-       if (mtrr_state)
-               memset(mtrr_state,0,size);
-       else
-               return -ENOMEM;
-
-       for (i = 0; i < num_var_ranges; i++) {
-               mtrr_if->get(i,
-                            &mtrr_state[i].lbase,
-                            &mtrr_state[i].lsize,
-                            &mtrr_state[i].ltype);
-       }
        return 0;
 }
 
-static int mtrr_restore(struct sys_device * sysdev)
-{
-       int i;
-
-       for (i = 0; i < num_var_ranges; i++) {
-               if (mtrr_state[i].lsize) 
-                       set_mtrr(i,
-                                mtrr_state[i].lbase,
-                                mtrr_state[i].lsize,
-                                mtrr_state[i].ltype);
-       }
-       kfree(mtrr_state);
-       return 0;
-}
-
-
-
-static struct sysdev_driver mtrr_sysdev_driver = {
-       .suspend        = mtrr_save,
-       .resume         = mtrr_restore,
-};
-
-
-/**
- * mtrr_init - initialize mtrrs on the boot CPU
- *
- * This needs to be called early; before any of the other CPUs are 
- * initialized (i.e. before smp_init()).
- * 
- */
-static int __init mtrr_init(void)
-{
-       init_ifs();
-
-       if (cpu_has_mtrr) {
-               mtrr_if = &generic_mtrr_ops;
-               size_or_mask = 0xff000000;      /* 36 bits */
-               size_and_mask = 0x00f00000;
-
-               /* This is an AMD specific MSR, but we assume(hope?) that
-                  Intel will implement it to when they extend the address
-                  bus of the Xeon. */
-               if (cpuid_eax(0x80000000) >= 0x80000008) {
-                       u32 phys_addr;
-                       phys_addr = cpuid_eax(0x80000008) & 0xff;
-                       size_or_mask = ~((1 << (phys_addr - PAGE_SHIFT)) - 1);
-                       size_and_mask = ~size_or_mask & 0xfff00000;
-               } else if (boot_cpu_data.x86_vendor == X86_VENDOR_CENTAUR &&
-                          boot_cpu_data.x86 == 6) {
-                       /* VIA C* family have Intel style MTRRs, but
-                          don't support PAE */
-                       size_or_mask = 0xfff00000;      /* 32 bits */
-                       size_and_mask = 0;
-               }
-       } else {
-               switch (boot_cpu_data.x86_vendor) {
-               case X86_VENDOR_AMD:
-                       if (cpu_has_k6_mtrr) {
-                               /* Pre-Athlon (K6) AMD CPU MTRRs */
-                               mtrr_if = mtrr_ops[X86_VENDOR_AMD];
-                               size_or_mask = 0xfff00000;      /* 32 bits */
-                               size_and_mask = 0;
-                       }
-                       break;
-               case X86_VENDOR_CENTAUR:
-                       if (cpu_has_centaur_mcr) {
-                               mtrr_if = mtrr_ops[X86_VENDOR_CENTAUR];
-                               size_or_mask = 0xfff00000;      /* 32 bits */
-                               size_and_mask = 0;
-                       }
-                       break;
-               case X86_VENDOR_CYRIX:
-                       if (cpu_has_cyrix_arr) {
-                               mtrr_if = mtrr_ops[X86_VENDOR_CYRIX];
-                               size_or_mask = 0xfff00000;      /* 32 bits */
-                               size_and_mask = 0;
-                       }
-                       break;
-               default:
-                       break;
-               }
-       }
-       printk(KERN_INFO "mtrr: v%s\n",MTRR_VERSION);
-
-       if (mtrr_if) {
-               set_num_var_ranges();
-               init_table();
-               init_other_cpus();
-
-               return sysdev_driver_register(&cpu_sysdev_class,
-                                             &mtrr_sysdev_driver);
-       }
-       return -ENXIO;
-}
-
 subsys_initcall(mtrr_init);
diff -x mkbuildtree -x include -x xen -x SCCS -urPp 
pristine-linux-2.6.12/arch/i386/kernel/cpu/mtrr/Makefile 
linux-2.6-xen-sparse/arch/i386/kernel/cpu/mtrr/Makefile
--- pristine-linux-2.6.12/arch/i386/kernel/cpu/mtrr/Makefile    2005-06-17 
12:48:29.000000000 -0700
+++ linux-2.6-xen-sparse/arch/i386/kernel/cpu/mtrr/Makefile     2005-07-28 
13:17:07.000000000 -0700
@@ -1,5 +1,16 @@
-obj-y          := main.o if.o generic.o state.o
-obj-y          += amd.o
-obj-y          += cyrix.o
-obj-y          += centaur.o
+obj-y  := main.o
+c-obj-y        := if.o
 
+c-link :=
+
+$(patsubst %.o,$(obj)/%.c,$(c-obj-y) $(c-link)): $(obj)/mtrr.h
+       @ln -fsn $(srctree)/arch/i386/kernel/cpu/mtrr/$(notdir $@) $@
+
+$(patsubst %.o,$(obj)/%.c,$(obj-y)): $(obj)/mtrr.h
+
+$(obj)/mtrr.h:
+       @ln -fsn $(srctree)/arch/i386/kernel/cpu/mtrr/mtrr.h $@
+
+obj-y  += $(c-obj-y)
+
+clean-files += $(patsubst %.o,%.c,$(c-obj-y) $(c-obj-) $(c-link))
diff -x mkbuildtree -x include -x xen -x SCCS -urPp 
pristine-linux-2.6.12/arch/i386/kernel/entry.S 
linux-2.6-xen-sparse/arch/i386/kernel/entry.S
--- pristine-linux-2.6.12/arch/i386/kernel/entry.S      2005-06-17 
12:48:29.000000000 -0700
+++ linux-2.6-xen-sparse/arch/i386/kernel/entry.S       2005-07-28 
13:17:07.000000000 -0700
@@ -47,8 +47,8 @@
 #include <asm/segment.h>
 #include <asm/smp.h>
 #include <asm/page.h>
-#include <asm/desc.h>
 #include "irq_vectors.h"
+#include <asm-xen/xen-public/xen.h>
 
 #define nr_syscalls ((syscall_table_size)/4)
 
@@ -64,6 +64,7 @@ ES            = 0x20
 ORIG_EAX       = 0x24
 EIP            = 0x28
 CS             = 0x2C
+EVENT_MASK     = 0x2E
 EFLAGS         = 0x30
 OLDESP         = 0x34
 OLDSS          = 0x38
@@ -75,11 +76,43 @@ DF_MASK             = 0x00000400 
 NT_MASK                = 0x00004000
 VM_MASK                = 0x00020000
 
+/* Offsets into shared_info_t. */
+#define evtchn_upcall_pending          /* 0 */
+#define evtchn_upcall_mask             1
+
+#define sizeof_vcpu_shift              3
+
+#ifdef CONFIG_SMP
+#define preempt_disable(reg)   incl TI_preempt_count(reg)
+#define preempt_enable(reg)    decl TI_preempt_count(reg)
+#define XEN_GET_VCPU_INFO(reg) preempt_disable(%ebp)                   ; \
+                               movl TI_cpu(%ebp),reg                   ; \
+                               shl  $sizeof_vcpu_shift,reg             ; \
+                               addl HYPERVISOR_shared_info,reg
+#define XEN_PUT_VCPU_INFO(reg) preempt_enable(%ebp)
+#define XEN_PUT_VCPU_INFO_fixup .byte 0xff,0xff,0xff
+#else
+#define XEN_GET_VCPU_INFO(reg) movl HYPERVISOR_shared_info,reg
+#define XEN_PUT_VCPU_INFO(reg)
+#define XEN_PUT_VCPU_INFO_fixup
+#endif
+
+#define XEN_LOCKED_BLOCK_EVENTS(reg)   movb $1,evtchn_upcall_mask(reg)
+#define XEN_LOCKED_UNBLOCK_EVENTS(reg) movb $0,evtchn_upcall_mask(reg)
+#define XEN_BLOCK_EVENTS(reg)  XEN_GET_VCPU_INFO(reg)                  ; \
+                               XEN_LOCKED_BLOCK_EVENTS(reg)            ; \
+                               XEN_PUT_VCPU_INFO(reg)
+#define XEN_UNBLOCK_EVENTS(reg)        XEN_GET_VCPU_INFO(reg)                  
; \
+                               XEN_LOCKED_UNBLOCK_EVENTS(reg)          ; \
+                               XEN_PUT_VCPU_INFO(reg)
+#define XEN_TEST_PENDING(reg)  testb $0xFF,evtchn_upcall_pending(reg)
+
 #ifdef CONFIG_PREEMPT
-#define preempt_stop           cli
+#define preempt_stop           GET_THREAD_INFO(%ebp)                   ; \
+                               XEN_BLOCK_EVENTS(%esi)
 #else
 #define preempt_stop
-#define resume_kernel          restore_nocheck
+#define resume_kernel          restore_all
 #endif
 
 #define SAVE_ALL \
@@ -123,6 +156,23 @@ VM_MASK            = 0x00020000
 .previous
 
 
+#define RESTORE_ALL    \
+       RESTORE_REGS    \
+       addl $4, %esp;  \
+1:     iret;           \
+.section .fixup,"ax";   \
+2:     movl $(__USER_DS), %edx; \
+       movl %edx, %ds; \
+       movl %edx, %es; \
+       movl $11,%eax;  \
+       call do_exit;   \
+.previous;             \
+.section __ex_table,"a";\
+       .align 4;       \
+       .long 1b,2b;    \
+.previous
+
+
 ENTRY(ret_from_fork)
        pushl %eax
        call schedule_tail
@@ -145,10 +195,10 @@ ret_from_intr:
        GET_THREAD_INFO(%ebp)
        movl EFLAGS(%esp), %eax         # mix EFLAGS and CS
        movb CS(%esp), %al
-       testl $(VM_MASK | 3), %eax
-       jz resume_kernel
+       testl $(VM_MASK | 2), %eax
+       jz resume_kernel                # returning to kernel or vm86-space
 ENTRY(resume_userspace)
-       cli                             # make sure we don't miss an interrupt
+       XEN_BLOCK_EVENTS(%esi)          # make sure we don't miss an interrupt
                                        # setting need_resched or sigpending
                                        # between sampling and the iret
        movl TI_flags(%ebp), %ecx
@@ -159,15 +209,15 @@ ENTRY(resume_userspace)
 
 #ifdef CONFIG_PREEMPT
 ENTRY(resume_kernel)
-       cli
+       XEN_BLOCK_EVENTS(%esi)
        cmpl $0,TI_preempt_count(%ebp)  # non-zero preempt_count ?
-       jnz restore_nocheck
+       jnz restore_all
 need_resched:
        movl TI_flags(%ebp), %ecx       # need_resched set ?
        testb $_TIF_NEED_RESCHED, %cl
        jz restore_all
-       testl $IF_MASK,EFLAGS(%esp)     # interrupts off (exception path) ?
-       jz restore_all
+       testb $0xFF,EVENT_MASK(%esp)    # interrupts off (exception path) ?
+       jnz restore_all
        call preempt_schedule_irq
        jmp need_resched
 #endif
@@ -202,8 +252,7 @@ sysenter_past_esp:
        SAVE_ALL
        GET_THREAD_INFO(%ebp)
 
-       /* Note, _TIF_SECCOMP is bit number 8, and so it needs testw and not 
testb */
-       testw 
$(_TIF_SYSCALL_TRACE|_TIF_SYSCALL_AUDIT|_TIF_SECCOMP),TI_flags(%ebp)
+       testb $(_TIF_SYSCALL_TRACE|_TIF_SYSCALL_AUDIT),TI_flags(%ebp)
        jnz syscall_trace_entry
        cmpl $(nr_syscalls), %eax
        jae syscall_badsys
@@ -227,8 +276,7 @@ ENTRY(system_call)
        SAVE_ALL
        GET_THREAD_INFO(%ebp)
                                        # system call tracing in operation
-       /* Note, _TIF_SECCOMP is bit number 8, and so it needs testw and not 
testb */
-       testw 
$(_TIF_SYSCALL_TRACE|_TIF_SYSCALL_AUDIT|_TIF_SECCOMP),TI_flags(%ebp)
+       testb $(_TIF_SYSCALL_TRACE|_TIF_SYSCALL_AUDIT),TI_flags(%ebp)
        jnz syscall_trace_entry
        cmpl $(nr_syscalls), %eax
        jae syscall_badsys
@@ -236,63 +284,31 @@ syscall_call:
        call *sys_call_table(,%eax,4)
        movl %eax,EAX(%esp)             # store the return value
 syscall_exit:
-       cli                             # make sure we don't miss an interrupt
+       XEN_BLOCK_EVENTS(%esi)          # make sure we don't miss an interrupt
                                        # setting need_resched or sigpending
                                        # between sampling and the iret
        movl TI_flags(%ebp), %ecx
        testw $_TIF_ALLWORK_MASK, %cx   # current->work
        jne syscall_exit_work
-
 restore_all:
-       movl EFLAGS(%esp), %eax         # mix EFLAGS, SS and CS
-       # Warning: OLDSS(%esp) contains the wrong/random values if we
-       # are returning to the kernel.
-       # See comments in process.c:copy_thread() for details.
-       movb OLDSS(%esp), %ah
-       movb CS(%esp), %al
-       andl $(VM_MASK | (4 << 8) | 3), %eax
-       cmpl $((4 << 8) | 3), %eax
-       je ldt_ss                       # returning to user-space with LDT SS
-restore_nocheck:
-       RESTORE_REGS
-       addl $4, %esp
-1:     iret
-.section .fixup,"ax"
-iret_exc:
-       sti
-       pushl $0                        # no error code
-       pushl $do_iret_error
-       jmp error_code
-.previous
-.section __ex_table,"a"
-       .align 4
-       .long 1b,iret_exc
-.previous
+       testl $VM_MASK, EFLAGS(%esp)
+       jnz resume_vm86
+       movb EVENT_MASK(%esp), %al
+       notb %al                        # %al == ~saved_mask
+       XEN_GET_VCPU_INFO(%esi)
+       andb evtchn_upcall_mask(%esi),%al
+       andb $1,%al                     # %al == mask & ~saved_mask
+       jnz restore_all_enable_events   #     != 0 => reenable event delivery
+       XEN_PUT_VCPU_INFO(%esi)
+       RESTORE_ALL
 
-ldt_ss:
-       larl OLDSS(%esp), %eax
-       jnz restore_nocheck
-       testl $0x00400000, %eax         # returning to 32bit stack?
-       jnz restore_nocheck             # allright, normal return
-       /* If returning to userspace with 16bit stack,
-        * try to fix the higher word of ESP, as the CPU
-        * won't restore it.
-        * This is an "official" bug of all the x86-compatible
-        * CPUs, which we can try to work around to make
-        * dosemu and wine happy. */
-       subl $8, %esp           # reserve space for switch16 pointer
-       cli
-       movl %esp, %eax
-       /* Set up the 16bit stack frame with switch32 pointer on top,
-        * and a switch16 pointer on top of the current frame. */
-       call setup_x86_bogus_stack
+resume_vm86:
+       XEN_UNBLOCK_EVENTS(%esi)
        RESTORE_REGS
-       lss 20+4(%esp), %esp    # switch to 16bit stack
-1:     iret
-.section __ex_table,"a"
-       .align 4
-       .long 1b,iret_exc
-.previous
+       movl %eax,(%esp)
+       movl $__HYPERVISOR_switch_vm86,%eax
+       int $0x82
+       ud2
 
        # perform work that needs to be done immediately before resumption
        ALIGN
@@ -301,7 +317,7 @@ work_pending:
        jz work_notifysig
 work_resched:
        call schedule
-       cli                             # make sure we don't miss an interrupt
+       XEN_BLOCK_EVENTS(%esi)          # make sure we don't miss an interrupt
                                        # setting need_resched or sigpending
                                        # between sampling and the iret
        movl TI_flags(%ebp), %ecx
@@ -348,7 +364,7 @@ syscall_trace_entry:
 syscall_exit_work:
        testb $(_TIF_SYSCALL_TRACE|_TIF_SYSCALL_AUDIT|_TIF_SINGLESTEP), %cl
        jz work_pending
-       sti                             # could let do_syscall_trace() call
+       XEN_UNBLOCK_EVENTS(%esi)        # could let do_syscall_trace() call
                                        # schedule() instead
        movl %esp, %eax
        movl $1, %edx
@@ -368,27 +384,7 @@ syscall_badsys:
        movl $-ENOSYS,EAX(%esp)
        jmp resume_userspace
 
-#define FIXUP_ESPFIX_STACK \
-       movl %esp, %eax; \
-       /* switch to 32bit stack using the pointer on top of 16bit stack */ \
-       lss %ss:CPU_16BIT_STACK_SIZE-8, %esp; \
-       /* copy data from 16bit stack to 32bit stack */ \
-       call fixup_x86_bogus_stack; \
-       /* put ESP to the proper location */ \
-       movl %eax, %esp;
-#define UNWIND_ESPFIX_STACK \
-       pushl %eax; \
-       movl %ss, %eax; \
-       /* see if on 16bit stack */ \
-       cmpw $__ESPFIX_SS, %ax; \
-       jne 28f; \
-       movl $__KERNEL_DS, %edx; \
-       movl %edx, %ds; \
-       movl %edx, %es; \
-       /* switch to 32bit stack */ \
-       FIXUP_ESPFIX_STACK \
-28:    popl %eax;
-
+#if 0 /* XEN */
 /*
  * Build the entry stubs and pointer table with
  * some assembler magic.
@@ -426,6 +422,7 @@ ENTRY(name)                         \
 
 /* The include is where all of the SMP etc. interrupts come from */
 #include "entry_arch.h"
+#endif /* XEN */
 
 ENTRY(divide_error)
        pushl $0                        # no error code
@@ -443,9 +440,7 @@ error_code:
        pushl %ecx
        pushl %ebx
        cld
-       pushl %es
-       UNWIND_ESPFIX_STACK
-       popl %ecx
+       movl %es, %ecx
        movl ES(%esp), %edi             # get the function address
        movl ORIG_EAX(%esp), %edx       # get the error code
        movl %eax, ORIG_EAX(%esp)
@@ -457,6 +452,118 @@ error_code:
        call *%edi
        jmp ret_from_exception
 
+# A note on the "critical region" in our callback handler.
+# We want to avoid stacking callback handlers due to events occurring
+# during handling of the last event. To do this, we keep events disabled
+# until we've done all processing. HOWEVER, we must enable events before
+# popping the stack frame (can't be done atomically) and so it would still
+# be possible to get enough handler activations to overflow the stack.
+# Although unlikely, bugs of that kind are hard to track down, so we'd
+# like to avoid the possibility.
+# So, on entry to the handler we detect whether we interrupted an
+# existing activation in its critical region -- if so, we pop the current
+# activation and restart the handler using the previous one.
+ENTRY(hypervisor_callback)
+       pushl %eax
+       SAVE_ALL
+       movl EIP(%esp),%eax
+       cmpl $scrit,%eax
+       jb   11f
+       cmpl $ecrit,%eax
+       jb   critical_region_fixup
+11:    push %esp
+       call evtchn_do_upcall
+       add  $4,%esp
+       jmp  ret_from_intr
+
+        ALIGN
+restore_all_enable_events:  
+       XEN_LOCKED_UNBLOCK_EVENTS(%esi)
+scrit: /**** START OF CRITICAL REGION ****/
+       XEN_TEST_PENDING(%esi)
+       jnz  14f                        # process more events if necessary...
+       XEN_PUT_VCPU_INFO(%esi)
+       RESTORE_ALL
+14:    XEN_LOCKED_BLOCK_EVENTS(%esi)
+       XEN_PUT_VCPU_INFO(%esi)
+       jmp  11b
+ecrit:  /**** END OF CRITICAL REGION ****/
+# [How we do the fixup]. We want to merge the current stack frame with the
+# just-interrupted frame. How we do this depends on where in the critical
+# region the interrupted handler was executing, and so how many saved
+# registers are in each frame. We do this quickly using the lookup table
+# 'critical_fixup_table'. For each byte offset in the critical region, it
+# provides the number of bytes which have already been popped from the
+# interrupted stack frame. 
+critical_region_fixup:
+       addl $critical_fixup_table-scrit,%eax
+       movzbl (%eax),%eax              # %eax contains num bytes popped
+       cmpb $0xff,%al                  # 0xff => vcpu_info critical region
+       jne  15f
+       GET_THREAD_INFO(%ebp)
+       XEN_PUT_VCPU_INFO(%esi)         # abort vcpu_info critical region
+        xorl %eax,%eax
+15:    mov  %esp,%esi
+       add  %eax,%esi                  # %esi points at end of src region
+       mov  %esp,%edi
+       add  $0x34,%edi                 # %edi points at end of dst region
+       mov  %eax,%ecx
+       shr  $2,%ecx                    # convert words to bytes
+       je   17f                        # skip loop if nothing to copy
+16:    subl $4,%esi                    # pre-decrementing copy loop
+       subl $4,%edi
+       movl (%esi),%eax
+       movl %eax,(%edi)
+       loop 16b
+17:    movl %edi,%esp                  # final %edi is top of merged stack
+       jmp  11b
+
+critical_fixup_table:
+       .byte 0xff,0xff,0xff            # testb $0xff,(%esi) = XEN_TEST_PENDING
+       .byte 0xff,0xff                 # jnz  14f
+       XEN_PUT_VCPU_INFO_fixup
+       .byte 0x00                      # pop  %ebx
+       .byte 0x04                      # pop  %ecx
+       .byte 0x08                      # pop  %edx
+       .byte 0x0c                      # pop  %esi
+       .byte 0x10                      # pop  %edi
+       .byte 0x14                      # pop  %ebp
+       .byte 0x18                      # pop  %eax
+       .byte 0x1c                      # pop  %ds
+       .byte 0x20                      # pop  %es
+       .byte 0x24,0x24,0x24            # add  $4,%esp
+       .byte 0x28                      # iret
+       .byte 0xff,0xff,0xff,0xff       # movb $1,1(%esi)
+       XEN_PUT_VCPU_INFO_fixup
+       .byte 0x00,0x00                 # jmp  11b
+
+# Hypervisor uses this for application faults while it executes.
+ENTRY(failsafe_callback)
+1:     popl %ds
+2:     popl %es
+3:     popl %fs
+4:     popl %gs
+       subl $4,%esp
+       SAVE_ALL
+       jmp  ret_from_exception
+.section .fixup,"ax";  \
+6:     movl $0,(%esp); \
+       jmp 1b;         \
+7:     movl $0,(%esp); \
+       jmp 2b;         \
+8:     movl $0,(%esp); \
+       jmp 3b;         \
+9:     movl $0,(%esp); \
+       jmp 4b;         \
+.previous;             \
+.section __ex_table,"a";\
+       .align 4;       \
+       .long 1b,6b;    \
+       .long 2b,7b;    \
+       .long 3b,8b;    \
+       .long 4b,9b;    \
+.previous
+
 ENTRY(coprocessor_error)
        pushl $0
        pushl $do_coprocessor_error
@@ -470,17 +577,9 @@ ENTRY(simd_coprocessor_error)
 ENTRY(device_not_available)
        pushl $-1                       # mark this as an int
        SAVE_ALL
-       movl %cr0, %eax
-       testl $0x4, %eax                # EM (math emulation bit)
-       jne device_not_available_emulate
        preempt_stop
        call math_state_restore
        jmp ret_from_exception
-device_not_available_emulate:
-       pushl $0                        # temporary storage for ORIG_EIP
-       call math_emulate
-       addl $4, %esp
-       jmp ret_from_exception
 
 /*
  * Debug traps and NMI can happen at the one SYSENTER instruction
@@ -516,6 +615,7 @@ debug_stack_correct:
        call do_debug
        jmp ret_from_exception
 
+#if 0 /* XEN */
 /*
  * NMI is doubly nasty. It can happen _while_ we're handling
  * a debug fault, and the debug fault hasn't yet been able to
@@ -525,11 +625,6 @@ debug_stack_correct:
  * fault happened on the sysenter path.
  */
 ENTRY(nmi)
-       pushl %eax
-       movl %ss, %eax
-       cmpw $__ESPFIX_SS, %ax
-       popl %eax
-       je nmi_16bit_stack
        cmpl $sysenter_entry,(%esp)
        je nmi_stack_fixup
        pushl %eax
@@ -549,7 +644,7 @@ nmi_stack_correct:
        xorl %edx,%edx          # zero error code
        movl %esp,%eax          # pt_regs pointer
        call do_nmi
-       jmp restore_all
+       RESTORE_ALL
 
 nmi_stack_fixup:
        FIX_STACK(12,nmi_stack_correct, 1)
@@ -564,29 +659,7 @@ nmi_debug_stack_check:
 nmi_debug_stack_fixup:
        FIX_STACK(24,nmi_stack_correct, 1)
        jmp nmi_stack_correct
-
-nmi_16bit_stack:
-       /* create the pointer to lss back */
-       pushl %ss
-       pushl %esp
-       movzwl %sp, %esp
-       addw $4, (%esp)
-       /* copy the iret frame of 12 bytes */
-       .rept 3
-       pushl 16(%esp)
-       .endr
-       pushl %eax
-       SAVE_ALL
-       FIXUP_ESPFIX_STACK              # %eax == %esp
-       xorl %edx,%edx                  # zero error code
-       call do_nmi
-       RESTORE_REGS
-       lss 12+4(%esp), %esp            # back to 16bit stack
-1:     iret
-.section __ex_table,"a"
-       .align 4
-       .long 1b,iret_exc
-.previous
+#endif /* XEN */
 
 ENTRY(int3)
        pushl $-1                       # mark this as an int
@@ -636,9 +709,33 @@ ENTRY(alignment_check)
        pushl $do_alignment_check
        jmp error_code
 
+# This handler is special, because it gets an extra value on its stack,
+# which is the linear faulting address.
+# fastcall register usage:  %eax = pt_regs, %edx = error code,
+#                          %ecx = fault address
 ENTRY(page_fault)
-       pushl $do_page_fault
-       jmp error_code
+       pushl %ds
+       pushl %eax
+       xorl %eax, %eax
+       pushl %ebp
+       pushl %edi
+       pushl %esi
+       pushl %edx
+       decl %eax                       /* eax = -1 */
+       pushl %ecx
+       pushl %ebx
+       cld
+       movl %es,%edi
+       movl ES(%esp), %ecx             /* get the faulting address */
+       movl ORIG_EAX(%esp), %edx       /* get the error code */
+       movl %eax, ORIG_EAX(%esp)
+       movl %edi, ES(%esp)
+       movl $(__KERNEL_DS),%eax
+       movl %eax, %ds
+       movl %eax, %es
+       movl %esp,%eax                  /* pt_regs pointer */
+       call do_page_fault
+       jmp ret_from_exception
 
 #ifdef CONFIG_X86_MCE
 ENTRY(machine_check)
@@ -647,9 +744,8 @@ ENTRY(machine_check)
        jmp error_code
 #endif
 
-ENTRY(spurious_interrupt_bug)
-       pushl $0
-       pushl $do_spurious_interrupt_bug
+ENTRY(fixup_4gb_segment)
+       pushl $do_fixup_4gb_segment
        jmp error_code
 
 #include "syscall_table.S"
diff -x mkbuildtree -x include -x xen -x SCCS -urPp 
pristine-linux-2.6.12/arch/i386/kernel/head.S 
linux-2.6-xen-sparse/arch/i386/kernel/head.S
--- pristine-linux-2.6.12/arch/i386/kernel/head.S       2005-06-17 
12:48:29.000000000 -0700
+++ linux-2.6-xen-sparse/arch/i386/kernel/head.S        2005-07-28 
13:17:07.000000000 -0700
@@ -1,24 +1,25 @@
-/*
- *  linux/arch/i386/kernel/head.S -- the 32-bit startup code.
- *
- *  Copyright (C) 1991, 1992  Linus Torvalds
- *
- *  Enhanced CPU detection and feature setting code by Mike Jagdis
- *  and Martin Mares, November 1997.
- */
 
-.text
 #include <linux/config.h>
+
+.section __xen_guest
+       .ascii  "GUEST_OS=linux,GUEST_VER=2.6"
+       .ascii  ",XEN_VER=3.0"
+       .ascii  ",VIRT_BASE=0xC0000000"
+#ifdef CONFIG_X86_PAE
+       .ascii  ",PAE=yes"
+#else
+       .ascii  ",PAE=no"
+#endif
+       .ascii  ",LOADER=generic"
+       .byte   0
+
+.text
 #include <linux/threads.h>
 #include <linux/linkage.h>
 #include <asm/segment.h>
-#include <asm/page.h>
-#include <asm/pgtable.h>
-#include <asm/desc.h>
-#include <asm/cache.h>
 #include <asm/thread_info.h>
 #include <asm/asm_offsets.h>
-#include <asm/setup.h>
+#include <asm-xen/xen-public/arch-x86_32.h>
 
 /*
  * References to members of the new_cpu_data structure.
@@ -33,239 +34,24 @@
 #define X86_CAPABILITY new_cpu_data+CPUINFO_x86_capability
 #define X86_VENDOR_ID  new_cpu_data+CPUINFO_x86_vendor_id
 
-/*
- * This is how much memory *in addition to the memory covered up to
- * and including _end* we need mapped initially.  We need one bit for
- * each possible page, but only in low memory, which means
- * 2^32/4096/8 = 128K worst case (4G/4G split.)
- *
- * Modulo rounding, each megabyte assigned here requires a kilobyte of
- * memory, which is currently unreclaimed.
- *
- * This should be a multiple of a page.
- */
-#define INIT_MAP_BEYOND_END    (128*1024)
-
-
-/*
- * 32-bit kernel entrypoint; only used by the boot CPU.  On entry,
- * %esi points to the real-mode code as a 32-bit pointer.
- * CS and DS must be 4 GB flat segments, but we don't depend on
- * any particular GDT layout, because we load our own as soon as we
- * can.
- */
 ENTRY(startup_32)
-
-/*
- * Set segments to known values.
- */
        cld
-       lgdt boot_gdt_descr - __PAGE_OFFSET
-       movl $(__BOOT_DS),%eax
-       movl %eax,%ds
-       movl %eax,%es
-       movl %eax,%fs
-       movl %eax,%gs
 
-/*
- * Clear BSS first so that there are no surprises...
- * No need to cld as DF is already clear from cld above...
- */
-       xorl %eax,%eax
-       movl $__bss_start - __PAGE_OFFSET,%edi
-       movl $__bss_stop - __PAGE_OFFSET,%ecx
-       subl %edi,%ecx
-       shrl $2,%ecx
-       rep ; stosl
-
-/*
- * Initialize page tables.  This creates a PDE and a set of page
- * tables, which are located immediately beyond _end.  The variable
- * init_pg_tables_end is set up to point to the first "safe" location.
- * Mappings are created both at virtual address 0 (identity mapping)
- * and PAGE_OFFSET for up to _end+sizeof(page tables)+INIT_MAP_BEYOND_END.
- *
- * Warning: don't use %esi or the stack in this code.  However, %esp
- * can be used as a GPR if you really need it...
- */
-page_pde_offset = (__PAGE_OFFSET >> 20);
-
-       movl $(pg0 - __PAGE_OFFSET), %edi
-       movl $(swapper_pg_dir - __PAGE_OFFSET), %edx
-       movl $0x007, %eax                       /* 0x007 = PRESENT+RW+USER */
-10:
-       leal 0x007(%edi),%ecx                   /* Create PDE entry */
-       movl %ecx,(%edx)                        /* Store identity PDE entry */
-       movl %ecx,page_pde_offset(%edx)         /* Store kernel PDE entry */
-       addl $4,%edx
-       movl $1024, %ecx
-11:
-       stosl
-       addl $0x1000,%eax
-       loop 11b
-       /* End condition: we must map up to and including INIT_MAP_BEYOND_END */
-       /* bytes beyond the end of our own page tables; the +0x007 is the 
attribute bits */
-       leal (INIT_MAP_BEYOND_END+0x007)(%edi),%ebp
-       cmpl %ebp,%eax
-       jb 10b
-       movl %edi,(init_pg_tables_end - __PAGE_OFFSET)
+       /* Copy the necessary stuff from xen_start_info structure. */
+       mov  $xen_start_info_union,%edi
+       mov  $512,%ecx
+       rep movsl
 
 #ifdef CONFIG_SMP
-       xorl %ebx,%ebx                          /* This is the boot CPU (BSP) */
-       jmp 3f
-
-/*
- * Non-boot CPU entry point; entered from trampoline.S
- * We can't lgdt here, because lgdt itself uses a data segment, but
- * we know the trampoline has already loaded the boot_gdt_table GDT
- * for us.
- */
 ENTRY(startup_32_smp)
        cld
-       movl $(__BOOT_DS),%eax
-       movl %eax,%ds
-       movl %eax,%es
-       movl %eax,%fs
-       movl %eax,%gs
-
-/*
- *     New page tables may be in 4Mbyte page mode and may
- *     be using the global pages. 
- *
- *     NOTE! If we are on a 486 we may have no cr4 at all!
- *     So we do not try to touch it unless we really have
- *     some bits in it to set.  This won't work if the BSP
- *     implements cr4 but this AP does not -- very unlikely
- *     but be warned!  The same applies to the pse feature
- *     if not equally supported. --macro
- *
- *     NOTE! We have to correct for the fact that we're
- *     not yet offset PAGE_OFFSET..
- */
-#define cr4_bits mmu_cr4_features-__PAGE_OFFSET
-       movl cr4_bits,%edx
-       andl %edx,%edx
-       jz 6f
-       movl %cr4,%eax          # Turn on paging options (PSE,PAE,..)
-       orl %edx,%eax
-       movl %eax,%cr4
-
-       btl $5, %eax            # check if PAE is enabled
-       jnc 6f
-
-       /* Check if extended functions are implemented */
-       movl $0x80000000, %eax
-       cpuid
-       cmpl $0x80000000, %eax
-       jbe 6f
-       mov $0x80000001, %eax
-       cpuid
-       /* Execute Disable bit supported? */
-       btl $20, %edx
-       jnc 6f
-
-       /* Setup EFER (Extended Feature Enable Register) */
-       movl $0xc0000080, %ecx
-       rdmsr
-
-       btsl $11, %eax
-       /* Make changes effective */
-       wrmsr
-
-6:
-       /* This is a secondary processor (AP) */
-       xorl %ebx,%ebx
-       incl %ebx
-
-3:
 #endif /* CONFIG_SMP */
 
-/*
- * Enable paging
- */
-       movl $swapper_pg_dir-__PAGE_OFFSET,%eax
-       movl %eax,%cr3          /* set the page table pointer.. */
-       movl %cr0,%eax
-       orl $0x80000000,%eax
-       movl %eax,%cr0          /* ..and set paging (PG) bit */
-       ljmp $__BOOT_CS,$1f     /* Clear prefetch and normalize %eip */
-1:
        /* Set up the stack pointer */
        lss stack_start,%esp
 
-/*
- * Initialize eflags.  Some BIOS's leave bits like NT set.  This would
- * confuse the debugger if this code is traced.
- * XXX - best to initialize before switching to protected mode.
- */
-       pushl $0
-       popfl
-
-#ifdef CONFIG_SMP
-       andl %ebx,%ebx
-       jz  1f                          /* Initial CPU cleans BSS */
-       jmp checkCPUtype
-1:
-#endif /* CONFIG_SMP */
-
-/*
- * start system 32-bit setup. We need to re-do some of the things done
- * in 16-bit mode for the "real" operations.
- */
-       call setup_idt
-
-/*
- * Copy bootup parameters out of the way.
- * Note: %esi still has the pointer to the real-mode data.
- */
-       movl $boot_params,%edi
-       movl $(PARAM_SIZE/4),%ecx
-       cld
-       rep
-       movsl
-       movl boot_params+NEW_CL_POINTER,%esi
-       andl %esi,%esi
-       jnz 2f                  # New command line protocol
-       cmpw $(OLD_CL_MAGIC),OLD_CL_MAGIC_ADDR
-       jne 1f
-       movzwl OLD_CL_OFFSET,%esi
-       addl $(OLD_CL_BASE_ADDR),%esi
-2:
-       movl $saved_command_line,%edi
-       movl $(COMMAND_LINE_SIZE/4),%ecx
-       rep
-       movsl
-1:
 checkCPUtype:
 
-       movl $-1,X86_CPUID              #  -1 for no CPUID initially
-
-/* check if it is 486 or 386. */
-/*
- * XXX - this does a lot of unnecessary setup.  Alignment checks don't
- * apply at our cpl of 0 and the stack ought to be aligned already, and
- * we don't need to preserve eflags.
- */
-
-       movb $3,X86             # at least 386
-       pushfl                  # push EFLAGS
-       popl %eax               # get EFLAGS
-       movl %eax,%ecx          # save original EFLAGS
-       xorl $0x240000,%eax     # flip AC and ID bits in EFLAGS
-       pushl %eax              # copy to EFLAGS
-       popfl                   # set EFLAGS
-       pushfl                  # get new EFLAGS
-       popl %eax               # put it in eax
-       xorl %ecx,%eax          # change in flags
-       pushl %ecx              # restore original EFLAGS
-       popfl
-       testl $0x40000,%eax     # check if AC bit changed
-       je is386
-
-       movb $4,X86             # at least 486
-       testl $0x200000,%eax    # check if ID bit changed
-       je is486
-
        /* get vendor info */
        xorl %eax,%eax                  # call CPUID with 0 -> return vendor ID
        cpuid
@@ -274,9 +60,6 @@ checkCPUtype:
        movl %edx,X86_VENDOR_ID+4       # next 4 chars
        movl %ecx,X86_VENDOR_ID+8       # last 4 chars
 
-       orl %eax,%eax                   # do we have processor info as well?
-       je is486
-
        movl $1,%eax            # Use the CPUID instruction to get CPU type
        cpuid
        movb %al,%cl            # save reg for future use
@@ -289,32 +72,13 @@ checkCPUtype:
        movb %cl,X86_MASK
        movl %edx,X86_CAPABILITY
 
-is486: movl $0x50022,%ecx      # set AM, WP, NE and MP
-       jmp 2f
-
-is386: movl $2,%ecx            # set MP
-2:     movl %cr0,%eax
-       andl $0x80000011,%eax   # Save PG,PE,ET
-       orl %ecx,%eax
-       movl %eax,%cr0
-
-       call check_x87
        incb ready
-       lgdt cpu_gdt_descr
-       lidt idt_descr
-       ljmp $(__KERNEL_CS),$1f
-1:     movl $(__KERNEL_DS),%eax        # reload all the segment registers
-       movl %eax,%ss                   # after changing gdt.
-
-       movl $(__USER_DS),%eax          # DS/ES contains default USER segment
-       movl %eax,%ds
-       movl %eax,%es
 
        xorl %eax,%eax                  # Clear FS/GS and LDT
        movl %eax,%fs
        movl %eax,%gs
-       lldt %ax
        cld                     # gcc2 wants the direction flag cleared at all 
times
+
 #ifdef CONFIG_SMP
        movb ready, %cl 
        cmpb $1,%cl
@@ -329,100 +93,18 @@ L6:
        jmp L6                  # main should never return here, but
                                # just in case, we know what happens.
 
-/*
- * We depend on ET to be correct. This checks for 287/387.
- */
-check_x87:
-       movb $0,X86_HARD_MATH
-       clts
-       fninit
-       fstsw %ax
-       cmpb $0,%al
-       je 1f
-       movl %cr0,%eax          /* no coprocessor: have to set bits */
-       xorl $4,%eax            /* set EM */
-       movl %eax,%cr0
-       ret
-       ALIGN
-1:     movb $1,X86_HARD_MATH
-       .byte 0xDB,0xE4         /* fsetpm for 287, ignored by 387 */
-       ret
+ENTRY(lgdt_finish)
+       movl $(__KERNEL_DS),%eax        # reload all the segment registers
+       movw %ax,%ss                    # after changing gdt.
 
-/*
- *  setup_idt
- *
- *  sets up a idt with 256 entries pointing to
- *  ignore_int, interrupt gates. It doesn't actually load
- *  idt - that can be done only after paging has been enabled
- *  and the kernel moved to PAGE_OFFSET. Interrupts
- *  are enabled elsewhere, when we can be relatively
- *  sure everything is ok.
- *
- *  Warning: %esi is live across this function.
- */
-setup_idt:
-       lea ignore_int,%edx
-       movl $(__KERNEL_CS << 16),%eax
-       movw %dx,%ax            /* selector = 0x0010 = cs */
-       movw $0x8E00,%dx        /* interrupt gate - dpl=0, present */
-
-       lea idt_table,%edi
-       mov $256,%ecx
-rp_sidt:
-       movl %eax,(%edi)
-       movl %edx,4(%edi)
-       addl $8,%edi
-       dec %ecx
-       jne rp_sidt
-       ret
+       movl $(__USER_DS),%eax          # DS/ES contains default USER segment
+       movw %ax,%ds
+       movw %ax,%es
 
-/* This is the default interrupt "handler" :-) */
-       ALIGN
-ignore_int:
-       cld
-#ifdef CONFIG_PRINTK
+       popl %eax                       # reload CS by intersegment return
+       pushl $(__KERNEL_CS)
        pushl %eax
-       pushl %ecx
-       pushl %edx
-       pushl %es
-       pushl %ds
-       movl $(__KERNEL_DS),%eax
-       movl %eax,%ds
-       movl %eax,%es
-       pushl 16(%esp)
-       pushl 24(%esp)
-       pushl 32(%esp)
-       pushl 40(%esp)
-       pushl $int_msg
-       call printk
-       addl $(5*4),%esp
-       popl %ds
-       popl %es
-       popl %edx
-       popl %ecx
-       popl %eax
-#endif
-       iret
-
-/*
- * Real beginning of normal "text" segment
- */
-ENTRY(stext)
-ENTRY(_stext)
-
-/*
- * BSS section
- */
-.section ".bss.page_aligned","w"
-ENTRY(swapper_pg_dir)
-       .fill 1024,4,0
-ENTRY(empty_zero_page)
-       .fill 4096,1,0
-
-/*
- * This starts the data section.
- */
-.data
+       lret
 
 ENTRY(stack_start)
        .long init_thread_union+THREAD_SIZE
@@ -430,27 +112,10 @@ ENTRY(stack_start)
 
 ready: .byte 0
 
-int_msg:
-       .asciz "Unknown interrupt or fault at EIP %p %p %p\n"
-
-/*
- * The IDT and GDT 'descriptors' are a strange 48-bit object
- * only used by the lidt and lgdt instructions. They are not
- * like usual segment descriptors - they consist of a 16-bit
- * segment size, and 32-bit linear address value:
- */
-
-.globl boot_gdt_descr
 .globl idt_descr
 .globl cpu_gdt_descr
 
        ALIGN
-# early boot GDT descriptor (must use 1:1 address mapping)
-       .word 0                         # 32 bit align gdt_desc.address
-boot_gdt_descr:
-       .word __BOOT_DS+7
-       .long boot_gdt_table - __PAGE_OFFSET
-
        .word 0                         # 32-bit align idt_desc.address
 idt_descr:
        .word IDT_ENTRIES*8-1           # idt contains 256 entries
@@ -459,25 +124,18 @@ idt_descr:
 # boot GDT descriptor (later on used by CPU#0):
        .word 0                         # 32 bit align gdt_desc.address
 cpu_gdt_descr:
-       .word GDT_ENTRIES*8-1
+       .word GDT_SIZE
        .long cpu_gdt_table
 
        .fill NR_CPUS-1,8,0             # space for the other GDT descriptors
 
-/*
- * The boot_gdt_table must mirror the equivalent in setup.S and is
- * used only for booting.
- */
-       .align L1_CACHE_BYTES
-ENTRY(boot_gdt_table)
-       .fill GDT_ENTRY_BOOT_CS,8,0
-       .quad 0x00cf9a000000ffff        /* kernel 4GB code at 0x00000000 */
-       .quad 0x00cf92000000ffff        /* kernel 4GB data at 0x00000000 */
+.org 0x1000
+ENTRY(empty_zero_page)
 
-/*
- * The Global Descriptor Table contains 28 quadwords, per-CPU.
- */
-       .align PAGE_SIZE_asm
+.org 0x2000
+ENTRY(swapper_pg_dir)
+
+.org 0x3000
 ENTRY(cpu_gdt_table)
        .quad 0x0000000000000000        /* NULL descriptor */
        .quad 0x0000000000000000        /* 0x0b reserved */
@@ -492,32 +150,49 @@ ENTRY(cpu_gdt_table)
        .quad 0x0000000000000000        /* 0x53 reserved */
        .quad 0x0000000000000000        /* 0x5b reserved */
 
-       .quad 0x00cf9a000000ffff        /* 0x60 kernel 4GB code at 0x00000000 */
-       .quad 0x00cf92000000ffff        /* 0x68 kernel 4GB data at 0x00000000 */
-       .quad 0x00cffa000000ffff        /* 0x73 user 4GB code at 0x00000000 */
-       .quad 0x00cff2000000ffff        /* 0x7b user 4GB data at 0x00000000 */
+#ifdef CONFIG_X86_PAE
+       .quad 0x00cfbb00000067ff        /* 0x60 kernel 4GB code at 0x00000000 */
+       .quad 0x00cfb300000067ff        /* 0x68 kernel 4GB data at 0x00000000 */
+       .quad 0x00cffb00000067ff        /* 0x73 user 4GB code at 0x00000000 */
+       .quad 0x00cff300000067ff        /* 0x7b user 4GB data at 0x00000000 */
+#else
+       .quad 0x00cfbb000000c3ff        /* 0x60 kernel 4GB code at 0x00000000 */
+       .quad 0x00cfb3000000c3ff        /* 0x68 kernel 4GB data at 0x00000000 */
+       .quad 0x00cffb000000c3ff        /* 0x73 user 4GB code at 0x00000000 */
+       .quad 0x00cff3000000c3ff        /* 0x7b user 4GB data at 0x00000000 */
+#endif
 
        .quad 0x0000000000000000        /* 0x80 TSS descriptor */
        .quad 0x0000000000000000        /* 0x88 LDT descriptor */
 
        /* Segments used for calling PnP BIOS */
-       .quad 0x00c09a0000000000        /* 0x90 32-bit code */
-       .quad 0x00809a0000000000        /* 0x98 16-bit code */
-       .quad 0x0080920000000000        /* 0xa0 16-bit data */
-       .quad 0x0080920000000000        /* 0xa8 16-bit data */
-       .quad 0x0080920000000000        /* 0xb0 16-bit data */
+       .quad 0x0000000000000000        /* 0x90 32-bit code */
+       .quad 0x0000000000000000        /* 0x98 16-bit code */
+       .quad 0x0000000000000000        /* 0xa0 16-bit data */
+       .quad 0x0000000000000000        /* 0xa8 16-bit data */
+       .quad 0x0000000000000000        /* 0xb0 16-bit data */
        /*
         * The APM segments have byte granularity and their bases
         * and limits are set at run time.
         */
-       .quad 0x00409a0000000000        /* 0xb8 APM CS    code */
-       .quad 0x00009a0000000000        /* 0xc0 APM CS 16 code (16 bit) */
-       .quad 0x0040920000000000        /* 0xc8 APM DS    data */
+       .quad 0x0000000000000000        /* 0xb8 APM CS    code */
+       .quad 0x0000000000000000        /* 0xc0 APM CS 16 code (16 bit) */
+       .quad 0x0000000000000000        /* 0xc8 APM DS    data */
 
-       .quad 0x0000920000000000        /* 0xd0 - ESPFIX 16-bit SS */
+       .quad 0x0000000000000000        /* 0xd0 - unused */
        .quad 0x0000000000000000        /* 0xd8 - unused */
        .quad 0x0000000000000000        /* 0xe0 - unused */
        .quad 0x0000000000000000        /* 0xe8 - unused */
        .quad 0x0000000000000000        /* 0xf0 - unused */
        .quad 0x0000000000000000        /* 0xf8 - GDT entry 31: double-fault 
TSS */
+       .fill GDT_ENTRIES-32,8,0
 
+.org 0x4000
+ENTRY(default_ldt)
+
+.org 0x5000
+/*
+ * Real beginning of normal "text" segment
+ */
+ENTRY(stext)
+ENTRY(_stext)
diff -x mkbuildtree -x include -x xen -x SCCS -urPp 
pristine-linux-2.6.12/arch/i386/kernel/i386_ksyms.c 
linux-2.6-xen-sparse/arch/i386/kernel/i386_ksyms.c
--- pristine-linux-2.6.12/arch/i386/kernel/i386_ksyms.c 2005-06-17 
12:48:29.000000000 -0700
+++ linux-2.6-xen-sparse/arch/i386/kernel/i386_ksyms.c  2005-07-28 
13:17:07.000000000 -0700
@@ -76,7 +76,9 @@ EXPORT_SYMBOL(ioremap_nocache);
 EXPORT_SYMBOL(iounmap);
 EXPORT_SYMBOL(kernel_thread);
 EXPORT_SYMBOL(pm_idle);
+#ifdef CONFIG_ACPI_BOOT
 EXPORT_SYMBOL(pm_power_off);
+#endif
 EXPORT_SYMBOL(get_cmos_time);
 EXPORT_SYMBOL(cpu_khz);
 EXPORT_SYMBOL(apm_info);
diff -x mkbuildtree -x include -x xen -x SCCS -urPp 
pristine-linux-2.6.12/arch/i386/kernel/io_apic.c 
linux-2.6-xen-sparse/arch/i386/kernel/io_apic.c
--- pristine-linux-2.6.12/arch/i386/kernel/io_apic.c    2005-06-17 
12:48:29.000000000 -0700
+++ linux-2.6-xen-sparse/arch/i386/kernel/io_apic.c     2005-07-28 
13:17:07.000000000 -0700
@@ -42,6 +42,48 @@
 
 #include "io_ports.h"
 
+#ifdef CONFIG_XEN
+
+#include <asm-xen/xen-public/xen.h>
+#include <asm-xen/xen-public/physdev.h>
+
+/* Fake i8259 */
+#define make_8259A_irq(_irq)     (io_apic_irqs &= ~(1UL<<(_irq)))
+#define disable_8259A_irq(_irq)  ((void)0)
+#define i8259A_irq_pending(_irq) (0)
+
+unsigned long io_apic_irqs;
+
+static inline unsigned int xen_io_apic_read(unsigned int apic, unsigned int 
reg)
+{
+       physdev_op_t op;
+       int ret;
+
+       op.cmd = PHYSDEVOP_APIC_READ;
+       op.u.apic_op.apic = mp_ioapics[apic].mpc_apicid;
+       op.u.apic_op.offset = reg;
+       ret = HYPERVISOR_physdev_op(&op);
+       if (ret)
+               return ret;
+       return op.u.apic_op.value;
+}
+
+static inline void xen_io_apic_write(unsigned int apic, unsigned int reg, 
unsigned int value)
+{
+       physdev_op_t op;
+
+       op.cmd = PHYSDEVOP_APIC_WRITE;
+       op.u.apic_op.apic = mp_ioapics[apic].mpc_apicid;
+       op.u.apic_op.offset = reg;
+       op.u.apic_op.value = value;
+       HYPERVISOR_physdev_op(&op);
+}
+
+#define io_apic_read(a,r)    xen_io_apic_read(a,r)
+#define io_apic_write(a,r,v) xen_io_apic_write(a,r,v)
+
+#endif /* CONFIG_XEN */
+
 int (*ioapic_renumber_irq)(int ioapic, int irq);
 atomic_t irq_mis_count;
 
@@ -107,6 +149,7 @@ static void add_pin_to_irq(unsigned int 
        entry->pin = pin;
 }
 
+#ifndef CONFIG_XEN
 /*
  * Reroute an IRQ to a different pin.
  */
@@ -243,6 +286,9 @@ static void set_ioapic_affinity_irq(unsi
        }
        spin_unlock_irqrestore(&ioapic_lock, flags);
 }
+#else
+#define clear_IO_APIC() ((void)0)
+#endif
 
 #if defined(CONFIG_IRQBALANCE)
 # include <asm/processor.h>    /* kernel_thread() */
@@ -664,6 +710,7 @@ static inline void move_irq(int irq) { }
 #ifndef CONFIG_SMP
 void fastcall send_IPI_self(int vector)
 {
+#ifndef CONFIG_XEN
        unsigned int cfg;
 
        /*
@@ -675,6 +722,7 @@ void fastcall send_IPI_self(int vector)
         * Send the IPI. The write to APIC_ICR fires this off.
         */
        apic_write_around(APIC_ICR, cfg);
+#endif
 }
 #endif /* !CONFIG_SMP */
 
@@ -744,6 +792,7 @@ static int find_irq_entry(int apic, int 
        return -1;
 }
 
+#ifndef CONFIG_XEN
 /*
  * Find the pin to which IRQ[irq] (ISA) is connected
  */
@@ -766,6 +815,7 @@ static int find_isa_irq_pin(int irq, int
        }
        return -1;
 }
+#endif
 
 /*
  * Find a specific PCI IRQ entry.
@@ -813,6 +863,7 @@ int IO_APIC_get_PCI_irq_vector(int bus, 
        return best_guess;
 }
 
+#ifndef CONFIG_XEN
 /*
  * This function currently is only a helper for the i386 smp boot process 
where 
  * we need to reprogram the ioredtbls to cater for the cpus which have come 
online
@@ -836,6 +887,7 @@ void __init setup_ioapic_dest(void)
 
        }
 }
+#endif /* !CONFIG_XEN */
 
 /*
  * EISA Edge/Level control register, ELCR
@@ -1125,26 +1177,22 @@ static inline int IO_APIC_irq_trigger(in
 }
 
 /* irq_vectors is indexed by the sum of all RTEs in all I/O APICs. */
-u8 irq_vector[NR_IRQ_VECTORS] = { FIRST_DEVICE_VECTOR , 0 };
+u8 irq_vector[NR_IRQ_VECTORS]; /* = { FIRST_DEVICE_VECTOR , 0 }; */
 
 int assign_irq_vector(int irq)
 {
-       static int current_vector = FIRST_DEVICE_VECTOR, offset = 0;
+       static int current_vector = FIRST_DEVICE_VECTOR;
+       physdev_op_t op;
 
        BUG_ON(irq >= NR_IRQ_VECTORS);
        if (irq != AUTO_ASSIGN && IO_APIC_VECTOR(irq) > 0)
                return IO_APIC_VECTOR(irq);
-next:
-       current_vector += 8;
-       if (current_vector == SYSCALL_VECTOR)
-               goto next;
-
-       if (current_vector >= FIRST_SYSTEM_VECTOR) {
-               offset++;
-               if (!(offset%8))
-                       return -ENOSPC;
-               current_vector = FIRST_DEVICE_VECTOR + offset;
-       }
+
+       op.cmd = PHYSDEVOP_ASSIGN_VECTOR;
+       op.u.irq_op.irq = irq;
+       if (HYPERVISOR_physdev_op(&op))
+               return -ENOSPC;
+       current_vector = op.u.irq_op.vector;
 
        vector_irq[current_vector] = irq;
        if (irq != AUTO_ASSIGN)
@@ -1153,6 +1201,7 @@ next:
        return current_vector;
 }
 
+#ifndef CONFIG_XEN
 static struct hw_interrupt_type ioapic_level_type;
 static struct hw_interrupt_type ioapic_edge_type;
 
@@ -1178,6 +1227,9 @@ static inline void ioapic_register_intr(
                set_intr_gate(vector, interrupt[irq]);
        }
 }
+#else
+#define ioapic_register_intr(_irq,_vector,_trigger) ((void)0)
+#endif
 
 static void __init setup_IO_APIC_irqs(void)
 {
@@ -1233,7 +1285,7 @@ static void __init setup_IO_APIC_irqs(vo
                else
                        add_pin_to_irq(irq, apic, pin);
 
-               if (!apic && !IO_APIC_IRQ(irq))
+               if (/*!apic &&*/ !IO_APIC_IRQ(irq))
                        continue;
 
                if (IO_APIC_IRQ(irq)) {
@@ -1258,6 +1310,7 @@ static void __init setup_IO_APIC_irqs(vo
 /*
  * Set up the 8259A-master output pin:
  */
+#ifndef CONFIG_XEN
 static void __init setup_ExtINT_IRQ0_pin(unsigned int pin, int vector)
 {
        struct IO_APIC_route_entry entry;
@@ -1452,8 +1505,6 @@ void __init print_IO_APIC(void)
        return;
 }
 
-#if 0
-
 static void print_APIC_bitfield (int base)
 {
        unsigned int v;
@@ -1595,8 +1646,9 @@ void /*__init*/ print_PIC(void)
        v = inb(0x4d1) << 8 | inb(0x4d0);
        printk(KERN_DEBUG "... PIC ELCR: %04x\n", v);
 }
-
-#endif  /*  0  */
+#else
+void __init print_IO_APIC(void) { }
+#endif /* !CONFIG_XEN */
 
 static void __init enable_IO_APIC(void)
 {
@@ -1638,7 +1690,9 @@ void disable_IO_APIC(void)
         */
        clear_IO_APIC();
 
+#ifndef CONFIG_XEN
        disconnect_bsp_APIC();
+#endif
 }
 
 /*
@@ -1648,7 +1702,7 @@ void disable_IO_APIC(void)
  * by Matt Domsch <Matt_Domsch@xxxxxxxx>  Tue Dec 21 12:25:05 CST 1999
  */
 
-#ifndef CONFIG_X86_NUMAQ
+#if !defined(CONFIG_XEN) && !defined(CONFIG_X86_NUMAQ)
 static void __init setup_ioapic_ids_from_mpc(void)
 {
        union IO_APIC_reg_00 reg_00;
@@ -1755,6 +1809,7 @@ static void __init setup_ioapic_ids_from
 static void __init setup_ioapic_ids_from_mpc(void) { }
 #endif
 
+#ifndef CONFIG_XEN
 /*
  * There is a nasty bug in some older SMP boards, their mptable lies
  * about the timer IRQ. We do the following to work around the situation:
@@ -1979,6 +2034,7 @@ static struct hw_interrupt_type ioapic_l
        .end            = end_level_ioapic,
        .set_affinity   = set_ioapic_affinity,
 };
+#endif /* !CONFIG_XEN */
 
 static inline void init_IO_APIC_traps(void)
 {
@@ -2010,13 +2066,16 @@ static inline void init_IO_APIC_traps(vo
                         */
                        if (irq < 16)
                                make_8259A_irq(irq);
+#ifndef CONFIG_XEN
                        else
                                /* Strange. Oh, well.. */
                                irq_desc[irq].handler = &no_irq_type;
+#endif
                }
        }
 }
 
+#ifndef CONFIG_XEN
 static void enable_lapic_irq (unsigned int irq)
 {
        unsigned long v;
@@ -2243,6 +2302,9 @@ static inline void check_timer(void)
        panic("IO-APIC + timer doesn't work!  Boot with apic=debug and send a "
                "report.  Then try booting with the 'noapic' option");
 }
+#else
+#define check_timer() ((void)0)
+#endif
 
 /*
  *
@@ -2269,7 +2331,9 @@ void __init setup_IO_APIC(void)
         */
        if (!acpi_ioapic)
                setup_ioapic_ids_from_mpc();
+#ifndef CONFIG_XEN
        sync_Arb_IDs();
+#endif
        setup_IO_APIC_irqs();
        init_IO_APIC_traps();
        check_timer();
@@ -2391,6 +2455,7 @@ device_initcall(ioapic_init_sysfs);
 
 int __init io_apic_get_unique_id (int ioapic, int apic_id)
 {
+#ifndef CONFIG_XEN
        union IO_APIC_reg_00 reg_00;
        static physid_mask_t apic_id_map = PHYSID_MASK_NONE;
        physid_mask_t tmp;
@@ -2457,6 +2522,7 @@ int __init io_apic_get_unique_id (int io
 
        apic_printk(APIC_VERBOSE, KERN_INFO
                        "IOAPIC[%d]: Assigned apic_id %d\n", ioapic, apic_id);
+#endif /* !CONFIG_XEN */
 
        return apic_id;
 }
diff -x mkbuildtree -x include -x xen -x SCCS -urPp 
pristine-linux-2.6.12/arch/i386/kernel/ioport.c 
linux-2.6-xen-sparse/arch/i386/kernel/ioport.c
--- pristine-linux-2.6.12/arch/i386/kernel/ioport.c     2005-06-17 
12:48:29.000000000 -0700
+++ linux-2.6-xen-sparse/arch/i386/kernel/ioport.c      2005-07-28 
13:17:07.000000000 -0700
@@ -15,6 +15,7 @@
 #include <linux/stddef.h>
 #include <linux/slab.h>
 #include <linux/thread_info.h>
+#include <asm-xen/xen-public/physdev.h>
 
 /* Set EXTENT bits starting at BASE in BITMAP to value TURN_ON. */
 static void set_bitmap(unsigned long *bitmap, unsigned int base, unsigned int 
extent, int new_value)
@@ -56,10 +57,9 @@ static void set_bitmap(unsigned long *bi
  */
 asmlinkage long sys_ioperm(unsigned long from, unsigned long num, int turn_on)
 {
-       unsigned long i, max_long, bytes, bytes_updated;
        struct thread_struct * t = &current->thread;
-       struct tss_struct * tss;
        unsigned long *bitmap;
+       physdev_op_t op;
 
        if ((from + num <= from) || (from + num > IO_BITMAP_BITS))
                return -EINVAL;
@@ -78,41 +78,15 @@ asmlinkage long sys_ioperm(unsigned long
 
                memset(bitmap, 0xff, IO_BITMAP_BYTES);
                t->io_bitmap_ptr = bitmap;
-       }
 
-       /*
-        * do it in the per-thread copy and in the TSS ...
-        *
-        * Disable preemption via get_cpu() - we must not switch away
-        * because the ->io_bitmap_max value must match the bitmap
-        * contents:
-        */
-       tss = &per_cpu(init_tss, get_cpu());
+               op.cmd = PHYSDEVOP_SET_IOBITMAP;
+               op.u.set_iobitmap.bitmap   = (unsigned long)bitmap;
+               op.u.set_iobitmap.nr_ports = IO_BITMAP_BITS;
+               HYPERVISOR_physdev_op(&op);
+       }
 
        set_bitmap(t->io_bitmap_ptr, from, num, !turn_on);
 
-       /*
-        * Search for a (possibly new) maximum. This is simple and stupid,
-        * to keep it obviously correct:
-        */
-       max_long = 0;
-       for (i = 0; i < IO_BITMAP_LONGS; i++)
-               if (t->io_bitmap_ptr[i] != ~0UL)
-                       max_long = i;
-
-       bytes = (max_long + 1) * sizeof(long);
-       bytes_updated = max(bytes, t->io_bitmap_max);
-
-       t->io_bitmap_max = bytes;
-
-       /*
-        * Sets the lazy trigger so that the next I/O operation will
-        * reload the correct bitmap.
-        */
-       tss->io_bitmap_base = INVALID_IO_BITMAP_OFFSET_LAZY;
-
-       put_cpu();
-
        return 0;
 }
 
@@ -127,21 +101,29 @@ asmlinkage long sys_ioperm(unsigned long
  * code.
  */
 
-asmlinkage long sys_iopl(unsigned long unused)
+asmlinkage long sys_iopl(unsigned int new_io_pl)
 {
-       volatile struct pt_regs * regs = (struct pt_regs *) &unused;
-       unsigned int level = regs->ebx;
-       unsigned int old = (regs->eflags >> 12) & 3;
+       unsigned int old_io_pl = current->thread.io_pl;
+       physdev_op_t op;
 
-       if (level > 3)
+       if (new_io_pl > 3)
                return -EINVAL;
-       /* Trying to gain more privileges? */
-       if (level > old) {
-               if (!capable(CAP_SYS_RAWIO))
-                       return -EPERM;
-       }
-       regs->eflags = (regs->eflags &~ 0x3000UL) | (level << 12);
-       /* Make sure we return the long way (not sysenter) */
-       set_thread_flag(TIF_IRET);
+
+       /* Need "raw I/O" privileges for direct port access. */
+       if ((new_io_pl > old_io_pl) && !capable(CAP_SYS_RAWIO))
+               return -EPERM;
+
+       /* Maintain OS privileges even if user attempts to relinquish them. */
+       if (new_io_pl == 0)
+               new_io_pl = 1;
+
+       /* Change our version of the privilege levels. */
+       current->thread.io_pl = new_io_pl;
+
+       /* Force the change at ring 0. */
+       op.cmd             = PHYSDEVOP_SET_IOPL;
+       op.u.set_iopl.iopl = new_io_pl;
+       HYPERVISOR_physdev_op(&op);
+
        return 0;
 }
diff -x mkbuildtree -x include -x xen -x SCCS -urPp 
pristine-linux-2.6.12/arch/i386/kernel/irq.c 
linux-2.6-xen-sparse/arch/i386/kernel/irq.c
--- pristine-linux-2.6.12/arch/i386/kernel/irq.c        2005-06-17 
12:48:29.000000000 -0700
+++ linux-2.6-xen-sparse/arch/i386/kernel/irq.c 2005-07-28 13:17:07.000000000 
-0700
@@ -15,6 +15,9 @@
 #include <linux/seq_file.h>
 #include <linux/interrupt.h>
 #include <linux/kernel_stat.h>
+#include <linux/notifier.h>
+#include <linux/cpu.h>
+#include <linux/delay.h>
 
 DEFINE_PER_CPU(irq_cpustat_t, irq_stat) ____cacheline_maxaligned_in_smp;
 EXPORT_PER_CPU_SYMBOL(irq_stat);
@@ -51,7 +54,7 @@ static union irq_ctx *softirq_ctx[NR_CPU
 fastcall unsigned int do_IRQ(struct pt_regs *regs)
 {      
        /* high bits used in ret_from_ code */
-       int irq = regs->orig_eax & 0xff;
+       int irq = regs->orig_eax & __IRQ_MASK(HARDIRQ_BITS);
 #ifdef CONFIG_4KSTACKS
        union irq_ctx *curctx, *irqctx;
        u32 *isp;
@@ -210,9 +213,8 @@ int show_interrupts(struct seq_file *p, 
 
        if (i == 0) {
                seq_printf(p, "           ");
-               for (j=0; j<NR_CPUS; j++)
-                       if (cpu_online(j))
-                               seq_printf(p, "CPU%d       ",j);
+               for_each_cpu(j)
+                       seq_printf(p, "CPU%d       ",j);
                seq_putc(p, '\n');
        }
 
@@ -225,9 +227,8 @@ int show_interrupts(struct seq_file *p, 
 #ifndef CONFIG_SMP
                seq_printf(p, "%10u ", kstat_irqs(i));
 #else
-               for (j = 0; j < NR_CPUS; j++)
-                       if (cpu_online(j))
-                               seq_printf(p, "%10u ", kstat_cpu(j).irqs[i]);
+               for_each_cpu(j)
+                       seq_printf(p, "%10u ", kstat_cpu(j).irqs[i]);
 #endif
                seq_printf(p, " %14s", irq_desc[i].handler->typename);
                seq_printf(p, "  %s", action->name);
@@ -240,16 +241,13 @@ skip:
                spin_unlock_irqrestore(&irq_desc[i].lock, flags);
        } else if (i == NR_IRQS) {
                seq_printf(p, "NMI: ");
-               for (j = 0; j < NR_CPUS; j++)
-                       if (cpu_online(j))
-                               seq_printf(p, "%10u ", nmi_count(j));
+               for_each_cpu(j)
+                       seq_printf(p, "%10u ", nmi_count(j));
                seq_putc(p, '\n');
 #ifdef CONFIG_X86_LOCAL_APIC
                seq_printf(p, "LOC: ");
-               for (j = 0; j < NR_CPUS; j++)
-                       if (cpu_online(j))
-                               seq_printf(p, "%10u ",
-                                       per_cpu(irq_stat,j).apic_timer_irqs);
+               for_each_cpu(j)
+                       seq_printf(p, "%10u ", per_cpu(irq_stat, 
j).apic_timer_irqs);
                seq_putc(p, '\n');
 #endif
                seq_printf(p, "ERR: %10u\n", atomic_read(&irq_err_count));
@@ -259,3 +257,43 @@ skip:
        }
        return 0;
 }
+
+#ifdef CONFIG_HOTPLUG_CPU
+
+void fixup_irqs(cpumask_t map)
+{
+       unsigned int irq;
+
+       for (irq = 0; irq < NR_IRQS; irq++) {
+               cpumask_t mask;
+               if (irq == 2)
+                       continue;
+
+               cpus_and(mask, irq_affinity[irq], map);
+               if (any_online_cpu(mask) == NR_CPUS) {
+                       printk("Breaking affinity for irq %i\n", irq);
+                       mask = map;
+               }
+               if (irq_desc[irq].handler->set_affinity)
+                       irq_desc[irq].handler->set_affinity(irq, mask);
+               else if (irq_desc[irq].action)
+                       printk("Cannot set affinity for irq %i\n", irq);
+       }
+
+#if 0
+       barrier();
+       /* Ingo Molnar says: "after the IO-APIC masks have been redirected
+          [note the nop - the interrupt-enable boundary on x86 is two
+          instructions from sti] - to flush out pending hardirqs and
+          IPIs. After this point nothing is supposed to reach this CPU." */
+       __asm__ __volatile__("sti; nop; cli");
+       barrier();
+#else
+       /* That doesn't seem sufficient.  Give it 1ms. */
+       local_irq_enable();
+       mdelay(1);
+       local_irq_disable();
+#endif
+}
+#endif
+
diff -x mkbuildtree -x include -x xen -x SCCS -urPp 
pristine-linux-2.6.12/arch/i386/kernel/ldt.c 
linux-2.6-xen-sparse/arch/i386/kernel/ldt.c
--- pristine-linux-2.6.12/arch/i386/kernel/ldt.c        2005-06-17 
12:48:29.000000000 -0700
+++ linux-2.6-xen-sparse/arch/i386/kernel/ldt.c 2005-07-28 13:17:07.000000000 
-0700
@@ -18,6 +18,7 @@
 #include <asm/system.h>
 #include <asm/ldt.h>
 #include <asm/desc.h>
+#include <asm/mmu_context.h>
 
 #ifdef CONFIG_SMP /* avoids "defined but not used" warnig */
 static void flush_ldt(void *null)
@@ -58,16 +59,20 @@ static int alloc_ldt(mm_context_t *pc, i
 #ifdef CONFIG_SMP
                cpumask_t mask;
                preempt_disable();
+#endif
+               make_pages_readonly(pc->ldt, (pc->size * LDT_ENTRY_SIZE) /
+                                   PAGE_SIZE);
                load_LDT(pc);
+#ifdef CONFIG_SMP
                mask = cpumask_of_cpu(smp_processor_id());
                if (!cpus_equal(current->mm->cpu_vm_mask, mask))
                        smp_call_function(flush_ldt, NULL, 1, 1);
                preempt_enable();
-#else
-               load_LDT(pc);
 #endif
        }
        if (oldsize) {
+               make_pages_writable(oldldt, (oldsize * LDT_ENTRY_SIZE) /
+                       PAGE_SIZE);
                if (oldsize*LDT_ENTRY_SIZE > PAGE_SIZE)
                        vfree(oldldt);
                else
@@ -82,6 +87,8 @@ static inline int copy_ldt(mm_context_t 
        if (err < 0)
                return err;
        memcpy(new->ldt, old->ldt, old->size*LDT_ENTRY_SIZE);
+       make_pages_readonly(new->ldt, (new->size * LDT_ENTRY_SIZE) /
+                           PAGE_SIZE);
        return 0;
 }
 
@@ -94,14 +101,19 @@ int init_new_context(struct task_struct 
        struct mm_struct * old_mm;
        int retval = 0;
 
+       memset(&mm->context, 0, sizeof(mm->context));
        init_MUTEX(&mm->context.sem);
-       mm->context.size = 0;
        old_mm = current->mm;
        if (old_mm && old_mm->context.size > 0) {
                down(&old_mm->context.sem);
                retval = copy_ldt(&mm->context, &old_mm->context);
                up(&old_mm->context.sem);
        }
+       if (retval == 0) {
+               spin_lock(&mm_unpinned_lock);
+               list_add(&mm->context.unpinned, &mm_unpinned);
+               spin_unlock(&mm_unpinned_lock);
+       }
        return retval;
 }
 
@@ -113,12 +125,20 @@ void destroy_context(struct mm_struct *m
        if (mm->context.size) {
                if (mm == current->active_mm)
                        clear_LDT();
+               make_pages_writable(mm->context.ldt, 
+                                   (mm->context.size * LDT_ENTRY_SIZE) /
+                                   PAGE_SIZE);
                if (mm->context.size*LDT_ENTRY_SIZE > PAGE_SIZE)
                        vfree(mm->context.ldt);
                else
                        kfree(mm->context.ldt);
                mm->context.size = 0;
        }
+       if (!mm->context.pinned) {
+               spin_lock(&mm_unpinned_lock);
+               list_del(&mm->context.unpinned);
+               spin_unlock(&mm_unpinned_lock);
+       }
 }
 
 static int read_ldt(void __user * ptr, unsigned long bytecount)
@@ -178,6 +198,7 @@ static int write_ldt(void __user * ptr, 
 {
        struct mm_struct * mm = current->mm;
        __u32 entry_1, entry_2, *lp;
+       unsigned long mach_lp;
        int error;
        struct user_desc ldt_info;
 
@@ -206,6 +227,7 @@ static int write_ldt(void __user * ptr, 
        }
 
        lp = (__u32 *) ((ldt_info.entry_number << 3) + (char *) 
mm->context.ldt);
+       mach_lp = arbitrary_virt_to_machine(lp);
 
        /* Allow LDTs to be cleared by the user. */
        if (ldt_info.base_addr == 0 && ldt_info.limit == 0) {
@@ -223,9 +245,7 @@ static int write_ldt(void __user * ptr, 
 
        /* Install the new entry ...  */
 install:
-       *lp     = entry_1;
-       *(lp+1) = entry_2;
-       error = 0;
+       error = HYPERVISOR_update_descriptor(mach_lp, entry_1, entry_2);
 
 out_unlock:
        up(&mm->context.sem);
diff -x mkbuildtree -x include -x xen -x SCCS -urPp 
pristine-linux-2.6.12/arch/i386/kernel/Makefile 
linux-2.6-xen-sparse/arch/i386/kernel/Makefile
--- pristine-linux-2.6.12/arch/i386/kernel/Makefile     2005-06-17 
12:48:29.000000000 -0700
+++ linux-2.6-xen-sparse/arch/i386/kernel/Makefile      2005-07-28 
13:17:07.000000000 -0700
@@ -2,41 +2,52 @@
 # Makefile for the linux kernel.
 #
 
-extra-y := head.o init_task.o vmlinux.lds
+XENARCH        := $(subst ",,$(CONFIG_XENARCH))
 
-obj-y  := process.o semaphore.o signal.o entry.o traps.o irq.o vm86.o \
-               ptrace.o time.o ioport.o ldt.o setup.o i8259.o sys_i386.o \
-               pci-dma.o i386_ksyms.o i387.o dmi_scan.o bootflag.o \
-               doublefault.o quirks.o
+CFLAGS += -Iarch/$(XENARCH)/kernel
+
+extra-y := head.o init_task.o
+
+obj-y  := process.o signal.o entry.o traps.o \
+               time.o ioport.o ldt.o setup.o \
+               pci-dma.o i386_ksyms.o irq.o quirks.o
+
+c-obj-y        := semaphore.o vm86.o \
+               ptrace.o sys_i386.o \
+               i387.o dmi_scan.o bootflag.o \
+               doublefault.o
+s-obj-y        :=
 
 obj-y                          += cpu/
-obj-y                          += timers/
+#obj-y                         += timers/
 obj-$(CONFIG_ACPI_BOOT)                += acpi/
-obj-$(CONFIG_X86_BIOS_REBOOT)  += reboot.o
-obj-$(CONFIG_MCA)              += mca.o
-obj-$(CONFIG_X86_MSR)          += msr.o
-obj-$(CONFIG_X86_CPUID)                += cpuid.o
+#c-obj-$(CONFIG_X86_BIOS_REBOOT)       += reboot.o
+c-obj-$(CONFIG_MCA)            += mca.o
+c-obj-$(CONFIG_X86_MSR)                += msr.o
+c-obj-$(CONFIG_X86_CPUID)      += cpuid.o
 obj-$(CONFIG_MICROCODE)                += microcode.o
-obj-$(CONFIG_APM)              += apm.o
+c-obj-$(CONFIG_APM)            += apm.o
 obj-$(CONFIG_X86_SMP)          += smp.o smpboot.o
-obj-$(CONFIG_X86_TRAMPOLINE)   += trampoline.o
+#obj-$(CONFIG_X86_TRAMPOLINE)  += trampoline.o
 obj-$(CONFIG_X86_MPPARSE)      += mpparse.o
-obj-$(CONFIG_X86_LOCAL_APIC)   += apic.o nmi.o
+obj-$(CONFIG_X86_LOCAL_APIC)   += apic.o
+c-obj-$(CONFIG_X86_LOCAL_APIC) += nmi.o
 obj-$(CONFIG_X86_IO_APIC)      += io_apic.o
-obj-$(CONFIG_X86_REBOOTFIXUPS) += reboot_fixups.o
-obj-$(CONFIG_X86_NUMAQ)                += numaq.o
-obj-$(CONFIG_X86_SUMMIT_NUMA)  += summit.o
-obj-$(CONFIG_KPROBES)          += kprobes.o
-obj-$(CONFIG_MODULES)          += module.o
-obj-y                          += sysenter.o vsyscall.o
-obj-$(CONFIG_ACPI_SRAT)        += srat.o
-obj-$(CONFIG_HPET_TIMER)       += time_hpet.o
-obj-$(CONFIG_EFI)              += efi.o efi_stub.o
-obj-$(CONFIG_EARLY_PRINTK)     += early_printk.o
+c-obj-$(CONFIG_X86_REBOOTFIXUPS)+= reboot_fixups.o
+c-obj-$(CONFIG_X86_NUMAQ)      += numaq.o
+c-obj-$(CONFIG_X86_SUMMIT_NUMA)        += summit.o
+c-obj-$(CONFIG_MODULES)                += module.o
+c-obj-y                                += sysenter.o
+obj-y                          += vsyscall.o
+c-obj-$(CONFIG_ACPI_SRAT)      += srat.o
+c-obj-$(CONFIG_HPET_TIMER)     += time_hpet.o
+c-obj-$(CONFIG_EFI)            += efi.o efi_stub.o
+c-obj-$(CONFIG_EARLY_PRINTK)   += early_printk.o
+c-obj-$(CONFIG_SMP_ALTERNATIVES)+= smpalts.o
 
 EXTRA_AFLAGS   := -traditional
 
-obj-$(CONFIG_SCx200)           += scx200.o
+c-obj-$(CONFIG_SCx200)         += scx200.o
 
 # vsyscall.o contains the vsyscall DSO images as __initdata.
 # We must build both images before we can assemble it.
@@ -58,7 +69,7 @@ SYSCFLAGS_vsyscall-int80.so   = $(vsyscall
 
 $(obj)/vsyscall-int80.so $(obj)/vsyscall-sysenter.so: \
 $(obj)/vsyscall-%.so: $(src)/vsyscall.lds \
-                     $(obj)/vsyscall-%.o $(obj)/vsyscall-note.o FORCE
+                     $(obj)/vsyscall-%.o FORCE
        $(call if_changed,syscall)
 
 # We also create a special relocatable object that should mirror the symbol
@@ -70,5 +81,21 @@ $(obj)/built-in.o: ld_flags += -R $(obj)
 
 SYSCFLAGS_vsyscall-syms.o = -r
 $(obj)/vsyscall-syms.o: $(src)/vsyscall.lds \
-                       $(obj)/vsyscall-sysenter.o $(obj)/vsyscall-note.o FORCE
+                       $(obj)/vsyscall-sysenter.o FORCE
        $(call if_changed,syscall)
+
+c-link := init_task.o
+s-link := vsyscall-int80.o vsyscall-sysenter.o vsyscall-sigreturn.o 
vsyscall.lds.o syscall_table.o
+
+$(patsubst %.o,$(obj)/%.c,$(c-obj-y) $(c-obj-m) $(c-link)) $(patsubst 
%.o,$(obj)/%.S,$(s-obj-y) $(s-link)):
+       @ln -fsn $(srctree)/arch/i386/kernel/$(notdir $@) $@
+
+$(obj)/vsyscall-int80.S: $(obj)/vsyscall-sigreturn.S
+
+$(obj)/entry.o: $(src)/entry.S $(src)/syscall_table.S
+
+obj-y  += $(c-obj-y) $(s-obj-y)
+obj-m  += $(c-obj-m)
+
+clean-files += $(patsubst %.o,%.c,$(c-obj-y) $(c-obj-m) $(c-obj-) $(c-link))
+clean-files += $(patsubst %.o,%.S,$(s-obj-y) $(s-obj-) $(s-link))
diff -x mkbuildtree -x include -x xen -x SCCS -urPp 
pristine-linux-2.6.12/arch/i386/kernel/microcode.c 
linux-2.6-xen-sparse/arch/i386/kernel/microcode.c
--- pristine-linux-2.6.12/arch/i386/kernel/microcode.c  2005-06-17 
12:48:29.000000000 -0700
+++ linux-2.6-xen-sparse/arch/i386/kernel/microcode.c   2005-07-28 
13:17:07.000000000 -0700
@@ -18,55 +18,6 @@
  *     modify it under the terms of the GNU General Public License
  *     as published by the Free Software Foundation; either version
  *     2 of the License, or (at your option) any later version.
- *
- *     1.0     16 Feb 2000, Tigran Aivazian <tigran@xxxxxxx>
- *             Initial release.
- *     1.01    18 Feb 2000, Tigran Aivazian <tigran@xxxxxxx>
- *             Added read() support + cleanups.
- *     1.02    21 Feb 2000, Tigran Aivazian <tigran@xxxxxxx>
- *             Added 'device trimming' support. open(O_WRONLY) zeroes
- *             and frees the saved copy of applied microcode.
- *     1.03    29 Feb 2000, Tigran Aivazian <tigran@xxxxxxx>
- *             Made to use devfs (/dev/cpu/microcode) + cleanups.
- *     1.04    06 Jun 2000, Simon Trimmer <simon@xxxxxxxxxxx>
- *             Added misc device support (now uses both devfs and misc).
- *             Added MICROCODE_IOCFREE ioctl to clear memory.
- *     1.05    09 Jun 2000, Simon Trimmer <simon@xxxxxxxxxxx>
- *             Messages for error cases (non Intel & no suitable microcode).
- *     1.06    03 Aug 2000, Tigran Aivazian <tigran@xxxxxxxxxxx>
- *             Removed ->release(). Removed exclusive open and status bitmap.
- *             Added microcode_rwsem to serialize read()/write()/ioctl().
- *             Removed global kernel lock usage.
- *     1.07    07 Sep 2000, Tigran Aivazian <tigran@xxxxxxxxxxx>
- *             Write 0 to 0x8B msr and then cpuid before reading revision,
- *             so that it works even if there were no update done by the
- *             BIOS. Otherwise, reading from 0x8B gives junk (which happened
- *             to be 0 on my machine which is why it worked even when I
- *             disabled update by the BIOS)
- *             Thanks to Eric W. Biederman <ebiederman@xxxxxxxx> for the fix.
- *     1.08    11 Dec 2000, Richard Schaal <richard.schaal@xxxxxxxxx> and
- *                          Tigran Aivazian <tigran@xxxxxxxxxxx>
- *             Intel Pentium 4 processor support and bugfixes.
- *     1.09    30 Oct 2001, Tigran Aivazian <tigran@xxxxxxxxxxx>
- *             Bugfix for HT (Hyper-Threading) enabled processors
- *             whereby processor resources are shared by all logical processors
- *             in a single CPU package.
- *     1.10    28 Feb 2002 Asit K Mallick <asit.k.mallick@xxxxxxxxx> and
- *             Tigran Aivazian <tigran@xxxxxxxxxxx>,
- *             Serialize updates as required on HT processors due to 
speculative
- *             nature of implementation.
- *     1.11    22 Mar 2002 Tigran Aivazian <tigran@xxxxxxxxxxx>
- *             Fix the panic when writing zero-length microcode chunk.
- *     1.12    29 Sep 2003 Nitin Kamble <nitin.a.kamble@xxxxxxxxx>, 
- *             Jun Nakajima <jun.nakajima@xxxxxxxxx>
- *             Support for the microcode updates in the new format.
- *     1.13    10 Oct 2003 Tigran Aivazian <tigran@xxxxxxxxxxx>
- *             Removed ->read() method and obsoleted MICROCODE_IOCFREE ioctl
- *             because we no longer hold a copy of applied microcode 
- *             in kernel memory.
- *     1.14    25 Jun 2004 Tigran Aivazian <tigran@xxxxxxxxxxx>
- *             Fix sigmatch() macro to handle old CPUs with pf == 0.
- *             Thanks to Stuart Swales for pointing out this bug.
  */
 
 //#define DEBUG /* pr_debug */
@@ -79,6 +30,7 @@
 #include <linux/miscdevice.h>
 #include <linux/spinlock.h>
 #include <linux/mm.h>
+#include <linux/syscalls.h>
 
 #include <asm/msr.h>
 #include <asm/uaccess.h>
@@ -88,342 +40,41 @@ MODULE_DESCRIPTION("Intel CPU (IA-32) Mi
 MODULE_AUTHOR("Tigran Aivazian <tigran@xxxxxxxxxxx>");
 MODULE_LICENSE("GPL");
 
-#define MICROCODE_VERSION      "1.14"
+#define MICROCODE_VERSION      "1.14-xen"
 
 #define DEFAULT_UCODE_DATASIZE         (2000)    /* 2000 bytes */
 #define MC_HEADER_SIZE         (sizeof (microcode_header_t))     /* 48 bytes */
 #define DEFAULT_UCODE_TOTALSIZE (DEFAULT_UCODE_DATASIZE + MC_HEADER_SIZE) /* 
2048 bytes */
-#define EXT_HEADER_SIZE                (sizeof (struct extended_sigtable)) /* 
20 bytes */
-#define EXT_SIGNATURE_SIZE     (sizeof (struct extended_signature)) /* 12 
bytes */
-#define DWSIZE                 (sizeof (u32))
-#define get_totalsize(mc) \
-       (((microcode_t *)mc)->hdr.totalsize ? \
-        ((microcode_t *)mc)->hdr.totalsize : DEFAULT_UCODE_TOTALSIZE)
-#define get_datasize(mc) \
-       (((microcode_t *)mc)->hdr.datasize ? \
-        ((microcode_t *)mc)->hdr.datasize : DEFAULT_UCODE_DATASIZE)
-
-#define sigmatch(s1, s2, p1, p2) \
-       (((s1) == (s2)) && (((p1) & (p2)) || (((p1) == 0) && ((p2) == 0))))
-
-#define exttable_size(et) ((et)->count * EXT_SIGNATURE_SIZE + EXT_HEADER_SIZE)
-
-/* serialize access to the physical write to MSR 0x79 */
-static DEFINE_SPINLOCK(microcode_update_lock);
 
 /* no concurrent ->write()s are allowed on /dev/cpu/microcode */
 static DECLARE_MUTEX(microcode_sem);
 
 static void __user *user_buffer;       /* user area microcode data buffer */
 static unsigned int user_buffer_size;  /* it's size */
-
-typedef enum mc_error_code {
-       MC_SUCCESS      = 0,
-       MC_NOTFOUND     = 1,
-       MC_MARKED       = 2,
-       MC_ALLOCATED    = 3,
-} mc_error_code_t;
-
-static struct ucode_cpu_info {
-       unsigned int sig;
-       unsigned int pf;
-       unsigned int rev;
-       unsigned int cksum;
-       mc_error_code_t err;
-       microcode_t *mc;
-} ucode_cpu_info[NR_CPUS];
                                
 static int microcode_open (struct inode *unused1, struct file *unused2)
 {
        return capable(CAP_SYS_RAWIO) ? 0 : -EPERM;
 }
 
-static void collect_cpu_info (void *unused)
-{
-       int cpu_num = smp_processor_id();
-       struct cpuinfo_x86 *c = cpu_data + cpu_num;
-       struct ucode_cpu_info *uci = ucode_cpu_info + cpu_num;
-       unsigned int val[2];
-
-       uci->sig = uci->pf = uci->rev = uci->cksum = 0;
-       uci->err = MC_NOTFOUND; 
-       uci->mc = NULL;
-
-       if (c->x86_vendor != X86_VENDOR_INTEL || c->x86 < 6 ||
-               cpu_has(c, X86_FEATURE_IA64)) {
-               printk(KERN_ERR "microcode: CPU%d not a capable Intel 
processor\n", cpu_num);
-               return;
-       } else {
-               uci->sig = cpuid_eax(0x00000001);
-
-               if ((c->x86_model >= 5) || (c->x86 > 6)) {
-                       /* get processor flags from MSR 0x17 */
-                       rdmsr(MSR_IA32_PLATFORM_ID, val[0], val[1]);
-                       uci->pf = 1 << ((val[1] >> 18) & 7);
-               }
-       }
-
-       wrmsr(MSR_IA32_UCODE_REV, 0, 0);
-       __asm__ __volatile__ ("cpuid" : : : "ax", "bx", "cx", "dx");
-       /* get the current revision from MSR 0x8B */
-       rdmsr(MSR_IA32_UCODE_REV, val[0], uci->rev);
-       pr_debug("microcode: collect_cpu_info : sig=0x%x, pf=0x%x, rev=0x%x\n",
-                       uci->sig, uci->pf, uci->rev);
-}
-
-static inline void mark_microcode_update (int cpu_num, microcode_header_t 
*mc_header, int sig, int pf, int cksum)
-{
-       struct ucode_cpu_info *uci = ucode_cpu_info + cpu_num;
-
-       pr_debug("Microcode Found.\n");
-       pr_debug("   Header Revision 0x%x\n", mc_header->hdrver);
-       pr_debug("   Loader Revision 0x%x\n", mc_header->ldrver);
-       pr_debug("   Revision 0x%x \n", mc_header->rev);
-       pr_debug("   Date %x/%x/%x\n",
-               ((mc_header->date >> 24 ) & 0xff),
-               ((mc_header->date >> 16 ) & 0xff),
-               (mc_header->date & 0xFFFF));
-       pr_debug("   Signature 0x%x\n", sig);
-       pr_debug("   Type 0x%x Family 0x%x Model 0x%x Stepping 0x%x\n",
-               ((sig >> 12) & 0x3),
-               ((sig >> 8) & 0xf),
-               ((sig >> 4) & 0xf),
-               ((sig & 0xf)));
-       pr_debug("   Processor Flags 0x%x\n", pf);
-       pr_debug("   Checksum 0x%x\n", cksum);
-
-       if (mc_header->rev < uci->rev) {
-               printk(KERN_ERR "microcode: CPU%d not 'upgrading' to earlier 
revision"
-                      " 0x%x (current=0x%x)\n", cpu_num, mc_header->rev, 
uci->rev);
-               goto out;
-       } else if (mc_header->rev == uci->rev) {
-               /* notify the caller of success on this cpu */
-               uci->err = MC_SUCCESS;
-               printk(KERN_ERR "microcode: CPU%d already at revision"
-                       " 0x%x (current=0x%x)\n", cpu_num, mc_header->rev, 
uci->rev);
-               goto out;
-       }
-
-       pr_debug("microcode: CPU%d found a matching microcode update with "
-               " revision 0x%x (current=0x%x)\n", cpu_num, mc_header->rev, 
uci->rev);
-       uci->cksum = cksum;
-       uci->pf = pf; /* keep the original mc pf for cksum calculation */
-       uci->err = MC_MARKED; /* found the match */
-out:
-       return;
-}
-
-static int find_matching_ucodes (void) 
-{
-       int cursor = 0;
-       int error = 0;
-
-       while (cursor + MC_HEADER_SIZE < user_buffer_size) {
-               microcode_header_t mc_header;
-               void *newmc = NULL;
-               int i, sum, cpu_num, allocated_flag, total_size, data_size, 
ext_table_size;
-
-               if (copy_from_user(&mc_header, user_buffer + cursor, 
MC_HEADER_SIZE)) {
-                       printk(KERN_ERR "microcode: error! Can not read user 
data\n");
-                       error = -EFAULT;
-                       goto out;
-               }
-
-               total_size = get_totalsize(&mc_header);
-               if ((cursor + total_size > user_buffer_size) || (total_size < 
DEFAULT_UCODE_TOTALSIZE)) {
-                       printk(KERN_ERR "microcode: error! Bad data in 
microcode data file\n");
-                       error = -EINVAL;
-                       goto out;
-               }
-
-               data_size = get_datasize(&mc_header);
-               if ((data_size + MC_HEADER_SIZE > total_size) || (data_size < 
DEFAULT_UCODE_DATASIZE)) {
-                       printk(KERN_ERR "microcode: error! Bad data in 
microcode data file\n");
-                       error = -EINVAL;
-                       goto out;
-               }
-
-               if (mc_header.ldrver != 1 || mc_header.hdrver != 1) {
-                       printk(KERN_ERR "microcode: error! Unknown microcode 
update format\n");
-                       error = -EINVAL;
-                       goto out;
-               }
-               
-               for (cpu_num = 0; cpu_num < num_online_cpus(); cpu_num++) {
-                       struct ucode_cpu_info *uci = ucode_cpu_info + cpu_num;
-                       if (uci->err != MC_NOTFOUND) /* already found a match 
or not an online cpu*/
-                               continue;
-
-                       if (sigmatch(mc_header.sig, uci->sig, mc_header.pf, 
uci->pf))
-                               mark_microcode_update(cpu_num, &mc_header, 
mc_header.sig, mc_header.pf, mc_header.cksum);
-               }
-
-               ext_table_size = total_size - (MC_HEADER_SIZE + data_size);
-               if (ext_table_size) {
-                       struct extended_sigtable ext_header;
-                       struct extended_signature ext_sig;
-                       int ext_sigcount;
-
-                       if ((ext_table_size < EXT_HEADER_SIZE) 
-                                       || ((ext_table_size - EXT_HEADER_SIZE) 
% EXT_SIGNATURE_SIZE)) {
-                               printk(KERN_ERR "microcode: error! Bad data in 
microcode data file\n");
-                               error = -EINVAL;
-                               goto out;
-                       }
-                       if (copy_from_user(&ext_header, user_buffer + cursor 
-                                       + MC_HEADER_SIZE + data_size, 
EXT_HEADER_SIZE)) {
-                               printk(KERN_ERR "microcode: error! Can not read 
user data\n");
-                               error = -EFAULT;
-                               goto out;
-                       }
-                       if (ext_table_size != exttable_size(&ext_header)) {
-                               printk(KERN_ERR "microcode: error! Bad data in 
microcode data file\n");
-                               error = -EFAULT;
-                               goto out;
-                       }
-
-                       ext_sigcount = ext_header.count;
-                       
-                       for (i = 0; i < ext_sigcount; i++) {
-                               if (copy_from_user(&ext_sig, user_buffer + 
cursor + MC_HEADER_SIZE + data_size + EXT_HEADER_SIZE 
-                                               + EXT_SIGNATURE_SIZE * i, 
EXT_SIGNATURE_SIZE)) {
-                                       printk(KERN_ERR "microcode: error! Can 
not read user data\n");
-                                       error = -EFAULT;
-                                       goto out;
-                               }
-                               for (cpu_num = 0; cpu_num < num_online_cpus(); 
cpu_num++) {
-                                       struct ucode_cpu_info *uci = 
ucode_cpu_info + cpu_num;
-                                       if (uci->err != MC_NOTFOUND) /* already 
found a match or not an online cpu*/
-                                               continue;
-                                       if (sigmatch(ext_sig.sig, uci->sig, 
ext_sig.pf, uci->pf)) {
-                                               mark_microcode_update(cpu_num, 
&mc_header, ext_sig.sig, ext_sig.pf, ext_sig.cksum);
-                                       }
-                               }
-                       }
-               }
-               /* now check if any cpu has matched */
-               for (cpu_num = 0, allocated_flag = 0, sum = 0; cpu_num < 
num_online_cpus(); cpu_num++) {
-                       if (ucode_cpu_info[cpu_num].err == MC_MARKED) { 
-                               struct ucode_cpu_info *uci = ucode_cpu_info + 
cpu_num;
-                               if (!allocated_flag) {
-                                       allocated_flag = 1;
-                                       newmc = vmalloc(total_size);
-                                       if (!newmc) {
-                                               printk(KERN_ERR "microcode: 
error! Can not allocate memory\n");
-                                               error = -ENOMEM;
-                                               goto out;
-                                       }
-                                       if (copy_from_user(newmc + 
MC_HEADER_SIZE, 
-                                                               user_buffer + 
cursor + MC_HEADER_SIZE, 
-                                                               total_size - 
MC_HEADER_SIZE)) {
-                                               printk(KERN_ERR "microcode: 
error! Can not read user data\n");
-                                               vfree(newmc);
-                                               error = -EFAULT;
-                                               goto out;
-                                       }
-                                       memcpy(newmc, &mc_header, 
MC_HEADER_SIZE);
-                                       /* check extended table checksum */
-                                       if (ext_table_size) {
-                                               int ext_table_sum = 0;
-                                               int * ext_tablep = (((void *) 
newmc) + MC_HEADER_SIZE + data_size);
-                                               i = ext_table_size / DWSIZE;
-                                               while (i--) ext_table_sum += 
ext_tablep[i];
-                                               if (ext_table_sum) {
-                                                       printk(KERN_WARNING 
"microcode: aborting, bad extended signature table checksum\n");
-                                                       vfree(newmc);
-                                                       error = -EINVAL;
-                                                       goto out;
-                                               }
-                                       }
-
-                                       /* calculate the checksum */
-                                       i = (MC_HEADER_SIZE + data_size) / 
DWSIZE;
-                                       while (i--) sum += ((int *)newmc)[i];
-                                       sum -= (mc_header.sig + mc_header.pf + 
mc_header.cksum);
-                               }
-                               ucode_cpu_info[cpu_num].mc = newmc;
-                               ucode_cpu_info[cpu_num].err = MC_ALLOCATED; /* 
mc updated */
-                               if (sum + uci->sig + uci->pf + uci->cksum != 0) 
{
-                                       printk(KERN_ERR "microcode: CPU%d 
aborting, bad checksum\n", cpu_num);
-                                       error = -EINVAL;
-                                       goto out;
-                               }
-                       }
-               }
-               cursor += total_size; /* goto the next update patch */
-       } /* end of while */
-out:
-       return error;
-}
-
-static void do_update_one (void * unused)
-{
-       unsigned long flags;
-       unsigned int val[2];
-       int cpu_num = smp_processor_id();
-       struct ucode_cpu_info *uci = ucode_cpu_info + cpu_num;
-
-       if (uci->mc == NULL) {
-               printk(KERN_INFO "microcode: No new microcode data for 
CPU%d\n", cpu_num);
-               return;
-       }
-
-       /* serialize access to the physical write to MSR 0x79 */
-       spin_lock_irqsave(&microcode_update_lock, flags);          
-
-       /* write microcode via MSR 0x79 */
-       wrmsr(MSR_IA32_UCODE_WRITE,
-               (unsigned long) uci->mc->bits, 
-               (unsigned long) uci->mc->bits >> 16 >> 16);
-       wrmsr(MSR_IA32_UCODE_REV, 0, 0);
-
-       __asm__ __volatile__ ("cpuid" : : : "ax", "bx", "cx", "dx");
-       /* get the current revision from MSR 0x8B */
-       rdmsr(MSR_IA32_UCODE_REV, val[0], val[1]);
-
-       /* notify the caller of success on this cpu */
-       uci->err = MC_SUCCESS;
-       spin_unlock_irqrestore(&microcode_update_lock, flags);
-       printk(KERN_INFO "microcode: CPU%d updated from revision "
-              "0x%x to 0x%x, date = %08x \n", 
-              cpu_num, uci->rev, val[1], uci->mc->hdr.date);
-       return;
-}
 
 static int do_microcode_update (void)
 {
-       int i, error;
+       int err;
+       dom0_op_t op;
 
-       if (on_each_cpu(collect_cpu_info, NULL, 1, 1) != 0) {
-               printk(KERN_ERR "microcode: Error! Could not run on all 
processors\n");
-               error = -EIO;
-               goto out;
-       }
+       err = sys_mlock((unsigned long)user_buffer, user_buffer_size);
+       if (err != 0)
+               return err;
 
-       if ((error = find_matching_ucodes())) {
-               printk(KERN_ERR "microcode: Error in the microcode data\n");
-               goto out_free;
-       }
+       op.cmd = DOM0_MICROCODE;
+       op.u.microcode.data = user_buffer;
+       op.u.microcode.length = user_buffer_size;
+       err = HYPERVISOR_dom0_op(&op);
 
-       if (on_each_cpu(do_update_one, NULL, 1, 1) != 0) {
-               printk(KERN_ERR "microcode: Error! Could not run on all 
processors\n");
-               error = -EIO;
-       }
+       (void)sys_munlock((unsigned long)user_buffer, user_buffer_size);
 
-out_free:
-       for (i = 0; i < num_online_cpus(); i++) {
-               if (ucode_cpu_info[i].mc) {
-                       int j;
-                       void *tmp = ucode_cpu_info[i].mc;
-                       vfree(tmp);
-                       for (j = i; j < num_online_cpus(); j++) {
-                               if (ucode_cpu_info[j].mc == tmp)
-                                       ucode_cpu_info[j].mc = NULL;
-                       }
-               }
-       }
-out:
-       return error;
+       return err;
 }
 
 static ssize_t microcode_write (struct file *file, const char __user *buf, 
size_t len, loff_t *ppos)
diff -x mkbuildtree -x include -x xen -x SCCS -urPp 
pristine-linux-2.6.12/arch/i386/kernel/mpparse.c 
linux-2.6-xen-sparse/arch/i386/kernel/mpparse.c
--- pristine-linux-2.6.12/arch/i386/kernel/mpparse.c    2005-06-17 
12:48:29.000000000 -0700
+++ linux-2.6-xen-sparse/arch/i386/kernel/mpparse.c     2005-07-28 
13:17:07.000000000 -0700
@@ -109,7 +109,7 @@ static int MP_valid_apicid(int apicid, i
 {
        return hweight_long(apicid & 0xf) == 1 && (apicid >> 4) != 0xf;
 }
-#else
+#elif !defined(CONFIG_XEN)
 static int MP_valid_apicid(int apicid, int version)
 {
        if (version >= 0x14)
@@ -119,6 +119,7 @@ static int MP_valid_apicid(int apicid, i
 }
 #endif
 
+#ifndef CONFIG_XEN
 static void __init MP_processor_info (struct mpc_config_processor *m)
 {
        int ver, apicid;
@@ -217,6 +218,12 @@ static void __init MP_processor_info (st
        apic_version[m->mpc_apicid] = ver;
        bios_cpu_apicid[num_processors - 1] = m->mpc_apicid;
 }
+#else
+void __init MP_processor_info (struct mpc_config_processor *m)
+{
+       num_processors++;
+}
+#endif /* CONFIG_XEN */
 
 static void __init MP_bus_info (struct mpc_config_bus *m)
 {
@@ -690,7 +697,7 @@ void __init get_smp_config (void)
                 * Read the physical hardware table.  Anything here will
                 * override the defaults.
                 */
-               if (!smp_read_mpc((void *)mpf->mpf_physptr)) {
+               if (!smp_read_mpc(isa_bus_to_virt(mpf->mpf_physptr))) {
                        smp_found_config = 0;
                        printk(KERN_ERR "BIOS bug, MP table errors 
detected!...\n");
                        printk(KERN_ERR "... disabling SMP support. (tell your 
hw vendor)\n");
@@ -725,7 +732,7 @@ void __init get_smp_config (void)
 
 static int __init smp_scan_config (unsigned long base, unsigned long length)
 {
-       unsigned long *bp = phys_to_virt(base);
+       unsigned long *bp = isa_bus_to_virt(base);
        struct intel_mp_floating *mpf;
 
        Dprintk("Scan SMP from %p for %ld bytes.\n", bp,length);
@@ -741,6 +748,7 @@ static int __init smp_scan_config (unsig
                                || (mpf->mpf_specification == 4)) ) {
 
                        smp_found_config = 1;
+#ifndef CONFIG_XEN
                        printk(KERN_INFO "found SMP MP-table at %08lx\n",
                                                virt_to_phys(mpf));
                        reserve_bootmem(virt_to_phys(mpf), PAGE_SIZE);
@@ -760,6 +768,10 @@ static int __init smp_scan_config (unsig
                                        size = end - mpf->mpf_physptr;
                                reserve_bootmem(mpf->mpf_physptr, size);
                        }
+#else
+                       printk(KERN_INFO "found SMP MP-table at %08lx\n",
+                               ((unsigned long)bp - (unsigned 
long)isa_bus_to_virt(base)) + base);
+#endif
 
                        mpf_found = mpf;
                        return 1;
@@ -803,9 +815,11 @@ void __init find_smp_config (void)
         * MP1.4 SPEC states to only scan first 1K of 4K EBDA.
         */
 
+#ifndef CONFIG_XEN
        address = get_bios_ebda();
        if (address)
                smp_scan_config(address, 0x400);
+#endif
 }
 
 /* --------------------------------------------------------------------------
@@ -817,14 +831,14 @@ void __init find_smp_config (void)
 void __init mp_register_lapic_address (
        u64                     address)
 {
+#ifndef CONFIG_XEN
        mp_lapic_addr = (unsigned long) address;
 
-       set_fixmap_nocache(FIX_APIC_BASE, mp_lapic_addr);
-
        if (boot_cpu_physical_apicid == -1U)
                boot_cpu_physical_apicid = GET_APIC_ID(apic_read(APIC_ID));
 
        Dprintk("Boot CPU = %d\n", boot_cpu_physical_apicid);
+#endif
 }
 
 
@@ -844,6 +858,7 @@ void __init mp_register_lapic (
        if (id == boot_cpu_physical_apicid)
                boot_cpu = 1;
 
+#ifndef CONFIG_XEN
        processor.mpc_type = MP_PROCESSOR;
        processor.mpc_apicid = id;
        processor.mpc_apicver = GET_APIC_VERSION(apic_read(APIC_LVR));
@@ -854,6 +869,7 @@ void __init mp_register_lapic (
        processor.mpc_featureflag = boot_cpu_data.x86_capability[0];
        processor.mpc_reserved[0] = 0;
        processor.mpc_reserved[1] = 0;
+#endif
 
        MP_processor_info(&processor);
 }
@@ -913,7 +929,6 @@ void __init mp_register_ioapic (
        mp_ioapics[idx].mpc_flags = MPC_APIC_USABLE;
        mp_ioapics[idx].mpc_apicaddr = address;
 
-       set_fixmap_nocache(FIX_IO_APIC_BASE_0 + idx, address);
        mp_ioapics[idx].mpc_apicid = io_apic_get_unique_id(idx, id);
        mp_ioapics[idx].mpc_apicver = io_apic_get_version(idx);
        
diff -x mkbuildtree -x include -x xen -x SCCS -urPp 
pristine-linux-2.6.12/arch/i386/kernel/pci-dma.c 
linux-2.6-xen-sparse/arch/i386/kernel/pci-dma.c
--- pristine-linux-2.6.12/arch/i386/kernel/pci-dma.c    2005-06-17 
12:48:29.000000000 -0700
+++ linux-2.6-xen-sparse/arch/i386/kernel/pci-dma.c     2005-07-28 
13:17:07.000000000 -0700
@@ -11,7 +11,10 @@
 #include <linux/mm.h>
 #include <linux/string.h>
 #include <linux/pci.h>
+#include <linux/version.h>
 #include <asm/io.h>
+#include <asm-xen/balloon.h>
+#include <asm/tlbflush.h>
 
 struct dma_coherent_mem {
        void            *virt_base;
@@ -26,7 +29,8 @@ void *dma_alloc_coherent(struct device *
 {
        void *ret;
        struct dma_coherent_mem *mem = dev ? dev->dma_mem : NULL;
-       int order = get_order(size);
+       unsigned int order = get_order(size);
+       unsigned long vstart;
        /* ignore region specifiers */
        gfp &= ~(__GFP_DMA | __GFP_HIGHMEM);
 
@@ -46,11 +50,14 @@ void *dma_alloc_coherent(struct device *
        if (dev == NULL || (dev->coherent_dma_mask < 0xffffffff))
                gfp |= GFP_DMA;
 
-       ret = (void *)__get_free_pages(gfp, order);
+       vstart = __get_free_pages(gfp, order);
+       ret = (void *)vstart;
 
        if (ret != NULL) {
+               xen_contig_memory(vstart, order);
+
                memset(ret, 0, size);
-               *dma_handle = virt_to_phys(ret);
+               *dma_handle = virt_to_bus(ret);
        }
        return ret;
 }
@@ -145,3 +152,131 @@ void *dma_mark_declared_memory_occupied(
        return mem->virt_base + (pos << PAGE_SHIFT);
 }
 EXPORT_SYMBOL(dma_mark_declared_memory_occupied);
+
+static LIST_HEAD(dma_map_head);
+static DEFINE_SPINLOCK(dma_map_lock);
+struct dma_map_entry {
+       struct list_head list;
+       dma_addr_t dma;
+       char *bounce, *host;
+       size_t size;
+};
+#define DMA_MAP_MATCHES(e,d) (((e)->dma<=(d)) && (((e)->dma+(e)->size)>(d)))
+
+dma_addr_t
+dma_map_single(struct device *dev, void *ptr, size_t size,
+              enum dma_data_direction direction)
+{
+       struct dma_map_entry *ent;
+       void *bnc;
+       dma_addr_t dma;
+       unsigned long flags;
+
+       BUG_ON(direction == DMA_NONE);
+
+       /*
+        * Even if size is sub-page, the buffer may still straddle a page
+        * boundary. Take into account buffer start offset. All other calls are
+        * conservative and always search the dma_map list if it's non-empty.
+        */
+       if ((((unsigned int)ptr & ~PAGE_MASK) + size) <= PAGE_SIZE) {
+               dma = virt_to_bus(ptr);
+       } else {
+               BUG_ON((bnc = dma_alloc_coherent(dev, size, &dma, 0)) == NULL);
+               BUG_ON((ent = kmalloc(sizeof(*ent), GFP_KERNEL)) == NULL);
+               if (direction != DMA_FROM_DEVICE)
+                       memcpy(bnc, ptr, size);
+               ent->dma    = dma;
+               ent->bounce = bnc;
+               ent->host   = ptr;
+               ent->size   = size;
+               spin_lock_irqsave(&dma_map_lock, flags);
+               list_add(&ent->list, &dma_map_head);
+               spin_unlock_irqrestore(&dma_map_lock, flags);
+       }
+
+       flush_write_buffers();
+       return dma;
+}
+EXPORT_SYMBOL(dma_map_single);
+
+void
+dma_unmap_single(struct device *dev, dma_addr_t dma_addr, size_t size,
+                enum dma_data_direction direction)
+{
+       struct dma_map_entry *ent;
+       unsigned long flags;
+
+       BUG_ON(direction == DMA_NONE);
+
+       /* Fast-path check: are there any multi-page DMA mappings? */
+       if (!list_empty(&dma_map_head)) {
+               spin_lock_irqsave(&dma_map_lock, flags);
+               list_for_each_entry ( ent, &dma_map_head, list ) {
+                       if (DMA_MAP_MATCHES(ent, dma_addr)) {
+                               list_del(&ent->list);
+                               break;
+                       }
+               }
+               spin_unlock_irqrestore(&dma_map_lock, flags);
+               if (&ent->list != &dma_map_head) {
+                       BUG_ON(dma_addr != ent->dma);
+                       BUG_ON(size != ent->size);
+                       if (direction != DMA_TO_DEVICE)
+                               memcpy(ent->host, ent->bounce, size);
+                       dma_free_coherent(dev, size, ent->bounce, ent->dma);
+                       kfree(ent);
+               }
+       }
+}
+EXPORT_SYMBOL(dma_unmap_single);
+
+void
+dma_sync_single_for_cpu(struct device *dev, dma_addr_t dma_handle, size_t size,
+                       enum dma_data_direction direction)
+{
+       struct dma_map_entry *ent;
+       unsigned long flags, off;
+
+       /* Fast-path check: are there any multi-page DMA mappings? */
+       if (!list_empty(&dma_map_head)) {
+               spin_lock_irqsave(&dma_map_lock, flags);
+               list_for_each_entry ( ent, &dma_map_head, list )
+                       if (DMA_MAP_MATCHES(ent, dma_handle))
+                               break;
+               spin_unlock_irqrestore(&dma_map_lock, flags);
+               if (&ent->list != &dma_map_head) {
+                       off = dma_handle - ent->dma;
+                       BUG_ON((off + size) > ent->size);
+                       /*if (direction != DMA_TO_DEVICE)*/
+                               memcpy(ent->host+off, ent->bounce+off, size);
+               }
+       }
+}
+EXPORT_SYMBOL(dma_sync_single_for_cpu);
+
+void
+dma_sync_single_for_device(struct device *dev, dma_addr_t dma_handle, size_t 
size,
+                           enum dma_data_direction direction)
+{
+       struct dma_map_entry *ent;
+       unsigned long flags, off;
+
+       /* Fast-path check: are there any multi-page DMA mappings? */
+       if (!list_empty(&dma_map_head)) {
+               spin_lock_irqsave(&dma_map_lock, flags);
+               list_for_each_entry ( ent, &dma_map_head, list )
+                       if (DMA_MAP_MATCHES(ent, dma_handle))
+                               break;
+               spin_unlock_irqrestore(&dma_map_lock, flags);
+               if (&ent->list != &dma_map_head) {
+                       off = dma_handle - ent->dma;
+                       BUG_ON((off + size) > ent->size);
+                       /*if (direction != DMA_FROM_DEVICE)*/
+                               memcpy(ent->bounce+off, ent->host+off, size);
+               }
+       }
+
+       flush_write_buffers();
+}
+EXPORT_SYMBOL(dma_sync_single_for_device);
diff -x mkbuildtree -x include -x xen -x SCCS -urPp 
pristine-linux-2.6.12/arch/i386/kernel/process.c 
linux-2.6-xen-sparse/arch/i386/kernel/process.c
--- pristine-linux-2.6.12/arch/i386/kernel/process.c    2005-06-17 
12:48:29.000000000 -0700
+++ linux-2.6-xen-sparse/arch/i386/kernel/process.c     2005-07-28 
13:17:07.000000000 -0700
@@ -13,6 +13,7 @@
 
 #include <stdarg.h>
 
+#include <linux/cpu.h>
 #include <linux/errno.h>
 #include <linux/sched.h>
 #include <linux/fs.h>
@@ -47,6 +48,7 @@
 #include <asm/i387.h>
 #include <asm/irq.h>
 #include <asm/desc.h>
+#include <asm-xen/xen-public/physdev.h>
 #ifdef CONFIG_MATH_EMULATION
 #include <asm/math_emu.h>
 #endif
@@ -54,6 +56,9 @@
 #include <linux/irq.h>
 #include <linux/err.h>
 
+#include <asm/tlbflush.h>
+#include <asm/cpu.h>
+
 asmlinkage void ret_from_fork(void) __asm__("ret_from_fork");
 
 static int hlt_counter;
@@ -89,54 +94,48 @@ void enable_hlt(void)
 
 EXPORT_SYMBOL(enable_hlt);
 
-/*
- * We use this if we don't have any better
- * idle routine..
- */
-void default_idle(void)
+/* XXX XEN doesn't use default_idle(), poll_idle(). Use xen_idle() instead. */
+extern void stop_hz_timer(void);
+extern void start_hz_timer(void);
+void xen_idle(void)
 {
-       if (!hlt_counter && boot_cpu_data.hlt_works_ok) {
-               local_irq_disable();
-               if (!need_resched())
-                       safe_halt();
-               else
-                       local_irq_enable();
+       local_irq_disable();
+
+       if (need_resched()) {
+               local_irq_enable();
        } else {
-               cpu_relax();
+               stop_hz_timer();
+               HYPERVISOR_block(); /* implicit local_irq_enable() */
+               start_hz_timer();
        }
 }
 
-/*
- * On SMP it's slightly faster (but much more power-consuming!)
- * to poll the ->work.need_resched flag instead of waiting for the
- * cross-CPU IPI to arrive. Use this option with caution.
- */
-static void poll_idle (void)
-{
-       int oldval;
-
+#ifdef CONFIG_HOTPLUG_CPU
+#include <asm/nmi.h>
+/* We don't actually take CPU down, just spin without interrupts. */
+static inline void play_dead(void)
+{
+       /* Ack it */
+       __get_cpu_var(cpu_state) = CPU_DEAD;
+
+       /* We shouldn't have to disable interrupts while dead, but
+        * some interrupts just don't seem to go away, and this makes
+        * it "work" for testing purposes. */
+       /* Death loop */
+       while (__get_cpu_var(cpu_state) != CPU_UP_PREPARE)
+               HYPERVISOR_yield();
+
+       local_irq_disable();
+       __flush_tlb_all();
+       cpu_set(smp_processor_id(), cpu_online_map);
        local_irq_enable();
-
-       /*
-        * Deal with another CPU just having chosen a thread to
-        * run here:
-        */
-       oldval = test_and_clear_thread_flag(TIF_NEED_RESCHED);
-
-       if (!oldval) {
-               set_thread_flag(TIF_POLLING_NRFLAG);
-               asm volatile(
-                       "2:"
-                       "testl %0, %1;"
-                       "rep; nop;"
-                       "je 2b;"
-                       : : "i"(_TIF_NEED_RESCHED), "m" 
(current_thread_info()->flags));
-
-               clear_thread_flag(TIF_POLLING_NRFLAG);
-       } else {
-               set_need_resched();
-       }
 }
+#else
+static inline void play_dead(void)
+{
+       BUG();
+}
+#endif /* CONFIG_HOTPLUG_CPU */
 
 /*
  * The idle thread. There's no useful work to be
@@ -146,22 +145,26 @@ static void poll_idle (void)
  */
 void cpu_idle (void)
 {
+       int cpu = _smp_processor_id();
+
        /* endless idle loop with no priority at all */
        while (1) {
                while (!need_resched()) {
-                       void (*idle)(void);
 
                        if (__get_cpu_var(cpu_idle_state))
                                __get_cpu_var(cpu_idle_state) = 0;
-
                        rmb();
-                       idle = pm_idle;
 
-                       if (!idle)
-                               idle = default_idle;
+                       if (cpu_is_offline(cpu)) {
+#if defined(CONFIG_XEN) && defined(CONFIG_HOTPLUG_CPU)
+                               /* Tell hypervisor to take vcpu down. */
+                               HYPERVISOR_vcpu_down(cpu);
+#endif
+                               play_dead();
+         }
 
                        __get_cpu_var(irq_stat).idle_timestamp = jiffies;
-                       idle();
+                       xen_idle();
                }
                schedule();
        }
@@ -195,74 +198,18 @@ void cpu_idle_wait(void)
 }
 EXPORT_SYMBOL_GPL(cpu_idle_wait);
 
-/*
- * This uses new MONITOR/MWAIT instructions on P4 processors with PNI,
- * which can obviate IPI to trigger checking of need_resched.
- * We execute MONITOR against need_resched and enter optimized wait state
- * through MWAIT. Whenever someone changes need_resched, we would be woken
- * up from MWAIT (without an IPI).
- */
-static void mwait_idle(void)
-{
-       local_irq_enable();
-
-       if (!need_resched()) {
-               set_thread_flag(TIF_POLLING_NRFLAG);
-               do {
-                       __monitor((void *)&current_thread_info()->flags, 0, 0);
-                       if (need_resched())
-                               break;
-                       __mwait(0, 0);
-               } while (!need_resched());
-               clear_thread_flag(TIF_POLLING_NRFLAG);
-       }
-}
-
-void __init select_idle_routine(const struct cpuinfo_x86 *c)
-{
-       if (cpu_has(c, X86_FEATURE_MWAIT)) {
-               printk("monitor/mwait feature present.\n");
-               /*
-                * Skip, if setup has overridden idle.
-                * One CPU supports mwait => All CPUs supports mwait
-                */
-               if (!pm_idle) {
-                       printk("using mwait in idle threads.\n");
-                       pm_idle = mwait_idle;
-               }
-       }
-}
-
-static int __init idle_setup (char *str)
-{
-       if (!strncmp(str, "poll", 4)) {
-               printk("using polling idle threads.\n");
-               pm_idle = poll_idle;
-#ifdef CONFIG_X86_SMP
-               if (smp_num_siblings > 1)
-                       printk("WARNING: polling idle and HT enabled, 
performance may degrade.\n");
-#endif
-       } else if (!strncmp(str, "halt", 4)) {
-               printk("using halt in idle threads.\n");
-               pm_idle = default_idle;
-       }
-
-       boot_option_idle_override = 1;
-       return 1;
-}
-
-__setup("idle=", idle_setup);
+/* XXX XEN doesn't use mwait_idle(), select_idle_routine(), idle_setup(). */
+/* Always use xen_idle() instead. */
+void __init select_idle_routine(const struct cpuinfo_x86 *c) {}
 
 void show_regs(struct pt_regs * regs)
 {
-       unsigned long cr0 = 0L, cr2 = 0L, cr3 = 0L, cr4 = 0L;
-
        printk("\n");
        printk("Pid: %d, comm: %20s\n", current->pid, current->comm);
        printk("EIP: %04x:[<%08lx>] CPU: %d\n",0xffff & regs->xcs,regs->eip, 
smp_processor_id());
        print_symbol("EIP is at %s\n", regs->eip);
 
-       if (regs->xcs & 3)
+       if (regs->xcs & 2)
                printk(" ESP: %04x:%08lx",0xffff & regs->xss,regs->esp);
        printk(" EFLAGS: %08lx    %s  (%s)\n",
               regs->eflags, print_tainted(), system_utsname.release);
@@ -273,17 +220,6 @@ void show_regs(struct pt_regs * regs)
        printk(" DS: %04x ES: %04x\n",
                0xffff & regs->xds,0xffff & regs->xes);
 
-       __asm__("movl %%cr0, %0": "=r" (cr0));
-       __asm__("movl %%cr2, %0": "=r" (cr2));
-       __asm__("movl %%cr3, %0": "=r" (cr3));
-       /* This could fault if %cr4 does not exist */
-       __asm__("1: movl %%cr4, %0              \n"
-               "2:                             \n"
-               ".section __ex_table,\"a\"      \n"
-               ".long 1b,2b                    \n"
-               ".previous                      \n"
-               : "=r" (cr4): "0" (0));
-       printk("CR0: %08lx CR2: %08lx CR3: %08lx CR4: %08lx\n", cr0, cr2, cr3, 
cr4);
        show_trace(NULL, &regs->esp);
 }
 
@@ -336,20 +272,11 @@ void exit_thread(void)
 
        /* The process may have allocated an io port bitmap... nuke it. */
        if (unlikely(NULL != t->io_bitmap_ptr)) {
-               int cpu = get_cpu();
-               struct tss_struct *tss = &per_cpu(init_tss, cpu);
-
+               physdev_op_t op = { 0 };
+               op.cmd = PHYSDEVOP_SET_IOBITMAP;
+               HYPERVISOR_physdev_op(&op);
                kfree(t->io_bitmap_ptr);
                t->io_bitmap_ptr = NULL;
-               /*
-                * Careful, clear this in the TSS too:
-                */
-               memset(tss->io_bitmap, 0xff, tss->io_bitmap_max);
-               t->io_bitmap_max = 0;
-               tss->io_bitmap_owner = NULL;
-               tss->io_bitmap_max = 0;
-               tss->io_bitmap_base = INVALID_IO_BITMAP_OFFSET;
-               put_cpu();
        }
 }
 
@@ -458,6 +385,8 @@ int copy_thread(int nr, unsigned long cl
                desc->b = LDT_entry_b(&info);
        }
 
+       p->thread.io_pl = current->thread.io_pl;
+
        err = 0;
  out:
        if (err && p->thread.io_bitmap_ptr) {
@@ -525,40 +454,10 @@ int dump_task_regs(struct task_struct *t
 
        elf_core_copy_regs(regs, &ptregs);
 
+       boot_option_idle_override = 1;
        return 1;
 }
 
-static inline void
-handle_io_bitmap(struct thread_struct *next, struct tss_struct *tss)
-{
-       if (!next->io_bitmap_ptr) {
-               /*
-                * Disable the bitmap via an invalid offset. We still cache
-                * the previous bitmap owner and the IO bitmap contents:
-                */
-               tss->io_bitmap_base = INVALID_IO_BITMAP_OFFSET;
-               return;
-       }
-       if (likely(next == tss->io_bitmap_owner)) {
-               /*
-                * Previous owner of the bitmap (hence the bitmap content)
-                * matches the next task, we dont have to do anything but
-                * to set a valid offset in the TSS:
-                */
-               tss->io_bitmap_base = IO_BITMAP_OFFSET;
-               return;
-       }
-       /*
-        * Lazy TSS's I/O bitmap copy. We set an invalid offset here
-        * and we let the task to get a GPF in case an I/O instruction
-        * is performed.  The handler of the GPF will verify that the
-        * faulting task has a valid I/O bitmap and, it true, does the
-        * real copy and restart the instruction.  This will save us
-        * redundant copies when the currently switched task does not
-        * perform any I/O during its timeslice.
-        */
-       tss->io_bitmap_base = INVALID_IO_BITMAP_OFFSET_LAZY;
-}
 
 /*
  *     switch_to(x,yn) should switch tasks from x to y.
@@ -593,32 +492,77 @@ struct task_struct fastcall * __switch_t
                                 *next = &next_p->thread;
        int cpu = smp_processor_id();
        struct tss_struct *tss = &per_cpu(init_tss, cpu);
+       physdev_op_t iopl_op, iobmp_op;
+       multicall_entry_t _mcl[8], *mcl = _mcl;
 
-       /* never put a printk in __switch_to... printk() calls wake_up*() 
indirectly */
+       /* XEN NOTE: FS/GS saved in switch_mm(), not here. */
 
-       __unlazy_fpu(prev_p);
+       /*
+        * This is basically '__unlazy_fpu', except that we queue a
+        * multicall to indicate FPU task switch, rather than
+        * synchronously trapping to Xen.
+        */
+       if (prev_p->thread_info->status & TS_USEDFPU) {
+               __save_init_fpu(prev_p); /* _not_ save_init_fpu() */
+               mcl->op      = __HYPERVISOR_fpu_taskswitch;
+               mcl->args[0] = 1;
+               mcl++;
+       }
 
        /*
         * Reload esp0, LDT and the page table pointer:
+        * This is load_esp0(tss, next) with a multicall.
         */
-       load_esp0(tss, next);
+       tss->esp0 = next->esp0;
+       mcl->op      = __HYPERVISOR_stack_switch;
+       mcl->args[0] = tss->ss0;
+       mcl->args[1] = tss->esp0;
+       mcl++;
 
        /*
         * Load the per-thread Thread-Local Storage descriptor.
+        * This is load_TLS(next, cpu) with multicalls.
         */
-       load_TLS(next, cpu);
+#define C(i) do {                                                       \
+       if (unlikely(next->tls_array[i].a != prev->tls_array[i].a ||    \
+                    next->tls_array[i].b != prev->tls_array[i].b)) {   \
+               mcl->op      = __HYPERVISOR_update_descriptor;          \
+               mcl->args[0] = virt_to_machine(&get_cpu_gdt_table(cpu)  \
+                                        [GDT_ENTRY_TLS_MIN + i]);      \
+               mcl->args[1] = ((u32 *)&next->tls_array[i])[0];         \
+               mcl->args[2] = ((u32 *)&next->tls_array[i])[1];         \
+               mcl++;                                                  \
+       }                                                               \
+} while (0)
+       C(0); C(1); C(2);
+#undef C
+
+       if (unlikely(prev->io_pl != next->io_pl)) {
+               iopl_op.cmd             = PHYSDEVOP_SET_IOPL;
+               iopl_op.u.set_iopl.iopl = next->io_pl;
+               mcl->op      = __HYPERVISOR_physdev_op;
+               mcl->args[0] = (unsigned long)&iopl_op;
+               mcl++;
+       }
+
+       if (unlikely(prev->io_bitmap_ptr || next->io_bitmap_ptr)) {
+               iobmp_op.cmd                     =
+                       PHYSDEVOP_SET_IOBITMAP;
+               iobmp_op.u.set_iobitmap.bitmap   =
+                       (unsigned long)next->io_bitmap_ptr;
+               iobmp_op.u.set_iobitmap.nr_ports =
+                       next->io_bitmap_ptr ? IO_BITMAP_BITS : 0;
+               mcl->op      = __HYPERVISOR_physdev_op;
+               mcl->args[0] = (unsigned long)&iobmp_op;
+               mcl++;
+       }
 
-       /*
-        * Save away %fs and %gs. No need to save %es and %ds, as
-        * those are always kernel segments while inside the kernel.
-        */
-       asm volatile("mov %%fs,%0":"=m" (prev->fs));
-       asm volatile("mov %%gs,%0":"=m" (prev->gs));
+       (void)HYPERVISOR_multicall(_mcl, mcl - _mcl);
 
        /*
         * Restore %fs and %gs if needed.
         */
-       if (unlikely(prev->fs | prev->gs | next->fs | next->gs)) {
+       if (unlikely(next->fs | next->gs)) {
                loadsegment(fs, next->fs);
                loadsegment(gs, next->gs);
        }
@@ -636,9 +580,6 @@ struct task_struct fastcall * __switch_t
                loaddebug(next, 7);
        }
 
-       if (unlikely(prev->io_bitmap_ptr || next->io_bitmap_ptr))
-               handle_io_bitmap(next, tss);
-
        return prev_p;
 }
 
diff -x mkbuildtree -x include -x xen -x SCCS -urPp 
pristine-linux-2.6.12/arch/i386/kernel/quirks.c 
linux-2.6-xen-sparse/arch/i386/kernel/quirks.c
--- pristine-linux-2.6.12/arch/i386/kernel/quirks.c     2005-06-17 
12:48:29.000000000 -0700
+++ linux-2.6-xen-sparse/arch/i386/kernel/quirks.c      2005-07-28 
13:17:07.000000000 -0700
@@ -32,14 +32,11 @@ static void __devinit quirk_intel_irqbal
        raw_pci_ops->read(0, 0, 0x40, 0x4c, 2, &word);
 
        if (!(word & (1 << 13))) {
+               dom0_op_t op;
                printk(KERN_INFO "Disabling irq balancing and affinity\n");
-#ifdef CONFIG_IRQBALANCE
-               irqbalance_disable("");
-#endif
-               noirqdebug_setup("");
-#ifdef CONFIG_PROC_FS
-               no_irq_affinity = 1;
-#endif
+               op.cmd = DOM0_PLATFORM_QUIRK;
+               op.u.platform_quirk.quirk_id = QUIRK_NOIRQBALANCING;
+               (void)HYPERVISOR_dom0_op(&op);
        }
 
        config &= ~0x2;
diff -x mkbuildtree -x include -x xen -x SCCS -urPp 
pristine-linux-2.6.12/arch/i386/kernel/setup.c 
linux-2.6-xen-sparse/arch/i386/kernel/setup.c
--- pristine-linux-2.6.12/arch/i386/kernel/setup.c      2005-06-17 
12:48:29.000000000 -0700
+++ linux-2.6-xen-sparse/arch/i386/kernel/setup.c       2005-07-28 
13:17:07.000000000 -0700
@@ -41,6 +41,9 @@
 #include <linux/init.h>
 #include <linux/edd.h>
 #include <linux/nodemask.h>
+#include <linux/kernel.h>
+#include <linux/percpu.h>
+#include <linux/notifier.h>
 #include <video/edid.h>
 #include <asm/e820.h>
 #include <asm/mpspec.h>
@@ -50,13 +53,18 @@
 #include <asm/io_apic.h>
 #include <asm/ist.h>
 #include <asm/io.h>
+#include <asm-xen/hypervisor.h>
+#include <asm-xen/xen-public/physdev.h>
 #include "setup_arch_pre.h"
 #include <bios_ebda.h>
 
-/* This value is set up by the early boot code to point to the value
-   immediately after the boot time page tables.  It contains a *physical*
-   address, and must not be in the .bss segment! */
-unsigned long init_pg_tables_end __initdata = ~0UL;
+/* Allows setting of maximum possible memory size  */
+static unsigned long xen_override_max_pfn;
+
+static int xen_panic_event(struct notifier_block *, unsigned long, void *);
+static struct notifier_block xen_panic_block = {
+       xen_panic_event, NULL, 0 /* try to go last */
+};
 
 int disable_pse __initdata = 0;
 
@@ -70,9 +78,9 @@ EXPORT_SYMBOL(efi_enabled);
 #endif
 
 /* cpu data as detected by the assembly code in head.S */
-struct cpuinfo_x86 new_cpu_data __initdata = { 0, 0, 0, 0, -1, 1, 0, 0, -1 };
+struct cpuinfo_x86 new_cpu_data __initdata = { 0, 0, 0, 0, -1, 0, 1, 0, -1 };
 /* common cpu data for all cpus */
-struct cpuinfo_x86 boot_cpu_data = { 0, 0, 0, 0, -1, 1, 0, 0, -1 };
+struct cpuinfo_x86 boot_cpu_data = { 0, 0, 0, 0, -1, 0, 1, 0, -1 };
 
 unsigned long mmu_cr4_features;
 
@@ -146,6 +154,7 @@ static struct resource code_resource = {
        .flags  = IORESOURCE_BUSY | IORESOURCE_MEM
 };
 
+#ifdef CONFIG_XEN_PRIVILEGED_GUEST
 static struct resource system_rom_resource = {
        .name   = "System ROM",
        .start  = 0xf0000,
@@ -201,6 +210,7 @@ static struct resource video_rom_resourc
        .end    = 0xc7fff,
        .flags  = IORESOURCE_BUSY | IORESOURCE_READONLY | IORESOURCE_MEM
 };
+#endif
 
 static struct resource video_ram_resource = {
        .name   = "Video RAM area",
@@ -259,6 +269,7 @@ static struct resource standard_io_resou
 #define STANDARD_IO_RESOURCES \
        (sizeof standard_io_resources / sizeof standard_io_resources[0])
 
+#ifdef CONFIG_XEN_PRIVILEGED_GUEST
 #define romsignature(x) (*(unsigned short *)(x) == 0xaa55)
 
 static int __init romchecksum(unsigned char *rom, unsigned long length)
@@ -276,6 +287,10 @@ static void __init probe_roms(void)
        unsigned char *rom;
        int           i;
 
+       /* Nothing to do if not running in dom0. */
+       if (!(xen_start_info.flags & SIF_INITDOMAIN))
+               return;
+
        /* video rom */
        upper = adapter_rom_resources[0].start;
        for (start = video_rom_resource.start; start < upper; start += 2048) {
@@ -334,6 +349,20 @@ static void __init probe_roms(void)
                start = adapter_rom_resources[i++].end & ~2047UL;
        }
 }
+#endif
+
+/*
+ * Point at the empty zero page to start with. We map the real shared_info
+ * page as soon as fixmap is up and running.
+ */
+shared_info_t *HYPERVISOR_shared_info = (shared_info_t *)empty_zero_page;
+EXPORT_SYMBOL(HYPERVISOR_shared_info);
+
+unsigned int *phys_to_machine_mapping, *pfn_to_mfn_frame_list;
+EXPORT_SYMBOL(phys_to_machine_mapping);
+
+/* Raw start-of-day parameters from the hypervisor. */
+union xen_start_info_union xen_start_info_union;
 
 static void __init limit_regions(unsigned long long size)
 {
@@ -414,6 +443,7 @@ static void __init print_memory_map(char
        }
 }
 
+#if 0
 /*
  * Sanitize the BIOS e820 map.
  *
@@ -633,6 +663,7 @@ static int __init copy_e820_map(struct e
        } while (biosmap++,--nr_map);
        return 0;
 }
+#endif
 
 #if defined(CONFIG_EDD) || defined(CONFIG_EDD_MODULE)
 struct edd edd;
@@ -666,11 +697,14 @@ static inline void copy_edd(void)
 static void __init parse_cmdline_early (char ** cmdline_p)
 {
        char c = ' ', *to = command_line, *from = saved_command_line;
-       int len = 0;
+       int len = 0, max_cmdline;
        int userdef = 0;
 
+       if ((max_cmdline = MAX_GUEST_CMDLINE) > COMMAND_LINE_SIZE)
+               max_cmdline = COMMAND_LINE_SIZE;
+       memcpy(saved_command_line, xen_start_info.cmd_line, max_cmdline);
        /* Save unparsed command line copy for /proc/cmdline */
-       saved_command_line[COMMAND_LINE_SIZE-1] = '\0';
+       saved_command_line[max_cmdline-1] = '\0';
 
        for (;;) {
                if (c != ' ')
@@ -702,8 +736,13 @@ static void __init parse_cmdline_early (
                                unsigned long long mem_size;
  
                                mem_size = memparse(from+4, &from);
+#if 0
                                limit_regions(mem_size);
                                userdef=1;
+#else
+                               xen_override_max_pfn =
+                                       (unsigned long)(mem_size>>PAGE_SHIFT);
+#endif
                        }
                }
 
@@ -744,7 +783,7 @@ static void __init parse_cmdline_early (
                        noexec_setup(from + 7);
 
 
-#ifdef  CONFIG_X86_SMP
+#ifdef  CONFIG_X86_MPPARSE
                /*
                 * If the BIOS enumerates physical processors before logical,
                 * maxcpus=N at enumeration-time can be used to disable HT.
@@ -846,6 +885,7 @@ static void __init parse_cmdline_early (
        }
 }
 
+#if 0 /* !XEN */
 /*
  * Callback for efi_memory_walk.
  */
@@ -889,6 +929,15 @@ void __init find_max_pfn(void)
                        max_pfn = end;
        }
 }
+#else
+/* We don't use the fake e820 because we need to respond to user override. */
+void __init find_max_pfn(void)
+{
+       if ( xen_override_max_pfn < xen_start_info.nr_pages )
+               xen_override_max_pfn = xen_start_info.nr_pages;
+       max_pfn = xen_override_max_pfn;
+}
+#endif /* XEN */
 
 /*
  * Determine low and high memory ranges:
@@ -1011,6 +1060,7 @@ static void __init register_bootmem_low_
        }
 }
 
+#ifndef CONFIG_XEN
 /*
  * workaround for Dell systems that neglect to reserve EBDA
  */
@@ -1021,16 +1071,18 @@ static void __init reserve_ebda_region(v
        if (addr)
                reserve_bootmem(addr, PAGE_SIZE);       
 }
+#endif
 
 #ifndef CONFIG_DISCONTIGMEM
 void __init setup_bootmem_allocator(void);
 static unsigned long __init setup_memory(void)
 {
+
        /*
         * partially used pages are not usable - thus
         * we are rounding upwards:
         */
-       min_low_pfn = PFN_UP(init_pg_tables_end);
+       min_low_pfn = PFN_UP(__pa(xen_start_info.pt_base)) + 
xen_start_info.nr_pt_frames;
 
        find_max_pfn();
 
@@ -1057,7 +1109,14 @@ void __init zone_sizes_init(void)
        unsigned long zones_size[MAX_NR_ZONES] = {0, 0, 0};
        unsigned int max_dma, low;
 
-       max_dma = virt_to_phys((char *)MAX_DMA_ADDRESS) >> PAGE_SHIFT;
+       /*
+        * XEN: Our notion of "DMA memory" is fake when running over Xen.
+        * We simply put all RAM in the DMA zone so that those drivers which
+        * needlessly specify GFP_DMA do not get starved of RAM unnecessarily.
+        * Those drivers that *do* require lowmem are screwed anyway when
+        * running over Xen!
+        */
+       max_dma = max_low_pfn;
        low = max_low_pfn;
 
        if (low < max_dma)
@@ -1095,6 +1154,7 @@ void __init setup_bootmem_allocator(void
        reserve_bootmem(HIGH_MEMORY, (PFN_PHYS(min_low_pfn) +
                         bootmap_size + PAGE_SIZE-1) - (HIGH_MEMORY));
 
+#ifndef CONFIG_XEN
        /*
         * reserve physical page 0 - it's a special BIOS page on many boxes,
         * enabling clean reboots, SMP operation, laptop functions.
@@ -1125,20 +1185,15 @@ void __init setup_bootmem_allocator(void
         */
        acpi_reserve_bootmem();
 #endif
-#ifdef CONFIG_X86_FIND_SMP_CONFIG
-       /*
-        * Find and reserve possible boot-time SMP configuration:
-        */
-       find_smp_config();
-#endif
+#endif /* !CONFIG_XEN */
 
 #ifdef CONFIG_BLK_DEV_INITRD
-       if (LOADER_TYPE && INITRD_START) {
+       if (xen_start_info.mod_start) {
                if (INITRD_START + INITRD_SIZE <= (max_low_pfn << PAGE_SHIFT)) {
-                       reserve_bootmem(INITRD_START, INITRD_SIZE);
-                       initrd_start =
-                               INITRD_START ? INITRD_START + PAGE_OFFSET : 0;
+                       /*reserve_bootmem(INITRD_START, INITRD_SIZE);*/
+                       initrd_start = INITRD_START + PAGE_OFFSET;
                        initrd_end = initrd_start+INITRD_SIZE;
+                       initrd_below_start_ok = 1;
                }
                else {
                        printk(KERN_ERR "initrd extends beyond end of memory "
@@ -1149,6 +1204,8 @@ void __init setup_bootmem_allocator(void
                }
        }
 #endif
+
+       phys_to_machine_mapping = (unsigned int *)xen_start_info.mfn_list;
 }
 
 /*
@@ -1178,7 +1235,9 @@ legacy_init_iomem_resources(struct resou
 {
        int i;
 
+#ifdef CONFIG_XEN_PRIVILEGED_GUEST
        probe_roms();
+#endif
        for (i = 0; i < e820.nr_map; i++) {
                struct resource *res;
                if (e820.map[i].addr + e820.map[i].size > 0x100000000ULL)
@@ -1220,8 +1279,9 @@ static void __init register_memory(void)
        else
                legacy_init_iomem_resources(&code_resource, &data_resource);
 
-       /* EFI systems may still have VGA */
-       request_resource(&iomem_resource, &video_ram_resource);
+       if (xen_start_info.flags & SIF_INITDOMAIN)
+               /* EFI systems may still have VGA */
+               request_resource(&iomem_resource, &video_ram_resource);
 
        /* request I/O space for devices used on all i[345]86 PCs */
        for (i = 0; i < STANDARD_IO_RESOURCES; i++)
@@ -1396,10 +1456,23 @@ static void set_mca_bus(int x) { }
  */
 void __init setup_arch(char **cmdline_p)
 {
+       int i, j;
+       physdev_op_t op;
        unsigned long max_low_pfn;
 
+       /* Force a quick death if the kernel panics. */
+       extern int panic_timeout;
+       if (panic_timeout == 0)
+               panic_timeout = 1;
+
+       /* Register a call for panic conditions. */
+       notifier_chain_register(&panic_notifier_list, &xen_panic_block);
+
+       HYPERVISOR_vm_assist(VMASST_CMD_enable, VMASST_TYPE_4gb_segments);
+       HYPERVISOR_vm_assist(VMASST_CMD_enable,
+                            VMASST_TYPE_writable_pagetables);
+
        memcpy(&boot_cpu_data, &new_cpu_data, sizeof(new_cpu_data));
-       pre_setup_arch_hook();
        early_cpu_init();
 
        /*
@@ -1414,7 +1487,10 @@ void __init setup_arch(char **cmdline_p)
                efi_enabled = 1;
 #endif
 
-       ROOT_DEV = old_decode_dev(ORIG_ROOT_DEV);
+       /* This must be initialized to UNNAMED_MAJOR for ipconfig to work
+          properly.  Setting ROOT_DEV to default to /dev/ram0 breaks initrd.
+       */
+       ROOT_DEV = MKDEV(UNNAMED_MAJOR,0);
        drive_info = DRIVE_INFO;
        screen_info = SCREEN_INFO;
        edid_info = EDID_INFO;
@@ -1429,6 +1505,16 @@ void __init setup_arch(char **cmdline_p)
        }
        bootloader_type = LOADER_TYPE;
 
+#ifdef CONFIG_XEN_PHYSDEV_ACCESS
+       /* This is drawn from a dump from vgacon:startup in standard Linux. */
+       screen_info.orig_video_mode = 3; 
+       screen_info.orig_video_isVGA = 1;
+       screen_info.orig_video_lines = 25;
+       screen_info.orig_video_cols = 80;
+       screen_info.orig_video_ega_bx = 3;
+       screen_info.orig_video_points = 16;
+#endif
+
 #ifdef CONFIG_BLK_DEV_RAM
        rd_image_start = RAMDISK_FLAGS & RAMDISK_IMAGE_START_MASK;
        rd_prompt = ((RAMDISK_FLAGS & RAMDISK_PROMPT_FLAG) != 0);
@@ -1449,12 +1535,14 @@ void __init setup_arch(char **cmdline_p)
        init_mm.start_code = (unsigned long) _text;
        init_mm.end_code = (unsigned long) _etext;
        init_mm.end_data = (unsigned long) _edata;
-       init_mm.brk = init_pg_tables_end + PAGE_OFFSET;
+       init_mm.brk = (PFN_UP(__pa(xen_start_info.pt_base)) +
+                      xen_start_info.nr_pt_frames) << PAGE_SHIFT;
 
-       code_resource.start = virt_to_phys(_text);
-       code_resource.end = virt_to_phys(_etext)-1;
-       data_resource.start = virt_to_phys(_etext);
-       data_resource.end = virt_to_phys(_edata)-1;
+       /* XEN: This is nonsense: kernel may not even be contiguous in RAM. */
+       /*code_resource.start = virt_to_phys(_text);*/
+       /*code_resource.end = virt_to_phys(_etext)-1;*/
+       /*data_resource.start = virt_to_phys(_etext);*/
+       /*data_resource.end = virt_to_phys(_edata)-1;*/
 
        parse_cmdline_early(cmdline_p);
 
@@ -1477,6 +1565,51 @@ void __init setup_arch(char **cmdline_p)
        remapped_pgdat_init();
        zone_sizes_init();
 
+#ifdef CONFIG_X86_FIND_SMP_CONFIG
+       /*
+        * Find and reserve possible boot-time SMP configuration:
+        */
+       find_smp_config();
+#endif
+
+       /* Make sure we have a correctly sized P->M table. */
+       if (max_pfn != xen_start_info.nr_pages) {
+               phys_to_machine_mapping = alloc_bootmem_low_pages(
+                       max_pfn * sizeof(unsigned long));
+
+               if (max_pfn > xen_start_info.nr_pages) {
+                       /* set to INVALID_P2M_ENTRY */
+                       memset(phys_to_machine_mapping, ~0,
+                               max_pfn * sizeof(unsigned long));
+                       memcpy(phys_to_machine_mapping,
+                               (unsigned long *)xen_start_info.mfn_list,
+                               xen_start_info.nr_pages * sizeof(unsigned 
long));
+               } else {
+                       memcpy(phys_to_machine_mapping,
+                               (unsigned long *)xen_start_info.mfn_list,
+                               max_pfn * sizeof(unsigned long));
+                       if (HYPERVISOR_dom_mem_op(
+                               MEMOP_decrease_reservation,
+                               (unsigned long *)xen_start_info.mfn_list + 
max_pfn,
+                               xen_start_info.nr_pages - max_pfn, 0) !=
+                           (xen_start_info.nr_pages - max_pfn)) BUG();
+               }
+               free_bootmem(
+                       __pa(xen_start_info.mfn_list), 
+                       PFN_PHYS(PFN_UP(xen_start_info.nr_pages *
+                       sizeof(unsigned long))));
+       }
+
+       pfn_to_mfn_frame_list = alloc_bootmem_low_pages(PAGE_SIZE);
+       for ( i=0, j=0; i < max_pfn; i+=(PAGE_SIZE/sizeof(unsigned long)), j++ )
+       {       
+            pfn_to_mfn_frame_list[j] = 
+                 virt_to_machine(&phys_to_machine_mapping[i]) >> PAGE_SHIFT;
+       }
+       HYPERVISOR_shared_info->arch.pfn_to_mfn_frame_list =
+            virt_to_machine(pfn_to_mfn_frame_list) >> PAGE_SHIFT;
+
+
        /*
         * NOTE: at this point the bootmem allocator is fully available.
         */
@@ -1502,6 +1635,18 @@ void __init setup_arch(char **cmdline_p)
        if (efi_enabled)
                efi_map_memmap();
 
+       op.cmd             = PHYSDEVOP_SET_IOPL;
+       op.u.set_iopl.iopl = current->thread.io_pl = 1;
+       HYPERVISOR_physdev_op(&op);
+
+#ifdef CONFIG_ACPI_BOOT
+       if (!(xen_start_info.flags & SIF_INITDOMAIN)) {
+               printk(KERN_INFO "ACPI in unprivileged domain disabled\n");
+               acpi_disabled = 1;
+               acpi_ht = 0;
+       }
+#endif
+
 #ifdef CONFIG_ACPI_BOOT
        /*
         * Parse the ACPI tables for possible boot-time SMP configuration.
@@ -1515,16 +1660,46 @@ void __init setup_arch(char **cmdline_p)
                get_smp_config();
 #endif
 
+       /* XXX Disable irqdebug until we have a way to avoid interrupt
+        * conflicts. */
+       noirqdebug_setup("");
+
        register_memory();
 
+       if (xen_start_info.flags & SIF_INITDOMAIN) {
+               if (!(xen_start_info.flags & SIF_PRIVILEGED))
+                       panic("Xen granted us console access "
+                             "but not privileged status");
+
 #ifdef CONFIG_VT
 #if defined(CONFIG_VGA_CONSOLE)
-       if (!efi_enabled || (efi_mem_type(0xa0000) != EFI_CONVENTIONAL_MEMORY))
-               conswitchp = &vga_con;
+               if (!efi_enabled ||
+                   (efi_mem_type(0xa0000) != EFI_CONVENTIONAL_MEMORY))
+                       conswitchp = &vga_con;
 #elif defined(CONFIG_DUMMY_CONSOLE)
-       conswitchp = &dummy_con;
+               conswitchp = &dummy_con;
+#endif
 #endif
+       } else {
+#ifdef CONFIG_XEN_PRIVILEGED_GUEST
+               extern const struct consw xennull_con;
+               extern int console_use_vt;
+#if defined(CONFIG_VGA_CONSOLE)
+               /* disable VGA driver */
+               ORIG_VIDEO_ISVGA = VIDEO_TYPE_VLFB;
 #endif
+               conswitchp = &xennull_con;
+               console_use_vt = 0;
+#endif
+       }
+}
+
+static int
+xen_panic_event(struct notifier_block *this, unsigned long event, void *ptr)
+{
+       HYPERVISOR_crash();    
+       /* we're never actually going to get here... */
+       return NOTIFY_DONE;
 }
 
 #include "setup_arch_post.h"
diff -x mkbuildtree -x include -x xen -x SCCS -urPp 
pristine-linux-2.6.12/arch/i386/kernel/signal.c 
linux-2.6-xen-sparse/arch/i386/kernel/signal.c
--- pristine-linux-2.6.12/arch/i386/kernel/signal.c     2005-06-17 
12:48:29.000000000 -0700
+++ linux-2.6-xen-sparse/arch/i386/kernel/signal.c      2005-08-02 
00:59:44.000000000 -0700
@@ -599,7 +599,7 @@ int fastcall do_signal(struct pt_regs *r
         * kernel mode. Just return without doing anything
         * if so.
         */
-       if ((regs->xcs & 3) != 3)
+       if ((regs->xcs & 2) != 2)
                return 1;
 
        if (current->flags & PF_FREEZE) {
diff -x mkbuildtree -x include -x xen -x SCCS -urPp 
pristine-linux-2.6.12/arch/i386/kernel/smpboot.c 
linux-2.6-xen-sparse/arch/i386/kernel/smpboot.c
--- pristine-linux-2.6.12/arch/i386/kernel/smpboot.c    2005-06-17 
12:48:29.000000000 -0700
+++ linux-2.6-xen-sparse/arch/i386/kernel/smpboot.c     2005-07-28 
13:17:07.000000000 -0700
@@ -44,6 +44,9 @@
 #include <linux/smp_lock.h>
 #include <linux/irq.h>
 #include <linux/bootmem.h>
+#include <linux/notifier.h>
+#include <linux/cpu.h>
+#include <linux/percpu.h>
 
 #include <linux/delay.h>
 #include <linux/mc146818rtc.h>
@@ -51,7 +54,11 @@
 #include <asm/desc.h>
 #include <asm/arch_hooks.h>
 
-#include <mach_apic.h>
+#include <asm/smp_alt.h>
+
+#ifndef CONFIG_X86_IO_APIC
+#define Dprintk(args...)
+#endif
 #include <mach_wakecpu.h>
 #include <smpboot_hooks.h>
 
@@ -79,6 +86,7 @@ u8 x86_cpu_to_apicid[NR_CPUS] =
                        { [0 ... NR_CPUS-1] = 0xff };
 EXPORT_SYMBOL(x86_cpu_to_apicid);
 
+#if 0
 /*
  * Trampoline 80x86 program as an array.
  */
@@ -87,9 +95,19 @@ extern unsigned char trampoline_data [];
 extern unsigned char trampoline_end  [];
 static unsigned char *trampoline_base;
 static int trampoline_exec;
+#endif
 
-static void map_cpu_to_logical_apicid(void);
+#ifdef CONFIG_HOTPLUG_CPU
+/* State of each CPU. */
+DEFINE_PER_CPU(int, cpu_state) = { 0 };
+#endif
+
+static DEFINE_PER_CPU(int, resched_irq);
+static DEFINE_PER_CPU(int, callfunc_irq);
+static char resched_name[NR_CPUS][15];
+static char callfunc_name[NR_CPUS][15];
 
+#if 0
 /*
  * Currently trivial. Write the real->protected mode
  * bootstrap into the page concerned. The caller
@@ -101,6 +119,9 @@ static unsigned long __init setup_trampo
        memcpy(trampoline_base, trampoline_data, trampoline_end - 
trampoline_data);
        return virt_to_phys(trampoline_base);
 }
+#endif
+
+static void map_cpu_to_logical_apicid(void);
 
 /*
  * We are called very early to get the low memory for the
@@ -108,6 +129,15 @@ static unsigned long __init setup_trampo
  */
 void __init smp_alloc_memory(void)
 {
+#if 1
+       int cpu;
+
+       for (cpu = 1; cpu < NR_CPUS; cpu++) {
+               cpu_gdt_descr[cpu].address = (unsigned long)
+                       alloc_bootmem_low_pages(PAGE_SIZE);
+               /* XXX free unused pages later */
+       }
+#else
        trampoline_base = (void *) alloc_bootmem_low_pages(PAGE_SIZE);
        /*
         * Has to be in very low memory so we can execute
@@ -119,6 +149,7 @@ void __init smp_alloc_memory(void)
         * Make the SMP trampoline executable:
         */
        trampoline_exec = set_kernel_exec((unsigned long)trampoline_base, 1);
+#endif
 }
 
 /*
@@ -179,6 +210,7 @@ valid_k7:
        ;
 }
 
+#if 0
 /*
  * TSC synchronization.
  *
@@ -315,6 +347,7 @@ static void __init synchronize_tsc_ap (v
        }
 }
 #undef NR_LOOPS
+#endif
 
 extern void calibrate_delay(void);
 
@@ -325,6 +358,7 @@ static void __init smp_callin(void)
        int cpuid, phys_id;
        unsigned long timeout;
 
+#if 0
        /*
         * If waken up by an INIT in an 82489DX configuration
         * we may get here before an INIT-deassert IPI reaches
@@ -332,11 +366,12 @@ static void __init smp_callin(void)
         * lock up on an APIC access.
         */
        wait_for_init_deassert(&init_deasserted);
+#endif
 
        /*
         * (This works even if the APIC is not enabled.)
         */
-       phys_id = GET_APIC_ID(apic_read(APIC_ID));
+       phys_id = smp_processor_id();
        cpuid = smp_processor_id();
        if (cpu_isset(cpuid, cpu_callin_map)) {
                printk("huh, phys CPU#%d, CPU#%d already present??\n",
@@ -372,6 +407,7 @@ static void __init smp_callin(void)
                BUG();
        }
 
+#if 0
        /*
         * the boot CPU has finished the init stage and is spinning
         * on callin_map until we finish. We are free to set up this
@@ -382,6 +418,7 @@ static void __init smp_callin(void)
        Dprintk("CALLIN, before setup_local_APIC().\n");
        smp_callin_clear_local_apic();
        setup_local_APIC();
+#endif
        map_cpu_to_logical_apicid();
 
        /*
@@ -395,22 +432,49 @@ static void __init smp_callin(void)
         */
        smp_store_cpu_info(cpuid);
 
+#if 0
        disable_APIC_timer();
+#endif
 
        /*
         * Allow the master to continue.
         */
        cpu_set(cpuid, cpu_callin_map);
 
+#if 0
        /*
         *      Synchronize the TSC with the BP
         */
        if (cpu_has_tsc && cpu_khz)
                synchronize_tsc_ap();
+#endif
 }
 
 static int cpucount;
 
+
+static irqreturn_t ldebug_interrupt(
+       int irq, void *dev_id, struct pt_regs *regs)
+{
+       return IRQ_HANDLED;
+}
+
+static DEFINE_PER_CPU(int, ldebug_irq);
+static char ldebug_name[NR_CPUS][15];
+
+void ldebug_setup(void)
+{
+       int cpu = smp_processor_id();
+
+       per_cpu(ldebug_irq, cpu) = bind_virq_to_irq(VIRQ_DEBUG);
+       sprintf(ldebug_name[cpu], "ldebug%d", cpu);
+       BUG_ON(request_irq(per_cpu(ldebug_irq, cpu), ldebug_interrupt,
+                          SA_INTERRUPT, ldebug_name[cpu], NULL));
+}
+
+
+extern void local_setup_timer(void);
+
 /*
  * Activate a secondary processor.
  */
@@ -425,13 +489,10 @@ static void __init start_secondary(void 
        smp_callin();
        while (!cpu_isset(smp_processor_id(), smp_commenced_mask))
                rep_nop();
-       setup_secondary_APIC_clock();
-       if (nmi_watchdog == NMI_IO_APIC) {
-               disable_8259A_irq(0);
-               enable_NMI_through_LVT0(NULL);
-               enable_8259A_irq(0);
-       }
-       enable_APIC_timer();
+       local_setup_timer();
+       ldebug_setup();
+       smp_intr_init();
+       local_irq_enable();
        /*
         * low-memory mappings have been cleared, flush them from
         * the local TLBs too.
@@ -510,7 +571,7 @@ u8 cpu_2_logical_apicid[NR_CPUS] = { [0 
 static void map_cpu_to_logical_apicid(void)
 {
        int cpu = smp_processor_id();
-       int apicid = logical_smp_processor_id();
+       int apicid = smp_processor_id();
 
        cpu_2_logical_apicid[cpu] = apicid;
        map_cpu_to_node(cpu, apicid_to_node(apicid));
@@ -560,6 +621,7 @@ static inline void __inquire_remote_apic
 }
 #endif
 
+#if 0
 #ifdef WAKE_SECONDARY_VIA_NMI
 /* 
  * Poke the other CPU in the eye via NMI to wake it up. Remember that the 
normal
@@ -745,6 +807,7 @@ wakeup_secondary_cpu(int phys_apicid, un
        return (send_status | accept_status);
 }
 #endif /* WAKE_SECONDARY_VIA_INIT */
+#endif
 
 extern cpumask_t cpu_initialized;
 
@@ -759,7 +822,15 @@ static int __init do_boot_cpu(int apicid
        unsigned long boot_error;
        int timeout, cpu;
        unsigned long start_eip;
+#if 0
        unsigned short nmi_high = 0, nmi_low = 0;
+#endif
+       vcpu_guest_context_t ctxt;
+       extern void startup_32_smp(void);
+       extern void hypervisor_callback(void);
+       extern void failsafe_callback(void);
+       extern void smp_trap_init(trap_info_t *);
+       int i;
 
        cpu = ++cpucount;
        /*
@@ -771,7 +842,7 @@ static int __init do_boot_cpu(int apicid
                panic("failed fork for CPU %d", cpu);
        idle->thread.eip = (unsigned long) start_secondary;
        /* start_eip had better be page-aligned! */
-       start_eip = setup_trampoline();
+       start_eip = (unsigned long)startup_32_smp;
 
        /* So we see what's up   */
        printk("Booting processor %d/%d eip %lx\n", cpu, apicid, start_eip);
@@ -787,6 +858,107 @@ static int __init do_boot_cpu(int apicid
 
        atomic_set(&init_deasserted, 0);
 
+#if 1
+       if (cpu_gdt_descr[0].size > PAGE_SIZE)
+               BUG();
+       cpu_gdt_descr[cpu].size = cpu_gdt_descr[0].size;
+       printk("GDT: copying %d bytes from %lx to %lx\n",
+               cpu_gdt_descr[0].size, cpu_gdt_descr[0].address,
+               cpu_gdt_descr[cpu].address); 
+       memcpy((void *)cpu_gdt_descr[cpu].address,
+              (void *)cpu_gdt_descr[0].address, cpu_gdt_descr[0].size);
+
+       memset(&ctxt, 0, sizeof(ctxt));
+
+       ctxt.user_regs.ds = __USER_DS;
+       ctxt.user_regs.es = __USER_DS;
+       ctxt.user_regs.fs = 0;
+       ctxt.user_regs.gs = 0;
+       ctxt.user_regs.ss = __KERNEL_DS;
+       ctxt.user_regs.cs = __KERNEL_CS;
+       ctxt.user_regs.eip = start_eip;
+       ctxt.user_regs.esp = idle->thread.esp;
+       ctxt.user_regs.eflags = (1<<9) | (1<<2) | (idle->thread.io_pl<<12);
+
+       /* FPU is set up to default initial state. */
+       memset(&ctxt.fpu_ctxt, 0, sizeof(ctxt.fpu_ctxt));
+
+       /* Virtual IDT is empty at start-of-day. */
+       for ( i = 0; i < 256; i++ )
+       {
+               ctxt.trap_ctxt[i].vector = i;
+               ctxt.trap_ctxt[i].cs     = FLAT_KERNEL_CS;
+       }
+       smp_trap_init(ctxt.trap_ctxt);
+
+       /* No LDT. */
+       ctxt.ldt_ents = 0;
+
+       {
+               unsigned long va;
+               int f;
+
+               for (va = cpu_gdt_descr[cpu].address, f = 0;
+                    va < cpu_gdt_descr[cpu].address + cpu_gdt_descr[cpu].size;
+                    va += PAGE_SIZE, f++) {
+                       ctxt.gdt_frames[f] = virt_to_machine(va) >> PAGE_SHIFT;
+                       make_page_readonly((void *)va);
+               }
+               ctxt.gdt_ents = cpu_gdt_descr[cpu].size / 8;
+       }
+
+       /* Ring 1 stack is the initial stack. */
+       ctxt.kernel_ss = __KERNEL_DS;
+       ctxt.kernel_sp = idle->thread.esp;
+
+       /* Callback handlers. */
+       ctxt.event_callback_cs     = __KERNEL_CS;
+       ctxt.event_callback_eip    = (unsigned long)hypervisor_callback;
+       ctxt.failsafe_callback_cs  = __KERNEL_CS;
+       ctxt.failsafe_callback_eip = (unsigned long)failsafe_callback;
+
+       ctxt.ctrlreg[3] = (unsigned long)virt_to_machine(swapper_pg_dir);
+
+       boot_error = HYPERVISOR_boot_vcpu(cpu, &ctxt);
+       printk("boot error: %ld\n", boot_error);
+
+       if (!boot_error) {
+               /*
+                * allow APs to start initializing.
+                */
+               Dprintk("Before Callout %d.\n", cpu);
+               cpu_set(cpu, cpu_callout_map);
+               Dprintk("After Callout %d.\n", cpu);
+
+               /*
+                * Wait 5s total for a response
+                */
+               for (timeout = 0; timeout < 50000; timeout++) {
+                       if (cpu_isset(cpu, cpu_callin_map))
+                               break;  /* It has booted */
+                       udelay(100);
+               }
+
+               if (cpu_isset(cpu, cpu_callin_map)) {
+                       /* number CPUs logically, starting from 1 (BSP is 0) */
+                       Dprintk("OK.\n");
+                       printk("CPU%d: ", cpu);
+                       print_cpu_info(&cpu_data[cpu]);
+                       Dprintk("CPU has booted.\n");
+               } else {
+                       boot_error= 1;
+               }
+       }
+       x86_cpu_to_apicid[cpu] = apicid;
+       if (boot_error) {
+               /* Try to put things back the way they were before ... */
+               unmap_cpu_to_logical_apicid(cpu);
+               cpu_clear(cpu, cpu_callout_map); /* was set here 
(do_boot_cpu()) */
+               cpu_clear(cpu, cpu_initialized); /* was set by cpu_init() */
+               cpucount--;
+       }
+
+#else
        Dprintk("Setting warm reset code and vector.\n");
 
        store_NMI_vector(&nmi_high, &nmi_low);
@@ -844,6 +1016,7 @@ static int __init do_boot_cpu(int apicid
 
        /* mark "stuck" area as not stuck */
        *((volatile unsigned long *)trampoline_base) = 0;
+#endif
 
        return boot_error;
 }
@@ -882,7 +1055,9 @@ static void smp_tune_scheduling (void)
  * Cycle through the processors sending APIC IPIs to boot each.
  */
 
+#if 0
 static int boot_cpu_logical_apicid;
+#endif
 /* Where the IO area was mapped on multiquad, always 0 otherwise */
 void *xquad_portio;
 
@@ -892,8 +1067,11 @@ EXPORT_SYMBOL(cpu_core_map);
 
 static void __init smp_boot_cpus(unsigned int max_cpus)
 {
-       int apicid, cpu, bit, kicked;
+       int cpu, kicked;
        unsigned long bogosum = 0;
+#if 0
+       int apicid, bit;
+#endif
 
        /*
         * Setup boot CPU information
@@ -902,9 +1080,15 @@ static void __init smp_boot_cpus(unsigne
        printk("CPU%d: ", 0);
        print_cpu_info(&cpu_data[0]);
 
+#if 0
        boot_cpu_physical_apicid = GET_APIC_ID(apic_read(APIC_ID));
        boot_cpu_logical_apicid = logical_smp_processor_id();
        x86_cpu_to_apicid[0] = boot_cpu_physical_apicid;
+#else
+       // boot_cpu_physical_apicid = 0;
+       // boot_cpu_logical_apicid = 0;
+       x86_cpu_to_apicid[0] = 0;
+#endif
 
        current_thread_info()->cpu = 0;
        smp_tune_scheduling();
@@ -914,6 +1098,7 @@ static void __init smp_boot_cpus(unsigne
        cpus_clear(cpu_core_map[0]);
        cpu_set(0, cpu_core_map[0]);
 
+#ifdef CONFIG_X86_IO_APIC
        /*
         * If we couldn't find an SMP configuration at boot time,
         * get out of here now!
@@ -921,16 +1106,22 @@ static void __init smp_boot_cpus(unsigne
        if (!smp_found_config && !acpi_lapic) {
                printk(KERN_NOTICE "SMP motherboard not detected.\n");
                smpboot_clear_io_apic_irqs();
+#if 0
                phys_cpu_present_map = physid_mask_of_physid(0);
+#endif
+#ifdef CONFIG_X86_LOCAL_APIC
                if (APIC_init_uniprocessor())
                        printk(KERN_NOTICE "Local APIC not detected."
                                           " Using dummy APIC emulation.\n");
+#endif
                map_cpu_to_logical_apicid();
                cpu_set(0, cpu_sibling_map[0]);
                cpu_set(0, cpu_core_map[0]);
                return;
        }
+#endif
 
+#if 0
        /*
         * Should not be necessary because the MP table should list the boot
         * CPU too, but we do it for the sake of robustness anyway.
@@ -953,27 +1144,35 @@ static void __init smp_boot_cpus(unsigne
                phys_cpu_present_map = physid_mask_of_physid(0);
                cpu_set(0, cpu_sibling_map[0]);
                cpu_set(0, cpu_core_map[0]);
+               cpu_set(0, cpu_sibling_map[0]);
+               cpu_set(0, cpu_core_map[0]);
                return;
        }
 
        verify_local_APIC();
+#endif
 
        /*
         * If SMP should be disabled, then really disable it!
         */
        if (!max_cpus) {
-               smp_found_config = 0;
+               HYPERVISOR_shared_info->n_vcpu = 1;
                printk(KERN_INFO "SMP mode deactivated, forcing use of dummy 
APIC emulation.\n");
                smpboot_clear_io_apic_irqs();
+#if 0
                phys_cpu_present_map = physid_mask_of_physid(0);
-               cpu_set(0, cpu_sibling_map[0]);
-               cpu_set(0, cpu_core_map[0]);
+#endif
                return;
        }
 
+       smp_intr_init();
+
+#if 0
        connect_bsp_APIC();
        setup_local_APIC();
+#endif
        map_cpu_to_logical_apicid();
+#if 0
 
 
        setup_portio_remap();
@@ -986,32 +1185,33 @@ static void __init smp_boot_cpus(unsigne
         * clustered apic ID.
         */
        Dprintk("CPU present map: %lx\n", physids_coerce(phys_cpu_present_map));
+#endif
+       Dprintk("CPU present map: %lx\n",
+               (1UL << HYPERVISOR_shared_info->n_vcpu) - 1);
 
        kicked = 1;
-       for (bit = 0; kicked < NR_CPUS && bit < MAX_APICS; bit++) {
-               apicid = cpu_present_to_apicid(bit);
-               /*
-                * Don't even attempt to start the boot CPU!
-                */
-               if ((apicid == boot_cpu_apicid) || (apicid == BAD_APICID))
-                       continue;
-
-               if (!check_apicid_present(bit))
-                       continue;
+       for (cpu = 1; kicked < NR_CPUS &&
+                    cpu < HYPERVISOR_shared_info->n_vcpu; cpu++) {
                if (max_cpus <= cpucount+1)
                        continue;
 
-               if (do_boot_cpu(apicid))
+#ifdef CONFIG_SMP_ALTERNATIVES
+               if (kicked == 1)
+                       prepare_for_smp();
+#endif
+               if (do_boot_cpu(cpu))
                        printk("CPU #%d not responding - cannot use it.\n",
-                                                               apicid);
+                                                               cpu);
                else
                        ++kicked;
        }
 
+#if 0
        /*
         * Cleanup possible dangling ends...
         */
        smpboot_restore_warm_reset_vector();
+#endif
 
        /*
         * Allow the user to impress friends.
@@ -1078,7 +1278,6 @@ static void __init smp_boot_cpus(unsigne
                        printk(KERN_WARNING "WARNING: %d siblings found for 
CPU%d, should be %d\n", siblings, cpu, smp_num_siblings);
                        smp_num_siblings = siblings;
                }
-
                if (c->x86_num_cores > 1) {
                        for (i = 0; i < NR_CPUS; i++) {
                                if (!cpu_isset(i, cpu_callout_map))
@@ -1094,6 +1293,7 @@ static void __init smp_boot_cpus(unsigne
 
        smpboot_setup_io_apic();
 
+#if 0
        setup_boot_APIC_clock();
 
        /*
@@ -1101,12 +1301,16 @@ static void __init smp_boot_cpus(unsigne
         */
        if (cpu_has_tsc && cpucount && cpu_khz)
                synchronize_tsc_bp();
+#endif
 }
 
 /* These are wrappers to interface to the new boot process.  Someone
    who understands all this stuff should rewrite it properly. --RR 15/Jul/02 */
 void __init smp_prepare_cpus(unsigned int max_cpus)
 {
+       smp_commenced_mask = cpumask_of_cpu(0);
+       cpu_callin_map = cpumask_of_cpu(0);
+       mb();
        smp_boot_cpus(max_cpus);
 }
 
@@ -1116,20 +1320,189 @@ void __devinit smp_prepare_boot_cpu(void
        cpu_set(smp_processor_id(), cpu_callout_map);
 }
 
-int __devinit __cpu_up(unsigned int cpu)
+#ifdef CONFIG_HOTPLUG_CPU
+#include <asm-xen/ctrl_if.h>
+
+/* hotplug down/up funtion pointer and target vcpu */
+struct vcpu_hotplug_handler_t {
+       void (*fn)(int vcpu);
+       u32 vcpu;
+};
+static struct vcpu_hotplug_handler_t vcpu_hotplug_handler;
+
+/* must be called with the cpucontrol mutex held */
+static int __devinit cpu_enable(unsigned int cpu)
+{
+#ifdef CONFIG_SMP_ALTERNATIVES
+       if (num_online_cpus() == 1)
+               prepare_for_smp();
+#endif
+
+       /* get the target out of its holding state */
+       per_cpu(cpu_state, cpu) = CPU_UP_PREPARE;
+       wmb();
+
+       /* wait for the processor to ack it. timeout? */
+       while (!cpu_online(cpu))
+               cpu_relax();
+
+       fixup_irqs(cpu_online_map);
+
+       /* counter the disable in fixup_irqs() */
+       local_irq_enable();
+       return 0;
+}
+
+int __cpu_disable(void)
 {
-       /* This only works at boot for x86.  See "rewrite" above. */
-       if (cpu_isset(cpu, smp_commenced_mask)) {
-               local_irq_enable();
-               return -ENOSYS;
+       cpumask_t map = cpu_online_map;
+       int cpu = smp_processor_id();
+
+       /*
+        * Perhaps use cpufreq to drop frequency, but that could go
+        * into generic code.
+        *
+        * We won't take down the boot processor on i386 due to some
+        * interrupts only being able to be serviced by the BSP.
+        * Especially so if we're not using an IOAPIC   -zwane
+        */
+       if (cpu == 0)
+               return -EBUSY;
+
+       cpu_clear(cpu, map);
+       fixup_irqs(map);
+
+       /* It's now safe to remove this processor from the online map */
+       cpu_clear(cpu, cpu_online_map);
+
+#ifdef CONFIG_SMP_ALTERNATIVES
+       if (num_online_cpus() == 1)
+               unprepare_for_smp();
+#endif
+
+       return 0;
+}
+
+void __cpu_die(unsigned int cpu)
+{
+       /* We don't do anything here: idle task is faking death itself. */
+       unsigned int i;
+
+       for (i = 0; i < 10; i++) {
+               /* They ack this in play_dead by setting CPU_DEAD */
+               if (per_cpu(cpu_state, cpu) == CPU_DEAD)
+                       return;
+               current->state = TASK_UNINTERRUPTIBLE;
+               schedule_timeout(HZ/10);
+       }
+       printk(KERN_ERR "CPU %u didn't die...\n", cpu);
+}
+
+static int vcpu_hotplug_cpu_process(void *unused)
+{
+       struct vcpu_hotplug_handler_t *handler = &vcpu_hotplug_handler;
+
+       if (handler->fn) {
+               (*(handler->fn))(handler->vcpu);
+               handler->fn = NULL;
        }
+       return 0;
+}
+
+static void __vcpu_hotplug_handler(void *unused)
+{
+       int err;
+
+       err = kernel_thread(vcpu_hotplug_cpu_process, 
+                           NULL, CLONE_FS | CLONE_FILES);
+       if (err < 0)
+               printk(KERN_ALERT "Error creating hotplug_cpu process!\n");
+
+}
+
+static void vcpu_hotplug_event_handler(ctrl_msg_t *msg, unsigned long id)
+{
+       static DECLARE_WORK(vcpu_hotplug_work, __vcpu_hotplug_handler, NULL);
+       vcpu_hotplug_t *req = (vcpu_hotplug_t *)&msg->msg[0];
+       struct vcpu_hotplug_handler_t *handler = &vcpu_hotplug_handler;
+       ssize_t ret;
+
+       if (msg->length != sizeof(vcpu_hotplug_t))
+               goto parse_error;
+
+       /* grab target vcpu from msg */
+       handler->vcpu = req->vcpu;
+
+       /* determine which function to call based on msg subtype */
+       switch (msg->subtype) {
+        case CMSG_VCPU_HOTPLUG_OFF:
+               handler->fn = (void *)&cpu_down;
+               ret = schedule_work(&vcpu_hotplug_work);
+               req->status = (u32) ret;
+               break;
+        case CMSG_VCPU_HOTPLUG_ON:
+               handler->fn = (void *)&cpu_up;
+               ret = schedule_work(&vcpu_hotplug_work);
+               req->status = (u32) ret;
+               break;
+        default:
+               goto parse_error;
+       }
+
+       ctrl_if_send_response(msg);
+       return;
+ parse_error:
+       msg->length = 0;
+       ctrl_if_send_response(msg);
+}
+
+static int __init setup_vcpu_hotplug_event(void)
+{
+       struct vcpu_hotplug_handler_t *handler = &vcpu_hotplug_handler;
+
+       handler->fn = NULL;
+       ctrl_if_register_receiver(CMSG_VCPU_HOTPLUG,
+                                 vcpu_hotplug_event_handler, 0);
+
+       return 0;
+}
+
+__initcall(setup_vcpu_hotplug_event);
 
+#else /* ... !CONFIG_HOTPLUG_CPU */
+int __cpu_disable(void)
+{
+       return -ENOSYS;
+}
+
+void __cpu_die(unsigned int cpu)
+{
+       /* We said "no" in __cpu_disable */
+       BUG();
+}
+#endif /* CONFIG_HOTPLUG_CPU */
+
+int __devinit __cpu_up(unsigned int cpu)
+{
        /* In case one didn't come up */
        if (!cpu_isset(cpu, cpu_callin_map)) {
+               printk(KERN_DEBUG "skipping cpu%d, didn't come online\n", cpu);
                local_irq_enable();
                return -EIO;
        }
 
+#ifdef CONFIG_HOTPLUG_CPU
+#ifdef CONFIG_XEN
+       /* Tell hypervisor to bring vcpu up. */
+       HYPERVISOR_vcpu_up(cpu);
+#endif
+       /* Already up, and in cpu_quiescent now? */
+       if (cpu_isset(cpu, smp_commenced_mask)) {
+               cpu_enable(cpu);
+               return 0;
+       }
+#endif
+
        local_irq_enable();
        /* Unleash the CPU! */
        cpu_set(cpu, smp_commenced_mask);
@@ -1140,6 +1513,8 @@ int __devinit __cpu_up(unsigned int cpu)
 
 void __init smp_cpus_done(unsigned int max_cpus)
 {
+#if 1
+#else
 #ifdef CONFIG_X86_IO_APIC
        setup_ioapic_dest();
 #endif
@@ -1148,25 +1523,26 @@ void __init smp_cpus_done(unsigned int m
         * Disable executability of the SMP trampoline:
         */
        set_kernel_exec((unsigned long)trampoline_base, trampoline_exec);
+#endif
 }
 
+extern irqreturn_t smp_reschedule_interrupt(int, void *, struct pt_regs *);
+extern irqreturn_t smp_call_function_interrupt(int, void *, struct pt_regs *);
+
 void __init smp_intr_init(void)
 {
-       /*
-        * IRQ0 must be given a fixed assignment and initialized,
-        * because it's used before the IO-APIC is set up.
-        */
-       set_intr_gate(FIRST_DEVICE_VECTOR, interrupt[0]);
-
-       /*
-        * The reschedule interrupt is a CPU-to-CPU reschedule-helper
-        * IPI, driven by wakeup.
-        */
-       set_intr_gate(RESCHEDULE_VECTOR, reschedule_interrupt);
-
-       /* IPI for invalidation */
-       set_intr_gate(INVALIDATE_TLB_VECTOR, invalidate_interrupt);
+       int cpu = smp_processor_id();
 
-       /* IPI for generic function call */
-       set_intr_gate(CALL_FUNCTION_VECTOR, call_function_interrupt);
+       per_cpu(resched_irq, cpu) =
+               bind_ipi_on_cpu_to_irq(RESCHEDULE_VECTOR);
+       sprintf(resched_name[cpu], "resched%d", cpu);
+       BUG_ON(request_irq(per_cpu(resched_irq, cpu), smp_reschedule_interrupt,
+                          SA_INTERRUPT, resched_name[cpu], NULL));
+
+       per_cpu(callfunc_irq, cpu) =
+               bind_ipi_on_cpu_to_irq(CALL_FUNCTION_VECTOR);
+       sprintf(callfunc_name[cpu], "callfunc%d", cpu);
+       BUG_ON(request_irq(per_cpu(callfunc_irq, cpu),
+                          smp_call_function_interrupt,
+                          SA_INTERRUPT, callfunc_name[cpu], NULL));
 }
diff -x mkbuildtree -x include -x xen -x SCCS -urPp 
pristine-linux-2.6.12/arch/i386/kernel/smp.c 
linux-2.6-xen-sparse/arch/i386/kernel/smp.c
--- pristine-linux-2.6.12/arch/i386/kernel/smp.c        2005-06-17 
12:48:29.000000000 -0700
+++ linux-2.6-xen-sparse/arch/i386/kernel/smp.c 2005-07-28 13:17:07.000000000 
-0700
@@ -19,10 +19,16 @@
 #include <linux/mc146818rtc.h>
 #include <linux/cache.h>
 #include <linux/interrupt.h>
+#include <linux/cpu.h>
 
 #include <asm/mtrr.h>
 #include <asm/tlbflush.h>
+#if 0
 #include <mach_apic.h>
+#endif
+#include <asm-xen/evtchn.h>
+
+#define xxprint(msg) HYPERVISOR_console_io(CONSOLEIO_write, strlen(msg), msg)
 
 /*
  *     Some notes on x86 processor bugs affecting SMP operation:
@@ -121,31 +127,49 @@ static inline int __prepare_ICR2 (unsign
        return SET_APIC_DEST_FIELD(mask);
 }
 
-void __send_IPI_shortcut(unsigned int shortcut, int vector)
+DECLARE_PER_CPU(int, ipi_to_evtchn[NR_IPIS]);
+
+static inline void __send_IPI_one(unsigned int cpu, int vector)
 {
-       /*
-        * Subtle. In the case of the 'never do double writes' workaround
-        * we have to lock out interrupts to be safe.  As we don't care
-        * of the value read we use an atomic rmw access to avoid costly
-        * cli/sti.  Otherwise we use an even cheaper single atomic write
-        * to the APIC.
-        */
-       unsigned int cfg;
+       unsigned int evtchn;
 
-       /*
-        * Wait for idle.
-        */
-       apic_wait_icr_idle();
+       evtchn = per_cpu(ipi_to_evtchn, cpu)[vector];
+       // printk("send_IPI_mask_bitmask cpu %d vector %d evtchn %d\n", cpu, 
vector, evtchn);
+       if (evtchn) {
+#if 0
+               shared_info_t *s = HYPERVISOR_shared_info;
+               while (synch_test_bit(evtchn, &s->evtchn_pending[0]) ||
+                      synch_test_bit(evtchn, &s->evtchn_mask[0]))
+                       ;
+#endif
+               notify_via_evtchn(evtchn);
+       } else
+               printk("send_IPI to unbound port %d/%d",
+                      cpu, vector);
+}
 
-       /*
-        * No need to touch the target chip field
-        */
-       cfg = __prepare_ICR(shortcut, vector);
+void __send_IPI_shortcut(unsigned int shortcut, int vector)
+{
+       int cpu;
 
-       /*
-        * Send the IPI. The write to APIC_ICR fires this off.
-        */
-       apic_write_around(APIC_ICR, cfg);
+       switch (shortcut) {
+       case APIC_DEST_SELF:
+               __send_IPI_one(smp_processor_id(), vector);
+               break;
+       case APIC_DEST_ALLBUT:
+               for (cpu = 0; cpu < NR_CPUS; ++cpu) {
+                       if (cpu == smp_processor_id())
+                               continue;
+                       if (cpu_isset(cpu, cpu_online_map)) {
+                               __send_IPI_one(cpu, vector);
+                       }
+               }
+               break;
+       default:
+               printk("XXXXXX __send_IPI_shortcut %08x vector %d\n", shortcut,
+                      vector);
+               break;
+       }
 }
 
 void fastcall send_IPI_self(int vector)
@@ -156,81 +180,32 @@ void fastcall send_IPI_self(int vector)
 /*
  * This is only used on smaller machines.
  */
-void send_IPI_mask_bitmask(cpumask_t cpumask, int vector)
+void send_IPI_mask_bitmask(cpumask_t mask, int vector)
 {
-       unsigned long mask = cpus_addr(cpumask)[0];
-       unsigned long cfg;
        unsigned long flags;
+       unsigned int cpu;
 
        local_irq_save(flags);
-               
-       /*
-        * Wait for idle.
-        */
-       apic_wait_icr_idle();
-               
-       /*
-        * prepare target chip field
-        */
-       cfg = __prepare_ICR2(mask);
-       apic_write_around(APIC_ICR2, cfg);
-               
-       /*
-        * program the ICR 
-        */
-       cfg = __prepare_ICR(0, vector);
-                       
-       /*
-        * Send the IPI. The write to APIC_ICR fires this off.
-        */
-       apic_write_around(APIC_ICR, cfg);
+       WARN_ON(cpus_addr(mask)[0] & ~cpus_addr(cpu_online_map)[0]);
+
+       for (cpu = 0; cpu < NR_CPUS; ++cpu) {
+               if (cpu_isset(cpu, mask)) {
+                       __send_IPI_one(cpu, vector);
+               }
+       }
 
        local_irq_restore(flags);
 }
 
 void send_IPI_mask_sequence(cpumask_t mask, int vector)
 {
-       unsigned long cfg, flags;
-       unsigned int query_cpu;
-
-       /*
-        * Hack. The clustered APIC addressing mode doesn't allow us to send 
-        * to an arbitrary mask, so I do a unicasts to each CPU instead. This 
-        * should be modified to do 1 message per cluster ID - mbligh
-        */ 
 
-       local_irq_save(flags);
-
-       for (query_cpu = 0; query_cpu < NR_CPUS; ++query_cpu) {
-               if (cpu_isset(query_cpu, mask)) {
-               
-                       /*
-                        * Wait for idle.
-                        */
-                       apic_wait_icr_idle();
-               
-                       /*
-                        * prepare target chip field
-                        */
-                       cfg = __prepare_ICR2(cpu_to_logical_apicid(query_cpu));
-                       apic_write_around(APIC_ICR2, cfg);
-               
-                       /*
-                        * program the ICR 
-                        */
-                       cfg = __prepare_ICR(0, vector);
-                       
-                       /*
-                        * Send the IPI. The write to APIC_ICR fires this off.
-                        */
-                       apic_write_around(APIC_ICR, cfg);
-               }
-       }
-       local_irq_restore(flags);
+       send_IPI_mask_bitmask(mask, vector);
 }
 
 #include <mach_ipi.h> /* must come after the send_IPI functions above for 
inlining */
 
+#if 0 /* XEN */
 /*
  *     Smarter SMP flushing macros. 
  *             c/o Linus Torvalds.
@@ -308,7 +283,8 @@ static inline void leave_mm (unsigned lo
  * 2) Leave the mm if we are in the lazy tlb mode.
  */
 
-fastcall void smp_invalidate_interrupt(struct pt_regs *regs)
+irqreturn_t smp_invalidate_interrupt(int irq, void *dev_id,
+                                    struct pt_regs *regs)
 {
        unsigned long cpu;
 
@@ -334,32 +310,33 @@ fastcall void smp_invalidate_interrupt(s
                } else
                        leave_mm(cpu);
        }
-       ack_APIC_irq();
        smp_mb__before_clear_bit();
        cpu_clear(cpu, flush_cpumask);
        smp_mb__after_clear_bit();
 out:
        put_cpu_no_resched();
+
+       return IRQ_HANDLED;
 }
 
 static void flush_tlb_others(cpumask_t cpumask, struct mm_struct *mm,
                                                unsigned long va)
 {
-       cpumask_t tmp;
        /*
         * A couple of (to be removed) sanity checks:
         *
-        * - we do not send IPIs to not-yet booted CPUs.
         * - current CPU must not be in mask
         * - mask must exist :)
         */
        BUG_ON(cpus_empty(cpumask));
-
-       cpus_and(tmp, cpumask, cpu_online_map);
-       BUG_ON(!cpus_equal(cpumask, tmp));
        BUG_ON(cpu_isset(smp_processor_id(), cpumask));
        BUG_ON(!mm);
 
+       /* If a CPU which we ran on has gone down, OK. */
+       cpus_and(cpumask, cpumask, cpu_online_map);
+       if (cpus_empty(cpumask))
+               return;
+
        /*
         * i'm not happy about this global shared spinlock in the
         * MM hot path, but we'll see how contended it is.
@@ -443,7 +420,7 @@ void flush_tlb_page(struct vm_area_struc
        if (current->active_mm == mm) {
                if(current->mm)
                        __flush_tlb_one(va);
-                else
+               else
                        leave_mm(smp_processor_id());
        }
 
@@ -467,6 +444,22 @@ void flush_tlb_all(void)
        on_each_cpu(do_flush_tlb_all, NULL, 1, 1);
 }
 
+#else
+
+irqreturn_t smp_invalidate_interrupt(int irq, void *dev_id,
+                                    struct pt_regs *regs)
+{ return 0; }
+void flush_tlb_current_task(void)
+{ xen_tlb_flush_mask(&current->mm->cpu_vm_mask); }
+void flush_tlb_mm(struct mm_struct * mm)
+{ xen_tlb_flush_mask(&mm->cpu_vm_mask); }
+void flush_tlb_page(struct vm_area_struct *vma, unsigned long va)
+{ xen_invlpg_mask(&vma->vm_mm->cpu_vm_mask, va); }
+void flush_tlb_all(void)
+{ xen_tlb_flush_all(); }
+
+#endif /* XEN */
+
 /*
  * this function sends a 'reschedule' IPI to another CPU.
  * it goes straight through and wastes no time serializing
@@ -474,6 +467,7 @@ void flush_tlb_all(void)
  */
 void smp_send_reschedule(int cpu)
 {
+       WARN_ON(cpu_is_offline(cpu));
        send_IPI_mask(cpumask_of_cpu(cpu), RESCHEDULE_VECTOR);
 }
 
@@ -514,10 +508,16 @@ int smp_call_function (void (*func) (voi
  */
 {
        struct call_data_struct data;
-       int cpus = num_online_cpus()-1;
+       int cpus;
 
-       if (!cpus)
+       /* Holding any lock stops cpus from going down. */
+       spin_lock(&call_lock);
+       cpus = num_online_cpus()-1;
+
+       if (!cpus) {
+               spin_unlock(&call_lock);
                return 0;
+       }
 
        /* Can deadlock when called with interrupts disabled */
        WARN_ON(irqs_disabled());
@@ -529,7 +529,6 @@ int smp_call_function (void (*func) (voi
        if (wait)
                atomic_set(&data.finished, 0);
 
-       spin_lock(&call_lock);
        call_data = &data;
        mb();
        
@@ -538,11 +537,11 @@ int smp_call_function (void (*func) (voi
 
        /* Wait for response */
        while (atomic_read(&data.started) != cpus)
-               cpu_relax();
+               barrier();
 
        if (wait)
                while (atomic_read(&data.finished) != cpus)
-                       cpu_relax();
+                       barrier();
        spin_unlock(&call_lock);
 
        return 0;
@@ -555,7 +554,11 @@ static void stop_this_cpu (void * dummy)
         */
        cpu_clear(smp_processor_id(), cpu_online_map);
        local_irq_disable();
+#if 1
+       xxprint("stop_this_cpu disable_local_APIC\n");
+#else
        disable_local_APIC();
+#endif
        if (cpu_data[smp_processor_id()].hlt_works_ok)
                for(;;) __asm__("hlt");
        for (;;);
@@ -570,7 +573,11 @@ void smp_send_stop(void)
        smp_call_function(stop_this_cpu, NULL, 1, 0);
 
        local_irq_disable();
+#if 1
+       xxprint("smp_send_stop disable_local_APIC\n");
+#else
        disable_local_APIC();
+#endif
        local_irq_enable();
 }
 
@@ -579,18 +586,21 @@ void smp_send_stop(void)
  * all the work is done automatically when
  * we return from the interrupt.
  */
-fastcall void smp_reschedule_interrupt(struct pt_regs *regs)
+irqreturn_t smp_reschedule_interrupt(int irq, void *dev_id,
+                                    struct pt_regs *regs)
 {
-       ack_APIC_irq();
+
+       return IRQ_HANDLED;
 }
 
-fastcall void smp_call_function_interrupt(struct pt_regs *regs)
+#include <linux/kallsyms.h>
+irqreturn_t smp_call_function_interrupt(int irq, void *dev_id,
+                                       struct pt_regs *regs)
 {
        void (*func) (void *info) = call_data->func;
        void *info = call_data->info;
        int wait = call_data->wait;
 
-       ack_APIC_irq();
        /*
         * Notify initiating CPU that I've grabbed the data and am
         * about to execute the function
@@ -608,5 +618,7 @@ fastcall void smp_call_function_interrup
                mb();
                atomic_inc(&call_data->finished);
        }
+
+       return IRQ_HANDLED;
 }
 
diff -x mkbuildtree -x include -x xen -x SCCS -urPp 
pristine-linux-2.6.12/arch/i386/kernel/time.c 
linux-2.6-xen-sparse/arch/i386/kernel/time.c
--- pristine-linux-2.6.12/arch/i386/kernel/time.c       2005-06-17 
12:48:29.000000000 -0700
+++ linux-2.6-xen-sparse/arch/i386/kernel/time.c        2005-07-28 
13:17:07.000000000 -0700
@@ -46,6 +46,8 @@
 #include <linux/bcd.h>
 #include <linux/efi.h>
 #include <linux/mca.h>
+#include <linux/sysctl.h>
+#include <linux/percpu.h>
 
 #include <asm/io.h>
 #include <asm/smp.h>
@@ -71,13 +73,24 @@
 extern spinlock_t i8259A_lock;
 int pit_latch_buggy;              /* extern */
 
-#include "do_timer.h"
-
 u64 jiffies_64 = INITIAL_JIFFIES;
 
 EXPORT_SYMBOL(jiffies_64);
 
+#if defined(__x86_64__)
+unsigned long vxtime_hz = PIT_TICK_RATE;
+struct vxtime_data __vxtime __section_vxtime;   /* for vsyscalls */
+volatile unsigned long __jiffies __section_jiffies = INITIAL_JIFFIES;
+unsigned long __wall_jiffies __section_wall_jiffies = INITIAL_JIFFIES;
+struct timespec __xtime __section_xtime;
+struct timezone __sys_tz __section_sys_tz;
+#endif
+
+#if defined(__x86_64__)
+unsigned int cpu_khz;  /* Detected as we calibrate the TSC */
+#else
 unsigned long cpu_khz; /* Detected as we calibrate the TSC */
+#endif
 
 extern unsigned long wall_jiffies;
 
@@ -86,7 +99,210 @@ DEFINE_SPINLOCK(rtc_lock);
 DEFINE_SPINLOCK(i8253_lock);
 EXPORT_SYMBOL(i8253_lock);
 
-struct timer_opts *cur_timer = &timer_none;
+extern struct init_timer_opts timer_tsc_init;
+extern struct timer_opts timer_tsc;
+struct timer_opts *cur_timer = &timer_tsc;
+
+/* These are peridically updated in shared_info, and then copied here. */
+struct shadow_time_info {
+       u64 tsc_timestamp;     /* TSC at last update of time vals.  */
+       u64 system_timestamp;  /* Time, in nanosecs, since boot.    */
+       u32 tsc_to_nsec_mul;
+       u32 tsc_to_usec_mul;
+       int tsc_shift;
+       u32 version;
+};
+static DEFINE_PER_CPU(struct shadow_time_info, shadow_time);
+static struct timeval shadow_tv;
+
+/* Keep track of last time we did processing/updating of jiffies and xtime. */
+static u64 processed_system_time;   /* System time (ns) at last processing. */
+static DEFINE_PER_CPU(u64, processed_system_time);
+
+#define NS_PER_TICK (1000000000ULL/HZ)
+
+#define HANDLE_USEC_UNDERFLOW(_tv) do {                \
+       while ((_tv).tv_usec < 0) {             \
+               (_tv).tv_usec += USEC_PER_SEC;  \
+               (_tv).tv_sec--;                 \
+       }                                       \
+} while (0)
+#define HANDLE_USEC_OVERFLOW(_tv) do {         \
+       while ((_tv).tv_usec >= USEC_PER_SEC) { \
+               (_tv).tv_usec -= USEC_PER_SEC;  \
+               (_tv).tv_sec++;                 \
+       }                                       \
+} while (0)
+static inline void __normalize_time(time_t *sec, s64 *nsec)
+{
+       while (*nsec >= NSEC_PER_SEC) {
+               (*nsec) -= NSEC_PER_SEC;
+               (*sec)++;
+       }
+       while (*nsec < 0) {
+               (*nsec) += NSEC_PER_SEC;
+               (*sec)--;
+       }
+}
+
+/* Does this guest OS track Xen time, or set its wall clock independently? */
+static int independent_wallclock = 0;
+static int __init __independent_wallclock(char *str)
+{
+       independent_wallclock = 1;
+       return 1;
+}
+__setup("independent_wallclock", __independent_wallclock);
+#define INDEPENDENT_WALLCLOCK() \
+    (independent_wallclock || (xen_start_info.flags & SIF_INITDOMAIN))
+
+int tsc_disable __initdata = 0;
+
+static void delay_tsc(unsigned long loops)
+{
+       unsigned long bclock, now;
+       
+       rdtscl(bclock);
+       do
+       {
+               rep_nop();
+               rdtscl(now);
+       } while ((now-bclock) < loops);
+}
+
+struct timer_opts timer_tsc = {
+       .name = "tsc",
+       .delay = delay_tsc,
+};
+
+static inline u32 down_shift(u64 time, int shift)
+{
+       if ( shift < 0 )
+               return (u32)(time >> -shift);
+       return (u32)((u32)time << shift);
+}
+
+/*
+ * 32-bit multiplication of integer multiplicand and fractional multiplier
+ * yielding 32-bit integer product.
+ */
+static inline u32 mul_frac(u32 multiplicand, u32 multiplier)
+{
+       u32 product_int, product_frac;
+       __asm__ (
+               "mul %3"
+               : "=a" (product_frac), "=d" (product_int)
+               : "0" (multiplicand), "r" (multiplier) );
+       return product_int;
+}
+
+void init_cpu_khz(void)
+{
+       u64 __cpu_khz = 1000000ULL << 32;
+       struct vcpu_time_info *info = &HYPERVISOR_shared_info->vcpu_time[0];
+       do_div(__cpu_khz, info->tsc_to_system_mul);
+       cpu_khz = down_shift(__cpu_khz, -info->tsc_shift);
+       printk(KERN_INFO "Xen reported: %lu.%03lu MHz processor.\n",
+              cpu_khz / 1000, cpu_khz % 1000);
+}
+
+static u64 get_nsec_offset(struct shadow_time_info *shadow)
+{
+       u64 now;
+       u32 delta;
+       rdtscll(now);
+       delta = down_shift(now - shadow->tsc_timestamp, shadow->tsc_shift);
+       return mul_frac(delta, shadow->tsc_to_nsec_mul);
+}
+
+static unsigned long get_usec_offset(struct shadow_time_info *shadow)
+{
+       u64 now;
+       u32 delta;
+       rdtscll(now);
+       delta = down_shift(now - shadow->tsc_timestamp, shadow->tsc_shift);
+       return mul_frac(delta, shadow->tsc_to_usec_mul);
+}
+
+static void update_wallclock(void)
+{
+       shared_info_t *s = HYPERVISOR_shared_info;
+       long wtm_nsec, xtime_nsec;
+       time_t wtm_sec, xtime_sec;
+       u64 tmp, usec;
+
+       shadow_tv.tv_sec  = s->wc_sec;
+       shadow_tv.tv_usec = s->wc_usec;
+
+       if (INDEPENDENT_WALLCLOCK())
+               return;
+
+       if ((time_status & STA_UNSYNC) != 0)
+               return;
+
+       /* Adjust wall-clock time base based on wall_jiffies ticks. */
+       usec = processed_system_time;
+       do_div(usec, 1000);
+       usec += (u64)shadow_tv.tv_sec * 1000000ULL;
+       usec += (u64)shadow_tv.tv_usec;
+       usec -= (jiffies - wall_jiffies) * (USEC_PER_SEC / HZ);
+
+       /* Split wallclock base into seconds and nanoseconds. */
+       tmp = usec;
+       xtime_nsec = do_div(tmp, 1000000) * 1000ULL;
+       xtime_sec  = (time_t)tmp;
+
+       wtm_sec  = wall_to_monotonic.tv_sec + (xtime.tv_sec - xtime_sec);
+       wtm_nsec = wall_to_monotonic.tv_nsec + (xtime.tv_nsec - xtime_nsec);
+
+       set_normalized_timespec(&xtime, xtime_sec, xtime_nsec);
+       set_normalized_timespec(&wall_to_monotonic, wtm_sec, wtm_nsec);
+}
+
+/*
+ * Reads a consistent set of time-base values from Xen, into a shadow data
+ * area. Must be called with the xtime_lock held for writing.
+ */
+static void __get_time_values_from_xen(void)
+{
+       shared_info_t           *s = HYPERVISOR_shared_info;
+       struct vcpu_time_info   *src;
+       struct shadow_time_info *dst;
+
+       src = &s->vcpu_time[smp_processor_id()];
+       dst = &per_cpu(shadow_time, smp_processor_id());
+
+       do {
+               dst->version = src->time_version2;
+               rmb();
+               dst->tsc_timestamp     = src->tsc_timestamp;
+               dst->system_timestamp  = src->system_time;
+               dst->tsc_to_nsec_mul   = src->tsc_to_system_mul;
+               dst->tsc_shift         = src->tsc_shift;
+               rmb();
+       }
+       while (dst->version != src->time_version1);
+
+       dst->tsc_to_usec_mul = dst->tsc_to_nsec_mul / 1000;
+
+       if ((shadow_tv.tv_sec != s->wc_sec) ||
+           (shadow_tv.tv_usec != s->wc_usec))
+               update_wallclock();
+}
+
+static inline int time_values_up_to_date(int cpu)
+{
+       struct vcpu_time_info   *src;
+       struct shadow_time_info *dst;
+
+       src = &HYPERVISOR_shared_info->vcpu_time[smp_processor_id()];
+       dst = &per_cpu(shadow_time, smp_processor_id());
+
+       return (dst->version == src->time_version2);
+}
+
+#define TIME_VALUES_UP_TO_DATE \
+ ({ rmb(); (shadow_time_version == HYPERVISOR_shared_info->time_version2); })
 
 /*
  * This is a special lock that is owned by the CPU and holds the index
@@ -126,13 +342,20 @@ void do_gettimeofday(struct timeval *tv)
        unsigned long seq;
        unsigned long usec, sec;
        unsigned long max_ntp_tick;
+       unsigned long flags;
+       s64 nsec;
+       unsigned int cpu;
+       struct shadow_time_info *shadow;
+
+       cpu = get_cpu();
+       shadow = &per_cpu(shadow_time, cpu);
 
        do {
                unsigned long lost;
 
                seq = read_seqbegin(&xtime_lock);
 
-               usec = cur_timer->get_offset();
+               usec = get_usec_offset(shadow);
                lost = jiffies - wall_jiffies;
 
                /*
@@ -151,11 +374,31 @@ void do_gettimeofday(struct timeval *tv)
                        usec += lost * (USEC_PER_SEC / HZ);
 
                sec = xtime.tv_sec;
-               usec += (xtime.tv_nsec / 1000);
+               usec += (xtime.tv_nsec / NSEC_PER_USEC);
+
+               nsec = shadow->system_timestamp - processed_system_time;
+               __normalize_time(&sec, &nsec);
+               usec += (long)nsec / NSEC_PER_USEC;
+
+               if (unlikely(!time_values_up_to_date(cpu))) {
+                       /*
+                        * We may have blocked for a long time,
+                        * rendering our calculations invalid
+                        * (e.g. the time delta may have
+                        * overflowed). Detect that and recalculate
+                        * with fresh values.
+                        */
+                       write_seqlock_irqsave(&xtime_lock, flags);
+                       __get_time_values_from_xen();
+                       write_sequnlock_irqrestore(&xtime_lock, flags);
+                       continue;
+               }
        } while (read_seqretry(&xtime_lock, seq));
 
-       while (usec >= 1000000) {
-               usec -= 1000000;
+       put_cpu();
+
+       while (usec >= USEC_PER_SEC) {
+               usec -= USEC_PER_SEC;
                sec++;
        }
 
@@ -168,21 +411,49 @@ EXPORT_SYMBOL(do_gettimeofday);
 int do_settimeofday(struct timespec *tv)
 {
        time_t wtm_sec, sec = tv->tv_sec;
-       long wtm_nsec, nsec = tv->tv_nsec;
+       long wtm_nsec;
+       s64 nsec;
+       struct timespec xentime;
+       unsigned int cpu;
+       struct shadow_time_info *shadow;
 
        if ((unsigned long)tv->tv_nsec >= NSEC_PER_SEC)
                return -EINVAL;
 
+       if (!INDEPENDENT_WALLCLOCK())
+               return 0; /* Silent failure? */
+
+       cpu = get_cpu();
+       shadow = &per_cpu(shadow_time, cpu);
+
        write_seqlock_irq(&xtime_lock);
+
+       /*
+        * Ensure we don't get blocked for a long time so that our time delta
+        * overflows. If that were to happen then our shadow time values would
+        * be stale, so we can retry with fresh ones.
+        */
+ again:
+       nsec = (s64)tv->tv_nsec - (s64)get_nsec_offset(shadow);
+       if (unlikely(!time_values_up_to_date(cpu))) {
+               __get_time_values_from_xen();
+               goto again;
+       }
+
+       __normalize_time(&sec, &nsec);
+       set_normalized_timespec(&xentime, sec, nsec);
+
        /*
         * This is revolting. We need to set "xtime" correctly. However, the
         * value in this location is the value at the most recent update of
         * wall time.  Discover what correction gettimeofday() would have
         * made, and then undo it!
         */
-       nsec -= cur_timer->get_offset() * NSEC_PER_USEC;
        nsec -= (jiffies - wall_jiffies) * TICK_NSEC;
 
+       nsec -= (shadow->system_timestamp - processed_system_time);
+
+       __normalize_time(&sec, &nsec);
        wtm_sec  = wall_to_monotonic.tv_sec + (xtime.tv_sec - sec);
        wtm_nsec = wall_to_monotonic.tv_nsec + (xtime.tv_nsec - nsec);
 
@@ -193,13 +464,29 @@ int do_settimeofday(struct timespec *tv)
        time_status |= STA_UNSYNC;
        time_maxerror = NTP_PHASE_LIMIT;
        time_esterror = NTP_PHASE_LIMIT;
-       write_sequnlock_irq(&xtime_lock);
+
+#ifdef CONFIG_XEN_PRIVILEGED_GUEST
+       if (xen_start_info.flags & SIF_INITDOMAIN) {
+               dom0_op_t op;
+               op.cmd = DOM0_SETTIME;
+               op.u.settime.secs        = xentime.tv_sec;
+               op.u.settime.usecs       = xentime.tv_nsec / NSEC_PER_USEC;
+               op.u.settime.system_time = shadow->system_timestamp;
+               write_sequnlock_irq(&xtime_lock);
+               HYPERVISOR_dom0_op(&op);
+       } else
+#endif
+               write_sequnlock_irq(&xtime_lock);
+
+       put_cpu();
+
        clock_was_set();
        return 0;
 }
 
 EXPORT_SYMBOL(do_settimeofday);
 
+#ifdef CONFIG_XEN_PRIVILEGED_GUEST
 static int set_rtc_mmss(unsigned long nowtime)
 {
        int retval;
@@ -216,9 +503,12 @@ static int set_rtc_mmss(unsigned long no
 
        return retval;
 }
-
-
-int timer_ack;
+#else
+static int set_rtc_mmss(unsigned long nowtime)
+{
+       return 0;
+}
+#endif
 
 /* monotonic_clock(): returns # of nanoseconds passed since time_init()
  *             Note: This function is required to return accurate
@@ -226,10 +516,31 @@ int timer_ack;
  */
 unsigned long long monotonic_clock(void)
 {
-       return cur_timer->monotonic_clock();
+       int cpu = get_cpu();
+       struct shadow_time_info *shadow = &per_cpu(shadow_time, cpu);
+       s64 off;
+       unsigned long flags;
+       
+       for ( ; ; ) {
+               off = get_nsec_offset(shadow);
+               if (time_values_up_to_date(cpu))
+                       break;
+               write_seqlock_irqsave(&xtime_lock, flags);
+               __get_time_values_from_xen();
+               write_sequnlock_irqrestore(&xtime_lock, flags);
+       }
+
+       put_cpu();
+
+       return shadow->system_timestamp + off;
 }
 EXPORT_SYMBOL(monotonic_clock);
 
+unsigned long long sched_clock(void)
+{
+       return monotonic_clock();
+}
+
 #if defined(CONFIG_SMP) && defined(CONFIG_FRAME_POINTER)
 unsigned long profile_pc(struct pt_regs *regs)
 {
@@ -250,37 +561,47 @@ EXPORT_SYMBOL(profile_pc);
 static inline void do_timer_interrupt(int irq, void *dev_id,
                                        struct pt_regs *regs)
 {
-#ifdef CONFIG_X86_IO_APIC
-       if (timer_ack) {
-               /*
-                * Subtle, when I/O APICs are used we have to ack timer IRQ
-                * manually to reset the IRR bit for do_slow_gettimeoffset().
-                * This will also deassert NMI lines for the watchdog if run
-                * on an 82489DX-based system.
-                */
-               spin_lock(&i8259A_lock);
-               outb(0x0c, PIC_MASTER_OCW3);
-               /* Ack the IRQ; AEOI will end it automatically. */
-               inb(PIC_MASTER_POLL);
-               spin_unlock(&i8259A_lock);
-       }
-#endif
+       s64 delta, delta_cpu;
+       int cpu = smp_processor_id();
+       struct shadow_time_info *shadow = &per_cpu(shadow_time, cpu);
+
+       do {
+               __get_time_values_from_xen();
 
-       do_timer_interrupt_hook(regs);
+               delta = delta_cpu = 
+                       shadow->system_timestamp + get_nsec_offset(shadow);
+               delta     -= processed_system_time;
+               delta_cpu -= per_cpu(processed_system_time, cpu);
+       }
+       while (!time_values_up_to_date(cpu));
 
+       if (unlikely(delta < 0) || unlikely(delta_cpu < 0)) {
+               printk("Timer ISR/%d: Time went backwards: "
+                      "delta=%lld cpu_delta=%lld shadow=%lld "
+                      "off=%lld processed=%lld cpu_processed=%lld\n",
+                      cpu, delta, delta_cpu, shadow->system_timestamp,
+                      (s64)get_nsec_offset(shadow),
+                      processed_system_time,
+                      per_cpu(processed_system_time, cpu));
+               for (cpu = 0; cpu < num_online_cpus(); cpu++)
+                       printk(" %d: %lld\n", cpu,
+                              per_cpu(processed_system_time, cpu));
+               return;
+       }
 
-       if (MCA_bus) {
-               /* The PS/2 uses level-triggered interrupts.  You can't
-               turn them off, nor would you want to (any attempt to
-               enable edge-triggered interrupts usually gets intercepted by a
-               special hardware circuit).  Hence we have to acknowledge
-               the timer interrupt.  Through some incredibly stupid
-               design idea, the reset for IRQ 0 is done by setting the
-               high bit of the PPI port B (0x61).  Note that some PS/2s,
-               notably the 55SX, work fine if this is removed.  */
+       /* System-wide jiffy work. */
+       while (delta >= NS_PER_TICK) {
+               delta -= NS_PER_TICK;
+               processed_system_time += NS_PER_TICK;
+               do_timer(regs);
+       }
 
-               irq = inb_p( 0x61 );    /* read the current state */
-               outb_p( irq|0x80, 0x61 );       /* reset the IRQ */
+       /* Local CPU jiffy work. */
+       while (delta_cpu >= NS_PER_TICK) {
+               delta_cpu -= NS_PER_TICK;
+               per_cpu(processed_system_time, cpu) += NS_PER_TICK;
+               update_process_times(user_mode(regs));
+               profile_tick(CPU_PROFILING, regs);
        }
 }
 
@@ -299,11 +620,7 @@ irqreturn_t timer_interrupt(int irq, voi
         * locally disabled. -arca
         */
        write_seqlock(&xtime_lock);
-
-       cur_timer->mark_offset();
- 
        do_timer_interrupt(irq, NULL, regs);
-
        write_sequnlock(&xtime_lock);
        return IRQ_HANDLED;
 }
@@ -452,6 +769,14 @@ static void __init hpet_time_init(void)
 }
 #endif
 
+/* Dynamically-mapped IRQ. */
+static DEFINE_PER_CPU(int, timer_irq);
+
+static struct irqaction irq_timer = {
+       timer_interrupt, SA_INTERRUPT, CPU_MASK_NONE, "timer0",
+       NULL, NULL
+};
+
 void __init time_init(void)
 {
 #ifdef CONFIG_HPET_TIMER
@@ -464,13 +789,141 @@ void __init time_init(void)
                return;
        }
 #endif
-       xtime.tv_sec = get_cmos_time();
-       xtime.tv_nsec = (INITIAL_JIFFIES % HZ) * (NSEC_PER_SEC / HZ);
+       __get_time_values_from_xen();
+       xtime.tv_sec = shadow_tv.tv_sec;
+       xtime.tv_nsec = shadow_tv.tv_usec * NSEC_PER_USEC;
        set_normalized_timespec(&wall_to_monotonic,
                -xtime.tv_sec, -xtime.tv_nsec);
+       processed_system_time = per_cpu(shadow_time, 0).system_timestamp;
+       per_cpu(processed_system_time, 0) = processed_system_time;
 
-       cur_timer = select_timer();
-       printk(KERN_INFO "Using %s for high-res timesource\n",cur_timer->name);
+       init_cpu_khz();
 
-       time_init_hook();
+#if defined(__x86_64__)
+       vxtime.mode = VXTIME_TSC;
+       vxtime.quot = (1000000L << 32) / vxtime_hz;
+       vxtime.tsc_quot = (1000L << 32) / cpu_khz;
+       vxtime.hz = vxtime_hz;
+       sync_core();
+       rdtscll(vxtime.last_tsc);
+#endif
+
+       per_cpu(timer_irq, 0) = bind_virq_to_irq(VIRQ_TIMER);
+       (void)setup_irq(per_cpu(timer_irq, 0), &irq_timer);
+}
+
+/* Convert jiffies to system time. */
+static inline u64 jiffies_to_st(unsigned long j) 
+{
+       unsigned long seq;
+       long delta;
+       u64 st;
+
+       do {
+               seq = read_seqbegin(&xtime_lock);
+               delta = j - jiffies;
+               /* NB. The next check can trigger in some wrap-around cases,
+                * but that's ok: we'll just end up with a shorter timeout. */
+               if (delta < 1)
+                       delta = 1;
+               st = processed_system_time + (delta * NS_PER_TICK);
+       } while (read_seqretry(&xtime_lock, seq));
+
+       return st;
 }
+
+/*
+ * stop_hz_timer / start_hz_timer - enter/exit 'tickless mode' on an idle cpu
+ * These functions are based on implementations from arch/s390/kernel/time.c
+ */
+void stop_hz_timer(void)
+{
+       unsigned int cpu = smp_processor_id();
+       unsigned long j;
+
+       /* s390 does this /before/ checking rcu_pending(). We copy them. */
+       cpu_set(cpu, nohz_cpu_mask);
+
+       /* Leave ourselves in 'tick mode' if rcu or softirq pending. */
+       if (rcu_pending(cpu) || local_softirq_pending()) {
+               cpu_clear(cpu, nohz_cpu_mask);
+               j = jiffies + 1;
+       } else {
+               j = next_timer_interrupt();
+       }
+
+       BUG_ON(HYPERVISOR_set_timer_op(jiffies_to_st(j)) != 0);
+}
+
+void start_hz_timer(void)
+{
+       cpu_clear(smp_processor_id(), nohz_cpu_mask);
+}
+
+void time_suspend(void)
+{
+       /* nothing */
+}
+
+/* No locking required. We are only CPU running, and interrupts are off. */
+void time_resume(void)
+{
+       init_cpu_khz();
+
+       /* Get timebases for new environment. */ 
+       __get_time_values_from_xen();
+
+       /* Reset our own concept of passage of system time. */
+       processed_system_time =
+               per_cpu(shadow_time, smp_processor_id()).system_timestamp;
+       per_cpu(processed_system_time, 0) = processed_system_time;
+}
+
+#ifdef CONFIG_SMP
+static char timer_name[NR_CPUS][15];
+void local_setup_timer(void)
+{
+       int seq, cpu = smp_processor_id();
+
+       do {
+               seq = read_seqbegin(&xtime_lock);
+               per_cpu(processed_system_time, cpu) = 
+                       per_cpu(shadow_time, cpu).system_timestamp;
+       } while (read_seqretry(&xtime_lock, seq));
+
+       per_cpu(timer_irq, cpu) = bind_virq_to_irq(VIRQ_TIMER);
+       sprintf(timer_name[cpu], "timer%d", cpu);
+       BUG_ON(request_irq(per_cpu(timer_irq, cpu), timer_interrupt,
+                          SA_INTERRUPT, timer_name[cpu], NULL));
+}
+#endif
+
+/*
+ * /proc/sys/xen: This really belongs in another file. It can stay here for
+ * now however.
+ */
+static ctl_table xen_subtable[] = {
+       {1, "independent_wallclock", &independent_wallclock,
+        sizeof(independent_wallclock), 0644, NULL, proc_dointvec},
+       {0}
+};
+static ctl_table xen_table[] = {
+       {123, "xen", NULL, 0, 0555, xen_subtable},
+       {0}
+};
+static int __init xen_sysctl_init(void)
+{
+       (void)register_sysctl_table(xen_table, 0);
+       return 0;
+}
+__initcall(xen_sysctl_init);
+
+/*
+ * Local variables:
+ *  c-file-style: "linux"
+ *  indent-tabs-mode: t
+ *  c-indent-level: 8
+ *  c-basic-offset: 8
+ *  tab-width: 8
+ * End:
+ */
diff -x mkbuildtree -x include -x xen -x SCCS -urPp 
pristine-linux-2.6.12/arch/i386/kernel/timers/Makefile 
linux-2.6-xen-sparse/arch/i386/kernel/timers/Makefile
--- pristine-linux-2.6.12/arch/i386/kernel/timers/Makefile      2005-06-17 
12:48:29.000000000 -0700
+++ linux-2.6-xen-sparse/arch/i386/kernel/timers/Makefile       2005-07-28 
13:17:07.000000000 -0700
@@ -2,8 +2,16 @@
 # Makefile for x86 timers
 #
 
-obj-y := timer.o timer_none.o timer_tsc.o timer_pit.o common.o
+XENARCH        := $(subst ",,$(CONFIG_XENARCH))
 
-obj-$(CONFIG_X86_CYCLONE_TIMER)        += timer_cyclone.o
-obj-$(CONFIG_HPET_TIMER)       += timer_hpet.o
-obj-$(CONFIG_X86_PM_TIMER)     += timer_pm.o
+obj-y :=       timer_tsc.o
+c-obj-y :=
+
+c-link :=
+
+$(patsubst %.o,$(obj)/%.c,$(c-obj-y) $(c-link)):
+       @ln -fsn $(srctree)/arch/i386/kernel/timers/$(notdir $@) $@
+
+obj-y  += $(c-obj-y)
+
+clean-files += $(patsubst %.o,%.c,$(c-obj-y) $(c-obj-) $(c-link))
diff -x mkbuildtree -x include -x xen -x SCCS -urPp 
pristine-linux-2.6.12/arch/i386/kernel/timers/timer_tsc.c 
linux-2.6-xen-sparse/arch/i386/kernel/timers/timer_tsc.c
--- pristine-linux-2.6.12/arch/i386/kernel/timers/timer_tsc.c   2005-06-17 
12:48:29.000000000 -0700
+++ linux-2.6-xen-sparse/arch/i386/kernel/timers/timer_tsc.c    2005-07-28 
13:17:07.000000000 -0700
@@ -1,10 +1,6 @@
 /*
  * This code largely moved from arch/i386/kernel/time.c.
  * See comments there for proper credits.
- *
- * 2004-06-25    Jesper Juhl
- *      moved mark_offset_tsc below cpufreq_delayed_get to avoid gcc 3.4
- *      failing to inline.
  */
 
 #include <linux/spinlock.h>
@@ -38,12 +34,9 @@ int tsc_disable __initdata = 0;
 extern spinlock_t i8253_lock;
 
 static int use_tsc;
-/* Number of usecs that the last interrupt was delayed */
-static int delay_at_last_interrupt;
 
-static unsigned long last_tsc_low; /* lsb 32 bits of Time Stamp Counter */
-static unsigned long last_tsc_high; /* msb 32 bits of Time Stamp Counter */
 static unsigned long long monotonic_base;
+static u32 monotonic_offset;
 static seqlock_t monotonic_lock = SEQLOCK_UNLOCKED;
 
 /* convert from cycles(64bits) => nanoseconds (64bits)
@@ -74,8 +67,6 @@ static inline unsigned long long cycles_
        return (cyc * cyc2ns_scale) >> CYC2NS_SCALE_FACTOR;
 }
 
-static int count2; /* counter for mark_offset_tsc() */
-
 /* Cached *multiplier* to convert TSC counts to microseconds.
  * (see the equation below).
  * Equal to 2^32 * (1 / (clocks per usec) ).
@@ -83,6 +74,9 @@ static int count2; /* counter for mark_o
  */
 static unsigned long fast_gettimeoffset_quotient;
 
+extern u32 shadow_tsc_stamp;
+extern u64 shadow_system_time;
+
 static unsigned long get_offset_tsc(void)
 {
        register unsigned long eax, edx;
@@ -92,7 +86,7 @@ static unsigned long get_offset_tsc(void
        rdtsc(eax,edx);
 
        /* .. relative to previous jiffy (32 bits is enough) */
-       eax -= last_tsc_low;    /* tsc_low delta */
+       eax -= shadow_tsc_stamp;
 
        /*
          * Time offset = (tsc_low delta) * fast_gettimeoffset_quotient
@@ -109,7 +103,7 @@ static unsigned long get_offset_tsc(void
                 "0" (eax));
 
        /* our adjusted time offset in microseconds */
-       return delay_at_last_interrupt + edx;
+       return edx;
 }
 
 static unsigned long long monotonic_clock_tsc(void)
@@ -120,7 +114,7 @@ static unsigned long long monotonic_cloc
        /* atomically read monotonic base & last_offset */
        do {
                seq = read_seqbegin(&monotonic_lock);
-               last_offset = ((unsigned long 
long)last_tsc_high<<32)|last_tsc_low;
+               last_offset = monotonic_offset;
                base = monotonic_base;
        } while (read_seqretry(&monotonic_lock, seq));
 
@@ -155,6 +149,17 @@ unsigned long long sched_clock(void)
        return cycles_2_ns(this_offset);
 }
 
+
+static void mark_offset_tsc(void)
+{
+
+       /* update the monotonic base value */
+       write_seqlock(&monotonic_lock);
+       monotonic_base = shadow_system_time;
+       monotonic_offset = shadow_tsc_stamp;
+       write_sequnlock(&monotonic_lock);
+}
+
 static void delay_tsc(unsigned long loops)
 {
        unsigned long bclock, now;
@@ -320,245 +325,39 @@ core_initcall(cpufreq_tsc);
 static inline void cpufreq_delayed_get(void) { return; }
 #endif 
 
-int recalibrate_cpu_khz(void)
-{
-#ifndef CONFIG_SMP
-       unsigned long cpu_khz_old = cpu_khz;
-
-       if (cpu_has_tsc) {
-               init_cpu_khz();
-               cpu_data[0].loops_per_jiffy =
-                   cpufreq_scale(cpu_data[0].loops_per_jiffy,
-                                 cpu_khz_old,
-                                 cpu_khz);
-               return 0;
-       } else
-               return -ENODEV;
-#else
-       return -ENODEV;
-#endif
-}
-EXPORT_SYMBOL(recalibrate_cpu_khz);
 
-static void mark_offset_tsc(void)
+static int init_tsc(char* override)
 {
-       unsigned long lost,delay;
-       unsigned long delta = last_tsc_low;
-       int count;
-       int countmp;
-       static int count1 = 0;
-       unsigned long long this_offset, last_offset;
-       static int lost_count = 0;
-
-       write_seqlock(&monotonic_lock);
-       last_offset = ((unsigned long long)last_tsc_high<<32)|last_tsc_low;
-       /*
-        * It is important that these two operations happen almost at
-        * the same time. We do the RDTSC stuff first, since it's
-        * faster. To avoid any inconsistencies, we need interrupts
-        * disabled locally.
-        */
-
-       /*
-        * Interrupts are just disabled locally since the timer irq
-        * has the SA_INTERRUPT flag set. -arca
-        */
-
-       /* read Pentium cycle counter */
-
-       rdtsc(last_tsc_low, last_tsc_high);
-
-       spin_lock(&i8253_lock);
-       outb_p(0x00, PIT_MODE);     /* latch the count ASAP */
-
-       count = inb_p(PIT_CH0);    /* read the latched count */
-       count |= inb(PIT_CH0) << 8;
-
-       /*
-        * VIA686a test code... reset the latch if count > max + 1
-        * from timer_pit.c - cjb
-        */
-       if (count > LATCH) {
-               outb_p(0x34, PIT_MODE);
-               outb_p(LATCH & 0xff, PIT_CH0);
-               outb(LATCH >> 8, PIT_CH0);
-               count = LATCH - 1;
-       }
-
-       spin_unlock(&i8253_lock);
+       u64 __cpu_khz;
 
-       if (pit_latch_buggy) {
-               /* get center value of last 3 time lutch */
-               if ((count2 >= count && count >= count1)
-                   || (count1 >= count && count >= count2)) {
-                       count2 = count1; count1 = count;
-               } else if ((count1 >= count2 && count2 >= count)
-                          || (count >= count2 && count2 >= count1)) {
-                       countmp = count;count = count2;
-                       count2 = count1;count1 = countmp;
-               } else {
-                       count2 = count1; count1 = count; count = count1;
-               }
-       }
+       __cpu_khz = HYPERVISOR_shared_info->cpu_freq;
+       do_div(__cpu_khz, 1000);
+       cpu_khz = (u32)__cpu_khz;
+       printk(KERN_INFO "Xen reported: %lu.%03lu MHz processor.\n", 
+              cpu_khz / 1000, cpu_khz % 1000);
 
-       /* lost tick compensation */
-       delta = last_tsc_low - delta;
+       /* (10^6 * 2^32) / cpu_hz = (10^3 * 2^32) / cpu_khz =
+          (2^32 * 1 / (clocks/us)) */
        {
-               register unsigned long eax, edx;
-               eax = delta;
-               __asm__("mull %2"
-               :"=a" (eax), "=d" (edx)
-               :"rm" (fast_gettimeoffset_quotient),
-                "0" (eax));
-               delta = edx;
-       }
-       delta += delay_at_last_interrupt;
-       lost = delta/(1000000/HZ);
-       delay = delta%(1000000/HZ);
-       if (lost >= 2) {
-               jiffies_64 += lost-1;
-
-               /* sanity check to ensure we're not always losing ticks */
-               if (lost_count++ > 100) {
-                       printk(KERN_WARNING "Losing too many ticks!\n");
-                       printk(KERN_WARNING "TSC cannot be used as a 
timesource.  \n");
-                       printk(KERN_WARNING "Possible reasons for this are:\n");
-                       printk(KERN_WARNING "  You're running with 
Speedstep,\n");
-                       printk(KERN_WARNING "  You don't have DMA enabled for 
your hard disk (see hdparm),\n");
-                       printk(KERN_WARNING "  Incorrect TSC synchronization on 
an SMP system (see dmesg).\n");
-                       printk(KERN_WARNING "Falling back to a sane timesource 
now.\n");
-
-                       clock_fallback();
-               }
-               /* ... but give the TSC a fair chance */
-               if (lost_count > 25)
-                       cpufreq_delayed_get();
-       } else
-               lost_count = 0;
-       /* update the monotonic base value */
-       this_offset = ((unsigned long long)last_tsc_high<<32)|last_tsc_low;
-       monotonic_base += cycles_2_ns(this_offset - last_offset);
-       write_sequnlock(&monotonic_lock);
-
-       /* calculate delay_at_last_interrupt */
-       count = ((LATCH-1) - count) * TICK_SIZE;
-       delay_at_last_interrupt = (count + LATCH/2) / LATCH;
-
-       /* catch corner case where tick rollover occured
-        * between tsc and pit reads (as noted when
-        * usec delta is > 90% # of usecs/tick)
-        */
-       if (lost && abs(delay - delay_at_last_interrupt) > (900000/HZ))
-               jiffies_64++;
-}
-
-static int __init init_tsc(char* override)
-{
-
-       /* check clock override */
-       if (override[0] && strncmp(override,"tsc",3)) {
-#ifdef CONFIG_HPET_TIMER
-               if (is_hpet_enabled()) {
-                       printk(KERN_ERR "Warning: clock= override failed. 
Defaulting to tsc\n");
-               } else
-#endif
-               {
-                       return -ENODEV;
-               }
+               unsigned long eax=0, edx=1000;
+               __asm__("divl %2"
+                   :"=a" (fast_gettimeoffset_quotient), "=d" (edx)
+                   :"r" (cpu_khz),
+                   "0" (eax), "1" (edx));
        }
 
-       /*
-        * If we have APM enabled or the CPU clock speed is variable
-        * (CPU stops clock on HLT or slows clock to save power)
-        * then the TSC timestamps may diverge by up to 1 jiffy from
-        * 'real time' but nothing will break.
-        * The most frequent case is that the CPU is "woken" from a halt
-        * state by the timer interrupt itself, so we get 0 error. In the
-        * rare cases where a driver would "wake" the CPU and request a
-        * timestamp, the maximum error is < 1 jiffy. But timestamps are
-        * still perfectly ordered.
-        * Note that the TSC counter will be reset if APM suspends
-        * to disk; this won't break the kernel, though, 'cuz we're
-        * smart.  See arch/i386/kernel/apm.c.
-        */
-       /*
-        *      Firstly we have to do a CPU check for chips with
-        *      a potentially buggy TSC. At this point we haven't run
-        *      the ident/bugs checks so we must run this hook as it
-        *      may turn off the TSC flag.
-        *
-        *      NOTE: this doesn't yet handle SMP 486 machines where only
-        *      some CPU's have a TSC. Thats never worked and nobody has
-        *      moaned if you have the only one in the world - you fix it!
-        */
-
-       count2 = LATCH; /* initialize counter for mark_offset_tsc() */
+       set_cyc2ns_scale(cpu_khz/1000);
 
-       if (cpu_has_tsc) {
-               unsigned long tsc_quotient;
-#ifdef CONFIG_HPET_TIMER
-               if (is_hpet_enabled() && hpet_use_timer) {
-                       unsigned long result, remain;
-                       printk("Using TSC for gettimeofday\n");
-                       tsc_quotient = calibrate_tsc_hpet(NULL);
-                       timer_tsc.mark_offset = &mark_offset_tsc_hpet;
-                       /*
-                        * Math to calculate hpet to usec multiplier
-                        * Look for the comments at get_offset_tsc_hpet()
-                        */
-                       ASM_DIV64_REG(result, remain, hpet_tick,
-                                       0, KERNEL_TICK_USEC);
-                       if (remain > (hpet_tick >> 1))
-                               result++; /* rounding the result */
+       use_tsc = 1;
 
-                       hpet_usec_quotient = result;
-               } else
-#endif
-               {
-                       tsc_quotient = calibrate_tsc();
-               }
-
-               if (tsc_quotient) {
-                       fast_gettimeoffset_quotient = tsc_quotient;
-                       use_tsc = 1;
-                       /*
-                        *      We could be more selective here I suspect
-                        *      and just enable this for the next intel chips ?
-                        */
-                       /* report CPU clock rate in Hz.
-                        * The formula is (10^6 * 2^32) / (2^32 * 1 / 
(clocks/us)) =
-                        * clock/second. Our precision is about 100 ppm.
-                        */
-                       {       unsigned long eax=0, edx=1000;
-                               __asm__("divl %2"
-                               :"=a" (cpu_khz), "=d" (edx)
-                               :"r" (tsc_quotient),
-                               "0" (eax), "1" (edx));
-                               printk("Detected %lu.%03lu MHz processor.\n", 
cpu_khz / 1000, cpu_khz % 1000);
-                       }
-                       set_cyc2ns_scale(cpu_khz/1000);
-                       return 0;
-               }
-       }
-       return -ENODEV;
+       return 0;
 }
 
-#ifndef CONFIG_X86_TSC
-/* disable flag for tsc.  Takes effect by clearing the TSC cpu flag
- * in cpu/common.c */
 static int __init tsc_setup(char *str)
 {
-       tsc_disable = 1;
+       printk(KERN_WARNING "notsc: cannot disable TSC in Xen/Linux.\n");
        return 1;
 }
-#else
-static int __init tsc_setup(char *str)
-{
-       printk(KERN_WARNING "notsc: Kernel compiled with CONFIG_X86_TSC, "
-                               "cannot disable TSC.\n");
-       return 1;
-}
-#endif
 __setup("notsc", tsc_setup);
 
 
@@ -566,7 +365,7 @@ __setup("notsc", tsc_setup);
 /************************************************************/
 
 /* tsc timer_opts struct */
-static struct timer_opts timer_tsc = {
+struct timer_opts timer_tsc = {
        .name = "tsc",
        .mark_offset = mark_offset_tsc, 
        .get_offset = get_offset_tsc,
@@ -574,7 +373,7 @@ static struct timer_opts timer_tsc = {
        .delay = delay_tsc,
 };
 
-struct init_timer_opts __initdata timer_tsc_init = {
+struct init_timer_opts timer_tsc_init = {
        .init = init_tsc,
        .opts = &timer_tsc,
 };
diff -x mkbuildtree -x include -x xen -x SCCS -urPp 
pristine-linux-2.6.12/arch/i386/kernel/traps.c 
linux-2.6-xen-sparse/arch/i386/kernel/traps.c
--- pristine-linux-2.6.12/arch/i386/kernel/traps.c      2005-06-17 
12:48:29.000000000 -0700
+++ linux-2.6-xen-sparse/arch/i386/kernel/traps.c       2005-07-28 
13:17:07.000000000 -0700
@@ -58,9 +58,6 @@
 
 asmlinkage int system_call(void);
 
-struct desc_struct default_ldt[] = { { 0, 0 }, { 0, 0 }, { 0, 0 },
-               { 0, 0 }, { 0, 0 } };
-
 /* Do we ignore FPU interrupts ? */
 char ignore_fpu_irq = 0;
 
@@ -88,7 +85,7 @@ asmlinkage void page_fault(void);
 asmlinkage void coprocessor_error(void);
 asmlinkage void simd_coprocessor_error(void);
 asmlinkage void alignment_check(void);
-asmlinkage void spurious_interrupt_bug(void);
+asmlinkage void fixup_4gb_segment(void);
 asmlinkage void machine_check(void);
 
 static int kstack_depth_to_print = 24;
@@ -209,7 +206,7 @@ void show_registers(struct pt_regs *regs
 
        esp = (unsigned long) (&regs->esp);
        ss = __KERNEL_DS;
-       if (regs->xcs & 3) {
+       if (regs->xcs & 2) {
                in_kernel = 0;
                esp = regs->esp;
                ss = regs->xss & 0xffff;
@@ -265,7 +262,7 @@ static void handle_BUG(struct pt_regs *r
        char c;
        unsigned long eip;
 
-       if (regs->xcs & 3)
+       if (regs->xcs & 2)
                goto no_bug;            /* Not in kernel */
 
        eip = regs->eip;
@@ -353,7 +350,7 @@ void die(const char * str, struct pt_reg
 
 static inline void die_if_kernel(const char * str, struct pt_regs * regs, long 
err)
 {
-       if (!(regs->eflags & VM_MASK) && !(3 & regs->xcs))
+       if (!(regs->eflags & VM_MASK) && !(2 & regs->xcs))
                die(str, regs, err);
 }
 
@@ -366,7 +363,7 @@ static void do_trap(int trapnr, int sign
                goto trap_signal;
        }
 
-       if (!(regs->xcs & 3))
+       if (!(regs->xcs & 2))
                goto kernel_trap;
 
        trap_signal: {
@@ -446,49 +443,37 @@ DO_VM86_ERROR( 3, SIGTRAP, "int3", int3)
 DO_VM86_ERROR( 4, SIGSEGV, "overflow", overflow)
 DO_VM86_ERROR( 5, SIGSEGV, "bounds", bounds)
 DO_ERROR_INFO( 6, SIGILL,  "invalid operand", invalid_op, ILL_ILLOPN, 
regs->eip)
+DO_VM86_ERROR( 7, SIGSEGV, "device not available", device_not_available)
 DO_ERROR( 9, SIGFPE,  "coprocessor segment overrun", 
coprocessor_segment_overrun)
 DO_ERROR(10, SIGSEGV, "invalid TSS", invalid_TSS)
 DO_ERROR(11, SIGBUS,  "segment not present", segment_not_present)
 DO_ERROR(12, SIGBUS,  "stack segment", stack_segment)
 DO_ERROR_INFO(17, SIGBUS, "alignment check", alignment_check, BUS_ADRALN, 0)
 DO_ERROR_INFO(32, SIGSEGV, "iret exception", iret_error, ILL_BADSTK, 0)
+#ifdef CONFIG_X86_MCE
+DO_ERROR(18, SIGBUS, "machine check", machine_check)
+#endif
 
 fastcall void do_general_protection(struct pt_regs * regs, long error_code)
 {
-       int cpu = get_cpu();
-       struct tss_struct *tss = &per_cpu(init_tss, cpu);
-       struct thread_struct *thread = &current->thread;
-
        /*
-        * Perform the lazy TSS's I/O bitmap copy. If the TSS has an
-        * invalid offset set (the LAZY one) and the faulting thread has
-        * a valid I/O bitmap pointer, we copy the I/O bitmap in the TSS
-        * and we set the offset field correctly. Then we let the CPU to
-        * restart the faulting instruction.
-        */
-       if (tss->io_bitmap_base == INVALID_IO_BITMAP_OFFSET_LAZY &&
-           thread->io_bitmap_ptr) {
-               memcpy(tss->io_bitmap, thread->io_bitmap_ptr,
-                      thread->io_bitmap_max);
-               /*
-                * If the previously set map was extending to higher ports
-                * than the current one, pad extra space with 0xff (no access).
-                */
-               if (thread->io_bitmap_max < tss->io_bitmap_max)
-                       memset((char *) tss->io_bitmap +
-                               thread->io_bitmap_max, 0xff,
-                               tss->io_bitmap_max - thread->io_bitmap_max);
-               tss->io_bitmap_max = thread->io_bitmap_max;
-               tss->io_bitmap_base = IO_BITMAP_OFFSET;
-               put_cpu();
-               return;
+        * If we trapped on an LDT access then ensure that the default_ldt is
+        * loaded, if nothing else. We load default_ldt lazily because LDT
+        * switching costs time and many applications don't need it.
+        */
+       if (unlikely((error_code & 6) == 4)) {
+               unsigned long ldt;
+               __asm__ __volatile__ ("sldt %0" : "=r" (ldt));
+               if (ldt == 0) {
+                       xen_set_ldt((unsigned long)&default_ldt[0], 5);
+                       return;
+               }
        }
-       put_cpu();
 
        if (regs->eflags & VM_MASK)
                goto gp_in_vm86;
 
-       if (!(regs->xcs & 3))
+       if (!(regs->xcs & 2))
                goto gp_in_kernel;
 
        current->thread.error_code = error_code;
@@ -624,6 +609,14 @@ fastcall void do_nmi(struct pt_regs * re
        nmi_enter();
 
        cpu = smp_processor_id();
+
+#ifdef CONFIG_HOTPLUG_CPU
+       if (!cpu_online(cpu)) {
+               nmi_exit();
+               return;
+       }
+#endif
+
        ++nmi_count(cpu);
 
        if (!nmi_callback(regs, cpu))
@@ -682,14 +675,16 @@ fastcall void do_debug(struct pt_regs * 
        unsigned int condition;
        struct task_struct *tsk = current;
 
-       __asm__ __volatile__("movl %%db6,%0" : "=r" (condition));
+       condition = HYPERVISOR_get_debugreg(6);
 
        if (notify_die(DIE_DEBUG, "debug", regs, condition, error_code,
                                        SIGTRAP) == NOTIFY_STOP)
                return;
+#if 0
        /* It's safe to allow irq's after DR6 has been saved */
        if (regs->eflags & X86_EFLAGS_IF)
                local_irq_enable();
+#endif
 
        /* Mask out spurious debug traps due to lazy DR7 setting */
        if (condition & (DR_TRAP0|DR_TRAP1|DR_TRAP2|DR_TRAP3)) {
@@ -713,7 +708,7 @@ fastcall void do_debug(struct pt_regs * 
                 * check for kernel mode by just checking the CPL
                 * of CS.
                 */
-               if ((regs->xcs & 3) == 0)
+               if ((regs->xcs & 2) == 0)
                        goto clear_TF_reenable;
        }
 
@@ -724,9 +719,7 @@ fastcall void do_debug(struct pt_regs * 
         * the signal is delivered.
         */
 clear_dr7:
-       __asm__("movl %0,%%db7"
-               : /* no output */
-               : "r" (0));
+       HYPERVISOR_set_debugreg(7, 0);
        return;
 
 debug_vm86:
@@ -878,15 +871,6 @@ fastcall void do_simd_coprocessor_error(
        }
 }
 
-fastcall void do_spurious_interrupt_bug(struct pt_regs * regs,
-                                         long error_code)
-{
-#if 0
-       /* No need to warn about this any longer. */
-       printk("Ignoring P6 Local APIC Spurious Interrupt Bug...\n");
-#endif
-}
-
 fastcall void setup_x86_bogus_stack(unsigned char * stk)
 {
        unsigned long *switch16_ptr, *switch32_ptr;
@@ -947,7 +931,7 @@ asmlinkage void math_state_restore(struc
        struct thread_info *thread = current_thread_info();
        struct task_struct *tsk = thread->task;
 
-       clts();         /* Allow maths ops (or we recurse) */
+       /* NB. 'clts' is done for us by Xen during virtual trap. */
        if (!tsk_used_math(tsk))
                init_fpu(tsk);
        restore_fpu(tsk);
@@ -980,100 +964,58 @@ void __init trap_init_f00f_bug(void)
 }
 #endif
 
-#define _set_gate(gate_addr,type,dpl,addr,seg) \
-do { \
-  int __d0, __d1; \
-  __asm__ __volatile__ ("movw %%dx,%%ax\n\t" \
-       "movw %4,%%dx\n\t" \
-       "movl %%eax,%0\n\t" \
-       "movl %%edx,%1" \
-       :"=m" (*((long *) (gate_addr))), \
-        "=m" (*(1+(long *) (gate_addr))), "=&a" (__d0), "=&d" (__d1) \
-       :"i" ((short) (0x8000+(dpl<<13)+(type<<8))), \
-        "3" ((char *) (addr)),"2" ((seg) << 16)); \
-} while (0)
-
-
-/*
- * This needs to use 'idt_table' rather than 'idt', and
- * thus use the _nonmapped_ version of the IDT, as the
- * Pentium F0 0F bugfix can have resulted in the mapped
- * IDT being write-protected.
- */
-void set_intr_gate(unsigned int n, void *addr)
-{
-       _set_gate(idt_table+n,14,0,addr,__KERNEL_CS);
-}
-
-/*
- * This routine sets up an interrupt gate at directory privilege level 3.
- */
-static inline void set_system_intr_gate(unsigned int n, void *addr)
-{
-       _set_gate(idt_table+n, 14, 3, addr, __KERNEL_CS);
-}
-
-static void __init set_trap_gate(unsigned int n, void *addr)
-{
-       _set_gate(idt_table+n,15,0,addr,__KERNEL_CS);
-}
-
-static void __init set_system_gate(unsigned int n, void *addr)
-{
-       _set_gate(idt_table+n,15,3,addr,__KERNEL_CS);
-}
-
-static void __init set_task_gate(unsigned int n, unsigned int gdt_entry)
-{
-       _set_gate(idt_table+n,5,0,0,(gdt_entry<<3));
-}
 
+/* NB. All these are "trap gates" (i.e. events_mask isn't cleared). */
+static trap_info_t trap_table[] = {
+       {  0, 0, __KERNEL_CS, (unsigned long)divide_error               },
+       {  1, 0, __KERNEL_CS, (unsigned long)debug                      },
+       {  3, 3, __KERNEL_CS, (unsigned long)int3                       },
+       {  4, 3, __KERNEL_CS, (unsigned long)overflow                   },
+       {  5, 3, __KERNEL_CS, (unsigned long)bounds                     },
+       {  6, 0, __KERNEL_CS, (unsigned long)invalid_op                 },
+       {  7, 0, __KERNEL_CS, (unsigned long)device_not_available       },
+       {  9, 0, __KERNEL_CS, (unsigned long)coprocessor_segment_overrun },
+       { 10, 0, __KERNEL_CS, (unsigned long)invalid_TSS                },
+       { 11, 0, __KERNEL_CS, (unsigned long)segment_not_present        },
+       { 12, 0, __KERNEL_CS, (unsigned long)stack_segment              },
+       { 13, 0, __KERNEL_CS, (unsigned long)general_protection         },
+       { 14, 0, __KERNEL_CS, (unsigned long)page_fault                 },
+       { 15, 0, __KERNEL_CS, (unsigned long)fixup_4gb_segment          },
+       { 16, 0, __KERNEL_CS, (unsigned long)coprocessor_error          },
+       { 17, 0, __KERNEL_CS, (unsigned long)alignment_check            },
+#ifdef CONFIG_X86_MCE
+       { 18, 0, __KERNEL_CS, (unsigned long)machine_check              },
+#endif
+       { 19, 0, __KERNEL_CS, (unsigned long)simd_coprocessor_error     },
+       { SYSCALL_VECTOR,  3, __KERNEL_CS, (unsigned long)system_call   },
+       {  0, 0,           0, 0                                         }
+};
 
 void __init trap_init(void)
 {
-#ifdef CONFIG_EISA
-       void __iomem *p = ioremap(0x0FFFD9, 4);
-       if (readl(p) == 'E'+('I'<<8)+('S'<<16)+('A'<<24)) {
-               EISA_bus = 1;
-       }
-       iounmap(p);
-#endif
-
-#ifdef CONFIG_X86_LOCAL_APIC
-       init_apic_mappings();
-#endif
-
-       set_trap_gate(0,&divide_error);
-       set_intr_gate(1,&debug);
-       set_intr_gate(2,&nmi);
-       set_system_intr_gate(3, &int3); /* int3-5 can be called from all */
-       set_system_gate(4,&overflow);
-       set_system_gate(5,&bounds);
-       set_trap_gate(6,&invalid_op);
-       set_trap_gate(7,&device_not_available);
-       set_task_gate(8,GDT_ENTRY_DOUBLEFAULT_TSS);
-       set_trap_gate(9,&coprocessor_segment_overrun);
-       set_trap_gate(10,&invalid_TSS);
-       set_trap_gate(11,&segment_not_present);
-       set_trap_gate(12,&stack_segment);
-       set_trap_gate(13,&general_protection);
-       set_intr_gate(14,&page_fault);
-       set_trap_gate(15,&spurious_interrupt_bug);
-       set_trap_gate(16,&coprocessor_error);
-       set_trap_gate(17,&alignment_check);
-#ifdef CONFIG_X86_MCE
-       set_trap_gate(18,&machine_check);
-#endif
-       set_trap_gate(19,&simd_coprocessor_error);
+       HYPERVISOR_set_trap_table(trap_table);
 
-       set_system_gate(SYSCALL_VECTOR,&system_call);
+       /*
+        * default LDT is a single-entry callgate to lcall7 for iBCS
+        * and a callgate to lcall27 for Solaris/x86 binaries
+        */
+       make_lowmem_page_readonly(&default_ldt[0]);
 
        /*
         * Should be a barrier for any external CPU state.
         */
        cpu_init();
+}
 
-       trap_init_hook();
+void smp_trap_init(trap_info_t *trap_ctxt)
+{
+       trap_info_t *t = trap_table;
+
+       for (t = trap_table; t->address; t++) {
+               trap_ctxt[t->vector].flags = t->flags;
+               trap_ctxt[t->vector].cs = t->cs;
+               trap_ctxt[t->vector].address = t->address;
+       }
 }
 
 static int __init kstack_setup(char *s)
diff -x mkbuildtree -x include -x xen -x SCCS -urPp 
pristine-linux-2.6.12/arch/i386/kernel/vsyscall.S 
linux-2.6-xen-sparse/arch/i386/kernel/vsyscall.S
--- pristine-linux-2.6.12/arch/i386/kernel/vsyscall.S   2005-06-17 
12:48:29.000000000 -0700
+++ linux-2.6-xen-sparse/arch/i386/kernel/vsyscall.S    2005-07-28 
13:17:07.000000000 -0700
@@ -4,12 +4,12 @@ __INITDATA
 
        .globl vsyscall_int80_start, vsyscall_int80_end
 vsyscall_int80_start:
-       .incbin "arch/i386/kernel/vsyscall-int80.so"
+       .incbin "arch/xen/i386/kernel/vsyscall-int80.so"
 vsyscall_int80_end:
 
        .globl vsyscall_sysenter_start, vsyscall_sysenter_end
 vsyscall_sysenter_start:
-       .incbin "arch/i386/kernel/vsyscall-sysenter.so"
+       .incbin "arch/xen/i386/kernel/vsyscall-sysenter.so"
 vsyscall_sysenter_end:
 
 __FINIT
diff -x mkbuildtree -x include -x xen -x SCCS -urPp 
pristine-linux-2.6.12/arch/i386/mach-default/Makefile 
linux-2.6-xen-sparse/arch/i386/mach-default/Makefile
--- pristine-linux-2.6.12/arch/i386/mach-default/Makefile       2005-06-17 
12:48:29.000000000 -0700
+++ linux-2.6-xen-sparse/arch/i386/mach-default/Makefile        2005-07-28 
13:17:07.000000000 -0700
@@ -2,4 +2,11 @@
 # Makefile for the linux kernel.
 #
 
-obj-y                          := setup.o topology.o
+c-obj-y                                := topology.o
+
+$(patsubst %.o,$(obj)/%.c,$(c-obj-y)):
+       @ln -fsn $(srctree)/arch/i386/mach-default/$(notdir $@) $@
+
+obj-y  += $(c-obj-y)
+
+clean-files += $(patsubst %.o,%.c,$(c-obj-y) $(c-obj-))
diff -x mkbuildtree -x include -x xen -x SCCS -urPp 
pristine-linux-2.6.12/arch/i386/Makefile linux-2.6-xen-sparse/arch/i386/Makefile
--- pristine-linux-2.6.12/arch/i386/Makefile    2005-06-17 12:48:29.000000000 
-0700
+++ linux-2.6-xen-sparse/arch/i386/Makefile     2005-07-28 13:17:07.000000000 
-0700
@@ -17,15 +17,19 @@
 # 20050320  Kianusch Sayah Karadji <kianusch@xxxxxxxxxxx>
 #           Added support for GEODE CPU
 
+XENARCH        := $(subst ",,$(CONFIG_XENARCH))
+
 LDFLAGS                := -m elf_i386
-OBJCOPYFLAGS   := -O binary -R .note -R .comment -S
 LDFLAGS_vmlinux :=
-CHECKFLAGS     += -D__i386__
+CHECK          := $(CHECK) -D__i386__=1
+
+CFLAGS += -m32
+AFLAGS += -m32
 
 CFLAGS += -pipe -msoft-float
 
 # prevent gcc from keeping the stack 16 byte aligned
-CFLAGS += $(call cc-option,-mpreferred-stack-boundary=2)
+CFLAGS += $(call cc-option,-mpreferred-stack-boundary=2,)
 
 align := $(cc-option-align)
 cflags-$(CONFIG_M386)          += -march=i386
@@ -59,116 +63,46 @@ cflags-$(CONFIG_MGEODEGX1)         += $(call cc
 
 # -mregparm=3 works ok on gcc-3.0 and later
 #
-GCC_VERSION                    := $(call cc-version)
+GCC_VERSION                    := $(shell $(CONFIG_SHELL) 
$(srctree)/scripts/gcc-version.sh $(CC))
 cflags-$(CONFIG_REGPARM)       += $(shell if [ $(GCC_VERSION) -ge 0300 ] ; 
then echo "-mregparm=3"; fi ;)
 
 # Disable unit-at-a-time mode, it makes gcc use a lot more stack
 # due to the lack of sharing of stacklots.
-CFLAGS += $(call cc-option,-fno-unit-at-a-time)
+CFLAGS += $(call cc-option,-fno-unit-at-a-time,)
 
 CFLAGS += $(cflags-y)
 
-# Default subarch .c files
-mcore-y  := mach-default
-
-# Voyager subarch support
-mflags-$(CONFIG_X86_VOYAGER)   := -Iinclude/asm-i386/mach-voyager
-mcore-$(CONFIG_X86_VOYAGER)    := mach-voyager
-
-# VISWS subarch support
-mflags-$(CONFIG_X86_VISWS)     := -Iinclude/asm-i386/mach-visws
-mcore-$(CONFIG_X86_VISWS)      := mach-visws
-
-# NUMAQ subarch support
-mflags-$(CONFIG_X86_NUMAQ)     := -Iinclude/asm-i386/mach-numaq
-mcore-$(CONFIG_X86_NUMAQ)      := mach-default
-
-# BIGSMP subarch support
-mflags-$(CONFIG_X86_BIGSMP)    := -Iinclude/asm-i386/mach-bigsmp
-mcore-$(CONFIG_X86_BIGSMP)     := mach-default
-
-#Summit subarch support
-mflags-$(CONFIG_X86_SUMMIT) := -Iinclude/asm-i386/mach-summit
-mcore-$(CONFIG_X86_SUMMIT)  := mach-default
-
-# generic subarchitecture
-mflags-$(CONFIG_X86_GENERICARCH) := -Iinclude/asm-i386/mach-generic
-mcore-$(CONFIG_X86_GENERICARCH) := mach-default
-core-$(CONFIG_X86_GENERICARCH) += arch/i386/mach-generic/
-
-# ES7000 subarch support
-mflags-$(CONFIG_X86_ES7000)    := -Iinclude/asm-i386/mach-es7000
-mcore-$(CONFIG_X86_ES7000)     := mach-default
-core-$(CONFIG_X86_ES7000)      := arch/i386/mach-es7000/
-
-# default subarch .h files
-mflags-y += -Iinclude/asm-i386/mach-default
-
-head-y := arch/i386/kernel/head.o arch/i386/kernel/init_task.o
+head-y := arch/xen/i386/kernel/head.o arch/xen/i386/kernel/init_task.o
 
 libs-y                                         += arch/i386/lib/
-core-y                                 += arch/i386/kernel/ \
-                                          arch/i386/mm/ \
-                                          arch/i386/$(mcore-y)/ \
+core-y                                 += arch/xen/i386/kernel/ \
+                                          arch/xen/i386/mm/ \
+                                          arch/xen/i386/mach-default/ \
                                           arch/i386/crypto/
+# \
+#                                         arch/xen/$(mcore-y)/
 drivers-$(CONFIG_MATH_EMULATION)       += arch/i386/math-emu/
-drivers-$(CONFIG_PCI)                  += arch/i386/pci/
+drivers-$(CONFIG_PCI)                  += arch/xen/i386/pci/
 # must be linked after kernel/
 drivers-$(CONFIG_OPROFILE)             += arch/i386/oprofile/
 drivers-$(CONFIG_PM)                   += arch/i386/power/
 
-CFLAGS += $(mflags-y)
-AFLAGS += $(mflags-y)
-
-boot := arch/i386/boot
-
-.PHONY: zImage bzImage compressed zlilo bzlilo \
-       zdisk bzdisk fdimage fdimage144 fdimage288 install kernel_install
-
-all: bzImage
-
-# KBUILD_IMAGE specify target image being built
-                    KBUILD_IMAGE := $(boot)/bzImage
-zImage zlilo zdisk: KBUILD_IMAGE := arch/i386/boot/zImage
+# for clean
+obj-   += kernel/ mm/ pci/
+#obj-  += ../../i386/lib/ ../../i386/mm/ 
+#../../i386/$(mcore-y)/
+#obj-  += ../../i386/pci/ ../../i386/oprofile/ ../../i386/power/
+
+xenflags-y += -Iinclude/asm-xen/asm-i386/mach-xen \
+               -Iinclude/asm-i386/mach-default
+CFLAGS += $(xenflags-y)
+AFLAGS += $(xenflags-y)
 
-zImage bzImage: vmlinux
-       $(Q)$(MAKE) $(build)=$(boot) $(KBUILD_IMAGE)
+prepare: include/asm-$(XENARCH)/asm_offsets.h
+CLEAN_FILES += include/asm-$(XENARCH)/asm_offsets.h
 
-compressed: zImage
+arch/$(XENARCH)/kernel/asm-offsets.s: include/asm include/.asm-ignore \
+       include/linux/version.h include/config/MARKER
 
-zlilo bzlilo: vmlinux
-       $(Q)$(MAKE) $(build)=$(boot) BOOTIMAGE=$(KBUILD_IMAGE) zlilo
-
-zdisk bzdisk: vmlinux
-       $(Q)$(MAKE) $(build)=$(boot) BOOTIMAGE=$(KBUILD_IMAGE) zdisk
-
-fdimage fdimage144 fdimage288: vmlinux
-       $(Q)$(MAKE) $(build)=$(boot) BOOTIMAGE=$(KBUILD_IMAGE) $@
-
-install: vmlinux
-install kernel_install:
-       $(Q)$(MAKE) $(build)=$(boot) BOOTIMAGE=$(KBUILD_IMAGE) install
-
-prepare: include/asm-$(ARCH)/asm_offsets.h
-CLEAN_FILES += include/asm-$(ARCH)/asm_offsets.h
-
-arch/$(ARCH)/kernel/asm-offsets.s: include/asm include/linux/version.h \
-                                  include/config/MARKER
-
-include/asm-$(ARCH)/asm_offsets.h: arch/$(ARCH)/kernel/asm-offsets.s
+include/asm-$(XENARCH)/asm_offsets.h: arch/$(XENARCH)/kernel/asm-offsets.s
        $(call filechk,gen-asm-offsets)
-
-archclean:
-       $(Q)$(MAKE) $(clean)=arch/i386/boot
-
-define archhelp
-  echo  '* bzImage     - Compressed kernel image (arch/$(ARCH)/boot/bzImage)'
-  echo  '  install     - Install kernel using'
-  echo  '                 (your) ~/bin/installkernel or'
-  echo  '                 (distribution) /sbin/installkernel or'
-  echo  '                 install to $$(INSTALL_PATH) and run lilo'
-  echo  '  bzdisk       - Create a boot floppy in /dev/fd0'
-  echo  '  fdimage      - Create a boot floppy image'
-endef
-
-CLEAN_FILES += arch/$(ARCH)/boot/fdimage arch/$(ARCH)/boot/mtools.conf
diff -x mkbuildtree -x include -x xen -x SCCS -urPp 
pristine-linux-2.6.12/arch/i386/mm/fault.c 
linux-2.6-xen-sparse/arch/i386/mm/fault.c
--- pristine-linux-2.6.12/arch/i386/mm/fault.c  2005-06-17 12:48:29.000000000 
-0700
+++ linux-2.6-xen-sparse/arch/i386/mm/fault.c   2005-07-28 13:17:07.000000000 
-0700
@@ -21,6 +21,7 @@
 #include <linux/vt_kern.h>             /* For unblank_screen() */
 #include <linux/highmem.h>
 #include <linux/module.h>
+#include <linux/percpu.h>
 
 #include <asm/system.h>
 #include <asm/uaccess.h>
@@ -29,6 +30,8 @@
 
 extern void die(const char *,struct pt_regs *,long);
 
+DEFINE_PER_CPU(pgd_t *, cur_pgd);
+
 /*
  * Unlock any spinlocks which will prevent us from getting the
  * message out 
@@ -77,7 +80,7 @@ static inline unsigned long get_segment_
        u32 seg_ar, seg_limit, base, *desc;
 
        /* The standard kernel/user address space limit. */
-       *eip_limit = (seg & 3) ? USER_DS.seg : KERNEL_DS.seg;
+       *eip_limit = (seg & 2) ? USER_DS.seg : KERNEL_DS.seg;
 
        /* Unlikely, but must come before segment checks. */
        if (unlikely((regs->eflags & VM_MASK) != 0))
@@ -107,7 +110,7 @@ static inline unsigned long get_segment_
                desc = (void *)desc + (seg & ~7);
        } else {
                /* Must disable preemption while reading the GDT. */
-               desc = (u32 *)&per_cpu(cpu_gdt_table, get_cpu());
+               desc = (u32 *)get_cpu_gdt_table(get_cpu());
                desc = (void *)desc + (seg & ~7);
        }
 
@@ -211,25 +214,30 @@ fastcall void do_invalid_op(struct pt_re
  *     bit 1 == 0 means read, 1 means write
  *     bit 2 == 0 means kernel, 1 means user-mode
  */
-fastcall void do_page_fault(struct pt_regs *regs, unsigned long error_code)
+fastcall void do_page_fault(struct pt_regs *regs, unsigned long error_code,
+                             unsigned long address)
 {
        struct task_struct *tsk;
        struct mm_struct *mm;
        struct vm_area_struct * vma;
-       unsigned long address;
        unsigned long page;
        int write;
        siginfo_t info;
 
-       /* get the address */
-       __asm__("movl %%cr2,%0":"=r" (address));
+       /* Set the "privileged fault" bit to something sane. */
+       error_code &= 3;
+       error_code |= (regs->xcs & 2) << 1;
+       if (regs->eflags & X86_EFLAGS_VM)
+               error_code |= 4;
 
        if (notify_die(DIE_PAGE_FAULT, "page fault", regs, error_code, 14,
                                        SIGSEGV) == NOTIFY_STOP)
                return;
+#if 0
        /* It's safe to allow irq's after cr2 has been saved */
        if (regs->eflags & (X86_EFLAGS_IF|VM_MASK))
                local_irq_enable();
+#endif
 
        tsk = current;
 
@@ -446,9 +454,10 @@ no_context:
        printk(" at virtual address %08lx\n",address);
        printk(KERN_ALERT " printing eip:\n");
        printk("%08lx\n", regs->eip);
-       asm("movl %%cr3,%0":"=r" (page));
-       page = ((unsigned long *) __va(page))[address >> 22];
-       printk(KERN_ALERT "*pde = %08lx\n", page);
+       page = ((unsigned long *) per_cpu(cur_pgd, smp_processor_id()))
+           [address >> 22];
+       printk(KERN_ALERT "*pde = ma %08lx pa %08lx\n", page,
+              machine_to_phys(page));
        /*
         * We must not directly access the pte in the highpte
         * case, the page table might be allocated in highmem.
@@ -459,8 +468,10 @@ no_context:
        if (page & 1) {
                page &= PAGE_MASK;
                address &= 0x003ff000;
+               page = machine_to_phys(page);
                page = ((unsigned long *) __va(page))[address >> PAGE_SHIFT];
-               printk(KERN_ALERT "*pte = %08lx\n", page);
+               printk(KERN_ALERT "*pte = ma %08lx pa %08lx\n", page,
+                      machine_to_phys(page));
        }
 #endif
        die("Oops", regs, error_code);
@@ -514,14 +525,12 @@ vmalloc_fault:
                 * an interrupt in the middle of a task switch..
                 */
                int index = pgd_index(address);
-               unsigned long pgd_paddr;
                pgd_t *pgd, *pgd_k;
                pud_t *pud, *pud_k;
                pmd_t *pmd, *pmd_k;
                pte_t *pte_k;
 
-               asm("movl %%cr3,%0":"=r" (pgd_paddr));
-               pgd = index + (pgd_t *)__va(pgd_paddr);
+               pgd = index + per_cpu(cur_pgd, smp_processor_id());
                pgd_k = init_mm.pgd + index;
 
                if (!pgd_present(*pgd_k))
diff -x mkbuildtree -x include -x xen -x SCCS -urPp 
pristine-linux-2.6.12/arch/i386/mm/highmem.c 
linux-2.6-xen-sparse/arch/i386/mm/highmem.c
--- pristine-linux-2.6.12/arch/i386/mm/highmem.c        2005-06-17 
12:48:29.000000000 -0700
+++ linux-2.6-xen-sparse/arch/i386/mm/highmem.c 2005-07-28 13:17:07.000000000 
-0700
@@ -25,7 +25,7 @@ void kunmap(struct page *page)
  * However when holding an atomic kmap is is not legal to sleep, so atomic
  * kmaps are appropriate for short, tight code paths only.
  */
-void *kmap_atomic(struct page *page, enum km_type type)
+static void *__kmap_atomic(struct page *page, enum km_type type, pgprot_t prot)
 {
        enum fixed_addresses idx;
        unsigned long vaddr;
@@ -41,12 +41,23 @@ void *kmap_atomic(struct page *page, enu
        if (!pte_none(*(kmap_pte-idx)))
                BUG();
 #endif
-       set_pte(kmap_pte-idx, mk_pte(page, kmap_prot));
+       set_pte(kmap_pte-idx, mk_pte(page, prot));
        __flush_tlb_one(vaddr);
 
        return (void*) vaddr;
 }
 
+void *kmap_atomic(struct page *page, enum km_type type)
+{
+       return __kmap_atomic(page, type, kmap_prot);
+}
+
+/* Same as kmap_atomic but with PAGE_KERNEL_RO page protection. */
+void *kmap_atomic_pte(struct page *page, enum km_type type)
+{
+       return __kmap_atomic(page, type, PAGE_KERNEL_RO);
+}
+
 void kunmap_atomic(void *kvaddr, enum km_type type)
 {
 #ifdef CONFIG_DEBUG_HIGHMEM
diff -x mkbuildtree -x include -x xen -x SCCS -urPp 
pristine-linux-2.6.12/arch/i386/mm/hypervisor.c 
linux-2.6-xen-sparse/arch/i386/mm/hypervisor.c
--- pristine-linux-2.6.12/arch/i386/mm/hypervisor.c     1969-12-31 
16:00:00.000000000 -0800
+++ linux-2.6-xen-sparse/arch/i386/mm/hypervisor.c      2005-07-28 
13:17:07.000000000 -0700
@@ -0,0 +1,363 @@
+/******************************************************************************
+ * mm/hypervisor.c
+ * 
+ * Update page tables via the hypervisor.
+ * 
+ * Copyright (c) 2002-2004, K A Fraser
+ * 
+ * This file may be distributed separately from the Linux kernel, or
+ * incorporated into other software packages, subject to the following license:
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this source file (the "Software"), to deal in the Software without
+ * restriction, including without limitation the rights to use, copy, modify,
+ * merge, publish, distribute, sublicense, and/or sell copies of the Software,
+ * and to permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#include <linux/config.h>
+#include <linux/sched.h>
+#include <linux/mm.h>
+#include <linux/vmalloc.h>
+#include <asm/page.h>
+#include <asm/pgtable.h>
+#include <asm-xen/hypervisor.h>
+#include <asm-xen/balloon.h>
+#if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,0)
+#include <linux/percpu.h>
+#include <asm/tlbflush.h>
+#endif
+
+#if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,0)
+#define pte_offset_kernel pte_offset
+#define pud_t pgd_t
+#define pud_offset(d, va) d
+#elif defined(CONFIG_X86_64)
+#define pmd_val_ma(v) (v).pmd
+#else
+#ifdef CONFIG_X86_PAE
+# define pmd_val_ma(v) ((v).pmd)
+# define pud_val_ma(v) ((v).pgd.pgd)
+#else
+# define pmd_val_ma(v) ((v).pud.pgd.pgd)
+#endif
+#endif
+
+#ifndef CONFIG_XEN_SHADOW_MODE
+void xen_l1_entry_update(pte_t *ptr, pte_t val)
+{
+    mmu_update_t u;
+    u.ptr = virt_to_machine(ptr);
+    u.val = pte_val_ma(val);
+    BUG_ON(HYPERVISOR_mmu_update(&u, 1, NULL, DOMID_SELF) < 0);
+}
+
+void xen_l2_entry_update(pmd_t *ptr, pmd_t val)
+{
+    mmu_update_t u;
+    u.ptr = virt_to_machine(ptr);
+    u.val = pmd_val_ma(val);
+    BUG_ON(HYPERVISOR_mmu_update(&u, 1, NULL, DOMID_SELF) < 0);
+}
+
+#ifdef CONFIG_X86_PAE
+void xen_l3_entry_update(pud_t *ptr, pud_t val)
+{
+    mmu_update_t u;
+    u.ptr = virt_to_machine(ptr);
+    u.val = pud_val_ma(val);
+    BUG_ON(HYPERVISOR_mmu_update(&u, 1, NULL, DOMID_SELF) < 0);
+}
+#endif
+
+#ifdef CONFIG_X86_64
+void xen_l3_entry_update(pud_t *ptr, pud_t val)
+{
+    mmu_update_t u;
+    u.ptr = virt_to_machine(ptr);
+    u.val = val.pud;
+    BUG_ON(HYPERVISOR_mmu_update(&u, 1, NULL, DOMID_SELF) < 0);
+}
+
+void xen_l4_entry_update(pgd_t *ptr, pgd_t val)
+{
+    mmu_update_t u;
+    u.ptr = virt_to_machine(ptr);
+    u.val = val.pgd;
+    BUG_ON(HYPERVISOR_mmu_update(&u, 1, NULL, DOMID_SELF) < 0);
+}
+#endif /* CONFIG_X86_64 */
+#endif /* CONFIG_XEN_SHADOW_MODE */
+
+void xen_machphys_update(unsigned long mfn, unsigned long pfn)
+{
+    mmu_update_t u;
+    u.ptr = (mfn << PAGE_SHIFT) | MMU_MACHPHYS_UPDATE;
+    u.val = pfn;
+    BUG_ON(HYPERVISOR_mmu_update(&u, 1, NULL, DOMID_SELF) < 0);
+}
+
+void xen_pt_switch(unsigned long ptr)
+{
+    struct mmuext_op op;
+    op.cmd = MMUEXT_NEW_BASEPTR;
+    op.mfn = pfn_to_mfn(ptr >> PAGE_SHIFT);
+    BUG_ON(HYPERVISOR_mmuext_op(&op, 1, NULL, DOMID_SELF) < 0);
+}
+
+void xen_new_user_pt(unsigned long ptr)
+{
+    struct mmuext_op op;
+    op.cmd = MMUEXT_NEW_USER_BASEPTR;
+    op.mfn = pfn_to_mfn(ptr >> PAGE_SHIFT);
+    BUG_ON(HYPERVISOR_mmuext_op(&op, 1, NULL, DOMID_SELF) < 0);
+}
+
+void xen_tlb_flush(void)
+{
+    struct mmuext_op op;
+    op.cmd = MMUEXT_TLB_FLUSH_LOCAL;
+    BUG_ON(HYPERVISOR_mmuext_op(&op, 1, NULL, DOMID_SELF) < 0);
+}
+
+void xen_invlpg(unsigned long ptr)
+{
+    struct mmuext_op op;
+    op.cmd = MMUEXT_INVLPG_LOCAL;
+    op.linear_addr = ptr & PAGE_MASK;
+    BUG_ON(HYPERVISOR_mmuext_op(&op, 1, NULL, DOMID_SELF) < 0);
+}
+
+#ifdef CONFIG_SMP
+
+void xen_tlb_flush_all(void)
+{
+    struct mmuext_op op;
+    op.cmd = MMUEXT_TLB_FLUSH_ALL;
+    BUG_ON(HYPERVISOR_mmuext_op(&op, 1, NULL, DOMID_SELF) < 0);
+}
+
+void xen_tlb_flush_mask(cpumask_t *mask)
+{
+    struct mmuext_op op;
+    if ( cpus_empty(*mask) )
+        return;
+    op.cmd = MMUEXT_TLB_FLUSH_MULTI;
+    op.vcpumask = mask->bits;
+    BUG_ON(HYPERVISOR_mmuext_op(&op, 1, NULL, DOMID_SELF) < 0);
+}
+
+void xen_invlpg_all(unsigned long ptr)
+{
+    struct mmuext_op op;
+    op.cmd = MMUEXT_INVLPG_ALL;
+    op.linear_addr = ptr & PAGE_MASK;
+    BUG_ON(HYPERVISOR_mmuext_op(&op, 1, NULL, DOMID_SELF) < 0);
+}
+
+void xen_invlpg_mask(cpumask_t *mask, unsigned long ptr)
+{
+    struct mmuext_op op;
+    if ( cpus_empty(*mask) )
+        return;
+    op.cmd = MMUEXT_INVLPG_MULTI;
+    op.vcpumask = mask->bits;
+    op.linear_addr = ptr & PAGE_MASK;
+    BUG_ON(HYPERVISOR_mmuext_op(&op, 1, NULL, DOMID_SELF) < 0);
+}
+
+#endif /* CONFIG_SMP */
+
+#ifndef CONFIG_XEN_SHADOW_MODE
+void xen_pgd_pin(unsigned long ptr)
+{
+    struct mmuext_op op;
+#ifdef CONFIG_X86_64
+    op.cmd = MMUEXT_PIN_L4_TABLE;
+#elif defined(CONFIG_X86_PAE)
+    op.cmd = MMUEXT_PIN_L3_TABLE;
+#else
+    op.cmd = MMUEXT_PIN_L2_TABLE;
+#endif
+    op.mfn = pfn_to_mfn(ptr >> PAGE_SHIFT);
+    BUG_ON(HYPERVISOR_mmuext_op(&op, 1, NULL, DOMID_SELF) < 0);
+}
+
+void xen_pgd_unpin(unsigned long ptr)
+{
+    struct mmuext_op op;
+    op.cmd = MMUEXT_UNPIN_TABLE;
+    op.mfn = pfn_to_mfn(ptr >> PAGE_SHIFT);
+    BUG_ON(HYPERVISOR_mmuext_op(&op, 1, NULL, DOMID_SELF) < 0);
+}
+
+void xen_pte_pin(unsigned long ptr)
+{
+    struct mmuext_op op;
+    op.cmd = MMUEXT_PIN_L1_TABLE;
+    op.mfn = pfn_to_mfn(ptr >> PAGE_SHIFT);
+    BUG_ON(HYPERVISOR_mmuext_op(&op, 1, NULL, DOMID_SELF) < 0);
+}
+
+void xen_pte_unpin(unsigned long ptr)
+{
+    struct mmuext_op op;
+    op.cmd = MMUEXT_UNPIN_TABLE;
+    op.mfn = pfn_to_mfn(ptr >> PAGE_SHIFT);
+    BUG_ON(HYPERVISOR_mmuext_op(&op, 1, NULL, DOMID_SELF) < 0);
+}
+
+#ifdef CONFIG_X86_64
+void xen_pud_pin(unsigned long ptr)
+{
+    struct mmuext_op op;
+    op.cmd = MMUEXT_PIN_L3_TABLE;
+    op.mfn = pfn_to_mfn(ptr >> PAGE_SHIFT);
+    BUG_ON(HYPERVISOR_mmuext_op(&op, 1, NULL, DOMID_SELF) < 0);
+}
+
+void xen_pud_unpin(unsigned long ptr)
+{
+    struct mmuext_op op;
+    op.cmd = MMUEXT_UNPIN_TABLE;
+    op.mfn = pfn_to_mfn(ptr >> PAGE_SHIFT);
+    BUG_ON(HYPERVISOR_mmuext_op(&op, 1, NULL, DOMID_SELF) < 0);
+}
+
+void xen_pmd_pin(unsigned long ptr)
+{
+    struct mmuext_op op;
+    op.cmd = MMUEXT_PIN_L2_TABLE;
+    op.mfn = pfn_to_mfn(ptr >> PAGE_SHIFT);
+    BUG_ON(HYPERVISOR_mmuext_op(&op, 1, NULL, DOMID_SELF) < 0);
+}
+
+void xen_pmd_unpin(unsigned long ptr)
+{
+    struct mmuext_op op;
+    op.cmd = MMUEXT_UNPIN_TABLE;
+    op.mfn = pfn_to_mfn(ptr >> PAGE_SHIFT);
+    BUG_ON(HYPERVISOR_mmuext_op(&op, 1, NULL, DOMID_SELF) < 0);
+}
+#endif /* CONFIG_X86_64 */
+#endif /* CONFIG_XEN_SHADOW_MODE */
+
+void xen_set_ldt(unsigned long ptr, unsigned long len)
+{
+    struct mmuext_op op;
+    op.cmd = MMUEXT_SET_LDT;
+    op.linear_addr = ptr;
+    op.nr_ents = len;
+    BUG_ON(HYPERVISOR_mmuext_op(&op, 1, NULL, DOMID_SELF) < 0);
+}
+
+void xen_contig_memory(unsigned long vstart, unsigned int order)
+{
+    /*
+     * Ensure multi-page extents are contiguous in machine memory. This code 
+     * could be cleaned up some, and the number of hypercalls reduced.
+     */
+    pgd_t         *pgd; 
+    pud_t         *pud; 
+    pmd_t         *pmd;
+    pte_t         *pte;
+    unsigned long  mfn, i, flags;
+
+    scrub_pages(vstart, 1 << order);
+
+    balloon_lock(flags);
+
+    /* 1. Zap current PTEs, giving away the underlying pages. */
+    for (i = 0; i < (1<<order); i++) {
+        pgd = pgd_offset_k(vstart + (i*PAGE_SIZE));
+        pud = pud_offset(pgd, (vstart + (i*PAGE_SIZE)));
+        pmd = pmd_offset(pud, (vstart + (i*PAGE_SIZE)));
+        pte = pte_offset_kernel(pmd, (vstart + (i*PAGE_SIZE)));
+        mfn = pte_mfn(*pte);
+        HYPERVISOR_update_va_mapping(
+            vstart + (i*PAGE_SIZE), __pte_ma(0), 0);
+        phys_to_machine_mapping[(__pa(vstart)>>PAGE_SHIFT)+i] =
+            INVALID_P2M_ENTRY;
+        BUG_ON(HYPERVISOR_dom_mem_op(
+            MEMOP_decrease_reservation, &mfn, 1, 0) != 1);
+    }
+
+    /* 2. Get a new contiguous memory extent. */
+    BUG_ON(HYPERVISOR_dom_mem_op(
+        MEMOP_increase_reservation, &mfn, 1, order) != 1);
+
+    /* 3. Map the new extent in place of old pages. */
+    for (i = 0; i < (1<<order); i++) {
+        HYPERVISOR_update_va_mapping(
+            vstart + (i*PAGE_SIZE),
+            __pte_ma(((mfn+i)<<PAGE_SHIFT)|__PAGE_KERNEL), 0);
+        xen_machphys_update(mfn+i, (__pa(vstart)>>PAGE_SHIFT)+i);
+        phys_to_machine_mapping[(__pa(vstart)>>PAGE_SHIFT)+i] = mfn+i;
+    }
+
+    flush_tlb_all();
+
+    balloon_unlock(flags);
+}
+
+#ifdef CONFIG_XEN_PHYSDEV_ACCESS
+
+unsigned long allocate_empty_lowmem_region(unsigned long pages)
+{
+    pgd_t         *pgd;
+    pud_t         *pud; 
+    pmd_t         *pmd;
+    pte_t         *pte;
+    unsigned long *pfn_array;
+    unsigned long  vstart;
+    unsigned long  i;
+    unsigned int   order = get_order(pages*PAGE_SIZE);
+
+    vstart = __get_free_pages(GFP_KERNEL, order);
+    if ( vstart == 0 )
+        return 0UL;
+
+    scrub_pages(vstart, 1 << order);
+
+    pfn_array = vmalloc((1<<order) * sizeof(*pfn_array));
+    if ( pfn_array == NULL )
+        BUG();
+
+    for ( i = 0; i < (1<<order); i++ )
+    {
+        pgd = pgd_offset_k(   (vstart + (i*PAGE_SIZE)));
+        pud = pud_offset(pgd, (vstart + (i*PAGE_SIZE)));
+        pmd = pmd_offset(pud, (vstart + (i*PAGE_SIZE)));
+        pte = pte_offset_kernel(pmd, (vstart + (i*PAGE_SIZE))); 
+        pfn_array[i] = pte_mfn(*pte);
+#ifdef CONFIG_X86_64
+        xen_l1_entry_update(pte, __pte(0));
+#else
+        HYPERVISOR_update_va_mapping(vstart + (i*PAGE_SIZE), __pte_ma(0), 0);
+#endif
+        phys_to_machine_mapping[(__pa(vstart)>>PAGE_SHIFT)+i] =
+            INVALID_P2M_ENTRY;
+    }
+
+    flush_tlb_all();
+
+    balloon_put_pages(pfn_array, 1 << order);
+
+    vfree(pfn_array);
+
+    return vstart;
+}
+
+#endif /* CONFIG_XEN_PHYSDEV_ACCESS */
diff -x mkbuildtree -x include -x xen -x SCCS -urPp 
pristine-linux-2.6.12/arch/i386/mm/init.c 
linux-2.6-xen-sparse/arch/i386/mm/init.c
--- pristine-linux-2.6.12/arch/i386/mm/init.c   2005-06-17 12:48:29.000000000 
-0700
+++ linux-2.6-xen-sparse/arch/i386/mm/init.c    2005-07-28 13:17:07.000000000 
-0700
@@ -39,6 +39,7 @@
 #include <asm/tlb.h>
 #include <asm/tlbflush.h>
 #include <asm/sections.h>
+#include <asm-xen/hypervisor.h>
 
 unsigned int __VMALLOC_RESERVE = 128 << 20;
 
@@ -56,9 +57,10 @@ static pmd_t * __init one_md_table_init(
 {
        pud_t *pud;
        pmd_t *pmd_table;
-               
+
 #ifdef CONFIG_X86_PAE
        pmd_table = (pmd_t *) alloc_bootmem_low_pages(PAGE_SIZE);
+       make_page_readonly(pmd_table);
        set_pgd(pgd, __pgd(__pa(pmd_table) | _PAGE_PRESENT));
        pud = pud_offset(pgd, 0);
        if (pmd_table != pmd_offset(pud, 0)) 
@@ -79,6 +81,7 @@ static pte_t * __init one_page_table_ini
 {
        if (pmd_none(*pmd)) {
                pte_t *page_table = (pte_t *) 
alloc_bootmem_low_pages(PAGE_SIZE);
+               make_page_readonly(page_table);
                set_pmd(pmd, __pmd(__pa(page_table) | _PAGE_TABLE));
                if (page_table != pte_offset_kernel(pmd, 0))
                        BUG();  
@@ -119,7 +122,7 @@ static void __init page_table_range_init
                pud = pud_offset(pgd, vaddr);
                pmd = pmd_offset(pud, vaddr);
                for (; (pmd_idx < PTRS_PER_PMD) && (vaddr != end); pmd++, 
pmd_idx++) {
-                       if (pmd_none(*pmd)) 
+                       if (vaddr < HYPERVISOR_VIRT_START && pmd_none(*pmd)) 
                                one_page_table_init(pmd);
 
                        vaddr += PMD_SIZE;
@@ -148,16 +151,36 @@ static void __init kernel_physical_mappi
        pte_t *pte;
        int pgd_idx, pmd_idx, pte_ofs;
 
+       unsigned long max_ram_pfn = xen_start_info.nr_pages;
+       if (max_ram_pfn > max_low_pfn)
+               max_ram_pfn = max_low_pfn;
+
        pgd_idx = pgd_index(PAGE_OFFSET);
        pgd = pgd_base + pgd_idx;
        pfn = 0;
+       pmd_idx = pmd_index(PAGE_OFFSET);
+       pte_ofs = pte_index(PAGE_OFFSET);
 
        for (; pgd_idx < PTRS_PER_PGD; pgd++, pgd_idx++) {
+#ifdef CONFIG_XEN
+               /*
+                * Native linux hasn't PAE-paging enabled yet at this
+                * point.  When running as xen domain we are in PAE
+                * mode already, thus we can't simply hook a empty
+                * pmd.  That would kill the mappings we are currently
+                * using ...
+                */
+               pmd = pmd_offset(pud_offset(pgd, PAGE_OFFSET), PAGE_OFFSET);
+#else
                pmd = one_md_table_init(pgd);
+#endif
                if (pfn >= max_low_pfn)
                        continue;
-               for (pmd_idx = 0; pmd_idx < PTRS_PER_PMD && pfn < max_low_pfn; 
pmd++, pmd_idx++) {
+               pmd += pmd_idx;
+               for (; pmd_idx < PTRS_PER_PMD && pfn < max_low_pfn; pmd++, 
pmd_idx++) {
                        unsigned int address = pfn * PAGE_SIZE + PAGE_OFFSET;
+                       if (address >= HYPERVISOR_VIRT_START)
+                               continue;
 
                        /* Map with big pages if possible, otherwise create 
normal page tables. */
                        if (cpu_has_pse) {
@@ -171,14 +194,20 @@ static void __init kernel_physical_mappi
                        } else {
                                pte = one_page_table_init(pmd);
 
-                               for (pte_ofs = 0; pte_ofs < PTRS_PER_PTE && pfn 
< max_low_pfn; pte++, pfn++, pte_ofs++) {
+                               pte += pte_ofs;
+                               for (; pte_ofs < PTRS_PER_PTE && pfn < 
max_low_pfn; pte++, pfn++, pte_ofs++) {
+                                               /* XEN: Only map initial RAM 
allocation. */
+                                               if ((pfn >= max_ram_pfn) || 
pte_present(*pte))
+                                                       continue;
                                                if (is_kernel_text(address))
                                                        set_pte(pte, 
pfn_pte(pfn, PAGE_KERNEL_EXEC));
                                                else
                                                        set_pte(pte, 
pfn_pte(pfn, PAGE_KERNEL));
                                }
+                               pte_ofs = 0;
                        }
                }
+               pmd_idx = 0;
        }
 }
 
@@ -271,7 +300,8 @@ void __init one_highpage_init(struct pag
                ClearPageReserved(page);
                set_bit(PG_highmem, &page->flags);
                set_page_count(page, 1);
-               __free_page(page);
+               if (pfn < xen_start_info.nr_pages)
+                       __free_page(page);
                totalhigh_pages++;
        } else
                SetPageReserved(page);
@@ -308,6 +338,7 @@ static void __init pagetable_init (void)
 {
        unsigned long vaddr;
        pgd_t *pgd_base = swapper_pg_dir;
+       pgd_t *old_pgd = (pgd_t *)xen_start_info.pt_base;
 
 #ifdef CONFIG_X86_PAE
        int i;
@@ -328,6 +359,45 @@ static void __init pagetable_init (void)
                __PAGE_KERNEL_EXEC |= _PAGE_GLOBAL;
        }
 
+       /*
+        * Switch to proper mm_init page directory. Initialise from the current
+        * page directory, write-protect the new page directory, then switch to
+        * it. We clean up by write-enabling and then freeing the old page dir.
+        */
+#ifndef CONFIG_X86_PAE
+       memcpy(pgd_base, old_pgd, PTRS_PER_PGD_NO_HV*sizeof(pgd_t));
+       make_page_readonly(pgd_base);
+       xen_pgd_pin(__pa(pgd_base));
+       load_cr3(pgd_base);
+       xen_pgd_unpin(__pa(old_pgd));
+       make_page_writable(old_pgd);
+       __flush_tlb_all();
+       free_bootmem(__pa(old_pgd), PAGE_SIZE);
+#else
+       {
+               pud_t *old_pud = pud_offset(old_pgd+3, PAGE_OFFSET);
+               pmd_t *old_pmd = pmd_offset(old_pud, PAGE_OFFSET);
+               pmd_t *new_pmd = alloc_bootmem_low_pages(PAGE_SIZE);
+
+               memcpy(new_pmd,  old_pmd, PAGE_SIZE);
+               memcpy(pgd_base, old_pgd, PTRS_PER_PGD_NO_HV*sizeof(pgd_t));
+               set_pgd(&pgd_base[3], __pgd(__pa(new_pmd) | _PAGE_PRESENT));
+
+               make_page_readonly(new_pmd);
+               make_page_readonly(pgd_base);
+               xen_pgd_pin(__pa(pgd_base));
+               load_cr3(pgd_base);
+               xen_pgd_unpin(__pa(old_pgd));
+               make_page_writable(old_pgd);
+               make_page_writable(old_pmd);
+               __flush_tlb_all();
+
+               free_bootmem(__pa(old_pgd), PAGE_SIZE);
+               free_bootmem(__pa(old_pmd), PAGE_SIZE);
+       }
+#endif
+
+       init_mm.context.pinned = 1;
        kernel_physical_mapping_init(pgd_base);
        remap_numa_kva();
 
@@ -340,7 +410,7 @@ static void __init pagetable_init (void)
 
        permanent_kmaps_init(pgd_base);
 
-#ifdef CONFIG_X86_PAE
+#if 0 /* def CONFIG_X86_PAE */
        /*
         * Add low memory identity-mappings - SMP needs it when
         * starting up on an AP from real-mode. In the non-PAE
@@ -348,7 +418,7 @@ static void __init pagetable_init (void)
         * All user-space mappings are explicitly cleared after
         * SMP startup.
         */
-       pgd_base[0] = pgd_base[USER_PTRS_PER_PGD];
+       set_pgd(&pgd_base[0], pgd_base[USER_PTRS_PER_PGD]);
 #endif
 }
 
@@ -383,7 +453,7 @@ void zap_low_mappings (void)
         * us, because pgd_clear() is a no-op on i386.
         */
        for (i = 0; i < USER_PTRS_PER_PGD; i++)
-#ifdef CONFIG_X86_PAE
+#if defined(CONFIG_X86_PAE) && !defined(CONFIG_XEN)
                set_pgd(swapper_pg_dir+i, __pgd(1 + __pa(empty_zero_page)));
 #else
                set_pgd(swapper_pg_dir+i, __pgd(0));
@@ -470,6 +540,10 @@ out:
  */
 void __init paging_init(void)
 {
+#ifdef CONFIG_XEN_PHYSDEV_ACCESS
+       int i;
+#endif
+
 #ifdef CONFIG_X86_PAE
        set_nx();
        if (nx_enabled)
@@ -478,12 +552,12 @@ void __init paging_init(void)
 
        pagetable_init();
 
-       load_cr3(swapper_pg_dir);
-
-#ifdef CONFIG_X86_PAE
+#if defined(CONFIG_X86_PAE) && !defined(CONFIG_XEN)
        /*
         * We will bail out later - printk doesn't work right now so
         * the user would just see a hanging kernel.
+        * when running as xen domain we are already in PAE mode at
+        * this point.
         */
        if (cpu_has_pae)
                set_in_cr4(X86_CR4_PAE);
@@ -491,6 +565,22 @@ void __init paging_init(void)
        __flush_tlb_all();
 
        kmap_init();
+
+       /* Switch to the real shared_info page, and clear the dummy page. */
+       set_fixmap(FIX_SHARED_INFO, xen_start_info.shared_info);
+       HYPERVISOR_shared_info = (shared_info_t *)fix_to_virt(FIX_SHARED_INFO);
+       memset(empty_zero_page, 0, sizeof(empty_zero_page));
+
+#ifdef CONFIG_XEN_PHYSDEV_ACCESS
+       /* Setup mapping of lower 1st MB */
+       for (i = 0; i < NR_FIX_ISAMAPS; i++)
+               if (xen_start_info.flags & SIF_PRIVILEGED)
+                       set_fixmap(FIX_ISAMAP_BEGIN - i, i * PAGE_SIZE);
+               else
+                       __set_fixmap(FIX_ISAMAP_BEGIN - i,
+                                    virt_to_machine(empty_zero_page),
+                                    PAGE_KERNEL_RO);
+#endif
 }
 
 /*
@@ -539,6 +629,7 @@ void __init mem_init(void)
        int codesize, reservedpages, datasize, initsize;
        int tmp;
        int bad_ppro;
+       unsigned long pfn;
 
 #ifndef CONFIG_DISCONTIGMEM
        if (!mem_map)
@@ -564,9 +655,18 @@ void __init mem_init(void)
 #else
        high_memory = (void *) __va(max_low_pfn * PAGE_SIZE - 1) + 1;
 #endif
-
+       printk("vmalloc area: %lx-%lx, maxmem %lx\n",
+              VMALLOC_START,VMALLOC_END,MAXMEM);
+       BUG_ON(VMALLOC_START > VMALLOC_END);
+       
        /* this will put all low memory onto the freelists */
        totalram_pages += free_all_bootmem();
+       /* XEN: init and count low-mem pages outside initial allocation. */
+       for (pfn = xen_start_info.nr_pages; pfn < max_low_pfn; pfn++) {
+               ClearPageReserved(&mem_map[pfn]);
+               set_page_count(&mem_map[pfn], 1);
+               totalram_pages++;
+       }
 
        reservedpages = 0;
        for (tmp = 0; tmp < max_low_pfn; tmp++)
@@ -630,11 +730,16 @@ void __init pgtable_cache_init(void)
                        panic("pgtable_cache_init(): cannot create pmd cache");
        }
        pgd_cache = kmem_cache_create("pgd",
+#if 0 /* How the heck _this_ works in native linux ??? */
                                PTRS_PER_PGD*sizeof(pgd_t),
                                PTRS_PER_PGD*sizeof(pgd_t),
+#else
+                               PAGE_SIZE,
+                               PAGE_SIZE,
+#endif
                                0,
                                pgd_ctor,
-                               PTRS_PER_PMD == 1 ? pgd_dtor : NULL);
+                               pgd_dtor);
        if (!pgd_cache)
                panic("pgtable_cache_init(): Cannot create pgd cache");
 }
diff -x mkbuildtree -x include -x xen -x SCCS -urPp 
pristine-linux-2.6.12/arch/i386/mm/ioremap.c 
linux-2.6-xen-sparse/arch/i386/mm/ioremap.c
--- pristine-linux-2.6.12/arch/i386/mm/ioremap.c        2005-06-17 
12:48:29.000000000 -0700
+++ linux-2.6-xen-sparse/arch/i386/mm/ioremap.c 2005-07-28 13:17:07.000000000 
-0700
@@ -11,91 +11,54 @@
 #include <linux/vmalloc.h>
 #include <linux/init.h>
 #include <linux/slab.h>
+#include <linux/module.h>
 #include <asm/io.h>
 #include <asm/fixmap.h>
 #include <asm/cacheflush.h>
 #include <asm/tlbflush.h>
 #include <asm/pgtable.h>
+#include <asm/pgalloc.h>
 
-#define ISA_START_ADDRESS      0xa0000
-#define ISA_END_ADDRESS                0x100000
+#ifndef CONFIG_XEN_PHYSDEV_ACCESS
 
-static int ioremap_pte_range(pmd_t *pmd, unsigned long addr,
-               unsigned long end, unsigned long phys_addr, unsigned long flags)
+void * __ioremap(unsigned long phys_addr, unsigned long size,
+                unsigned long flags)
 {
-       pte_t *pte;
-       unsigned long pfn;
-
-       pfn = phys_addr >> PAGE_SHIFT;
-       pte = pte_alloc_kernel(&init_mm, pmd, addr);
-       if (!pte)
-               return -ENOMEM;
-       do {
-               BUG_ON(!pte_none(*pte));
-               set_pte(pte, pfn_pte(pfn, __pgprot(_PAGE_PRESENT | _PAGE_RW | 
-                                       _PAGE_DIRTY | _PAGE_ACCESSED | flags)));
-               pfn++;
-       } while (pte++, addr += PAGE_SIZE, addr != end);
-       return 0;
+       return NULL;
 }
 
-static inline int ioremap_pmd_range(pud_t *pud, unsigned long addr,
-               unsigned long end, unsigned long phys_addr, unsigned long flags)
+void *ioremap_nocache (unsigned long phys_addr, unsigned long size)
 {
-       pmd_t *pmd;
-       unsigned long next;
-
-       phys_addr -= addr;
-       pmd = pmd_alloc(&init_mm, pud, addr);
-       if (!pmd)
-               return -ENOMEM;
-       do {
-               next = pmd_addr_end(addr, end);
-               if (ioremap_pte_range(pmd, addr, next, phys_addr + addr, flags))
-                       return -ENOMEM;
-       } while (pmd++, addr = next, addr != end);
-       return 0;
+       return NULL;
 }
 
-static inline int ioremap_pud_range(pgd_t *pgd, unsigned long addr,
-               unsigned long end, unsigned long phys_addr, unsigned long flags)
+void iounmap(volatile void __iomem *addr)
 {
-       pud_t *pud;
-       unsigned long next;
+}
 
-       phys_addr -= addr;
-       pud = pud_alloc(&init_mm, pgd, addr);
-       if (!pud)
-               return -ENOMEM;
-       do {
-               next = pud_addr_end(addr, end);
-               if (ioremap_pmd_range(pud, addr, next, phys_addr + addr, flags))
-                       return -ENOMEM;
-       } while (pud++, addr = next, addr != end);
-       return 0;
+void __init *bt_ioremap(unsigned long phys_addr, unsigned long size)
+{
+       return NULL;
 }
 
-static int ioremap_page_range(unsigned long addr,
-               unsigned long end, unsigned long phys_addr, unsigned long flags)
+void __init bt_iounmap(void *addr, unsigned long size)
 {
-       pgd_t *pgd;
-       unsigned long next;
-       int err;
+}
 
-       BUG_ON(addr >= end);
-       flush_cache_all();
-       phys_addr -= addr;
-       pgd = pgd_offset_k(addr);
-       spin_lock(&init_mm.page_table_lock);
-       do {
-               next = pgd_addr_end(addr, end);
-               err = ioremap_pud_range(pgd, addr, next, phys_addr+addr, flags);
-               if (err)
-                       break;
-       } while (pgd++, addr = next, addr != end);
-       spin_unlock(&init_mm.page_table_lock);
-       flush_tlb_all();
-       return err;
+#else
+
+/*
+ * Does @address reside within a non-highmem page that is local to this virtual
+ * machine (i.e., not an I/O page, nor a memory page belonging to another VM).
+ * See the comment that accompanies pte_pfn() in pgtable-2level.h to understand
+ * why this works.
+ */
+static inline int is_local_lowmem(unsigned long address)
+{
+       extern unsigned long max_low_pfn;
+       unsigned long mfn = address >> PAGE_SHIFT;
+       unsigned long pfn = mfn_to_pfn(mfn);
+       return ((pfn < max_low_pfn) && (pfn_to_mfn(pfn) == mfn));
 }
 
 /*
@@ -116,31 +79,36 @@ void __iomem * __ioremap(unsigned long p
        void __iomem * addr;
        struct vm_struct * area;
        unsigned long offset, last_addr;
+       domid_t domid = DOMID_IO;
 
        /* Don't allow wraparound or zero size */
        last_addr = phys_addr + size - 1;
        if (!size || last_addr < phys_addr)
                return NULL;
 
+#ifdef CONFIG_XEN_PRIVILEGED_GUEST
        /*
         * Don't remap the low PCI/ISA area, it's always mapped..
         */
-       if (phys_addr >= ISA_START_ADDRESS && last_addr < ISA_END_ADDRESS)
-               return (void __iomem *) phys_to_virt(phys_addr);
+       if (phys_addr >= 0x0 && last_addr < 0x100000)
+               return isa_bus_to_virt(phys_addr);
+#endif
 
        /*
         * Don't allow anybody to remap normal RAM that we're using..
         */
-       if (phys_addr <= virt_to_phys(high_memory - 1)) {
+       if (is_local_lowmem(phys_addr)) {
                char *t_addr, *t_end;
                struct page *page;
 
-               t_addr = __va(phys_addr);
+               t_addr = bus_to_virt(phys_addr);
                t_end = t_addr + (size - 1);
           
                for(page = virt_to_page(t_addr); page <= virt_to_page(t_end); 
page++)
                        if(!PageReserved(page))
                                return NULL;
+
+               domid = DOMID_SELF;
        }
 
        /*
@@ -158,8 +126,10 @@ void __iomem * __ioremap(unsigned long p
                return NULL;
        area->phys_addr = phys_addr;
        addr = (void __iomem *) area->addr;
-       if (ioremap_page_range((unsigned long) addr,
-                       (unsigned long) addr + size, phys_addr, flags)) {
+       if (direct_remap_area_pages(&init_mm, (unsigned long) addr, phys_addr,
+                                   size, __pgprot(_PAGE_PRESENT | _PAGE_RW |
+                                                  _PAGE_DIRTY | _PAGE_ACCESSED
+                                                  | flags), domid)) {
                vunmap((void __force *) addr);
                return NULL;
        }
@@ -199,8 +169,8 @@ void __iomem *ioremap_nocache (unsigned 
        /* Guaranteed to be > phys_addr, as per __ioremap() */
        last_addr = phys_addr + size - 1;
 
-       if (last_addr < virt_to_phys(high_memory) - 1) {
-               struct page *ppage = virt_to_page(__va(phys_addr));             
+       if (is_local_lowmem(last_addr)) { 
+               struct page *ppage = virt_to_page(bus_to_virt(phys_addr));
                unsigned long npages;
 
                phys_addr &= PAGE_MASK;
@@ -227,32 +197,24 @@ void iounmap(volatile void __iomem *addr
 {
        struct vm_struct *p;
        if ((void __force *) addr <= high_memory) 
+               return; 
+#ifdef CONFIG_XEN_PRIVILEGED_GUEST
+       if ((unsigned long) addr >= fix_to_virt(FIX_ISAMAP_BEGIN))
                return;
-
-       /*
-        * __ioremap special-cases the PCI/ISA range by not instantiating a
-        * vm_area and by simply returning an address into the kernel mapping
-        * of ISA space.   So handle that here.
-        */
-       if (addr >= phys_to_virt(ISA_START_ADDRESS) &&
-                       addr < phys_to_virt(ISA_END_ADDRESS))
-               return;
-
-       write_lock(&vmlist_lock);
-       p = __remove_vm_area((void *) (PAGE_MASK & (unsigned long __force) 
addr));
+#endif
+       p = remove_vm_area((void *) (PAGE_MASK & (unsigned long __force) addr));
        if (!p) { 
-               printk("iounmap: bad address %p\n", addr);
-               goto out_unlock;
+               printk("__iounmap: bad address %p\n", addr);
+               return;
        }
 
-       if ((p->flags >> 20) && p->phys_addr < virt_to_phys(high_memory) - 1) {
-               change_page_attr(virt_to_page(__va(p->phys_addr)),
-                                p->size >> PAGE_SHIFT,
-                                PAGE_KERNEL);
+       if ((p->flags >> 20) && is_local_lowmem(p->phys_addr)) {
+               /* p->size includes the guard page, but cpa doesn't like that */
+               change_page_attr(virt_to_page(bus_to_virt(p->phys_addr)),
+                                (p->size - PAGE_SIZE) >> PAGE_SHIFT,
+                                PAGE_KERNEL);                           
                global_flush_tlb();
        } 
-out_unlock:
-       write_unlock(&vmlist_lock);
        kfree(p); 
 }
 
@@ -267,11 +229,13 @@ void __init *bt_ioremap(unsigned long ph
        if (!size || last_addr < phys_addr)
                return NULL;
 
+#ifdef CONFIG_XEN_PRIVILEGED_GUEST
        /*
         * Don't remap the low PCI/ISA area, it's always mapped..
         */
-       if (phys_addr >= ISA_START_ADDRESS && last_addr < ISA_END_ADDRESS)
-               return phys_to_virt(phys_addr);
+       if (phys_addr >= 0x0 && last_addr < 0x100000)
+               return isa_bus_to_virt(phys_addr);
+#endif
 
        /*
         * Mappings have to be page-aligned
@@ -310,6 +274,10 @@ void __init bt_iounmap(void *addr, unsig
        virt_addr = (unsigned long)addr;
        if (virt_addr < fix_to_virt(FIX_BTMAP_BEGIN))
                return;
+#ifdef CONFIG_XEN_PRIVILEGED_GUEST
+       if (virt_addr >= fix_to_virt(FIX_ISAMAP_BEGIN))
+               return;
+#endif
        offset = virt_addr & ~PAGE_MASK;
        nrpages = PAGE_ALIGN(offset + size - 1) >> PAGE_SHIFT;
 
@@ -320,3 +288,155 @@ void __init bt_iounmap(void *addr, unsig
                --nrpages;
        }
 }
+
+#endif /* CONFIG_XEN_PHYSDEV_ACCESS */
+
+/* These hacky macros avoid phys->machine translations. */
+#define __direct_pte(x) ((pte_t) { (x) } )
+#define __direct_mk_pte(page_nr,pgprot) \
+  __direct_pte(((page_nr) << PAGE_SHIFT) | pgprot_val(pgprot))
+#define direct_mk_pte_phys(physpage, pgprot) \
+  __direct_mk_pte((physpage) >> PAGE_SHIFT, pgprot)
+
+static inline void direct_remap_area_pte(pte_t *pte, 
+                                        unsigned long address, 
+                                        unsigned long size,
+                                        mmu_update_t **v)
+{
+       unsigned long end;
+
+       address &= ~PMD_MASK;
+       end = address + size;
+       if (end > PMD_SIZE)
+               end = PMD_SIZE;
+       if (address >= end)
+               BUG();
+
+       do {
+               (*v)->ptr = virt_to_machine(pte);
+               (*v)++;
+               address += PAGE_SIZE;
+               pte++;
+       } while (address && (address < end));
+}
+
+static inline int direct_remap_area_pmd(struct mm_struct *mm,
+                                       pmd_t *pmd, 
+                                       unsigned long address, 
+                                       unsigned long size,
+                                       mmu_update_t **v)
+{
+       unsigned long end;
+
+       address &= ~PGDIR_MASK;
+       end = address + size;
+       if (end > PGDIR_SIZE)
+               end = PGDIR_SIZE;
+       if (address >= end)
+               BUG();
+       do {
+               pte_t *pte = (mm == &init_mm) ? 
+                       pte_alloc_kernel(mm, pmd, address) :
+                       pte_alloc_map(mm, pmd, address);
+               if (!pte)
+                       return -ENOMEM;
+               direct_remap_area_pte(pte, address, end - address, v);
+               pte_unmap(pte);
+               address = (address + PMD_SIZE) & PMD_MASK;
+               pmd++;
+       } while (address && (address < end));
+       return 0;
+}
+ 
+int __direct_remap_area_pages(struct mm_struct *mm,
+                             unsigned long address, 
+                             unsigned long size, 
+                             mmu_update_t *v)
+{
+       pgd_t * dir;
+       unsigned long end = address + size;
+       int error;
+
+       dir = pgd_offset(mm, address);
+       if (address >= end)
+               BUG();
+       spin_lock(&mm->page_table_lock);
+       do {
+               pud_t *pud;
+               pmd_t *pmd;
+
+               error = -ENOMEM;
+               pud = pud_alloc(mm, dir, address);
+               if (!pud)
+                       break;
+               pmd = pmd_alloc(mm, pud, address);
+               if (!pmd)
+                       break;
+               error = 0;
+               direct_remap_area_pmd(mm, pmd, address, end - address, &v);
+               address = (address + PGDIR_SIZE) & PGDIR_MASK;
+               dir++;
+
+       } while (address && (address < end));
+       spin_unlock(&mm->page_table_lock);
+       return error;
+}
+
+
+int direct_remap_area_pages(struct mm_struct *mm,
+                           unsigned long address, 
+                           unsigned long machine_addr,
+                           unsigned long size, 
+                           pgprot_t prot,
+                           domid_t  domid)
+{
+       int i;
+       unsigned long start_address;
+#define MAX_DIRECTMAP_MMU_QUEUE 130
+       mmu_update_t u[MAX_DIRECTMAP_MMU_QUEUE], *v = u;
+
+       start_address = address;
+
+       flush_cache_all();
+
+       for (i = 0; i < size; i += PAGE_SIZE) {
+               if ((v - u) == MAX_DIRECTMAP_MMU_QUEUE) {
+                       /* Fill in the PTE pointers. */
+                       __direct_remap_area_pages(mm,
+                                                 start_address, 
+                                                 address-start_address, 
+                                                 u);
+ 
+                       if (HYPERVISOR_mmu_update(u, v - u, NULL, domid) < 0)
+                               return -EFAULT;
+                       v = u;
+                       start_address = address;
+               }
+
+               /*
+                * Fill in the machine address: PTE ptr is done later by
+                * __direct_remap_area_pages(). 
+                */
+               v->val = (machine_addr & PAGE_MASK) | pgprot_val(prot);
+
+               machine_addr += PAGE_SIZE;
+               address += PAGE_SIZE; 
+               v++;
+       }
+
+       if (v != u) {
+               /* get the ptep's filled in */
+               __direct_remap_area_pages(mm,
+                                         start_address, 
+                                         address-start_address, 
+                                         u);
+               if (unlikely(HYPERVISOR_mmu_update(u, v - u, NULL, domid) < 0))
+                       return -EFAULT;
+       }
+
+       flush_tlb_all();
+
+       return 0;
+}
+
+EXPORT_SYMBOL(direct_remap_area_pages);
diff -x mkbuildtree -x include -x xen -x SCCS -urPp 
pristine-linux-2.6.12/arch/i386/mm/Makefile 
linux-2.6-xen-sparse/arch/i386/mm/Makefile
--- pristine-linux-2.6.12/arch/i386/mm/Makefile 2005-06-17 12:48:29.000000000 
-0700
+++ linux-2.6-xen-sparse/arch/i386/mm/Makefile  2005-07-28 13:17:07.000000000 
-0700
@@ -2,9 +2,23 @@
 # Makefile for the linux i386-specific parts of the memory manager.
 #
 
-obj-y  := init.o pgtable.o fault.o ioremap.o extable.o pageattr.o mmap.o
+XENARCH        := $(subst ",,$(CONFIG_XENARCH))
 
-obj-$(CONFIG_DISCONTIGMEM)     += discontig.o
-obj-$(CONFIG_HUGETLB_PAGE) += hugetlbpage.o
+CFLAGS += -Iarch/$(XENARCH)/mm
+
+obj-y  := init.o pgtable.o fault.o ioremap.o hypervisor.o
+c-obj-y        := extable.o mmap.o pageattr.o
+
+c-obj-$(CONFIG_DISCONTIGMEM)   += discontig.o
+c-obj-$(CONFIG_HUGETLB_PAGE) += hugetlbpage.o
 obj-$(CONFIG_HIGHMEM) += highmem.o
-obj-$(CONFIG_BOOT_IOREMAP) += boot_ioremap.o
+c-obj-$(CONFIG_BOOT_IOREMAP) += boot_ioremap.o
+
+c-link :=
+
+$(patsubst %.o,$(obj)/%.c,$(c-obj-y) $(c-link)):
+       @ln -fsn $(srctree)/arch/i386/mm/$(notdir $@) $@
+
+obj-y  += $(c-obj-y)
+
+clean-files += $(patsubst %.o,%.c,$(c-obj-y) $(c-obj-) $(c-link))
diff -x mkbuildtree -x include -x xen -x SCCS -urPp 
pristine-linux-2.6.12/arch/i386/mm/pgtable.c 
linux-2.6-xen-sparse/arch/i386/mm/pgtable.c
--- pristine-linux-2.6.12/arch/i386/mm/pgtable.c        2005-06-17 
12:48:29.000000000 -0700
+++ linux-2.6-xen-sparse/arch/i386/mm/pgtable.c 2005-07-28 13:17:07.000000000 
-0700
@@ -21,6 +21,10 @@
 #include <asm/e820.h>
 #include <asm/tlb.h>
 #include <asm/tlbflush.h>
+#include <asm/io.h>
+#include <asm/mmu_context.h>
+
+#include <asm-xen/foreign_page.h>
 
 void show_mem(void)
 {
@@ -93,6 +97,44 @@ static void set_pte_pfn(unsigned long va
 }
 
 /*
+ * Associate a virtual page frame with a given physical page frame 
+ * and protection flags for that frame.
+ */ 
+static void set_pte_pfn_ma(unsigned long vaddr, unsigned long pfn,
+                          pgprot_t flags)
+{
+       pgd_t *pgd;
+       pud_t *pud;
+       pmd_t *pmd;
+       pte_t *pte;
+
+       pgd = swapper_pg_dir + pgd_index(vaddr);
+       if (pgd_none(*pgd)) {
+               BUG();
+               return;
+       }
+       pud = pud_offset(pgd, vaddr);
+       if (pud_none(*pud)) {
+               BUG();
+               return;
+       }
+       pmd = pmd_offset(pud, vaddr);
+       if (pmd_none(*pmd)) {
+               BUG();
+               return;
+       }
+       pte = pte_offset_kernel(pmd, vaddr);
+       /* <pfn,flags> stored as-is, to permit clearing entries */
+       set_pte(pte, pfn_pte_ma(pfn, flags));
+
+       /*
+        * It's enough to flush this one mapping.
+        * (PGE mappings get flushed as well)
+        */
+       __flush_tlb_one(vaddr);
+}
+
+/*
  * Associate a large virtual page frame with a given physical page frame 
  * and protection flags for that frame. pfn is for the base of the page,
  * vaddr is what the page gets mapped to - both must be properly aligned. 
@@ -135,12 +177,26 @@ void __set_fixmap (enum fixed_addresses 
                BUG();
                return;
        }
-       set_pte_pfn(address, phys >> PAGE_SHIFT, flags);
+       switch (idx) {
+       case FIX_WP_TEST:
+       case FIX_VSYSCALL:
+#ifdef CONFIG_X86_F00F_BUG
+       case FIX_F00F_IDT:
+#endif
+               set_pte_pfn(address, phys >> PAGE_SHIFT, flags);
+               break;
+       default:
+               set_pte_pfn_ma(address, phys >> PAGE_SHIFT, flags);
+               break;
+       }
 }
 
 pte_t *pte_alloc_one_kernel(struct mm_struct *mm, unsigned long address)
 {
-       return (pte_t *)__get_free_page(GFP_KERNEL|__GFP_REPEAT|__GFP_ZERO);
+       pte_t *pte = (pte_t 
*)__get_free_page(GFP_KERNEL|__GFP_REPEAT|__GFP_ZERO);
+       if (pte)
+               make_page_readonly(pte);
+       return pte;
 }
 
 struct page *pte_alloc_one(struct mm_struct *mm, unsigned long address)
@@ -151,10 +207,29 @@ struct page *pte_alloc_one(struct mm_str
        pte = alloc_pages(GFP_KERNEL|__GFP_HIGHMEM|__GFP_REPEAT|__GFP_ZERO, 0);
 #else
        pte = alloc_pages(GFP_KERNEL|__GFP_REPEAT|__GFP_ZERO, 0);
+       if (pte) {
+               SetPageForeign(pte, pte_free);
+               set_page_count(pte, 1);
+       }
 #endif
+
        return pte;
 }
 
+void pte_free(struct page *pte)
+{
+       unsigned long va = (unsigned long)__va(page_to_pfn(pte)<<PAGE_SHIFT);
+
+       if (!pte_write(*virt_to_ptep(va)))
+               HYPERVISOR_update_va_mapping(
+                       va, pfn_pte(page_to_pfn(pte), PAGE_KERNEL), 0);
+
+       ClearPageForeign(pte);
+       set_page_count(pte, 1);
+
+       __free_page(pte);
+}
+
 void pmd_ctor(void *pmd, kmem_cache_t *cache, unsigned long flags)
 {
        memset(pmd, 0, PTRS_PER_PMD*sizeof(pmd_t));
@@ -199,14 +274,14 @@ void pgd_ctor(void *pgd, kmem_cache_t *c
 {
        unsigned long flags;
 
-       if (PTRS_PER_PMD == 1)
+       if (!HAVE_SHARED_KERNEL_PMD)
                spin_lock_irqsave(&pgd_lock, flags);
 
        memcpy((pgd_t *)pgd + USER_PTRS_PER_PGD,
                        swapper_pg_dir + USER_PTRS_PER_PGD,
                        (PTRS_PER_PGD - USER_PTRS_PER_PGD) * sizeof(pgd_t));
 
-       if (PTRS_PER_PMD > 1)
+       if (HAVE_SHARED_KERNEL_PMD)
                return;
 
        pgd_list_add(pgd);
@@ -214,11 +289,13 @@ void pgd_ctor(void *pgd, kmem_cache_t *c
        memset(pgd, 0, USER_PTRS_PER_PGD*sizeof(pgd_t));
 }
 
-/* never called when PTRS_PER_PMD > 1 */
 void pgd_dtor(void *pgd, kmem_cache_t *cache, unsigned long unused)
 {
        unsigned long flags; /* can be called from interrupt context */
 
+       if (HAVE_SHARED_KERNEL_PMD)
+               return;
+
        spin_lock_irqsave(&pgd_lock, flags);
        pgd_list_del(pgd);
        spin_unlock_irqrestore(&pgd_lock, flags);
@@ -226,12 +303,30 @@ void pgd_dtor(void *pgd, kmem_cache_t *c
 
 pgd_t *pgd_alloc(struct mm_struct *mm)
 {
-       int i;
+       int i = 0;
        pgd_t *pgd = kmem_cache_alloc(pgd_cache, GFP_KERNEL);
 
        if (PTRS_PER_PMD == 1 || !pgd)
                return pgd;
 
+       if (!HAVE_SHARED_KERNEL_PMD) {
+               /* alloc and copy kernel pmd */
+               unsigned long flags;
+               pgd_t *copy_pgd = pgd_offset_k(PAGE_OFFSET);
+               pud_t *copy_pud = pud_offset(copy_pgd, PAGE_OFFSET);
+               pmd_t *copy_pmd = pmd_offset(copy_pud, PAGE_OFFSET);
+               pmd_t *pmd = kmem_cache_alloc(pmd_cache, GFP_KERNEL);
+               if (0 == pmd)
+                       goto out_oom;
+
+               spin_lock_irqsave(&pgd_lock, flags);
+               memcpy(pmd, copy_pmd, PAGE_SIZE);
+               spin_unlock_irqrestore(&pgd_lock, flags);
+               make_page_readonly(pmd);
+               set_pgd(&pgd[USER_PTRS_PER_PGD], __pgd(1 + __pa(pmd)));
+       }
+
+       /* alloc user pmds */
        for (i = 0; i < USER_PTRS_PER_PGD; ++i) {
                pmd_t *pmd = kmem_cache_alloc(pmd_cache, GFP_KERNEL);
                if (!pmd)
@@ -250,11 +345,207 @@ out_oom:
 void pgd_free(pgd_t *pgd)
 {
        int i;
+       pte_t *ptep = virt_to_ptep(pgd);
+
+       if (!pte_write(*ptep)) {
+               xen_pgd_unpin(__pa(pgd));
+               HYPERVISOR_update_va_mapping(
+                       (unsigned long)pgd,
+                       pfn_pte(virt_to_phys(pgd)>>PAGE_SHIFT, PAGE_KERNEL),
+                       0);
+       }
 
        /* in the PAE case user pgd entries are overwritten before usage */
-       if (PTRS_PER_PMD > 1)
-               for (i = 0; i < USER_PTRS_PER_PGD; ++i)
-                       kmem_cache_free(pmd_cache, (void 
*)__va(pgd_val(pgd[i])-1));
+       if (PTRS_PER_PMD > 1) {
+               for (i = 0; i < USER_PTRS_PER_PGD; ++i) {
+                       pmd_t *pmd = (void *)__va(pgd_val(pgd[i])-1);
+                       kmem_cache_free(pmd_cache, pmd);
+               }
+               if (!HAVE_SHARED_KERNEL_PMD) {
+                       pmd_t *pmd = (void 
*)__va(pgd_val(pgd[USER_PTRS_PER_PGD])-1);
+                       make_page_writable(pmd);
+                       memset(pmd, 0, PTRS_PER_PMD*sizeof(pmd_t));
+                       kmem_cache_free(pmd_cache, pmd);
+               }
+       }
        /* in the non-PAE case, free_pgtables() clears user pgd entries */
        kmem_cache_free(pgd_cache, pgd);
 }
+
+#ifndef CONFIG_XEN_SHADOW_MODE
+void make_lowmem_page_readonly(void *va)
+{
+       pte_t *pte = virt_to_ptep(va);
+       set_pte(pte, pte_wrprotect(*pte));
+}
+
+void make_lowmem_page_writable(void *va)
+{
+       pte_t *pte = virt_to_ptep(va);
+       set_pte(pte, pte_mkwrite(*pte));
+}
+
+void make_page_readonly(void *va)
+{
+       pte_t *pte = virt_to_ptep(va);
+       set_pte(pte, pte_wrprotect(*pte));
+       if ( (unsigned long)va >= (unsigned long)high_memory )
+       {
+               unsigned long phys;
+               phys = machine_to_phys(*(unsigned long *)pte & PAGE_MASK);
+#ifdef CONFIG_HIGHMEM
+               if ( (phys >> PAGE_SHIFT) < highstart_pfn )
+#endif
+                       make_lowmem_page_readonly(phys_to_virt(phys));
+       }
+}
+
+void make_page_writable(void *va)
+{
+       pte_t *pte = virt_to_ptep(va);
+       set_pte(pte, pte_mkwrite(*pte));
+       if ( (unsigned long)va >= (unsigned long)high_memory )
+       {
+               unsigned long phys;
+               phys = machine_to_phys(*(unsigned long *)pte & PAGE_MASK);
+#ifdef CONFIG_HIGHMEM
+               if ( (phys >> PAGE_SHIFT) < highstart_pfn )
+#endif
+                       make_lowmem_page_writable(phys_to_virt(phys));
+       }
+}
+
+void make_pages_readonly(void *va, unsigned int nr)
+{
+       while ( nr-- != 0 )
+       {
+               make_page_readonly(va);
+               va = (void *)((unsigned long)va + PAGE_SIZE);
+       }
+}
+
+void make_pages_writable(void *va, unsigned int nr)
+{
+       while ( nr-- != 0 )
+       {
+               make_page_writable(va);
+               va = (void *)((unsigned long)va + PAGE_SIZE);
+       }
+}
+#endif /* CONFIG_XEN_SHADOW_MODE */
+
+LIST_HEAD(mm_unpinned);
+DEFINE_SPINLOCK(mm_unpinned_lock);
+
+static inline void mm_walk_set_prot(void *pt, pgprot_t flags)
+{
+       struct page *page = virt_to_page(pt);
+       unsigned long pfn = page_to_pfn(page);
+
+       if (PageHighMem(page))
+               return;
+       HYPERVISOR_update_va_mapping(
+               (unsigned long)__va(pfn << PAGE_SHIFT),
+               pfn_pte(pfn, flags), 0);
+}
+
+static void mm_walk(struct mm_struct *mm, pgprot_t flags)
+{
+       pgd_t       *pgd;
+       pud_t       *pud;
+       pmd_t       *pmd;
+       pte_t       *pte;
+       int          g,u,m;
+
+       pgd = mm->pgd;
+       for (g = 0; g < USER_PTRS_PER_PGD; g++, pgd++) {
+               if (pgd_none(*pgd))
+                       continue;
+               pud = pud_offset(pgd, 0);
+               if (PTRS_PER_PUD > 1) /* not folded */
+                       mm_walk_set_prot(pud,flags);
+               for (u = 0; u < PTRS_PER_PUD; u++, pud++) {
+                       if (pud_none(*pud))
+                               continue;
+                       pmd = pmd_offset(pud, 0);
+                       if (PTRS_PER_PMD > 1) /* not folded */
+                               mm_walk_set_prot(pmd,flags);
+                       for (m = 0; m < PTRS_PER_PMD; m++, pmd++) {
+                               if (pmd_none(*pmd))
+                                       continue;
+                               pte = pte_offset_kernel(pmd,0);
+                               mm_walk_set_prot(pte,flags);
+                       }
+               }
+       }
+}
+
+void mm_pin(struct mm_struct *mm)
+{
+    spin_lock(&mm->page_table_lock);
+
+    mm_walk(mm, PAGE_KERNEL_RO);
+    HYPERVISOR_update_va_mapping(
+        (unsigned long)mm->pgd,
+        pfn_pte(virt_to_phys(mm->pgd)>>PAGE_SHIFT, PAGE_KERNEL_RO),
+        UVMF_TLB_FLUSH);
+    xen_pgd_pin(__pa(mm->pgd));
+    mm->context.pinned = 1;
+    spin_lock(&mm_unpinned_lock);
+    list_del(&mm->context.unpinned);
+    spin_unlock(&mm_unpinned_lock);
+
+    spin_unlock(&mm->page_table_lock);
+}
+
+void mm_unpin(struct mm_struct *mm)
+{
+    spin_lock(&mm->page_table_lock);
+
+    xen_pgd_unpin(__pa(mm->pgd));
+    HYPERVISOR_update_va_mapping(
+        (unsigned long)mm->pgd,
+        pfn_pte(virt_to_phys(mm->pgd)>>PAGE_SHIFT, PAGE_KERNEL), 0);
+    mm_walk(mm, PAGE_KERNEL);
+    xen_tlb_flush();
+    mm->context.pinned = 0;
+    spin_lock(&mm_unpinned_lock);
+    list_add(&mm->context.unpinned, &mm_unpinned);
+    spin_unlock(&mm_unpinned_lock);
+
+    spin_unlock(&mm->page_table_lock);
+}
+
+void mm_pin_all(void)
+{
+    while (!list_empty(&mm_unpinned))  
+       mm_pin(list_entry(mm_unpinned.next, struct mm_struct,
+                         context.unpinned));
+}
+
+void _arch_exit_mmap(struct mm_struct *mm)
+{
+    struct task_struct *tsk = current;
+
+    task_lock(tsk);
+
+    /*
+     * We aggressively remove defunct pgd from cr3. We execute unmap_vmas()
+     * *much* faster this way, as no tlb flushes means bigger wrpt batches.
+     */
+    if ( tsk->active_mm == mm )
+    {
+        tsk->active_mm = &init_mm;
+        atomic_inc(&init_mm.mm_count);
+
+        switch_mm(mm, &init_mm, tsk);
+
+        atomic_dec(&mm->mm_count);
+        BUG_ON(atomic_read(&mm->mm_count) == 0);
+    }
+
+    task_unlock(tsk);
+
+    if ( mm->context.pinned && (atomic_read(&mm->mm_count) == 1) )
+        mm_unpin(mm);
+}
diff -x mkbuildtree -x include -x xen -x SCCS -urPp 
pristine-linux-2.6.12/arch/i386/pci/irq.c 
linux-2.6-xen-sparse/arch/i386/pci/irq.c
--- pristine-linux-2.6.12/arch/i386/pci/irq.c   2005-06-17 12:48:29.000000000 
-0700
+++ linux-2.6-xen-sparse/arch/i386/pci/irq.c    2005-07-28 13:17:07.000000000 
-0700
@@ -68,7 +68,8 @@ static struct irq_routing_table * __init
        int i;
        u8 sum;
 
-       for(addr = (u8 *) __va(0xf0000); addr < (u8 *) __va(0x100000); addr += 
16) {
+#ifdef CONFIG_XEN_PRIVILEGED_GUEST
+       for(addr = (u8 *) isa_bus_to_virt(0xf0000); addr < (u8 *) 
isa_bus_to_virt(0x100000); addr += 16) {
                rt = (struct irq_routing_table *) addr;
                if (rt->signature != PIRQ_SIGNATURE ||
                    rt->version != PIRQ_VERSION ||
@@ -83,6 +84,8 @@ static struct irq_routing_table * __init
                        return rt;
                }
        }
+#endif
+       
        return NULL;
 }
 
diff -x mkbuildtree -x include -x xen -x SCCS -urPp 
pristine-linux-2.6.12/arch/i386/pci/Makefile 
linux-2.6-xen-sparse/arch/i386/pci/Makefile
--- pristine-linux-2.6.12/arch/i386/pci/Makefile        2005-06-17 
12:48:29.000000000 -0700
+++ linux-2.6-xen-sparse/arch/i386/pci/Makefile 2005-07-28 13:17:07.000000000 
-0700
@@ -1,14 +1,32 @@
-obj-y                          := i386.o
+XENARCH        := $(subst ",,$(CONFIG_XENARCH))
 
-obj-$(CONFIG_PCI_BIOS)         += pcbios.o
-obj-$(CONFIG_PCI_MMCONFIG)     += mmconfig.o
-obj-$(CONFIG_PCI_DIRECT)       += direct.o
+CFLAGS += -Iarch/$(XENARCH)/pci
 
-pci-y                          := fixup.o
-pci-$(CONFIG_ACPI_PCI)         += acpi.o
-pci-y                          += legacy.o irq.o
+c-obj-y                                := i386.o
 
-pci-$(CONFIG_X86_VISWS)                := visws.o fixup.o
-pci-$(CONFIG_X86_NUMAQ)                := numa.o irq.o
+c-obj-$(CONFIG_PCI_BIOS)               += pcbios.o
+c-obj-$(CONFIG_PCI_MMCONFIG)   += mmconfig.o
+c-obj-$(CONFIG_PCI_DIRECT)     += direct.o
 
-obj-y                          += $(pci-y) common.o
+c-pci-y                                := fixup.o
+c-pci-$(CONFIG_ACPI_PCI)       += acpi.o
+c-pci-y                                += legacy.o
+# Make sure irq.o gets linked in after legacy.o
+l-pci-y                                += irq.o
+
+c-pci-$(CONFIG_X86_VISWS)      := visws.o fixup.o
+pci-$(CONFIG_X86_VISWS)                :=
+c-pci-$(CONFIG_X86_NUMAQ)      := numa.o
+pci-$(CONFIG_X86_NUMAQ)                := irq.o
+
+obj-y                          += $(pci-y)
+c-obj-y                                += $(c-pci-y) common.o
+
+c-link :=
+
+$(patsubst %.o,$(obj)/%.c,$(c-obj-y) $(c-link)):
+       @ln -fsn $(srctree)/arch/i386/pci/$(notdir $@) $@
+
+obj-y  += $(c-obj-y) $(l-pci-y)
+
+clean-files += $(patsubst %.o,%.c,$(c-obj-y) $(c-obj-) $(c-link))

_______________________________________________
Xen-merge mailing list
Xen-merge@xxxxxxxxxxxxxxxxxxx
http://lists.xensource.com/xen-merge
WARNING - OLD ARCHIVES

xen-merge

Re: [Xen-merge] i386 subarch